diff --git a/speech/beta/pom.xml b/speech/beta/pom.xml
new file mode 100644
index 00000000000..623af887158
--- /dev/null
+++ b/speech/beta/pom.xml
@@ -0,0 +1,139 @@
+
+
+ 4.0.0
+ com.example.speech
+ speech-google-cloud-samples
+ jar
+
+
+
+ com.google.cloud.samples
+ shared-configuration
+ 1.0.10
+
+
+
+ 1.8
+ 1.8
+ UTF-8
+
+
+
+
+
+ com.google.cloud
+ google-cloud-speech
+ 0.56.0-beta
+
+
+
+
+
+ junit
+ junit
+ 4.12
+ test
+
+
+ com.google.truth
+ truth
+ 0.42
+ test
+
+
+
+
+
+
+ maven-assembly-plugin
+
+
+
+ com.example.language.QuickstartSample
+
+
+
+ jar-with-dependencies
+
+
+
+
+
+
+
+
+ Quickstart
+
+
+ Quickstart
+
+
+
+
+
+ org.codehaus.mojo
+ exec-maven-plugin
+ 1.6.0
+
+
+
+ java
+
+
+
+
+ com.example.speech.QuickstartSample
+ false
+
+
+
+
+
+
+
+ Recognize
+
+
+ Recognize
+
+
+
+
+
+ org.codehaus.mojo
+ exec-maven-plugin
+ 1.6.0
+
+
+
+ java
+
+
+
+
+ com.example.speech.Recognize
+ false
+
+
+
+
+
+
+
diff --git a/speech/beta/resources/Google_Gnome.wav b/speech/beta/resources/Google_Gnome.wav
new file mode 100644
index 00000000000..2f497b7fbe7
Binary files /dev/null and b/speech/beta/resources/Google_Gnome.wav differ
diff --git a/speech/beta/resources/audio.raw b/speech/beta/resources/audio.raw
new file mode 100644
index 00000000000..5ebf79d3c9c
Binary files /dev/null and b/speech/beta/resources/audio.raw differ
diff --git a/speech/beta/resources/commercial_mono.wav b/speech/beta/resources/commercial_mono.wav
new file mode 100644
index 00000000000..e6b9ed434f9
Binary files /dev/null and b/speech/beta/resources/commercial_mono.wav differ
diff --git a/speech/beta/resources/commercial_stereo.wav b/speech/beta/resources/commercial_stereo.wav
new file mode 100644
index 00000000000..467f3687702
Binary files /dev/null and b/speech/beta/resources/commercial_stereo.wav differ
diff --git a/speech/beta/src/main/java/com/example/speech/Recognize.java b/speech/beta/src/main/java/com/example/speech/Recognize.java
new file mode 100644
index 00000000000..7c8aaccd74a
--- /dev/null
+++ b/speech/beta/src/main/java/com/example/speech/Recognize.java
@@ -0,0 +1,487 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.speech;
+
+import com.google.api.gax.longrunning.OperationFuture;
+import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata;
+import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse;
+import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
+import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
+import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType;
+import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
+import com.google.cloud.speech.v1p1beta1.SpeechClient;
+import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
+import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult;
+import com.google.protobuf.ByteString;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+
+public class Recognize {
+
+ /** Run speech recognition tasks. */
+ public static void main(String... args) throws Exception {
+ if (args.length < 1) {
+ System.out.println("Usage:");
+ System.out.printf(
+ "\tjava %s \"\" \"\"\n"
+ + "Commands:\n"
+ + "\t metadata | diarization | multi-channel |\n"
+ + "\t multi-language | word-level-conf\n"
+ + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
+ + "for a Cloud Storage resource (gs://...)\n",
+ Recognize.class.getCanonicalName());
+ return;
+ }
+ String command = args[0];
+ String path = args.length > 1 ? args[1] : "";
+
+ // Use command and GCS path pattern to invoke transcription.
+ if (command.equals("metadata")) {
+ transcribeFileWithMetadata(path);
+ } else if (command.equals("diarization")) {
+ if (path.startsWith("gs://")) {
+ transcribeDiarizationGcs(path);
+ } else {
+ transcribeDiarization(path);
+ }
+ } else if (command.equals("multi-channel")) {
+ if (path.startsWith("gs://")) {
+ transcribeMultiChannelGcs(path);
+ } else {
+ transcribeMultiChannel(path);
+ }
+ } else if (command.equals("multi-language")) {
+ if (path.startsWith("gs://")) {
+ transcribeMultiLanguageGcs(path);
+ } else {
+ transcribeMultiLanguage(path);
+ }
+ } else if (command.equals("word-level-conf")) {
+ if (path.startsWith("gs://")) {
+ transcribeWordLevelConfidenceGcs(path);
+ } else {
+ transcribeWordLevelConfidence(path);
+ }
+ }
+ }
+
+ // [START speech_transcribe_recognition_metadata_beta]
+ /**
+ * Transcribe the given audio file and include recognition metadata in the request.
+ *
+ * @param fileName the path to an audio file.
+ */
+ public static void transcribeFileWithMetadata(String fileName) throws Exception {
+ Path path = Paths.get(fileName);
+ byte[] content = Files.readAllBytes(path);
+
+ try (SpeechClient speechClient = SpeechClient.create()) {
+ // Get the contents of the local audio file
+ RecognitionAudio recognitionAudio =
+ RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
+
+ // Construct a recognition metadata object.
+ // Most metadata fields are specified as enums that can be found
+ // in speech.enums.RecognitionMetadata
+ RecognitionMetadata metadata =
+ RecognitionMetadata.newBuilder()
+ .setInteractionType(InteractionType.DISCUSSION)
+ .setMicrophoneDistance(MicrophoneDistance.NEARFIELD)
+ .setRecordingDeviceType(RecordingDeviceType.SMARTPHONE)
+ .setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings
+ // And some are integers, for instance the 6 digit NAICS code
+ // https://www.naics.com/search/
+ .setIndustryNaicsCodeOfAudio(519190)
+ .build();
+
+ // Configure request to enable enhanced models
+ RecognitionConfig config =
+ RecognitionConfig.newBuilder()
+ .setEncoding(AudioEncoding.LINEAR16)
+ .setLanguageCode("en-US")
+ .setSampleRateHertz(8000)
+ .setMetadata(metadata) // Add the metadata to the config
+ .build();
+
+ // Perform the transcription request
+ RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
+
+ // Print out the results
+ for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
+ // There can be several alternative transcripts for a given chunk of speech. Just use the
+ // first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternatives(0);
+ System.out.format("Transcript: %s\n\n", alternative.getTranscript());
+ }
+ }
+ }
+ // [END speech_transcribe_recognition_metadata_beta]
+
+ // [START speech_transcribe_diarization_beta]
+ /**
+ * Transcribe the given audio file using speaker diarization.
+ *
+ * @param fileName the path to an audio file.
+ */
+ public static void transcribeDiarization(String fileName) throws Exception {
+ Path path = Paths.get(fileName);
+ byte[] content = Files.readAllBytes(path);
+
+ try (SpeechClient speechClient = SpeechClient.create()) {
+ // Get the contents of the local audio file
+ RecognitionAudio recognitionAudio =
+ RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
+
+ // Configure request to enable Speaker diarization
+ RecognitionConfig config =
+ RecognitionConfig.newBuilder()
+ .setEncoding(AudioEncoding.LINEAR16)
+ .setLanguageCode("en-US")
+ .setSampleRateHertz(8000)
+ .setEnableSpeakerDiarization(true)
+ .setDiarizationSpeakerCount(2)
+ .build();
+
+ // Perform the transcription request
+ RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
+
+ // Print out the results
+ for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
+ // There can be several alternative transcripts for a given chunk of speech. Just
+ // use the first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternatives(0);
+ System.out.format("Transcript : %s\n", alternative.getTranscript());
+ // The words array contains the entire transcript up until that point.
+ // Referencing the last spoken word to get the associated Speaker tag
+ System.out.format(
+ "Speaker Tag %s: %s\n",
+ alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(),
+ alternative.getTranscript());
+ }
+ }
+ }
+ // [END speech_transcribe_diarization_beta]
+
+ // [START speech_transcribe_diarization_gcs_beta]
+ /**
+ * Transcribe a remote audio file using speaker diarization.
+ *
+ * @param gcsUri the path to an audio file.
+ */
+ public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
+ try (SpeechClient speechClient = SpeechClient.create()) {
+ // Configure request to enable Speaker diarization
+ RecognitionConfig config =
+ RecognitionConfig.newBuilder()
+ .setEncoding(AudioEncoding.LINEAR16)
+ .setLanguageCode("en-US")
+ .setSampleRateHertz(8000)
+ .setEnableSpeakerDiarization(true)
+ .setDiarizationSpeakerCount(2)
+ .build();
+
+ // Set the remote path for the audio file
+ RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();
+
+ // Use non-blocking call for getting file transcription
+ OperationFuture response =
+ speechClient.longRunningRecognizeAsync(config, audio);
+
+ while (!response.isDone()) {
+ System.out.println("Waiting for response...");
+ Thread.sleep(10000);
+ }
+
+ for (SpeechRecognitionResult result : response.get().getResultsList()) {
+ // There can be several alternative transcripts for a given chunk of speech. Just
+ // use the first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternatives(0);
+ // The words array contains the entire transcript up until that point.
+ // Referencing the last spoken word to get the associated Speaker tag
+ System.out.format(
+ "Speaker Tag %s:%s\n",
+ alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(),
+ alternative.getTranscript());
+ }
+ }
+ }
+ // [END speech_transcribe_diarization_gcs_beta]
+
+ // [START speech_transcribe_multichannel_beta]
+ /**
+ * Transcribe a local audio file with multi-channel recognition
+ *
+ * @param fileName the path to local audio file
+ */
+ public static void transcribeMultiChannel(String fileName) throws Exception {
+ Path path = Paths.get(fileName);
+ byte[] content = Files.readAllBytes(path);
+
+ try (SpeechClient speechClient = SpeechClient.create()) {
+ // Get the contents of the local audio file
+ RecognitionAudio recognitionAudio =
+ RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
+
+ // Configure request to enable multiple channels
+ RecognitionConfig config =
+ RecognitionConfig.newBuilder()
+ .setEncoding(AudioEncoding.LINEAR16)
+ .setLanguageCode("en-US")
+ .setSampleRateHertz(44100)
+ .setAudioChannelCount(2)
+ .setEnableSeparateRecognitionPerChannel(true)
+ .build();
+
+ // Perform the transcription request
+ RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
+
+ // Print out the results
+ for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
+ // There can be several alternative transcripts for a given chunk of speech. Just use the
+ // first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternatives(0);
+ System.out.format("Transcript : %s\n", alternative.getTranscript());
+ System.out.printf("Channel Tag : %s\n\n", result.getChannelTag());
+ }
+ }
+ }
+ // [END speech_transcribe_multichannel_beta]
+
+ // [START speech_transcribe_multichannel_gcs_beta]
+ /**
+ * Transcribe a remote audio file with multi-channel recognition
+ *
+ * @param gcsUri the path to the audio file
+ */
+ public static void transcribeMultiChannelGcs(String gcsUri) throws Exception {
+
+ try (SpeechClient speechClient = SpeechClient.create()) {
+
+ // Configure request to enable multiple channels
+ RecognitionConfig config =
+ RecognitionConfig.newBuilder()
+ .setEncoding(AudioEncoding.LINEAR16)
+ .setLanguageCode("en-US")
+ .setSampleRateHertz(44100)
+ .setAudioChannelCount(2)
+ .setEnableSeparateRecognitionPerChannel(true)
+ .build();
+
+ // Set the remote path for the audio file
+ RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();
+
+ // Use non-blocking call for getting file transcription
+ OperationFuture response =
+ speechClient.longRunningRecognizeAsync(config, audio);
+
+ while (!response.isDone()) {
+ System.out.println("Waiting for response...");
+ Thread.sleep(10000);
+ }
+ // Just print the first result here.
+ for (SpeechRecognitionResult result : response.get().getResultsList()) {
+
+ // There can be several alternative transcripts for a given chunk of speech. Just use the
+ // first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
+
+ // Print out the result
+ System.out.printf("Transcript : %s\n", alternative.getTranscript());
+ System.out.printf("Channel Tag : %s\n\n", result.getChannelTag());
+ }
+ }
+ }
+ // [END speech_transcribe_multichannel_gcs_beta]
+
+ // [START speech_transcribe_multilanguage_beta]
+ /**
+ * Transcribe a local audio file with multi-language recognition
+ *
+ * @param fileName the path to the audio file
+ */
+ public static void transcribeMultiLanguage(String fileName) throws Exception {
+ Path path = Paths.get(fileName);
+ // Get the contents of the local audio file
+ byte[] content = Files.readAllBytes(path);
+
+ try (SpeechClient speechClient = SpeechClient.create()) {
+
+ RecognitionAudio recognitionAudio =
+ RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
+ ArrayList languageList = new ArrayList<>();
+ languageList.add("es-ES");
+ languageList.add("en-US");
+
+ // Configure request to enable multiple languages
+ RecognitionConfig config =
+ RecognitionConfig.newBuilder()
+ .setEncoding(AudioEncoding.LINEAR16)
+ .setSampleRateHertz(16000)
+ .setLanguageCode("ja-JP")
+ .addAllAlternativeLanguageCodes(languageList)
+ .build();
+ // Perform the transcription request
+ RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
+
+ // Print out the results
+ for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
+ // There can be several alternative transcripts for a given chunk of speech. Just use the
+ // first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternatives(0);
+ System.out.format("Transcript : %s\n\n", alternative.getTranscript());
+ }
+ }
+ }
+ // [END speech_transcribe_multilanguage_beta]
+
+ // [START speech_transcribe_multilanguage_gcs_beta]
+ /**
+ * Transcribe a remote audio file with multi-language recognition
+ *
+ * @param gcsUri the path to the remote audio file
+ */
+ public static void transcribeMultiLanguageGcs(String gcsUri) throws Exception {
+ try (SpeechClient speechClient = SpeechClient.create()) {
+
+ ArrayList languageList = new ArrayList<>();
+ languageList.add("es-ES");
+ languageList.add("en-US");
+
+ // Configure request to enable multiple languages
+ RecognitionConfig config =
+ RecognitionConfig.newBuilder()
+ .setEncoding(AudioEncoding.LINEAR16)
+ .setSampleRateHertz(16000)
+ .setLanguageCode("ja-JP")
+ .addAllAlternativeLanguageCodes(languageList)
+ .build();
+
+ // Set the remote path for the audio file
+ RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();
+
+ // Use non-blocking call for getting file transcription
+ OperationFuture response =
+ speechClient.longRunningRecognizeAsync(config, audio);
+
+ while (!response.isDone()) {
+ System.out.println("Waiting for response...");
+ Thread.sleep(10000);
+ }
+
+ for (SpeechRecognitionResult result : response.get().getResultsList()) {
+
+ // There can be several alternative transcripts for a given chunk of speech. Just use the
+ // first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
+
+ // Print out the result
+ System.out.printf("Transcript : %s\n\n", alternative.getTranscript());
+ }
+ }
+ }
+ // [END speech_transcribe_multilanguage_gcs_beta]
+
+ // [START speech_transcribe_word_level_confidence_beta]
+ /**
+ * Transcribe a local audio file with word level confidence
+ *
+ * @param fileName the path to the local audio file
+ */
+ public static void transcribeWordLevelConfidence(String fileName) throws Exception {
+ Path path = Paths.get(fileName);
+ byte[] content = Files.readAllBytes(path);
+
+ try (SpeechClient speechClient = SpeechClient.create()) {
+ RecognitionAudio recognitionAudio =
+ RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
+ // Configure request to enable word level confidence
+ RecognitionConfig config =
+ RecognitionConfig.newBuilder()
+ .setEncoding(AudioEncoding.LINEAR16)
+ .setSampleRateHertz(16000)
+ .setLanguageCode("en-US")
+ .setEnableWordConfidence(true)
+ .build();
+ // Perform the transcription request
+ RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
+
+ // Print out the results
+ for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
+ // There can be several alternative transcripts for a given chunk of speech. Just use the
+ // first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternatives(0);
+ System.out.format("Transcript : %s\n", alternative.getTranscript());
+ System.out.format(
+ "First Word and Confidence : %s %s \n",
+ alternative.getWords(0).getWord(), alternative.getWords(0).getConfidence());
+ }
+ }
+ }
+ // [END speech_transcribe_word_level_confidence_beta]
+
+ // [START speech_transcribe_word_level_confidence_gcs_beta]
+ /**
+ * Transcribe a remote audio file with word level confidence
+ *
+ * @param gcsUri path to the remote audio file
+ */
+ public static void transcribeWordLevelConfidenceGcs(String gcsUri) throws Exception {
+ try (SpeechClient speechClient = SpeechClient.create()) {
+
+ // Configure request to enable word level confidence
+ RecognitionConfig config =
+ RecognitionConfig.newBuilder()
+ .setEncoding(AudioEncoding.FLAC)
+ .setSampleRateHertz(16000)
+ .setLanguageCode("en-US")
+ .setEnableWordConfidence(true)
+ .build();
+
+ // Set the remote path for the audio file
+ RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();
+
+ // Use non-blocking call for getting file transcription
+ OperationFuture response =
+ speechClient.longRunningRecognizeAsync(config, audio);
+
+ while (!response.isDone()) {
+ System.out.println("Waiting for response...");
+ Thread.sleep(10000);
+ }
+ // Just print the first result here.
+ SpeechRecognitionResult result = response.get().getResultsList().get(0);
+
+ // There can be several alternative transcripts for a given chunk of speech. Just use the
+ // first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
+ // Print out the result
+ System.out.printf("Transcript : %s\n", alternative.getTranscript());
+ System.out.format(
+ "First Word and Confidence : %s %s \n",
+ alternative.getWords(0).getWord(), alternative.getWords(0).getConfidence());
+ }
+ }
+ // [END speech_transcribe_word_level_confidence_gcs_beta]
+}
diff --git a/speech/beta/src/test/java/com/example/speech/RecognizeIT.java b/speech/beta/src/test/java/com/example/speech/RecognizeIT.java
new file mode 100644
index 00000000000..5219b58dc79
--- /dev/null
+++ b/speech/beta/src/test/java/com/example/speech/RecognizeIT.java
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.speech;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import java.io.ByteArrayOutputStream;
+import java.io.PrintStream;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Tests for speech recognize sample. */
+@RunWith(JUnit4.class)
+@SuppressWarnings("checkstyle:abbreviationaswordinname")
+public class RecognizeIT {
+ private static final String BUCKET = "cloud-samples-tests";
+
+ private ByteArrayOutputStream bout;
+ private PrintStream out;
+
+ // The path to the audio file to transcribe
+ private String audioFileName = "./resources/audio.raw";
+ private String multiChannelAudioFileName = "./resources/commercial_stereo.wav";
+ private String gcsMultiChannelAudioPath = "gs://" + BUCKET + "/speech/commercial_stereo.wav";
+ private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn.flac";
+ private String gcsDiarizationAudioPath = "gs://" + BUCKET + "/speech/commercial_mono.wav";
+
+ // The path to the video file to transcribe
+ private String videoFileName = "./resources/Google_Gnome.wav";
+ private String gcsVideoPath = "gs://" + BUCKET + "/speech/Google_Gnome.wav";
+
+ private String recognitionAudioFile = "./resources/commercial_mono.wav";
+
+ @Before
+ public void setUp() {
+ bout = new ByteArrayOutputStream();
+ out = new PrintStream(bout);
+ System.setOut(out);
+ }
+
+ @After
+ public void tearDown() {
+ System.setOut(null);
+ }
+
+ @Test
+ public void testMetadata() throws Exception {
+ Recognize.transcribeFileWithMetadata(recognitionAudioFile);
+ String got = bout.toString();
+ assertThat(got).contains("Chrome");
+ }
+
+ @Test
+ public void testTranscribeDiarization() throws Exception {
+ Recognize.transcribeDiarization(recognitionAudioFile);
+ String got = bout.toString();
+ assertThat(got).contains("Speaker Tag 2:");
+ }
+
+ @Test
+ public void testTranscribeDiarizationGcs() throws Exception {
+ Recognize.transcribeDiarizationGcs(gcsDiarizationAudioPath);
+ String got = bout.toString();
+ assertThat(got).contains("Speaker Tag 2:");
+ }
+
+ @Test
+ public void testTranscribeMultiChannel() throws Exception {
+ Recognize.transcribeMultiChannel(multiChannelAudioFileName);
+ String got = bout.toString();
+ assertThat(got).contains("Channel Tag : 1");
+ }
+
+ @Test
+ public void testTranscribeMultiChannelGcs() throws Exception {
+ Recognize.transcribeMultiChannelGcs(gcsMultiChannelAudioPath);
+ String got = bout.toString();
+ assertThat(got).contains("Channel Tag : 1");
+ }
+
+ @Test
+ public void testTranscribeMultiLanguage() throws Exception {
+ Recognize.transcribeMultiLanguage(videoFileName);
+ String got = bout.toString();
+ assertThat(got).contains("Transcript : OK Google");
+ }
+
+ @Test
+ public void testTranscribeMultiLanguageGcs() throws Exception {
+ Recognize.transcribeMultiLanguageGcs(gcsVideoPath);
+ String got = bout.toString();
+ assertThat(got).contains("Transcript : OK Google");
+ }
+
+ @Test
+ public void testTranscribeWordLevelConfidence() throws Exception {
+ Recognize.transcribeWordLevelConfidence(audioFileName);
+ String got = bout.toString();
+ assertThat(got).contains("Transcript : how old is the Brooklyn Bridge");
+ assertThat(got).contains("First Word and Confidence : how");
+ }
+
+ @Test
+ public void testTranscribeWordLevelConfidenceGcs() throws Exception {
+ Recognize.transcribeWordLevelConfidenceGcs(gcsAudioPath);
+ String got = bout.toString();
+ assertThat(got).contains("Transcript : how old is the Brooklyn Bridge");
+ assertThat(got).contains("First Word and Confidence : how");
+ }
+}
diff --git a/speech/cloud-client/pom.xml b/speech/cloud-client/pom.xml
index 68cd05bd13b..d926b40db03 100644
--- a/speech/cloud-client/pom.xml
+++ b/speech/cloud-client/pom.xml
@@ -40,7 +40,7 @@
com.google.cloud
google-cloud-speech
- 0.56.0-beta
+ 0.61.0-beta
diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java
index 3d6bfaecf29..de05ad84185 100644
--- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java
+++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java
@@ -22,24 +22,20 @@
import com.google.api.gax.rpc.ClientStream;
import com.google.api.gax.rpc.ResponseObserver;
import com.google.api.gax.rpc.StreamController;
-import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata;
-import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse;
-import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
-import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
-import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding;
-import com.google.cloud.speech.v1p1beta1.RecognitionMetadata;
-import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType;
-import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance;
-import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType;
-import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
-import com.google.cloud.speech.v1p1beta1.SpeechClient;
-import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
-import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult;
-import com.google.cloud.speech.v1p1beta1.StreamingRecognitionConfig;
-import com.google.cloud.speech.v1p1beta1.StreamingRecognitionResult;
-import com.google.cloud.speech.v1p1beta1.StreamingRecognizeRequest;
-import com.google.cloud.speech.v1p1beta1.StreamingRecognizeResponse;
-import com.google.cloud.speech.v1p1beta1.WordInfo;
+import com.google.cloud.speech.v1.LongRunningRecognizeMetadata;
+import com.google.cloud.speech.v1.LongRunningRecognizeResponse;
+import com.google.cloud.speech.v1.RecognitionAudio;
+import com.google.cloud.speech.v1.RecognitionConfig;
+import com.google.cloud.speech.v1.RecognitionConfig.AudioEncoding;
+import com.google.cloud.speech.v1.RecognizeResponse;
+import com.google.cloud.speech.v1.SpeechClient;
+import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
+import com.google.cloud.speech.v1.SpeechRecognitionResult;
+import com.google.cloud.speech.v1.StreamingRecognitionConfig;
+import com.google.cloud.speech.v1.StreamingRecognitionResult;
+import com.google.cloud.speech.v1.StreamingRecognizeRequest;
+import com.google.cloud.speech.v1.StreamingRecognizeResponse;
+import com.google.cloud.speech.v1.WordInfo;
import com.google.common.util.concurrent.SettableFuture;
import com.google.protobuf.ByteString;
@@ -67,9 +63,8 @@ public static void main(String... args) throws Exception {
"\tjava %s \"\" \"\"\n"
+ "Commands:\n"
+ "\tsyncrecognize | asyncrecognize | streamrecognize | micstreamrecognize \n"
- + "\t| wordoffsets | model-selection | auto-punctuation | stream-punctuation \n"
- + "\t| enhanced-model| metadata | diarization | multi-channel | multi-language \n"
- + "\t | word-level-conf"
+ + "\t| wordoffsets | auto-punctuation | stream-punctuation \n"
+ + "\t| enhanced-model | model-selection\n"
+ "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
+ "for a Cloud Storage resource (gs://...)\n",
Recognize.class.getCanonicalName());
@@ -101,12 +96,6 @@ public static void main(String... args) throws Exception {
streamingRecognizeFile(path);
} else if (command.equals("micstreamrecognize")) {
streamingMicRecognize();
- } else if (command.equals("model-selection")) {
- if (path.startsWith("gs://")) {
- transcribeModelSelectionGcs(path);
- } else {
- transcribeModelSelection(path);
- }
} else if (command.equals("auto-punctuation")) {
if (path.startsWith("gs://")) {
transcribeGcsWithAutomaticPunctuation(path);
@@ -117,31 +106,11 @@ public static void main(String... args) throws Exception {
streamingTranscribeWithAutomaticPunctuation(path);
} else if (command.equals("enhanced-model")) {
transcribeFileWithEnhancedModel(path);
- } else if (command.equals("metadata")) {
- transcribeFileWithMetadata(path);
- } else if (command.equals("diarization")) {
- if (path.startsWith("gs://")) {
- transcribeDiarizationGcs(path);
- } else {
- transcribeDiarization(path);
- }
- } else if (command.equals("multi-channel")) {
- if (path.startsWith("gs://")) {
- transcribeMultiChannelGcs(path);
- } else {
- transcribeMultiChannel(path);
- }
- } else if (command.equals("multi-language")) {
- if (path.startsWith("gs://")) {
- transcribeMultiLanguageGcs(path);
- } else {
- transcribeMultiLanguage(path);
- }
- } else if (command.equals("word-level-conf")) {
+ } else if (command.equals("model-selection")) {
if (path.startsWith("gs://")) {
- transcribeWordLevelConfidenceGcs(path);
+ transcribeModelSelectionGcs(path);
} else {
- transcribeWordLevelConfidence(path);
+ transcribeModelSelection(path);
}
}
}
@@ -477,87 +446,6 @@ public SettableFuture> future() {
}
// [END speech_transcribe_streaming]
- // [START speech_transcribe_model_selection_beta]
- /**
- * Performs transcription of the given audio file synchronously with the selected model.
- *
- * @param fileName the path to a audio file to transcribe
- */
- public static void transcribeModelSelection(String fileName) throws Exception {
- Path path = Paths.get(fileName);
- byte[] content = Files.readAllBytes(path);
-
- try (SpeechClient speech = SpeechClient.create()) {
- // Configure request with video media type
- RecognitionConfig recConfig =
- RecognitionConfig.newBuilder()
- // encoding may either be omitted or must match the value in the file header
- .setEncoding(AudioEncoding.LINEAR16)
- .setLanguageCode("en-US")
- // sample rate hertz may be either be omitted or must match the value in the file
- // header
- .setSampleRateHertz(16000)
- .setModel("video")
- .build();
-
- RecognitionAudio recognitionAudio =
- RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
-
- RecognizeResponse recognizeResponse = speech.recognize(recConfig, recognitionAudio);
- // Just print the first result here.
- SpeechRecognitionResult result = recognizeResponse.getResultsList().get(0);
- // There can be several alternative transcripts for a given chunk of speech. Just use the
- // first (most likely) one here.
- SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
- System.out.printf("Transcript : %s\n", alternative.getTranscript());
- }
- // [END speech_transcribe_model_selection_beta]
- }
-
- // [START speech_transcribe_model_selection_gcs_beta]
- /**
- * Performs transcription of the remote audio file asynchronously with the selected model.
- *
- * @param gcsUri the path to the remote audio file to transcribe.
- */
- public static void transcribeModelSelectionGcs(String gcsUri) throws Exception {
- try (SpeechClient speech = SpeechClient.create()) {
-
- // Configure request with video media type
- RecognitionConfig config =
- RecognitionConfig.newBuilder()
- // encoding may either be omitted or must match the value in the file header
- .setEncoding(AudioEncoding.LINEAR16)
- .setLanguageCode("en-US")
- // sample rate hertz may be either be omitted or must match the value in the file
- // header
- .setSampleRateHertz(16000)
- .setModel("video")
- .build();
-
- RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();
-
- // Use non-blocking call for getting file transcription
- OperationFuture response =
- speech.longRunningRecognizeAsync(config, audio);
-
- while (!response.isDone()) {
- System.out.println("Waiting for response...");
- Thread.sleep(10000);
- }
-
- List results = response.get().getResultsList();
-
- // Just print the first result here.
- SpeechRecognitionResult result = results.get(0);
- // There can be several alternative transcripts for a given chunk of speech. Just use the
- // first (most likely) one here.
- SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
- System.out.printf("Transcript : %s\n", alternative.getTranscript());
- }
- // [END speech_transcribe_model_selection_gcs_beta]
- }
-
// [START speech_sync_recognize_punctuation]
/**
* Performs transcription with automatic punctuation on raw PCM audio data.
@@ -598,7 +486,7 @@ public static void transcribeFileWithAutomaticPunctuation(String fileName) throw
}
// [END speech_sync_recognize_punctuation]
- // [START speech_transcribe_auto_punctuation_beta]
+ // [START speech_transcribe_auto_punctuation]
/**
* Performs transcription on remote FLAC file and prints the transcription.
*
@@ -638,7 +526,7 @@ public static void transcribeGcsWithAutomaticPunctuation(String gcsUri) throws E
System.out.printf("Transcript : %s\n", alternative.getTranscript());
}
}
- // [END speech_transcribe_auto_punctuation_beta]
+ // [END speech_transcribe_auto_punctuation]
// [START speech_stream_recognize_punctuation]
/**
@@ -820,7 +708,7 @@ public void onError(Throwable t) {
}
// [END speech_transcribe_streaming_mic]
- // [START speech_transcribe_enhanced_model_beta]
+ // [START speech_transcribe_enhanced_model]
/**
* Transcribe the given audio file using an enhanced model.
*
@@ -860,410 +748,86 @@ public static void transcribeFileWithEnhancedModel(String fileName) throws Excep
}
}
}
- // [END speech_transcribe_enhanced_model_beta]
+ // [END speech_transcribe_enhanced_model]
- // [START speech_transcribe_recognition_metadata_beta]
+ // [START speech_transcribe_model_selection]
/**
- * Transcribe the given audio file and include recognition metadata in the request.
- *
- * @param fileName the path to an audio file.
- */
- public static void transcribeFileWithMetadata(String fileName) throws Exception {
- Path path = Paths.get(fileName);
- byte[] content = Files.readAllBytes(path);
-
- try (SpeechClient speechClient = SpeechClient.create()) {
- // Get the contents of the local audio file
- RecognitionAudio recognitionAudio =
- RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
-
- // Construct a recognition metadata object.
- // Most metadata fields are specified as enums that can be found
- // in speech.enums.RecognitionMetadata
- RecognitionMetadata metadata =
- RecognitionMetadata.newBuilder()
- .setInteractionType(InteractionType.DISCUSSION)
- .setMicrophoneDistance(MicrophoneDistance.NEARFIELD)
- .setRecordingDeviceType(RecordingDeviceType.SMARTPHONE)
- .setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings
- // And some are integers, for instance the 6 digit NAICS code
- // https://www.naics.com/search/
- .setIndustryNaicsCodeOfAudio(519190)
- .build();
-
- // Configure request to enable enhanced models
- RecognitionConfig config =
- RecognitionConfig.newBuilder()
- .setEncoding(AudioEncoding.LINEAR16)
- .setLanguageCode("en-US")
- .setSampleRateHertz(8000)
- .setMetadata(metadata) // Add the metadata to the config
- .build();
-
- // Perform the transcription request
- RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
-
- // Print out the results
- for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
- // There can be several alternative transcripts for a given chunk of speech. Just use the
- // first (most likely) one here.
- SpeechRecognitionAlternative alternative = result.getAlternatives(0);
- System.out.format("Transcript: %s\n\n", alternative.getTranscript());
- }
- }
- }
- // [END speech_transcribe_recognition_metadata_beta]
-
- // [START speech_transcribe_diarization_beta]
- /**
- * Transcribe the given audio file using speaker diarization.
+ * Performs transcription of the given audio file synchronously with the selected model.
*
- * @param fileName the path to an audio file.
+ * @param fileName the path to a audio file to transcribe
*/
- public static void transcribeDiarization(String fileName) throws Exception {
+ public static void transcribeModelSelection(String fileName) throws Exception {
Path path = Paths.get(fileName);
byte[] content = Files.readAllBytes(path);
- try (SpeechClient speechClient = SpeechClient.create()) {
- // Get the contents of the local audio file
- RecognitionAudio recognitionAudio =
- RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
-
- // Configure request to enable Speaker diarization
- RecognitionConfig config =
- RecognitionConfig.newBuilder()
- .setEncoding(AudioEncoding.LINEAR16)
- .setLanguageCode("en-US")
- .setSampleRateHertz(8000)
- .setEnableSpeakerDiarization(true)
- .setDiarizationSpeakerCount(2)
- .build();
-
- // Perform the transcription request
- RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
-
- // Print out the results
- for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
- // There can be several alternative transcripts for a given chunk of speech. Just
- // use the first (most likely) one here.
- SpeechRecognitionAlternative alternative = result.getAlternatives(0);
- System.out.format("Transcript : %s\n", alternative.getTranscript());
- // The words array contains the entire transcript up until that point.
- // Referencing the last spoken word to get the associated Speaker tag
- System.out.format(
- "Speaker Tag %s: %s\n",
- alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(),
- alternative.getTranscript());
- }
- }
- }
- // [END speech_transcribe_diarization_beta]
-
- // [START speech_transcribe_diarization_gcs_beta]
- /**
- * Transcribe a remote audio file using speaker diarization.
- *
- * @param gcsUri the path to an audio file.
- */
- public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
- try (SpeechClient speechClient = SpeechClient.create()) {
- // Configure request to enable Speaker diarization
- RecognitionConfig config =
+ try (SpeechClient speech = SpeechClient.create()) {
+ // Configure request with video media type
+ RecognitionConfig recConfig =
RecognitionConfig.newBuilder()
+ // encoding may either be omitted or must match the value in the file header
.setEncoding(AudioEncoding.LINEAR16)
.setLanguageCode("en-US")
- .setSampleRateHertz(8000)
- .setEnableSpeakerDiarization(true)
- .setDiarizationSpeakerCount(2)
+ // sample rate hertz may be either be omitted or must match the value in the file
+ // header
+ .setSampleRateHertz(16000)
+ .setModel("video")
.build();
- // Set the remote path for the audio file
- RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();
-
- // Use non-blocking call for getting file transcription
- OperationFuture response =
- speechClient.longRunningRecognizeAsync(config, audio);
-
- while (!response.isDone()) {
- System.out.println("Waiting for response...");
- Thread.sleep(10000);
- }
-
- for (SpeechRecognitionResult result : response.get().getResultsList()) {
- // There can be several alternative transcripts for a given chunk of speech. Just
- // use the first (most likely) one here.
- SpeechRecognitionAlternative alternative = result.getAlternatives(0);
- // The words array contains the entire transcript up until that point.
- // Referencing the last spoken word to get the associated Speaker tag
- System.out.format(
- "Speaker Tag %s:%s\n",
- alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(),
- alternative.getTranscript());
- }
- }
- }
-
- // [END speech_transcribe_diarization_gcs_beta]
-
- // [START speech_transcribe_multichannel_beta]
-
- /**
- * Transcribe a local audio file with multi-channel recognition
- *
- * @param fileName the path to local audio file
- */
- public static void transcribeMultiChannel(String fileName) throws Exception {
- Path path = Paths.get(fileName);
- byte[] content = Files.readAllBytes(path);
-
- try (SpeechClient speechClient = SpeechClient.create()) {
- // Get the contents of the local audio file
RecognitionAudio recognitionAudio =
RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
- // Configure request to enable multiple channels
- RecognitionConfig config =
- RecognitionConfig.newBuilder()
- .setEncoding(AudioEncoding.LINEAR16)
- .setLanguageCode("en-US")
- .setSampleRateHertz(44100)
- .setAudioChannelCount(2)
- .setEnableSeparateRecognitionPerChannel(true)
- .build();
-
- // Perform the transcription request
- RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
-
- // Print out the results
- for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
- // There can be several alternative transcripts for a given chunk of speech. Just use the
- // first (most likely) one here.
- SpeechRecognitionAlternative alternative = result.getAlternatives(0);
- System.out.format("Transcript : %s\n", alternative.getTranscript());
- System.out.printf("Channel Tag : %s\n\n", result.getChannelTag());
- }
- }
- }
- // [END speech_transcribe_multichannel_beta]
-
- // [START speech_transcribe_multichannel_gcs_beta]
-
- /**
- * Transcribe a remote audio file with multi-channel recognition
- *
- * @param gcsUri the path to the audio file
- */
- public static void transcribeMultiChannelGcs(String gcsUri) throws Exception {
-
- try (SpeechClient speechClient = SpeechClient.create()) {
-
- // Configure request to enable multiple channels
- RecognitionConfig config =
- RecognitionConfig.newBuilder()
- .setEncoding(AudioEncoding.LINEAR16)
- .setLanguageCode("en-US")
- .setSampleRateHertz(44100)
- .setAudioChannelCount(2)
- .setEnableSeparateRecognitionPerChannel(true)
- .build();
-
- // Set the remote path for the audio file
- RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();
-
- // Use non-blocking call for getting file transcription
- OperationFuture response =
- speechClient.longRunningRecognizeAsync(config, audio);
-
- while (!response.isDone()) {
- System.out.println("Waiting for response...");
- Thread.sleep(10000);
- }
+ RecognizeResponse recognizeResponse = speech.recognize(recConfig, recognitionAudio);
// Just print the first result here.
- for (SpeechRecognitionResult result : response.get().getResultsList()) {
-
- // There can be several alternative transcripts for a given chunk of speech. Just use the
- // first (most likely) one here.
- SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
-
- // Print out the result
- System.out.printf("Transcript : %s\n", alternative.getTranscript());
- System.out.printf("Channel Tag : %s\n\n", result.getChannelTag());
- }
- }
- }
- // [END speech_transcribe_multichannel_gcs_beta]
-
- // [START speech_transcribe_multilanguage_beta]
-
- /**
- * Transcribe a local audio file with multi-language recognition
- *
- * @param fileName the path to the audio file
- */
- public static void transcribeMultiLanguage(String fileName) throws Exception {
- Path path = Paths.get(fileName);
- // Get the contents of the local audio file
- byte[] content = Files.readAllBytes(path);
-
- try (SpeechClient speechClient = SpeechClient.create()) {
-
- RecognitionAudio recognitionAudio =
- RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
- ArrayList languageList = new ArrayList<>();
- languageList.add("es-ES");
- languageList.add("en-US");
-
- // Configure request to enable multiple languages
- RecognitionConfig config =
- RecognitionConfig.newBuilder()
- .setEncoding(AudioEncoding.LINEAR16)
- .setSampleRateHertz(16000)
- .setLanguageCode("ja-JP")
- .addAllAlternativeLanguageCodes(languageList)
- .build();
- // Perform the transcription request
- RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
-
- // Print out the results
- for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
- // There can be several alternative transcripts for a given chunk of speech. Just use the
- // first (most likely) one here.
- SpeechRecognitionAlternative alternative = result.getAlternatives(0);
- System.out.format("Transcript : %s\n\n", alternative.getTranscript());
- }
+ SpeechRecognitionResult result = recognizeResponse.getResultsList().get(0);
+ // There can be several alternative transcripts for a given chunk of speech. Just use the
+ // first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
+ System.out.printf("Transcript : %s\n", alternative.getTranscript());
}
}
- // [END speech_transcribe_multilanguage_beta]
-
- // [START speech_transcribe_multilanguage_gcs_beta]
+ // [END speech_transcribe_model_selection]
+ // [START speech_transcribe_model_selection_gcs]
/**
- * Transcribe a remote audio file with multi-language recognition
+ * Performs transcription of the remote audio file asynchronously with the selected model.
*
- * @param gcsUri the path to the remote audio file
+ * @param gcsUri the path to the remote audio file to transcribe.
*/
- public static void transcribeMultiLanguageGcs(String gcsUri) throws Exception {
- try (SpeechClient speechClient = SpeechClient.create()) {
-
- ArrayList languageList = new ArrayList<>();
- languageList.add("es-ES");
- languageList.add("en-US");
+ public static void transcribeModelSelectionGcs(String gcsUri) throws Exception {
+ try (SpeechClient speech = SpeechClient.create()) {
- // Configure request to enable multiple languages
+ // Configure request with video media type
RecognitionConfig config =
RecognitionConfig.newBuilder()
+ // encoding may either be omitted or must match the value in the file header
.setEncoding(AudioEncoding.LINEAR16)
+ .setLanguageCode("en-US")
+ // sample rate hertz may be either be omitted or must match the value in the file
+ // header
.setSampleRateHertz(16000)
- .setLanguageCode("ja-JP")
- .addAllAlternativeLanguageCodes(languageList)
+ .setModel("video")
.build();
- // Set the remote path for the audio file
RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();
// Use non-blocking call for getting file transcription
OperationFuture response =
- speechClient.longRunningRecognizeAsync(config, audio);
+ speech.longRunningRecognizeAsync(config, audio);
while (!response.isDone()) {
System.out.println("Waiting for response...");
Thread.sleep(10000);
}
- for (SpeechRecognitionResult result : response.get().getResultsList()) {
-
- // There can be several alternative transcripts for a given chunk of speech. Just use the
- // first (most likely) one here.
- SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
-
- // Print out the result
- System.out.printf("Transcript : %s\n\n", alternative.getTranscript());
- }
- }
- }
- // [END speech_transcribe_multilanguage_gcs_beta]
-
- // [START speech_transcribe_word_level_confidence_beta]
-
- /**
- * Transcribe a local audio file with word level confidence
- *
- * @param fileName the path to the local audio file
- */
- public static void transcribeWordLevelConfidence(String fileName) throws Exception {
- Path path = Paths.get(fileName);
- byte[] content = Files.readAllBytes(path);
-
- try (SpeechClient speechClient = SpeechClient.create()) {
- RecognitionAudio recognitionAudio =
- RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
- // Configure request to enable word level confidence
- RecognitionConfig config =
- RecognitionConfig.newBuilder()
- .setEncoding(AudioEncoding.LINEAR16)
- .setSampleRateHertz(16000)
- .setLanguageCode("en-US")
- .setEnableWordConfidence(true)
- .build();
- // Perform the transcription request
- RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
-
- // Print out the results
- for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
- // There can be several alternative transcripts for a given chunk of speech. Just use the
- // first (most likely) one here.
- SpeechRecognitionAlternative alternative = result.getAlternatives(0);
- System.out.format("Transcript : %s\n", alternative.getTranscript());
- System.out.format(
- "First Word and Confidence : %s %s \n",
- alternative.getWords(0).getWord(), alternative.getWords(0).getConfidence());
- }
- }
- }
- // [END speech_transcribe_word_level_confidence_beta]
-
- // [START speech_transcribe_word_level_confidence_gcs_beta]
-
- /**
- * Transcribe a remote audio file with word level confidence
- *
- * @param gcsUri path to the remote audio file
- */
- public static void transcribeWordLevelConfidenceGcs(String gcsUri) throws Exception {
- try (SpeechClient speechClient = SpeechClient.create()) {
-
- // Configure request to enable word level confidence
- RecognitionConfig config =
- RecognitionConfig.newBuilder()
- .setEncoding(AudioEncoding.FLAC)
- .setSampleRateHertz(16000)
- .setLanguageCode("en-US")
- .setEnableWordConfidence(true)
- .build();
-
- // Set the remote path for the audio file
- RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();
-
- // Use non-blocking call for getting file transcription
- OperationFuture response =
- speechClient.longRunningRecognizeAsync(config, audio);
+ List results = response.get().getResultsList();
- while (!response.isDone()) {
- System.out.println("Waiting for response...");
- Thread.sleep(10000);
- }
// Just print the first result here.
- SpeechRecognitionResult result = response.get().getResultsList().get(0);
-
+ SpeechRecognitionResult result = results.get(0);
// There can be several alternative transcripts for a given chunk of speech. Just use the
// first (most likely) one here.
SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
- // Print out the result
System.out.printf("Transcript : %s\n", alternative.getTranscript());
- System.out.format(
- "First Word and Confidence : %s %s \n",
- alternative.getWords(0).getWord(), alternative.getWords(0).getConfidence());
}
}
- // [END speech_transcribe_word_level_confidence_gcs_beta]
+ // [END speech_transcribe_model_selection_gcs]
}
diff --git a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java
index 2a36ac3922a..3bc3d5f1611 100644
--- a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java
+++ b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java
@@ -37,17 +37,13 @@ public class RecognizeIT {
// The path to the audio file to transcribe
private String audioFileName = "./resources/audio.raw";
- private String multiChannelAudioFileName = "./resources/commercial_stereo.wav";
- private String gcsMultiChannelAudioPath = "gs://" + BUCKET + "/speech/commercial_stereo.wav";
private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn.flac";
- private String gcsDiarizationAudioPath = "gs://" + BUCKET + "/speech/commercial_mono.wav";
+ private String recognitionAudioFile = "./resources/commercial_mono.wav";
// The path to the video file to transcribe
private String videoFileName = "./resources/Google_Gnome.wav";
private String gcsVideoPath = "gs://" + BUCKET + "/speech/Google_Gnome.wav";
- private String recognitionAudioFile = "./resources/commercial_mono.wav";
-
@Before
public void setUp() {
bout = new ByteArrayOutputStream();
@@ -111,22 +107,6 @@ public void testStreamRecognize() throws Exception {
assertThat(got).contains("how old is the Brooklyn Bridge");
}
- @Test
- public void testModelSelection() throws Exception {
- Recognize.transcribeModelSelection(videoFileName);
- String got = bout.toString();
- assertThat(got).contains("OK Google");
- assertThat(got).contains("the weather outside is sunny");
- }
-
- @Test
- public void testGcsModelSelection() throws Exception {
- Recognize.transcribeModelSelectionGcs(gcsVideoPath);
- String got = bout.toString();
- assertThat(got).contains("OK Google");
- assertThat(got).contains("the weather outside is sunny");
- }
-
@Test
public void testAutoPunctuation() throws Exception {
Recognize.transcribeFileWithAutomaticPunctuation(audioFileName);
@@ -156,67 +136,18 @@ public void testEnhancedModel() throws Exception {
}
@Test
- public void testMetadata() throws Exception {
- Recognize.transcribeFileWithMetadata(recognitionAudioFile);
- String got = bout.toString();
- assertThat(got).contains("Chrome");
- }
-
- @Test
- public void testTranscribeDiarization() throws Exception {
- Recognize.transcribeDiarization(recognitionAudioFile);
- String got = bout.toString();
- assertThat(got).contains("Speaker Tag 2:");
- }
-
- @Test
- public void testTranscribeDiarizationGcs() throws Exception {
- Recognize.transcribeDiarizationGcs(gcsDiarizationAudioPath);
- String got = bout.toString();
- assertThat(got).contains("Speaker Tag 2:");
- }
-
- @Test
- public void testTranscribeMultiChannel() throws Exception {
- Recognize.transcribeMultiChannel(multiChannelAudioFileName);
- String got = bout.toString();
- assertThat(got).contains("Channel Tag : 1");
- }
-
- @Test
- public void testTranscribeMultiChannelGcs() throws Exception {
- Recognize.transcribeMultiChannelGcs(gcsMultiChannelAudioPath);
- String got = bout.toString();
- assertThat(got).contains("Channel Tag : 1");
- }
-
- @Test
- public void testTranscribeMultiLanguage() throws Exception {
- Recognize.transcribeMultiLanguage(videoFileName);
- String got = bout.toString();
- assertThat(got).contains("Transcript : OK Google");
- }
-
- @Test
- public void testTranscribeMultiLanguageGcs() throws Exception {
- Recognize.transcribeMultiLanguageGcs(gcsVideoPath);
- String got = bout.toString();
- assertThat(got).contains("Transcript : OK Google");
- }
-
- @Test
- public void testTranscribeWordLevelConfidence() throws Exception {
- Recognize.transcribeWordLevelConfidence(audioFileName);
+ public void testModelSelection() throws Exception {
+ Recognize.transcribeModelSelection(videoFileName);
String got = bout.toString();
- assertThat(got).contains("Transcript : how old is the Brooklyn Bridge");
- assertThat(got).contains("First Word and Confidence : how");
+ assertThat(got).contains("OK Google");
+ assertThat(got).contains("the weather outside is sunny");
}
@Test
- public void testTranscribeWordLevelConfidenceGcs() throws Exception {
- Recognize.transcribeWordLevelConfidenceGcs(gcsAudioPath);
+ public void testGcsModelSelection() throws Exception {
+ Recognize.transcribeModelSelectionGcs(gcsVideoPath);
String got = bout.toString();
- assertThat(got).contains("Transcript : how old is the Brooklyn Bridge");
- assertThat(got).contains("First Word and Confidence : how");
+ assertThat(got).contains("OK Google");
+ assertThat(got).contains("the weather outside is sunny");
}
}