diff --git a/speech/cloud-client/README.md b/speech/cloud-client/README.md
index 3f6cbc4a42e..4be5d92d080 100644
--- a/speech/cloud-client/README.md
+++ b/speech/cloud-client/README.md
@@ -91,3 +91,15 @@ Performing streaming speech transcription and punctuation on an audio file
```
mvn exec:java -DRecognize -Dexec.args="stream-punctuation ./resources/audio.raw"
```
+
+## Enhanced Model
+Transcribe an audio file using an enhanced model
+```
+mvn exec:java -DRecognize -Dexec.args="enhanced-model ./resources/commercial_mono.wav"
+```
+
+## Recognition Metadata
+Transcribe an audio file with recognition metadata
+```
+mvn exec:java -DRecognize -Dexec.args="metadata ./resources/commercial_mono.wav"
+```
diff --git a/speech/cloud-client/pom.xml b/speech/cloud-client/pom.xml
index ca80e6989bc..a761ad332b5 100644
--- a/speech/cloud-client/pom.xml
+++ b/speech/cloud-client/pom.xml
@@ -40,7 +40,7 @@
com.google.cloud
google-cloud-speech
- 0.42.0-alpha
+ 0.46.0-alpha
diff --git a/speech/cloud-client/resources/commercial_mono.wav b/speech/cloud-client/resources/commercial_mono.wav
new file mode 100644
index 00000000000..e6b9ed434f9
Binary files /dev/null and b/speech/cloud-client/resources/commercial_mono.wav differ
diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java
index dab73a3aa29..396cc5110ea 100644
--- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java
+++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java
@@ -24,6 +24,10 @@
import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance;
+import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType;
import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
import com.google.cloud.speech.v1p1beta1.SpeechClient;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
@@ -53,7 +57,7 @@ public static void main(String... args) throws Exception {
"\tjava %s \"\" \"\"\n"
+ "Commands:\n"
+ "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets | model-selection\n"
- + "\t| auto-punctuation | stream-punctuation\n"
+ + "\t| auto-punctuation | stream-punctuation | enhanced-model | metadata\n"
+ "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
+ "for a Cloud Storage resource (gs://...)\n",
Recognize.class.getCanonicalName());
@@ -97,6 +101,10 @@ public static void main(String... args) throws Exception {
}
} else if (command.equals("stream-punctuation")) {
streamingTranscribeWithAutomaticPunctuation(path);
+ } else if (command.equals("enhanced-model")) {
+ transcribeFileWithEnhancedModel(path);
+ } else if (command.equals("metadata")) {
+ transcribeFileWithMetadata(path);
}
}
@@ -678,4 +686,97 @@ public SettableFuture> future() {
}
}
// [END speech_stream_recognize_punctuation]
+
+ // [START speech_transcribe_file_with_enhanced_model]
+ /**
+ * Transcribe the given audio file using an enhanced model.
+ *
+ * @param fileName the path to an audio file.
+ */
+ public static void transcribeFileWithEnhancedModel(String fileName) throws Exception {
+ Path path = Paths.get(fileName);
+ byte[] content = Files.readAllBytes(path);
+
+ try (SpeechClient speechClient = SpeechClient.create()) {
+ // Get the contents of the local audio file
+ RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder()
+ .setContent(ByteString.copyFrom(content))
+ .build();
+
+ // Configure request to enable enhanced models
+ RecognitionConfig config = RecognitionConfig.newBuilder()
+ .setEncoding(AudioEncoding.LINEAR16)
+ .setLanguageCode("en-US")
+ .setSampleRateHertz(8000)
+ // Enhanced models are only available to projects that
+ // opt in for audio data collection.
+ .setUseEnhanced(true)
+ // A model must be specified to use enhanced model.
+ .setModel("phone_call")
+ .build();
+
+ // Perform the transcription request
+ RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
+
+ // Print out the results
+ for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
+ // There can be several alternative transcripts for a given chunk of speech. Just use the
+ // first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternatives(0);
+ System.out.format("Transcript: %s\n\n", alternative.getTranscript());
+ }
+ }
+ }
+ // [END speech_transcribe_file_with_enhanced_model]
+
+ // [START speech_transcribe_file_with_metadata]
+ /**
+ * Transcribe the given audio file and include recognition metadata in the request.
+ *
+ * @param fileName the path to an audio file.
+ */
+ public static void transcribeFileWithMetadata(String fileName) throws Exception {
+ Path path = Paths.get(fileName);
+ byte[] content = Files.readAllBytes(path);
+
+ try (SpeechClient speechClient = SpeechClient.create()) {
+ // Get the contents of the local audio file
+ RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder()
+ .setContent(ByteString.copyFrom(content))
+ .build();
+
+ // Construct a recognition metadata object.
+ // Most metadata fields are specified as enums that can be found
+ // in speech.enums.RecognitionMetadata
+ RecognitionMetadata metadata = RecognitionMetadata.newBuilder()
+ .setInteractionType(InteractionType.DISCUSSION)
+ .setMicrophoneDistance(MicrophoneDistance.NEARFIELD)
+ .setRecordingDeviceType(RecordingDeviceType.SMARTPHONE)
+ .setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings
+ // And some are integers, for instance the 6 digit NAICS code
+ // https://www.naics.com/search/
+ .setIndustryNaicsCodeOfAudio(519190)
+ .build();
+
+ // Configure request to enable enhanced models
+ RecognitionConfig config = RecognitionConfig.newBuilder()
+ .setEncoding(AudioEncoding.LINEAR16)
+ .setLanguageCode("en-US")
+ .setSampleRateHertz(8000)
+ .setMetadata(metadata) // Add the metadata to the config
+ .build();
+
+ // Perform the transcription request
+ RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
+
+ // Print out the results
+ for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
+ // There can be several alternative transcripts for a given chunk of speech. Just use the
+ // first (most likely) one here.
+ SpeechRecognitionAlternative alternative = result.getAlternatives(0);
+ System.out.format("Transcript: %s\n\n", alternative.getTranscript());
+ }
+ }
+ }
+ // [END speech_transcribe_file_with_metadata]
}
diff --git a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java
index 022e24f7d1a..983d3a8724f 100644
--- a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java
+++ b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java
@@ -46,6 +46,8 @@ public class RecognizeIT {
private String videoFileName = "./resources/Google_Gnome.wav";
private String gcsVideoPath = "gs://" + BUCKET + "/speech/Google_Gnome.wav";
+ private String recognitionAudioFile = "./resources/commercial_mono.wav";
+
@Before
public void setUp() {
bout = new ByteArrayOutputStream();
@@ -145,4 +147,18 @@ public void testStreamAutoPunctuation() throws Exception {
String got = bout.toString();
assertThat(got).contains("How old is the Brooklyn Bridge?");
}
+
+ @Test
+ public void testEnhancedModel() throws Exception {
+ Recognize.transcribeFileWithEnhancedModel(recognitionAudioFile);
+ String got = bout.toString();
+ assertThat(got).contains("Chrome");
+ }
+
+ @Test
+ public void testMetadata() throws Exception {
+ Recognize.transcribeFileWithMetadata(recognitionAudioFile);
+ String got = bout.toString();
+ assertThat(got).contains("Chrome");
+ }
}