Skip to content

Commit

Permalink
Add samples for enhanced models and metadata (#1093)
Browse files Browse the repository at this point in the history
  • Loading branch information
nnegrey authored and kurtisvg committed Apr 30, 2018
1 parent 310e0ab commit de00f10
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 2 deletions.
12 changes: 12 additions & 0 deletions speech/cloud-client/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,15 @@ Performing streaming speech transcription and punctuation on an audio file
```
mvn exec:java -DRecognize -Dexec.args="stream-punctuation ./resources/audio.raw"
```

## Enhanced Model
Transcribe an audio file using an enhanced model
```
mvn exec:java -DRecognize -Dexec.args="enhanced-model ./resources/commercial_mono.wav"
```

## Recognition Metadata
Transcribe an audio file with recognition metadata
```
mvn exec:java -DRecognize -Dexec.args="metadata ./resources/commercial_mono.wav"
```
2 changes: 1 addition & 1 deletion speech/cloud-client/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-speech</artifactId>
<version>0.42.0-alpha</version>
<version>0.46.0-alpha</version>
</dependency>
<!-- [END dependencies] -->

Expand Down
Binary file not shown.
103 changes: 102 additions & 1 deletion speech/cloud-client/src/main/java/com/example/speech/Recognize.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding;
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata;
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType;
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance;
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType;
import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
import com.google.cloud.speech.v1p1beta1.SpeechClient;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
Expand Down Expand Up @@ -53,7 +57,7 @@ public static void main(String... args) throws Exception {
"\tjava %s \"<command>\" \"<path-to-image>\"\n"
+ "Commands:\n"
+ "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets | model-selection\n"
+ "\t| auto-punctuation | stream-punctuation\n"
+ "\t| auto-punctuation | stream-punctuation | enhanced-model | metadata\n"
+ "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
+ "for a Cloud Storage resource (gs://...)\n",
Recognize.class.getCanonicalName());
Expand Down Expand Up @@ -97,6 +101,10 @@ public static void main(String... args) throws Exception {
}
} else if (command.equals("stream-punctuation")) {
streamingTranscribeWithAutomaticPunctuation(path);
} else if (command.equals("enhanced-model")) {
transcribeFileWithEnhancedModel(path);
} else if (command.equals("metadata")) {
transcribeFileWithMetadata(path);
}
}

Expand Down Expand Up @@ -678,4 +686,97 @@ public SettableFuture<List<T>> future() {
}
}
// [END speech_stream_recognize_punctuation]

// [START speech_transcribe_file_with_enhanced_model]
/**
* Transcribe the given audio file using an enhanced model.
*
* @param fileName the path to an audio file.
*/
public static void transcribeFileWithEnhancedModel(String fileName) throws Exception {
Path path = Paths.get(fileName);
byte[] content = Files.readAllBytes(path);

try (SpeechClient speechClient = SpeechClient.create()) {
// Get the contents of the local audio file
RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder()
.setContent(ByteString.copyFrom(content))
.build();

// Configure request to enable enhanced models
RecognitionConfig config = RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.LINEAR16)
.setLanguageCode("en-US")
.setSampleRateHertz(8000)
// Enhanced models are only available to projects that
// opt in for audio data collection.
.setUseEnhanced(true)
// A model must be specified to use enhanced model.
.setModel("phone_call")
.build();

// Perform the transcription request
RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);

// Print out the results
for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
// There can be several alternative transcripts for a given chunk of speech. Just use the
// first (most likely) one here.
SpeechRecognitionAlternative alternative = result.getAlternatives(0);
System.out.format("Transcript: %s\n\n", alternative.getTranscript());
}
}
}
// [END speech_transcribe_file_with_enhanced_model]

// [START speech_transcribe_file_with_metadata]
/**
* Transcribe the given audio file and include recognition metadata in the request.
*
* @param fileName the path to an audio file.
*/
public static void transcribeFileWithMetadata(String fileName) throws Exception {
Path path = Paths.get(fileName);
byte[] content = Files.readAllBytes(path);

try (SpeechClient speechClient = SpeechClient.create()) {
// Get the contents of the local audio file
RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder()
.setContent(ByteString.copyFrom(content))
.build();

// Construct a recognition metadata object.
// Most metadata fields are specified as enums that can be found
// in speech.enums.RecognitionMetadata
RecognitionMetadata metadata = RecognitionMetadata.newBuilder()
.setInteractionType(InteractionType.DISCUSSION)
.setMicrophoneDistance(MicrophoneDistance.NEARFIELD)
.setRecordingDeviceType(RecordingDeviceType.SMARTPHONE)
.setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings
// And some are integers, for instance the 6 digit NAICS code
// https://www.naics.com/search/
.setIndustryNaicsCodeOfAudio(519190)
.build();

// Configure request to enable enhanced models
RecognitionConfig config = RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.LINEAR16)
.setLanguageCode("en-US")
.setSampleRateHertz(8000)
.setMetadata(metadata) // Add the metadata to the config
.build();

// Perform the transcription request
RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);

// Print out the results
for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
// There can be several alternative transcripts for a given chunk of speech. Just use the
// first (most likely) one here.
SpeechRecognitionAlternative alternative = result.getAlternatives(0);
System.out.format("Transcript: %s\n\n", alternative.getTranscript());
}
}
}
// [END speech_transcribe_file_with_metadata]
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ public class RecognizeIT {
private String videoFileName = "./resources/Google_Gnome.wav";
private String gcsVideoPath = "gs://" + BUCKET + "/speech/Google_Gnome.wav";

private String recognitionAudioFile = "./resources/commercial_mono.wav";

@Before
public void setUp() {
bout = new ByteArrayOutputStream();
Expand Down Expand Up @@ -145,4 +147,18 @@ public void testStreamAutoPunctuation() throws Exception {
String got = bout.toString();
assertThat(got).contains("How old is the Brooklyn Bridge?");
}

@Test
public void testEnhancedModel() throws Exception {
Recognize.transcribeFileWithEnhancedModel(recognitionAudioFile);
String got = bout.toString();
assertThat(got).contains("Chrome");
}

@Test
public void testMetadata() throws Exception {
Recognize.transcribeFileWithMetadata(recognitionAudioFile);
String got = bout.toString();
assertThat(got).contains("Chrome");
}
}

0 comments on commit de00f10

Please sign in to comment.