Skip to content

Commit

Permalink
Updates to highlight word time offsets (#787)
Browse files Browse the repository at this point in the history
  • Loading branch information
gguuss authored and lesv committed Aug 3, 2017
1 parent 47dc3d2 commit 14f31f0
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 7 deletions.
12 changes: 12 additions & 0 deletions speech/cloud-client/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,15 @@ Build your project with:
java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \
com.example.speech.Recognize asyncrecognize gs://cloud-samples-tests/speech/vr.flac
```

### Synchronously transcribe an audio file and print word offsets
```
java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \
com.example.speech.Recognize wordoffsets ./resources/audio.raw
```

### Asynchronously transcribe a remote audio file and print word offsets
```
java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \
com.example.speech.Recognize wordoffsets gs://cloud-samples-tests/speech/vr.flac
```
7 changes: 4 additions & 3 deletions speech/cloud-client/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@

<!-- Parent defines config for testing & linting. -->
<parent>
<groupId>com.google.cloud.samples</groupId>
<artifactId>shared-configuration</artifactId>
<version>1.0.5</version>
<artifactId>doc-samples</artifactId>
<groupId>com.google.cloud</groupId>
<version>1.0.0</version>
<relativePath>../..</relativePath>
</parent>

<properties>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public static void main(String... args) throws Exception {
System.out.printf(
"\tjava %s \"<command>\" \"<path-to-image>\"\n"
+ "Commands:\n"
+ "\tsyncrecognize | asyncrecognize | streamrecognize\n"
+ "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets\n"
+ "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
+ "for a Cloud Storage resource (gs://...)\n",
Recognize.class.getCanonicalName());
Expand All @@ -66,6 +66,12 @@ public static void main(String... args) throws Exception {
} else {
syncRecognizeFile(path);
}
} else if (command.equals("wordoffsets")) {
if (path.startsWith("gs://")) {
asyncRecognizeWords(path);
} else {
syncRecognizeWords(path);
}
} else if (command.equals("asyncrecognize")) {
if (path.startsWith("gs://")) {
asyncRecognizeGcs(path);
Expand Down Expand Up @@ -113,6 +119,51 @@ public static void syncRecognizeFile(String fileName) throws Exception, IOExcept
speech.close();
}

/**
* Performs sync recognize and prints word time offsets.
*
* @param fileName the path to a PCM audio file to transcribe get offsets on.
*/
public static void syncRecognizeWords(String fileName) throws Exception, IOException {
SpeechClient speech = SpeechClient.create();

Path path = Paths.get(fileName);
byte[] data = Files.readAllBytes(path);
ByteString audioBytes = ByteString.copyFrom(data);

// Configure request with local raw PCM audio
RecognitionConfig config = RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.LINEAR16)
.setLanguageCode("en-US")
.setSampleRateHertz(16000)
.setEnableWordTimeOffsets(true)
.build();
RecognitionAudio audio = RecognitionAudio.newBuilder()
.setContent(audioBytes)
.build();

// Use blocking call to get audio transcript
RecognizeResponse response = speech.recognize(config, audio);
List<SpeechRecognitionResult> results = response.getResultsList();

for (SpeechRecognitionResult result: results) {
List<SpeechRecognitionAlternative> alternatives = result.getAlternativesList();
for (SpeechRecognitionAlternative alternative: alternatives) {
System.out.printf("Transcription: %s%n", alternative.getTranscript());
for (WordInfo wordInfo: alternative.getWordsList()) {
System.out.println(wordInfo.getWord());
System.out.printf("\t%s.%s sec - %s.%s sec\n",
wordInfo.getStartTime().getSeconds(),
wordInfo.getStartTime().getNanos() / 100000000,
wordInfo.getEndTime().getSeconds(),
wordInfo.getEndTime().getNanos() / 100000000);
}
}
}
speech.close();
}


/**
* Performs speech recognition on remote FLAC file and prints the transcription.
*
Expand Down Expand Up @@ -193,11 +244,11 @@ public static void asyncRecognizeFile(String fileName) throws Exception, IOExcep

/**
* Performs non-blocking speech recognition on remote FLAC file and prints
* the transcription.
* the transcription as well as word time offsets.
*
* @param gcsUri the path to the remote LINEAR16 audio file to transcribe.
*/
public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOException {
public static void asyncRecognizeWords(String gcsUri) throws Exception, IOException {
// Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS
SpeechClient speech = SpeechClient.create();

Expand Down Expand Up @@ -240,6 +291,47 @@ public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOExceptio
speech.close();
}

/**
* Performs non-blocking speech recognition on remote FLAC file and prints
* the transcription.
*
* @param gcsUri the path to the remote LINEAR16 audio file to transcribe.
*/
public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOException {
// Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS
SpeechClient speech = SpeechClient.create();

// Configure remote file request for Linear16
RecognitionConfig config = RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.FLAC)
.setLanguageCode("en-US")
.setSampleRateHertz(16000)
.build();
RecognitionAudio audio = RecognitionAudio.newBuilder()
.setUri(gcsUri)
.build();

// Use non-blocking call for getting file transcription
OperationFuture<LongRunningRecognizeResponse, LongRunningRecognizeMetadata,
Operation> response =
speech.longRunningRecognizeAsync(config, audio);
while (!response.isDone()) {
System.out.println("Waiting for response...");
Thread.sleep(10000);
}

List<SpeechRecognitionResult> results = response.get().getResultsList();

for (SpeechRecognitionResult result: results) {
List<SpeechRecognitionAlternative> alternatives = result.getAlternativesList();
for (SpeechRecognitionAlternative alternative: alternatives) {
System.out.printf("Transcription: %s\n",alternative.getTranscript());
}
}
speech.close();
}


/**
* Performs streaming speech recognition on raw PCM audio data.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ public void testRecognizeFile() throws Exception {
assertThat(got).contains("how old is the Brooklyn Bridge");
}

@Test
public void testRecognizeWordoffset() throws Exception {
Recognize.syncRecognizeWords(fileName);
String got = bout.toString();
assertThat(got).contains("how old is the Brooklyn Bridge");
assertThat(got).contains("\t0.0 sec -");
}

@Test
public void testRecognizeGcs() throws Exception {
Recognize.syncRecognizeGcs(gcsPath);
Expand All @@ -85,8 +93,9 @@ public void testAsyncRecognizeGcs() throws Exception {

@Test
public void testAsyncWordoffset() throws Exception {
Recognize.asyncRecognizeGcs(gcsPath);
Recognize.asyncRecognizeWords(gcsPath);
String got = bout.toString();
assertThat(got).contains("how old is the Brooklyn Bridge");
assertThat(got).contains("\t0.0 sec -");
}

Expand Down

0 comments on commit 14f31f0

Please sign in to comment.