Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speech api changes #264

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions speech/grpc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,20 +73,35 @@ note that the audio file must be in RAW format. You can use `sox`
(available, e.g. via [http://sox.sourceforge.net/](http://sox.sourceforge.net/)
or [homebrew](http://brew.sh/)) to convert audio files to raw format.

### Run the non-streaming client
### Run the sync client

You can run the batch client like this:
You can run the sync client like this:

```sh
$ bin/speech-sample-nonstreaming.sh --host=speech.googleapis.com --port=443 \
--file=<audio file path> --sampling=<sample rate>
$ bin/speech-sample-sync.sh --host=speech.googleapis.com --port=443 \
--uri=<audio file uri> --sampling=<sample rate>
```

Try a streaming rate of 16000 and the included sample audio file, as follows:

```sh
$ bin/speech-sample-nonstreaming.sh --host=speech.googleapis.com --port=443 \
--file=resources/audio.raw --sampling=16000
$ bin/speech-sample-sync.sh --host=speech.googleapis.com --port=443 \
--uri=resources/audio.raw --sampling=16000
```

### Run the async client

You can run the async client like this:

```sh
bin/speech-sample-async.sh --host=speech.googleapis.com --port=443 \
--uri=<audio file uri> --sampling=<sample rate>
```

Try a streaming rate of 16000 and the included sample audio file, as follows:
```sh
$ bin/speech-sample-async.sh --host=speech.googleapis.com --port=443 \
--uri=resources/audio.raw --sampling=16000
```

### Run the streaming client
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@

SRC_DIR=$(cd "$(dirname "$0")/.."; pwd)
java -cp ${SRC_DIR}/target/grpc-sample-1.0-jar-with-dependencies.jar \
com.google.cloud.speech.grpc.demos.NonStreamingRecognizeClient "$@"
com.google.cloud.speech.grpc.demos.AsyncRecognizeClient "$@"
2 changes: 1 addition & 1 deletion speech/grpc/bin/speech-sample-streaming.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@

SRC_DIR=$(cd "$(dirname "$0")/.."; pwd)
java -cp ${SRC_DIR}/target/grpc-sample-1.0-jar-with-dependencies.jar \
com.google.cloud.speech.grpc.demos.RecognizeClient "$@"
com.google.cloud.speech.grpc.demos.StreamingRecognizeClient "$@"
18 changes: 18 additions & 0 deletions speech/grpc/bin/speech-sample-sync.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

SRC_DIR=$(cd "$(dirname "$0")/.."; pwd)
java -cp ${SRC_DIR}/target/grpc-sample-1.0-jar-with-dependencies.jar \
com.google.cloud.speech.grpc.demos.SyncRecognizeClient "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
/*
* Copyright 2016 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// Client that sends audio to Speech.AsyncRecognize via gRPC and returns longrunning operation.
// The results are received via the google.longrunning.Operations interface.
//
// Uses a service account for OAuth2 authentication, which you may obtain at
// https://console.developers.google.com
// API Manager > Google Cloud Speech API > Enable
// API Manager > Credentials > Create credentials > Service account key > New service account.
//
// Then set environment variable GOOGLE_APPLICATION_CREDENTIALS to the full path of that file.

package com.google.cloud.speech.grpc.demos;

import com.google.auth.oauth2.GoogleCredentials;
import com.google.cloud.speech.v1beta1.AsyncRecognizeRequest;
import com.google.cloud.speech.v1beta1.AsyncRecognizeResponse;
import com.google.cloud.speech.v1beta1.RecognitionAudio;
import com.google.cloud.speech.v1beta1.RecognitionConfig;
import com.google.cloud.speech.v1beta1.RecognitionConfig.AudioEncoding;
import com.google.cloud.speech.v1beta1.SpeechGrpc;

import com.google.longrunning.GetOperationRequest;
import com.google.longrunning.Operation;
import com.google.longrunning.OperationsGrpc;

import io.grpc.ManagedChannel;
import io.grpc.StatusRuntimeException;
import io.grpc.auth.ClientAuthInterceptor;
import io.grpc.netty.NegotiationType;
import io.grpc.netty.NettyChannelBuilder;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;

import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
* Client that sends audio to Speech.AsyncRecognize and returns transcript.
*/
public class AsyncRecognizeClient {

private static final Logger logger =
Logger.getLogger(AsyncRecognizeClient.class.getName());

private static final List<String> OAUTH2_SCOPES =
Arrays.asList("https://www.googleapis.com/auth/cloud-platform");

private final String host;
private final int port;
private final URI input;
private final int samplingRate;

private final ManagedChannel channel;
private final SpeechGrpc.SpeechBlockingStub stub;
private final OperationsGrpc.OperationsBlockingStub statusStub;

/**
* Construct client connecting to Cloud Speech server at {@code host:port}.
*/
public AsyncRecognizeClient(String host, int port, URI input, int samplingRate)
throws IOException {
this.host = host;
this.port = port;
this.input = input;
this.samplingRate = samplingRate;

GoogleCredentials creds = GoogleCredentials.getApplicationDefault();
creds = creds.createScoped(OAUTH2_SCOPES);
channel = NettyChannelBuilder.forAddress(host, port)
.negotiationType(NegotiationType.TLS)
.intercept(new ClientAuthInterceptor(creds, Executors.newSingleThreadExecutor()))
.build();
stub = SpeechGrpc.newBlockingStub(channel);
statusStub = OperationsGrpc.newBlockingStub(channel);

logger.info("Created stub for " + host + ":" + port);
}

private RecognitionAudio createRecognitionAudio() throws IOException {
return RecognitionAudioFactory.createRecognitionAudio(this.input);
}

public void shutdown() throws InterruptedException {
channel.shutdown().awaitTermination(5, TimeUnit.SECONDS);
}

/** Send an async-recognize request to server. */
public void recognize() {
RecognitionAudio audio;
try {
audio = createRecognitionAudio();
} catch (IOException e) {
logger.log(Level.WARNING, "Failed to read audio uri input: " + input);
return;
}
logger.info("Sending " + audio.getContent().size() + " bytes from audio uri input: " + input);
RecognitionConfig config = RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.LINEAR16)
.setSampleRate(samplingRate)
.build();
AsyncRecognizeRequest request = AsyncRecognizeRequest.newBuilder()
.setConfig(config)
.setAudio(audio)
.build();

Operation operation;
Operation status;
try {
operation = stub.asyncRecognize(request);

//Print the long running operation handle
logger.log(Level.INFO, String.format("Operation handle: %s, URI: %s", operation.getName(),
input.toString()));
} catch (StatusRuntimeException e) {
logger.log(Level.WARNING, "RPC failed: {0}", e.getStatus());
return;
}

while (true) {
try {
logger.log(Level.INFO, "Waiting 2s for operation, {0} processing...", operation.getName());
Thread.sleep(2000);
GetOperationRequest operationReq = GetOperationRequest.newBuilder()
.setName(operation.getName())
.build();
status = statusStub.getOperation(
GetOperationRequest.newBuilder()
.setName(operation.getName())
.build()
);

if (status.getDone()) {
break;
}
} catch (Exception ex) {
logger.log(Level.WARNING, ex.getMessage());
}
}

try {
AsyncRecognizeResponse asyncRes = status.getResponse().unpack(AsyncRecognizeResponse.class);

logger.info("Received response: " + asyncRes);
} catch (com.google.protobuf.InvalidProtocolBufferException ex) {
logger.log(Level.WARNING, "Unpack error, {0}",ex.getMessage());
}
}

public static void main(String[] args) throws Exception {

String audioFile = "";
String host = "speech.googleapis.com";
Integer port = 443;
Integer sampling = 16000;

CommandLineParser parser = new DefaultParser();

Options options = new Options();
options.addOption(OptionBuilder.withLongOpt("uri")
.withDescription("path to audio uri")
.hasArg()
.withArgName("FILE_PATH")
.create());
options.addOption(OptionBuilder.withLongOpt("host")
.withDescription("endpoint for api, e.g. speech.googleapis.com")
.hasArg()
.withArgName("ENDPOINT")
.create());
options.addOption(OptionBuilder.withLongOpt("port")
.withDescription("SSL port, usually 443")
.hasArg()
.withArgName("PORT")
.create());
options.addOption(OptionBuilder.withLongOpt("sampling")
.withDescription("Sampling Rate, i.e. 16000")
.hasArg()
.withArgName("RATE")
.create());

try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("uri")) {
audioFile = line.getOptionValue("uri");
} else {
System.err.println("An Audio uri must be specified (e.g. file:///foo/baz.raw).");
System.exit(1);
}

if (line.hasOption("host")) {
host = line.getOptionValue("host");
} else {
System.err.println("An API enpoint must be specified (typically speech.googleapis.com).");
System.exit(1);
}

if (line.hasOption("port")) {
port = Integer.parseInt(line.getOptionValue("port"));
} else {
System.err.println("An SSL port must be specified (typically 443).");
System.exit(1);
}

if (line.hasOption("sampling")) {
sampling = Integer.parseInt(line.getOptionValue("sampling"));
} else {
System.err.println("An Audio sampling rate must be specified.");
System.exit(1);
}
} catch (ParseException exp) {
System.err.println("Unexpected exception:" + exp.getMessage());
System.exit(1);
}

AsyncRecognizeClient client =
new AsyncRecognizeClient(host, port, URI.create(audioFile), sampling);
try {
client.recognize();
} finally {
client.shutdown();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,21 @@

package com.google.cloud.speech.grpc.demos;

import com.google.cloud.speech.v1.AudioRequest;
import com.google.cloud.speech.v1beta1.RecognitionAudio;
import com.google.protobuf.ByteString;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

/*
* AudioRequestFactory takes a URI as an input and creates an AudioRequest. The URI can point to a
* local file or a file on Google Cloud Storage.
* RecognitionAudioFactory takes a URI as an input and creates a RecognitionAudio.
* The URI can point to a local file or a file on Google Cloud Storage.
*/
public class AudioRequestFactory {
public class RecognitionAudioFactory {

private static final String FILE_SCHEME = "file";
private static final String GS_SCHEME = "gs";
Expand All @@ -39,27 +40,31 @@ public class AudioRequestFactory {
* Takes an input URI of form $scheme:// and converts to audio request.
*
* @param uri input uri
* @return AudioRequest audio request
* @return RecognitionAudio recognition audio
*/
public static AudioRequest createRequest(URI uri)
public static RecognitionAudio createRecognitionAudio(URI uri)
throws IOException {
if (uri.getScheme() == null || uri.getScheme().equals(FILE_SCHEME)) {
if (uri.getScheme() == null) {
uri = new File(uri.toString()).toURI();
Path path = Paths.get(uri);
return audioFromBytes(Files.readAllBytes(path));
} else if (uri.getScheme().equals(FILE_SCHEME)) {
Path path = Paths.get(uri);
return audioFromBytes(Files.readAllBytes(path));
} else if (uri.getScheme().equals(GS_SCHEME)) {
return AudioRequest.newBuilder().setUri(uri.toString()).build();
return RecognitionAudio.newBuilder().setUri(uri.toString()).build();
}
throw new RuntimeException("scheme not supported " + uri.getScheme());
}

/**
* Convert bytes to AudioRequest.
* Convert bytes to RecognitionAudio.
*
* @param bytes input bytes
* @return AudioRequest audio request
* @return RecognitionAudio recognition audio
*/
private static AudioRequest audioFromBytes(byte[] bytes) {
return AudioRequest.newBuilder()
private static RecognitionAudio audioFromBytes(byte[] bytes) {
return RecognitionAudio.newBuilder()
.setContent(ByteString.copyFrom(bytes))
.build();
}
Expand Down
Loading