Skip to content

Commit

Permalink
Merge pull request #22 from sled-group/enable-stt
Browse files Browse the repository at this point in the history
Add `--enable-stt`
  • Loading branch information
yukw777 authored Dec 14, 2023
2 parents 8e84ba0 + 07fc648 commit 6a5af3e
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 66 deletions.
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ $ pip install -r samples/requirements.txt
$ python samples/simple_subscriber.py
```

### Text-to-speech
### Text-to-speech and Speech-to-text

You can enable text-to-speech by passing the `--enable-tts` option. `PercepSync` relies on [Microsoft Azure Speech Service](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-sdk) to generate speech, so make sure you also pass in your Azure credentials via a config file. In the local mode, the speech will be played via the speaker, while in the HoloLens mode, the speech will be played on the HoloLens.
You can enable text-to-speech and speech-to-text by passing the `--enable-tts` and `--enable-stt` options. `PercepSync` relies on [Microsoft Azure Speech Service](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-sdk) to handle speech, so make sure you also pass in your Azure credentials via a config file. In the local mode, the speech will be played via the speaker, while in the HoloLens mode, the speech will be played on the HoloLens.

**NOTE: Microsoft Azure Speech Service SDK relies on OpenSSL 1.x, which is no longer shipped with Ubuntu 22.04. As a result, you need to install OpenSSL 1.x from sources. Instructions can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/quickstarts/setup-platform?tabs=linux%2Cubuntu%2Cdotnetcli%2Cdotnet%2Cjre%2Cmaven%2Cnodejs%2Cmac%2Cpypi&pivots=programming-language-csharp#platform-requirements). Please make sure you set the environment variable `SSL_CERT_DIR=/etc/ssl/certs`.**

Expand All @@ -80,15 +80,19 @@ $ ./PercepSync --config-file config.toml --enable-tts local
$ ./PercepSync --config-file config.toml --enable-tts hololens
```

Now in another terminal, run the sample script.
Now in another terminal, run the sample scripts.

```bash
# Install the required packages
$ pip install -r samples/requirements.txt

# Now, run it!
# TTS
$ python samples/simple_tts.py
TTS Text: Hello, world!

# SST
$ python samples/simple_subscriber.py
Transcribed Text: Hello, world!
```

## Configuration
Expand Down
2 changes: 2 additions & 0 deletions src/PercepSync/Config.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ internal class Config
public static int DefaultRdzvServerPort = 13331;
public static bool DefaultEnableTts = false;
public static string DefaultTtsAddress = "tcp://*:12346";
public static bool DefaultEnableStt = false;
public static double DefaultFps = 5;

public string PercepStreamAddress { get; set; } = DefaultPercepStreamAddress;
public bool EnablePreview { get; set; } = DefaultEnablePreview;
public int RdzvServerPort { get; set; } = DefaultRdzvServerPort;
public bool EnableTts { get; set; } = DefaultEnableTts;
public string TtsAddress { get; set; } = DefaultTtsAddress;
public bool EnableStt { get; set; } = DefaultEnableStt;
public double Fps { get; set; } = DefaultFps;
public AzureSpeechConfig AzureSpeechConfig { get; set; } = new();
public LocalConfig? LocalConfig { get; set; } = null;
Expand Down
150 changes: 101 additions & 49 deletions src/PercepSync/PercepSync.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ public static void Main(string[] args)
getDefaultValue: () => Config.DefaultTtsAddress
);
rootCommand.AddOption(ttsAddressOption);
var enableSttOption = new Option<bool>(
name: "--enable-stt",
description: "Whether to enable speech-to-text or not. Make sure to set Azure creds if enabled.",
getDefaultValue: () => Config.DefaultEnableStt
);
rootCommand.AddOption(enableSttOption);

var localCommand = new Command("local", description: "Use local devices");
var localCameraDeviceIDOption = new Option<string>(
Expand Down Expand Up @@ -94,7 +100,8 @@ Config CreateConfig(
bool enablePreview,
int rdzvServerPort,
bool enableTts,
string? ttsAddress
string? ttsAddress,
bool enableStt
)
{
Config config;
Expand Down Expand Up @@ -132,6 +139,10 @@ percepStreamAddress is not null
{
config.TtsAddress = ttsAddress;
}
if (enableStt != Config.DefaultEnableStt)
{
config.EnableStt = enableStt;
}
return config;
}

Expand All @@ -144,7 +155,8 @@ percepStreamAddress is not null
context.ParseResult.GetValueForOption(enablePreviewOption),
context.ParseResult.GetValueForOption(rdzvServerPortOption),
context.ParseResult.GetValueForOption(enableTtsOption),
context.ParseResult.GetValueForOption(ttsAddressOption)
context.ParseResult.GetValueForOption(ttsAddressOption),
context.ParseResult.GetValueForOption(enableSttOption)
);
var cameraDeviceID = context.ParseResult.GetValueForOption(
localCameraDeviceIDOption
Expand Down Expand Up @@ -204,7 +216,8 @@ percepStreamAddress is not null
enablePreview,
rdzvServerPort,
enableTts,
ttsAddress
ttsAddress,
enableStt
) =>
{
var config = CreateConfig(
Expand All @@ -213,7 +226,8 @@ percepStreamAddress is not null
enablePreview,
rdzvServerPort,
enableTts,
ttsAddress
ttsAddress,
enableStt
);
if (config.HoloLensConfig is null)
{
Expand All @@ -232,7 +246,8 @@ percepStreamAddress is not null
enablePreviewOption,
rdzvServerPortOption,
enableTtsOption,
ttsAddressOption
ttsAddressOption,
enableSttOption
);
rootCommand.Invoke(args);
}
Expand Down Expand Up @@ -300,13 +315,22 @@ private static void RunPercepSync(
percepSyncPipeline,
config.TtsAddress
);
speechSynthesizer = new AzureSpeechSynthesizer(
percepSyncPipeline,
config.AzureSpeechConfig.SubscriptionKey,
config.AzureSpeechConfig.Region,
config.AzureSpeechConfig.SpeechSynthesisVoiceName,
audioBufferFrameSizeInBytes
);
try
{
speechSynthesizer = new AzureSpeechSynthesizer(
percepSyncPipeline,
config.AzureSpeechConfig.SubscriptionKey,
config.AzureSpeechConfig.Region,
config.AzureSpeechConfig.SpeechSynthesisVoiceName,
audioBufferFrameSizeInBytes
);
}
catch (Exception ex)
{
throw new Exception(
$"Error while setting up Text-to-Speech:\n\n{ex}.\n\nPlease ensure that your Azure credentials are correct."
);
}
ttsReceiver.PipeTo(speechSynthesizer);
if (config.LocalConfig is not null)
{
Expand Down Expand Up @@ -334,46 +358,74 @@ private static void RunPercepSync(
var audioBufferStream = sensorStreams.AudioBufferStream.Reframe(
audioBufferFrameSizeInBytes
);
var speechRecognizer = new ContinuousAzureSpeechRecognizer(
percepSyncPipeline,
config.AzureSpeechConfig.SubscriptionKey,
config.AzureSpeechConfig.Region
);
audioBufferStream.PipeTo(speechRecognizer);
var percepStream = videoFrameStream
.Join(
audioBufferStream,
Reproducible.Nearest<AudioBuffer>(
TimeSpan.FromSeconds(percepDurationInSeconds)
)
)
.Join(
speechRecognizer,
Reproducible.Nearest<string>(
TimeSpan.FromSeconds(percepDurationInSeconds / 2)
)
var videoAudioStream = videoFrameStream.Join(
audioBufferStream,
Reproducible.Nearest<AudioBuffer>(
TimeSpan.FromSeconds(percepDurationInSeconds)
)
.Select(
(tuple) =>
{
(var frame, var audioBuffer, var transcription) = tuple;

var pixelData = new byte[frame.Resource.Size];
frame.Resource.CopyTo(pixelData);
var rawPixelFrame = new RawPixelImage(
pixelData,
frame.Resource.Width,
frame.Resource.Height,
frame.Resource.Stride
);
);
IProducer<Perception> percepStream;
Perception CreatePerception(
Shared<Image> frame,
AudioBuffer audioBuffer,
string transcription = ""
)
{
var pixelData = new byte[frame.Resource.Size];
frame.Resource.CopyTo(pixelData);
var rawPixelFrame = new RawPixelImage(
pixelData,
frame.Resource.Width,
frame.Resource.Height,
frame.Resource.Stride
);

return new Perception(
rawPixelFrame,
new Audio(audioBuffer.Data),
new TranscribedText(transcription)
);
}
return new Perception(
rawPixelFrame,
new Audio(audioBuffer.Data),
new TranscribedText(transcription)
);
}
if (config.EnableStt)
{
ContinuousAzureSpeechRecognizer speechRecognizer;
try
{
speechRecognizer = new ContinuousAzureSpeechRecognizer(
percepSyncPipeline,
config.AzureSpeechConfig.SubscriptionKey,
config.AzureSpeechConfig.Region
);
}
catch (Exception ex)
{
throw new Exception(
$"Error while setting up Speech-to-Text:\n\n{ex}.\n\nPlease ensure that your Azure credentials are correct."
);
}
audioBufferStream.PipeTo(speechRecognizer);
percepStream = videoAudioStream
.Join(
speechRecognizer,
Reproducible.Nearest<string>(
TimeSpan.FromSeconds(percepDurationInSeconds / 2)
)
)
.Select(
(tuple) =>
CreatePerception(
tuple.Item1,
tuple.Item2,
transcription: tuple.Item3
)
);
}
else
{
percepStream = videoAudioStream.Select(
(tuple) => CreatePerception(tuple.Item1, tuple.Item2)
);
}
var percepStreamMQWriter = new NetMQWriter<Perception>(
percepSyncPipeline,
PerceptionTopic,
Expand Down
19 changes: 6 additions & 13 deletions src/PercepSync/Tts.cs
Original file line number Diff line number Diff line change
Expand Up @@ -94,19 +94,12 @@ int audioBufferFrameSizeInBytes
this.region = region;
this.voiceName = voiceName;

try
{
var speechConfig = SpeechConfig.FromSubscription(this.subscriptionKey, this.region);
speechConfig.SpeechSynthesisVoiceName = this.voiceName;
speechConfig.SetSpeechSynthesisOutputFormat(
SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
);
speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
}
catch (Exception e)
{
throw new Exception($"Error while initializing SpeechSynthesizer: {e.Message}");
}
var speechConfig = SpeechConfig.FromSubscription(this.subscriptionKey, this.region);
speechConfig.SpeechSynthesisVoiceName = this.voiceName;
speechConfig.SetSpeechSynthesisOutputFormat(
SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
);
speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
In = pipeline.CreateReceiver<TtsRequest>(this, Receive, nameof(In));
audioOut = pipeline.CreateEmitter<AudioBuffer>(this, nameof(audioOut));
reframer = new Reframe(pipeline, audioBufferFrameSizeInBytes);
Expand Down

0 comments on commit 6a5af3e

Please sign in to comment.