Merge pull request #22 from sled-group/enable-stt

Add `--enable-stt`
sled-group · Dec 14, 2023 · 6a5af3e · 6a5af3e
2 parents 8e84ba0 + 07fc648
commit 6a5af3e
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -61,9 +61,9 @@ $ pip install -r samples/requirements.txt
 $ python samples/simple_subscriber.py
 ```
 
-### Text-to-speech
+### Text-to-speech and Speech-to-text
 
-You can enable text-to-speech by passing the `--enable-tts` option. `PercepSync` relies on [Microsoft Azure Speech Service](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-sdk) to generate speech, so make sure you also pass in your Azure credentials via a config file. In the local mode, the speech will be played via the speaker, while in the HoloLens mode, the speech will be played on the HoloLens.
+You can enable text-to-speech and speech-to-text by passing the `--enable-tts` and `--enable-stt` options. `PercepSync` relies on [Microsoft Azure Speech Service](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-sdk) to handle speech, so make sure you also pass in your Azure credentials via a config file. In the local mode, the speech will be played via the speaker, while in the HoloLens mode, the speech will be played on the HoloLens.
 
 **NOTE: Microsoft Azure Speech Service SDK relies on OpenSSL 1.x, which is no longer shipped with Ubuntu 22.04. As a result, you need to install OpenSSL 1.x from sources. Instructions can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/quickstarts/setup-platform?tabs=linux%2Cubuntu%2Cdotnetcli%2Cdotnet%2Cjre%2Cmaven%2Cnodejs%2Cmac%2Cpypi&pivots=programming-language-csharp#platform-requirements). Please make sure you set the environment variable `SSL_CERT_DIR=/etc/ssl/certs`.**
 
@@ -80,15 +80,19 @@ $ ./PercepSync --config-file config.toml --enable-tts local
 $ ./PercepSync --config-file config.toml --enable-tts hololens
 ```
 
-Now in another terminal, run the sample script.
+Now in another terminal, run the sample scripts.
 
 ```bash
 # Install the required packages
 $ pip install -r samples/requirements.txt
 
-# Now, run it!
+# TTS
 $ python samples/simple_tts.py
 TTS Text: Hello, world!
+
+# SST
+$ python samples/simple_subscriber.py
+Transcribed Text: Hello, world!
 ```
 
 ## Configuration

diff --git a/src/PercepSync/Config.cs b/src/PercepSync/Config.cs
@@ -7,13 +7,15 @@ internal class Config
         public static int DefaultRdzvServerPort = 13331;
         public static bool DefaultEnableTts = false;
         public static string DefaultTtsAddress = "tcp://*:12346";
+        public static bool DefaultEnableStt = false;
         public static double DefaultFps = 5;
 
         public string PercepStreamAddress { get; set; } = DefaultPercepStreamAddress;
         public bool EnablePreview { get; set; } = DefaultEnablePreview;
         public int RdzvServerPort { get; set; } = DefaultRdzvServerPort;
         public bool EnableTts { get; set; } = DefaultEnableTts;
         public string TtsAddress { get; set; } = DefaultTtsAddress;
+        public bool EnableStt { get; set; } = DefaultEnableStt;
         public double Fps { get; set; } = DefaultFps;
         public AzureSpeechConfig AzureSpeechConfig { get; set; } = new();
         public LocalConfig? LocalConfig { get; set; } = null;

diff --git a/src/PercepSync/PercepSync.cs b/src/PercepSync/PercepSync.cs
@@ -66,6 +66,12 @@ public static void Main(string[] args)
                 getDefaultValue: () => Config.DefaultTtsAddress
             );
             rootCommand.AddOption(ttsAddressOption);
+            var enableSttOption = new Option<bool>(
+                name: "--enable-stt",
+                description: "Whether to enable speech-to-text or not. Make sure to set Azure creds if enabled.",
+                getDefaultValue: () => Config.DefaultEnableStt
+            );
+            rootCommand.AddOption(enableSttOption);
 
             var localCommand = new Command("local", description: "Use local devices");
             var localCameraDeviceIDOption = new Option<string>(
@@ -94,7 +100,8 @@ Config CreateConfig(
                 bool enablePreview,
                 int rdzvServerPort,
                 bool enableTts,
-                string? ttsAddress
+                string? ttsAddress,
+                bool enableStt
             )
             {
                 Config config;
@@ -132,6 +139,10 @@ percepStreamAddress is not null
                 {
                     config.TtsAddress = ttsAddress;
                 }
+                if (enableStt != Config.DefaultEnableStt)
+                {
+                    config.EnableStt = enableStt;
+                }
                 return config;
             }
 
@@ -144,7 +155,8 @@ percepStreamAddress is not null
                         context.ParseResult.GetValueForOption(enablePreviewOption),
                         context.ParseResult.GetValueForOption(rdzvServerPortOption),
                         context.ParseResult.GetValueForOption(enableTtsOption),
-                        context.ParseResult.GetValueForOption(ttsAddressOption)
+                        context.ParseResult.GetValueForOption(ttsAddressOption),
+                        context.ParseResult.GetValueForOption(enableSttOption)
                     );
                     var cameraDeviceID = context.ParseResult.GetValueForOption(
                         localCameraDeviceIDOption
@@ -204,7 +216,8 @@ percepStreamAddress is not null
                     enablePreview,
                     rdzvServerPort,
                     enableTts,
-                    ttsAddress
+                    ttsAddress,
+                    enableStt
                 ) =>
                 {
                     var config = CreateConfig(
@@ -213,7 +226,8 @@ percepStreamAddress is not null
                         enablePreview,
                         rdzvServerPort,
                         enableTts,
-                        ttsAddress
+                        ttsAddress,
+                        enableStt
                     );
                     if (config.HoloLensConfig is null)
                     {
@@ -232,7 +246,8 @@ percepStreamAddress is not null
                 enablePreviewOption,
                 rdzvServerPortOption,
                 enableTtsOption,
-                ttsAddressOption
+                ttsAddressOption,
+                enableSttOption
             );
             rootCommand.Invoke(args);
         }
@@ -300,13 +315,22 @@ private static void RunPercepSync(
                             percepSyncPipeline,
                             config.TtsAddress
                         );
-                        speechSynthesizer = new AzureSpeechSynthesizer(
-                            percepSyncPipeline,
-                            config.AzureSpeechConfig.SubscriptionKey,
-                            config.AzureSpeechConfig.Region,
-                            config.AzureSpeechConfig.SpeechSynthesisVoiceName,
-                            audioBufferFrameSizeInBytes
-                        );
+                        try
+                        {
+                            speechSynthesizer = new AzureSpeechSynthesizer(
+                                percepSyncPipeline,
+                                config.AzureSpeechConfig.SubscriptionKey,
+                                config.AzureSpeechConfig.Region,
+                                config.AzureSpeechConfig.SpeechSynthesisVoiceName,
+                                audioBufferFrameSizeInBytes
+                            );
+                        }
+                        catch (Exception ex)
+                        {
+                            throw new Exception(
+                                $"Error while setting up Text-to-Speech:\n\n{ex}.\n\nPlease ensure that your Azure credentials are correct."
+                            );
+                        }
                         ttsReceiver.PipeTo(speechSynthesizer);
                         if (config.LocalConfig is not null)
                         {
@@ -334,46 +358,74 @@ private static void RunPercepSync(
                     var audioBufferStream = sensorStreams.AudioBufferStream.Reframe(
                         audioBufferFrameSizeInBytes
                     );
-                    var speechRecognizer = new ContinuousAzureSpeechRecognizer(
-                        percepSyncPipeline,
-                        config.AzureSpeechConfig.SubscriptionKey,
-                        config.AzureSpeechConfig.Region
-                    );
-                    audioBufferStream.PipeTo(speechRecognizer);
-                    var percepStream = videoFrameStream
-                        .Join(
-                            audioBufferStream,
-                            Reproducible.Nearest<AudioBuffer>(
-                                TimeSpan.FromSeconds(percepDurationInSeconds)
-                            )
-                        )
-                        .Join(
-                            speechRecognizer,
-                            Reproducible.Nearest<string>(
-                                TimeSpan.FromSeconds(percepDurationInSeconds / 2)
-                            )
+                    var videoAudioStream = videoFrameStream.Join(
+                        audioBufferStream,
+                        Reproducible.Nearest<AudioBuffer>(
+                            TimeSpan.FromSeconds(percepDurationInSeconds)
                         )
-                        .Select(
-                            (tuple) =>
-                            {
-                                (var frame, var audioBuffer, var transcription) = tuple;
-
-                                var pixelData = new byte[frame.Resource.Size];
-                                frame.Resource.CopyTo(pixelData);
-                                var rawPixelFrame = new RawPixelImage(
-                                    pixelData,
-                                    frame.Resource.Width,
-                                    frame.Resource.Height,
-                                    frame.Resource.Stride
-                                );
+                    );
+                    IProducer<Perception> percepStream;
+                    Perception CreatePerception(
+                        Shared<Image> frame,
+                        AudioBuffer audioBuffer,
+                        string transcription = ""
+                    )
+                    {
+                        var pixelData = new byte[frame.Resource.Size];
+                        frame.Resource.CopyTo(pixelData);
+                        var rawPixelFrame = new RawPixelImage(
+                            pixelData,
+                            frame.Resource.Width,
+                            frame.Resource.Height,
+                            frame.Resource.Stride
+                        );
 
-                                return new Perception(
-                                    rawPixelFrame,
-                                    new Audio(audioBuffer.Data),
-                                    new TranscribedText(transcription)
-                                );
-                            }
+                        return new Perception(
+                            rawPixelFrame,
+                            new Audio(audioBuffer.Data),
+                            new TranscribedText(transcription)
                         );
+                    }
+                    if (config.EnableStt)
+                    {
+                        ContinuousAzureSpeechRecognizer speechRecognizer;
+                        try
+                        {
+                            speechRecognizer = new ContinuousAzureSpeechRecognizer(
+                                percepSyncPipeline,
+                                config.AzureSpeechConfig.SubscriptionKey,
+                                config.AzureSpeechConfig.Region
+                            );
+                        }
+                        catch (Exception ex)
+                        {
+                            throw new Exception(
+                                $"Error while setting up Speech-to-Text:\n\n{ex}.\n\nPlease ensure that your Azure credentials are correct."
+                            );
+                        }
+                        audioBufferStream.PipeTo(speechRecognizer);
+                        percepStream = videoAudioStream
+                            .Join(
+                                speechRecognizer,
+                                Reproducible.Nearest<string>(
+                                    TimeSpan.FromSeconds(percepDurationInSeconds / 2)
+                                )
+                            )
+                            .Select(
+                                (tuple) =>
+                                    CreatePerception(
+                                        tuple.Item1,
+                                        tuple.Item2,
+                                        transcription: tuple.Item3
+                                    )
+                            );
+                    }
+                    else
+                    {
+                        percepStream = videoAudioStream.Select(
+                            (tuple) => CreatePerception(tuple.Item1, tuple.Item2)
+                        );
+                    }
                     var percepStreamMQWriter = new NetMQWriter<Perception>(
                         percepSyncPipeline,
                         PerceptionTopic,

diff --git a/src/PercepSync/Tts.cs b/src/PercepSync/Tts.cs
@@ -94,19 +94,12 @@ int audioBufferFrameSizeInBytes
             this.region = region;
             this.voiceName = voiceName;
 
-            try
-            {
-                var speechConfig = SpeechConfig.FromSubscription(this.subscriptionKey, this.region);
-                speechConfig.SpeechSynthesisVoiceName = this.voiceName;
-                speechConfig.SetSpeechSynthesisOutputFormat(
-                    SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
-                );
-                speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
-            }
-            catch (Exception e)
-            {
-                throw new Exception($"Error while initializing SpeechSynthesizer: {e.Message}");
-            }
+            var speechConfig = SpeechConfig.FromSubscription(this.subscriptionKey, this.region);
+            speechConfig.SpeechSynthesisVoiceName = this.voiceName;
+            speechConfig.SetSpeechSynthesisOutputFormat(
+                SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
+            );
+            speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
             In = pipeline.CreateReceiver<TtsRequest>(this, Receive, nameof(In));
             audioOut = pipeline.CreateEmitter<AudioBuffer>(this, nameof(audioOut));
             reframer = new Reframe(pipeline, audioBufferFrameSizeInBytes);