-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
speechToSpeech.py
155 lines (114 loc) · 6.71 KB
/
speechToSpeech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from __future__ import annotations
import datetime
import sys
import pyaudio
from misc.obsSubtitles import *
from misc.translation import *
import helper
import os
from ttsProviders.ElevenlabsProvider import ElevenlabsProvider
from ttsProviders.PyttsxProvider import PyttsxProvider
from ttsProviders.__TTSProviderAbstract import TTSProvider
from speechRecognition.__SpeechRecProviderAbstract import SpeechRecProvider
from speechRecognition.WhisperProvider import WhisperProvider
from speechRecognition.VoskProvider import VoskProvider
from speechRecognition.AzureProvider import AzureProvider
#The whisper API does not return language information, so we use googletrans to detect the language from the text.
#Won't be super accurate, but it should be pretty fast.
import googletrans
#THESE IMPORTS ARE NOT UNUSED. recasepunc does some weird reflection stuff.
from misc.recasepunc import CasePuncPredictor, WordpieceTokenizer
dummyVar:CasePuncPredictor
dummyVar2:WordpieceTokenizer
srProvider:SpeechRecProvider
def main():
helper.show_text("Voice recognition will start once you confirm...")
print("Recognition started...")
srProvider.recognize_loop()
# I'll just use googletrans in that case.
def process_text(recognizedText:str, language:str, startTime:datetime.datetime, recognizedTime:datetime.datetime):
# If you want to do anything with the text (like sending it off to chatGPT and playing back the response instead) this is where you do it.
recognizedText = recognizedText.strip()
if recognizedText != "": #Ignore empty output
translatedText = translate_if_needed(recognizedText, language)
#translatedText contains the text in english.
print("\nRecognized and (possibly) translated text: " + translatedText)
helper.ttsProvider.synthesizeAndPlayAudio(translatedText, helper.chosenOutput, startTime, recognizedTime)
#Subtitle updating USED TO be here, but I moved it to the TTS providers to avoid
#situations where the subtitles would update before the audio is actually played
#if helper.subtitlesEnabled:
# subtitle_update(translatedText)
print("\nListening for voice input...\n")
def setup():
#if not helper.useGUI:
helper.setup_config()
configQuestions = dict()
defaultDevice = pyaudio.PyAudio().get_default_output_device_info()
defaultDevice = f"{defaultDevice['name']} - {defaultDevice['index']}"
outputDeviceQuestion = {
"widget_type": "list",
"options": helper.get_list_of_portaudio_devices("output"),
"label": "Output Device",
"default_value": defaultDevice
}
srProviderQuestion = {
"widget_type": "list",
"options": ["Vosk", "Whisper", "Azure"],
"descriptions": [
"Good accuracy, local and fast."
"\nDoes not include punctuation by default but includes the option to use recasepunc to add it, which does make it heavier to run."
"\nI find it to be a very good balance of quality+speed, at least when it comes to english."
"\nIt's worth noting that it can only support 1 language at a time (so you will be unable to speak using multiple languages)."
"\nIn addition, its language support is limited by what models are available, especially for recasepunc."
"\nIf you want to speak english and stick with something local that's still pretty fast, use this.\n",
"Can either can either be used locally (via faster-whisper) or online through their API (at 0.006$/minute of speech)."
"\nThe local version REQUIRES AN NVIDIA GPU. It offers a few different model sizes, whereas the API always uses the largest."
"\nAttempting to run the largest model size locally, assuming you even have an NVIDIA GPU with enough VRAM for it, will be pretty slow."
"\nI recommend sticking to the medium one at most, but you should try it and see how well it works on your machine."
"\nSupports a variety of languages, more languages are supported by the local version than the API however."
"\nIt does not require you to specify which language you will be speaking.\n",
"Great accuracy, online, supports a bunch of languages."
"\nIncludes 5 hours free per month, following speech is billed at 1$/hour."
"\nThat's 0.016$/minute of speech, which is roughly 2.5x as much as Whisper."
"\nIt supports speaking using multiple languages, but you will have to narrow it down to 10 maximum."
"\nThis is the best option if you need something online but don't want to pay, once you go over the 5 hours/month you should switch to Whisper for a cheaper price.\n"],
"label":"Speech Recognition Provider"
}
ttsProviderQuestion = {
"widget_type": "list",
"options": ["ElevenLabs - High quality, online, paid", "pyttsx3 - Low quality, local, free"],
"label": "Text To Speech Provider"
}
miscConfig = helper.get_misc_config()
configQuestions["output_device"] = outputDeviceQuestion
configQuestions["speech_provider"] = srProviderQuestion
configQuestions["tts_provider"] = ttsProviderQuestion
userInput = helper.ask_fetch_from_and_update_config(configQuestions, miscConfig, "General settings")
chosenOutputInfo = helper.get_portaudio_device_info_from_name(userInput["output_device"])
helper.chosenOutput = chosenOutputInfo["index"]
subtitle_setup()
translation_setup()
global srProvider
availableSRProviders: list[SpeechRecProvider] = [VoskProvider, WhisperProvider, AzureProvider]
chosenSRProviderClass:SpeechRecProvider = availableSRProviders[srProviderQuestion["options"].index(userInput["speech_provider"])]
srProvider = chosenSRProviderClass()
#Make the user choose from a provider and ensure that the config data field is present in the config file.
availableTTSProviders:list[TTSProvider] = [ElevenlabsProvider, PyttsxProvider]
chosenTTSProviderClass:TTSProvider = availableTTSProviders[ttsProviderQuestion["options"].index(userInput["tts_provider"])]
helper.ttsProvider = chosenTTSProviderClass()
if __name__ == '__main__':
if len(sys.argv) > 1:
if sys.argv[1] == "--cli":
helper.useGUI = False #Is true by default
# Make sure the default folders exist...
modelDir = os.path.join(os.getcwd(), "models")
if not os.path.isdir(modelDir):
os.mkdir(modelDir)
voskModelsDir = os.path.join(modelDir, "vosk")
if not os.path.isdir(voskModelsDir):
os.mkdir(voskModelsDir)
recasepuncModelsDir = os.path.join(modelDir, "recasepunc")
if not os.path.isdir(recasepuncModelsDir):
os.mkdir(recasepuncModelsDir)
setup()
main()