From d7d3f128ccc5e19198e15cdba2cb922d98c88072 Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 16:15:11 +0200 Subject: [PATCH 01/13] Added vosk --- speech_recognition/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 8eaabf94..a015e7d1 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1390,7 +1390,8 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac for node_id in top_k: human_string = self.tflabels[node_id] return human_string - + def recognize_vosk(self, audio_data, language='en'): + from vosk import Model, KaldiRecognizer def get_flac_converter(): """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" From f0eb1ba3573b35b24c113951c654a6fc7d75ef38 Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 16:16:54 +0200 Subject: [PATCH 02/13] Added 2 more recognition services --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index 0d0322b9..4434dc21 100644 --- a/README.rst +++ b/README.rst @@ -34,6 +34,8 @@ Speech recognition engine/API support: * `Houndify API `__ * `IBM Speech to Text `__ * `Snowboy Hotword Detection `__ (works offline) +* `Tensorflow `__ +* `Vosk API `__ (works offline) **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details. From af9373883cf7a02f11129e458d8a86dc8364f6b4 Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 18:45:27 +0200 Subject: [PATCH 03/13] Update of outdated README --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 4434dc21..b9ceb8c7 100644 --- a/README.rst +++ b/README.rst @@ -88,6 +88,7 @@ To use all of the functionality of the library, you should have: * **PocketSphinx** (required only if you need to use the Sphinx recognizer, ``recognizer_instance.recognize_sphinx``) * **Google API Client Library for Python** (required only if you need to use the Google Cloud Speech API, ``recognizer_instance.recognize_google_cloud``) * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X) +* **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``) The following requirements are optional, but can improve or extend functionality in some situations: From f726da68c5015efe3548b59743306590167e47c3 Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 19:36:34 +0200 Subject: [PATCH 04/13] Added Vosk API You can now simply recognize with: recognize_vosk() --- speech_recognition/__init__.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index a015e7d1..f3664054 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1390,9 +1390,39 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac for node_id in top_k: human_string = self.tflabels[node_id] return human_string + def recognize_vosk(self, audio_data, language='en'): from vosk import Model, KaldiRecognizer + if not os.path.exists("model"): + return "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder." + exit (1) + + import pyaudio + + model = Model("model") + rec = KaldiRecognizer(model, 16000) + + p = pyaudio.PyAudio() + stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) + stream.start_stream() + + while True: + data = stream.read(4000) + if len(data) == 0: + break + if rec.AcceptWaveform(data): + #bottom lines are for debugging + #print(rec.Result()) + break + else: + #bottom lines are for debugging + #print(rec.PartialResult()) + break + + finalRecognition = rec.FinalResult() + return finalRecognition + def get_flac_converter(): """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" flac_converter = shutil_which("flac") # check for installed version first From 44d17b1886333f7d033c190dcd0eb8b7e492d898 Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 19:52:35 +0200 Subject: [PATCH 05/13] Update of 1st review --- speech_recognition/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index f3664054..55a8ee68 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -25,6 +25,9 @@ __version__ = "3.8.1" __license__ = "BSD" +# model for Vosk +modelVosk = Model("model") + try: # attempt to use the Python 2 modules from urllib import urlencode from urllib2 import Request, urlopen, URLError, HTTPError @@ -1400,8 +1403,7 @@ def recognize_vosk(self, audio_data, language='en'): import pyaudio - model = Model("model") - rec = KaldiRecognizer(model, 16000) + rec = KaldiRecognizer(modelVosk, 16000) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) From 555cbaf854e47b6d9455a359754512047fe2d2fa Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 20:24:03 +0200 Subject: [PATCH 06/13] Update of vosk with help of @nshmyrev Part 1 --- speech_recognition/__init__.py | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 55a8ee68..ce2fc0da 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1400,29 +1400,13 @@ def recognize_vosk(self, audio_data, language='en'): if not os.path.exists("model"): return "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder." exit (1) - + + assert isinstance(audio_data, AudioData), "Data must be audio data" import pyaudio - - rec = KaldiRecognizer(modelVosk, 16000) - - p = pyaudio.PyAudio() - stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) - stream.start_stream() - - while True: - data = stream.read(4000) - if len(data) == 0: - break - if rec.AcceptWaveform(data): - #bottom lines are for debugging - #print(rec.Result()) - break - else: - #bottom lines are for debugging - #print(rec.PartialResult()) - break - + + rec.AcceptWaveform(audio_data.get_raw_data(convert_rate=16000, convert_width=2)); finalRecognition = rec.FinalResult() + return finalRecognition def get_flac_converter(): From 696ddb286f2526b29d175c1a95c9dd76c75ccbda Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 20:26:05 +0200 Subject: [PATCH 07/13] Update of vosk with help of @nshmyrev Part 2 --- speech_recognition/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index ce2fc0da..00a34e8e 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1404,6 +1404,11 @@ def recognize_vosk(self, audio_data, language='en'): assert isinstance(audio_data, AudioData), "Data must be audio data" import pyaudio + if not hasattr(self, 'vosk_model'): + self.vosk_model = Model() + + rec = KaldiRecognizer(self.vosk_model, 16000); + rec.AcceptWaveform(audio_data.get_raw_data(convert_rate=16000, convert_width=2)); finalRecognition = rec.FinalResult() From a8f270788053fa38ca58d1ac7d1ec36608169bab Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 20:27:01 +0200 Subject: [PATCH 08/13] Update of vosk with help of @nshmyrev Part 3 --- speech_recognition/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 00a34e8e..17f4e5bb 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1402,7 +1402,6 @@ def recognize_vosk(self, audio_data, language='en'): exit (1) assert isinstance(audio_data, AudioData), "Data must be audio data" - import pyaudio if not hasattr(self, 'vosk_model'): self.vosk_model = Model() From a4c29cbf45eb3a225a91d0aa2062241e5fff4530 Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 20:28:13 +0200 Subject: [PATCH 09/13] Update of vosk with help of @nshmyrev Part 4 --- speech_recognition/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 17f4e5bb..77f5d385 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -25,9 +25,6 @@ __version__ = "3.8.1" __license__ = "BSD" -# model for Vosk -modelVosk = Model("model") - try: # attempt to use the Python 2 modules from urllib import urlencode from urllib2 import Request, urlopen, URLError, HTTPError From ce3023f4c17aed92e8ffc957b6ba16a16335cb6c Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 20:37:09 +0200 Subject: [PATCH 10/13] Update of vosk with help of @nshmyrev Last part - Updating readme --- README.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.rst b/README.rst index b9ceb8c7..4b224960 100644 --- a/README.rst +++ b/README.rst @@ -54,6 +54,8 @@ The `library reference `__ for information about installing languages, compiling PocketSphinx, and building language packs from online resources. This document is also included under ``reference/pocketsphinx.rst``. +You have to install Vosk models for using Vosk. `Here `__ are models avaiable. You have to place them in models folder of your project, like "your-project-folder/models/your-vosk-model" + Examples -------- @@ -132,6 +134,16 @@ Note that the versions available in most package repositories are outdated and w See `Notes on using PocketSphinx `__ for information about installing languages, compiling PocketSphinx, and building language packs from online resources. This document is also included under ``reference/pocketsphinx.rst``. +Vosk (for Vosk users) +~~~~~~~~~~~~~~~~~~~~~ +Vosk API is **required if and only if you want to use Vosk recognizer** (``recognizer_instance.recognize_vosk``). + +You can install it with ``python3 -m pip install vosk``. + +You also have to install Vosk Models: + +`Here `__ are models avaiable for download. You have to place them in models folder of your project, like "your-project-folder/models/your-vosk-model" + Google Cloud Speech Library for Python (for Google Cloud Speech API users) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From dfad80ef4563002d96a9f85bee34bea81a893ad9 Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 20:40:28 +0200 Subject: [PATCH 11/13] Update of vosk with help of @nshmyrev Hoping that it's the last part --- speech_recognition/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 77f5d385..63ea2a39 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1393,14 +1393,14 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac def recognize_vosk(self, audio_data, language='en'): from vosk import Model, KaldiRecognizer - - if not os.path.exists("model"): - return "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder." - exit (1) assert isinstance(audio_data, AudioData), "Data must be audio data" + assert isinstance(language, AudioData), "Language data" if not hasattr(self, 'vosk_model'): + if not os.path.exists("model"): + return "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder." + exit (1) self.vosk_model = Model() rec = KaldiRecognizer(self.vosk_model, 16000); From a1a7a14a88bdab3fd7b49fc1b92ca887570369f0 Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Fri, 18 Sep 2020 20:41:07 +0200 Subject: [PATCH 12/13] Update of vosk with help of @nshmyrev Fix, because i screwed up --- speech_recognition/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 63ea2a39..095d8e89 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1395,7 +1395,7 @@ def recognize_vosk(self, audio_data, language='en'): from vosk import Model, KaldiRecognizer assert isinstance(audio_data, AudioData), "Data must be audio data" - assert isinstance(language, AudioData), "Language data" + assert isinstance(language, str), "Language data" if not hasattr(self, 'vosk_model'): if not os.path.exists("model"): From 274f5eb05bb56ffa591c402b19b6c230af9c5dff Mon Sep 17 00:00:00 2001 From: mytja <52399966+mytja@users.noreply.github.com> Date: Sat, 19 Sep 2020 10:33:59 +0200 Subject: [PATCH 13/13] Final update with help of @nshmyrev Finnaly final update --- speech_recognition/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 095d8e89..37b17292 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1395,13 +1395,12 @@ def recognize_vosk(self, audio_data, language='en'): from vosk import Model, KaldiRecognizer assert isinstance(audio_data, AudioData), "Data must be audio data" - assert isinstance(language, str), "Language data" if not hasattr(self, 'vosk_model'): if not os.path.exists("model"): return "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder." exit (1) - self.vosk_model = Model() + self.vosk_model = Model("model") rec = KaldiRecognizer(self.vosk_model, 16000);