From 036a53c442b325e847df94854ae0eeafb7a6ed13 Mon Sep 17 00:00:00 2001 From: Tommy Falgout Date: Tue, 23 Oct 2018 22:58:37 -0500 Subject: [PATCH 1/6] Add recognize_azure() due to Bing Speech API deprecation --- speech_recognition/__init__.py | 94 ++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index da2c998f..c4fd8cab 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1015,6 +1015,100 @@ def recognize_wit(self, audio_data, key, show_all=False): if "_text" not in result or result["_text"] is None: raise UnknownValueError() return result["_text"] + def recognize_azure(self, audio_data, key, language="en-US", result_format="simple", profanity="masked", location="westus", show_all=False): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Azure Speech API. + + The Microsoft Azure Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account `__ with Microsoft Azure. + + To get the API key, go to the `Microsoft Azure Portal Resources `__ page, go to "All Resources" > "Add" > "See All" > Search "Speech > "Create", and fill in the form to make a "Speech" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Azure Speech API keys are 32-character lowercase hexadecimal strings. + + The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation `__ under "Interactive and dictation mode". + + Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. + + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + """ + assert isinstance(audio_data, AudioData), "Data must be audio data" + assert isinstance(key, str), "``key`` must be a string" + assert isinstance(result_format, str), "``format`` must be a string" + assert isinstance(language, str), "``language`` must be a string" + + access_token, expire_time = getattr(self, "azure_cached_access_token", None), getattr(self, "azure_cached_access_token_expiry", None) + allow_caching = True + try: + from time import monotonic # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + except ImportError: + try: + from monotonic import monotonic # use time.monotonic backport for Python 2 if available (from https://pypi.python.org/pypi/monotonic) + except (ImportError, RuntimeError): + expire_time = None # monotonic time not available, don't cache access tokens + allow_caching = False # don't allow caching, since monotonic time isn't available + if expire_time is None or monotonic() > expire_time: # caching not enabled, first credential request, or the access token from the previous one expired + # get an access token using OAuth + credential_url = "https://" + location + ".api.cognitive.microsoft.com/sts/v1.0/issueToken" + credential_request = Request(credential_url, data=b"", headers={ + "Content-type": "application/x-www-form-urlencoded", + "Content-Length": "0", + "Ocp-Apim-Subscription-Key": key, + }) + + if allow_caching: + start_time = monotonic() + + try: + credential_response = urlopen(credential_request, timeout=60) # credential response can take longer, use longer timeout instead of default one + except HTTPError as e: + raise RequestError("credential request failed: {}".format(e.reason)) + except URLError as e: + raise RequestError("credential connection failed: {}".format(e.reason)) + access_token = credential_response.read().decode("utf-8") + + if allow_caching: + # save the token for the duration it is valid for + self.azure_cached_access_token = access_token + self.azure_cached_access_token_expiry = start_time + 600 # according to https://docs.microsoft.com/en-us/azure/cognitive-services/Speech-Service/rest-apis#authentication, the token expires in exactly 10 minutes + + wav_data = audio_data.get_wav_data( + convert_rate=16000, # audio samples must be 8kHz or 16 kHz + convert_width=2 # audio samples should be 16-bit + ) + + url = "https://" + location + ".stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?{}".format(urlencode({ + "language": language, + "format": result_format, + "profanity": profanity + })) + + if sys.version_info >= (3, 6): # chunked-transfer requests are only supported in the standard library as of Python 3.6+, use it if possible + request = Request(url, data=io.BytesIO(wav_data), headers={ + "Authorization": "Bearer {}".format(access_token), + "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000", + "Transfer-Encoding": "chunked", + }) + else: # fall back on manually formatting the POST body as a chunked request + ascii_hex_data_length = "{:X}".format(len(wav_data)).encode("utf-8") + chunked_transfer_encoding_data = ascii_hex_data_length + b"\r\n" + wav_data + b"\r\n0\r\n\r\n" + request = Request(url, data=chunked_transfer_encoding_data, headers={ + "Authorization": "Bearer {}".format(access_token), + "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000", + "Transfer-Encoding": "chunked", + }) + + try: + response = urlopen(request, timeout=self.operation_timeout) + except HTTPError as e: + raise RequestError("recognition request failed: {}".format(e.reason)) + except URLError as e: + raise RequestError("recognition connection failed: {}".format(e.reason)) + response_text = response.read().decode("utf-8") + result = json.loads(response_text) + + # return results + if show_all: return result + if "RecognitionStatus" not in result or result["RecognitionStatus"] != "Success" or "DisplayText" not in result: raise UnknownValueError() + return result["DisplayText"] + def recognize_bing(self, audio_data, key, language="en-US", show_all=False): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Speech API. From c008d584aa91a9180863762a939cffdc1d214d09 Mon Sep 17 00:00:00 2001 From: Tommy Falgout Date: Thu, 25 Oct 2018 12:22:52 -0500 Subject: [PATCH 2/6] Update documentation --- README.rst | 3 ++- examples/microphone_recognition.py | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 7eb3a059..42909375 100644 --- a/README.rst +++ b/README.rst @@ -29,7 +29,8 @@ Speech recognition engine/API support: * Google Speech Recognition * `Google Cloud Speech API `__ * `Wit.ai `__ -* `Microsoft Bing Voice Recognition `__ +* `Microsoft Bing Voice Recognition (Deprecated) `__ +* `Microsoft Azure Speech `__ * `Houndify API `__ * `IBM Speech to Text `__ * `Snowboy Hotword Detection `__ (works offline) diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py index 3d830f5a..325074e1 100644 --- a/examples/microphone_recognition.py +++ b/examples/microphone_recognition.py @@ -56,6 +56,15 @@ except sr.RequestError as e: print("Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e)) +# recognize speech using Microsoft Azure Speech +AZURE_SPEECH_KEY = "INSERT AZURE SPEECH API KEY HERE" # Microsoft Speech API keys 32-character lowercase hexadecimal strings +try: + print("Microsoft Azure Speech thinks you said " + r.recognize_azure(audio, key=AZURE_SPEECH_KEY)) +except sr.UnknownValueError: + print("Microsoft Azure Speech could not understand audio") +except sr.RequestError as e: + print("Could not request results from Microsoft Azure Speech service; {0}".format(e)) + # recognize speech using Houndify HOUNDIFY_CLIENT_ID = "INSERT HOUNDIFY CLIENT ID HERE" # Houndify client IDs are Base64-encoded strings HOUNDIFY_CLIENT_KEY = "INSERT HOUNDIFY CLIENT KEY HERE" # Houndify client keys are Base64-encoded strings From 135c4cb3644047b86c200e0b2015f0ffc56e947f Mon Sep 17 00:00:00 2001 From: Tommy Falgout Date: Thu, 25 Oct 2018 12:24:27 -0500 Subject: [PATCH 3/6] Fix audio_transcribe example and link to Azure Speech docs --- README.rst | 2 +- examples/audio_transcribe.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 42909375..44bdf726 100644 --- a/README.rst +++ b/README.rst @@ -29,8 +29,8 @@ Speech recognition engine/API support: * Google Speech Recognition * `Google Cloud Speech API `__ * `Wit.ai `__ +* `Microsoft Azure Speech `__ * `Microsoft Bing Voice Recognition (Deprecated) `__ -* `Microsoft Azure Speech `__ * `Houndify API `__ * `IBM Speech to Text `__ * `Snowboy Hotword Detection `__ (works offline) diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py index 37d149c5..7806023f 100644 --- a/examples/audio_transcribe.py +++ b/examples/audio_transcribe.py @@ -50,6 +50,15 @@ except sr.RequestError as e: print("Could not request results from Wit.ai service; {0}".format(e)) +# recognize speech using Microsoft Azure Speech +AZURE_SPEECH_KEY = "INSERT AZURE SPEECH API KEY HERE" # Microsoft Speech API keys 32-character lowercase hexadecimal strings +try: + print("Microsoft Azure Speech thinks you said " + r.recognize_azure(audio, key=AZURE_SPEECH_KEY)) +except sr.UnknownValueError: + print("Microsoft Azure Speech could not understand audio") +except sr.RequestError as e: + print("Could not request results from Microsoft Azure Speech service; {0}".format(e)) + # recognize speech using Microsoft Bing Voice Recognition BING_KEY = "INSERT BING API KEY HERE" # Microsoft Bing Voice Recognition API keys 32-character lowercase hexadecimal strings try: From 4e2fa4a90f59537d07aedcfbff4ffa542ec8b8d9 Mon Sep 17 00:00:00 2001 From: Tommy Falgout Date: Tue, 23 Oct 2018 22:58:37 -0500 Subject: [PATCH 4/6] Add recognize_azure() due to Bing Speech API deprecation --- speech_recognition/__init__.py | 94 ++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 29325fb7..f1d84663 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1018,6 +1018,100 @@ def recognize_wit(self, audio_data, key, show_all=False): if "_text" not in result or result["_text"] is None: raise UnknownValueError() return result["_text"] + def recognize_azure(self, audio_data, key, language="en-US", result_format="simple", profanity="masked", location="westus", show_all=False): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Azure Speech API. + + The Microsoft Azure Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account `__ with Microsoft Azure. + + To get the API key, go to the `Microsoft Azure Portal Resources `__ page, go to "All Resources" > "Add" > "See All" > Search "Speech > "Create", and fill in the form to make a "Speech" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Azure Speech API keys are 32-character lowercase hexadecimal strings. + + The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation `__ under "Interactive and dictation mode". + + Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. + + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + """ + assert isinstance(audio_data, AudioData), "Data must be audio data" + assert isinstance(key, str), "``key`` must be a string" + assert isinstance(result_format, str), "``format`` must be a string" + assert isinstance(language, str), "``language`` must be a string" + + access_token, expire_time = getattr(self, "azure_cached_access_token", None), getattr(self, "azure_cached_access_token_expiry", None) + allow_caching = True + try: + from time import monotonic # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + except ImportError: + try: + from monotonic import monotonic # use time.monotonic backport for Python 2 if available (from https://pypi.python.org/pypi/monotonic) + except (ImportError, RuntimeError): + expire_time = None # monotonic time not available, don't cache access tokens + allow_caching = False # don't allow caching, since monotonic time isn't available + if expire_time is None or monotonic() > expire_time: # caching not enabled, first credential request, or the access token from the previous one expired + # get an access token using OAuth + credential_url = "https://" + location + ".api.cognitive.microsoft.com/sts/v1.0/issueToken" + credential_request = Request(credential_url, data=b"", headers={ + "Content-type": "application/x-www-form-urlencoded", + "Content-Length": "0", + "Ocp-Apim-Subscription-Key": key, + }) + + if allow_caching: + start_time = monotonic() + + try: + credential_response = urlopen(credential_request, timeout=60) # credential response can take longer, use longer timeout instead of default one + except HTTPError as e: + raise RequestError("credential request failed: {}".format(e.reason)) + except URLError as e: + raise RequestError("credential connection failed: {}".format(e.reason)) + access_token = credential_response.read().decode("utf-8") + + if allow_caching: + # save the token for the duration it is valid for + self.azure_cached_access_token = access_token + self.azure_cached_access_token_expiry = start_time + 600 # according to https://docs.microsoft.com/en-us/azure/cognitive-services/Speech-Service/rest-apis#authentication, the token expires in exactly 10 minutes + + wav_data = audio_data.get_wav_data( + convert_rate=16000, # audio samples must be 8kHz or 16 kHz + convert_width=2 # audio samples should be 16-bit + ) + + url = "https://" + location + ".stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?{}".format(urlencode({ + "language": language, + "format": result_format, + "profanity": profanity + })) + + if sys.version_info >= (3, 6): # chunked-transfer requests are only supported in the standard library as of Python 3.6+, use it if possible + request = Request(url, data=io.BytesIO(wav_data), headers={ + "Authorization": "Bearer {}".format(access_token), + "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000", + "Transfer-Encoding": "chunked", + }) + else: # fall back on manually formatting the POST body as a chunked request + ascii_hex_data_length = "{:X}".format(len(wav_data)).encode("utf-8") + chunked_transfer_encoding_data = ascii_hex_data_length + b"\r\n" + wav_data + b"\r\n0\r\n\r\n" + request = Request(url, data=chunked_transfer_encoding_data, headers={ + "Authorization": "Bearer {}".format(access_token), + "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000", + "Transfer-Encoding": "chunked", + }) + + try: + response = urlopen(request, timeout=self.operation_timeout) + except HTTPError as e: + raise RequestError("recognition request failed: {}".format(e.reason)) + except URLError as e: + raise RequestError("recognition connection failed: {}".format(e.reason)) + response_text = response.read().decode("utf-8") + result = json.loads(response_text) + + # return results + if show_all: return result + if "RecognitionStatus" not in result or result["RecognitionStatus"] != "Success" or "DisplayText" not in result: raise UnknownValueError() + return result["DisplayText"] + def recognize_bing(self, audio_data, key, language="en-US", show_all=False): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Speech API. From fed05fec2668fd3ed44aea8c03978b6dcbff8b0b Mon Sep 17 00:00:00 2001 From: Tommy Falgout Date: Thu, 25 Oct 2018 12:22:52 -0500 Subject: [PATCH 5/6] Update documentation --- README.rst | 3 ++- examples/microphone_recognition.py | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 7eb3a059..42909375 100644 --- a/README.rst +++ b/README.rst @@ -29,7 +29,8 @@ Speech recognition engine/API support: * Google Speech Recognition * `Google Cloud Speech API `__ * `Wit.ai `__ -* `Microsoft Bing Voice Recognition `__ +* `Microsoft Bing Voice Recognition (Deprecated) `__ +* `Microsoft Azure Speech `__ * `Houndify API `__ * `IBM Speech to Text `__ * `Snowboy Hotword Detection `__ (works offline) diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py index 3d830f5a..325074e1 100644 --- a/examples/microphone_recognition.py +++ b/examples/microphone_recognition.py @@ -56,6 +56,15 @@ except sr.RequestError as e: print("Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e)) +# recognize speech using Microsoft Azure Speech +AZURE_SPEECH_KEY = "INSERT AZURE SPEECH API KEY HERE" # Microsoft Speech API keys 32-character lowercase hexadecimal strings +try: + print("Microsoft Azure Speech thinks you said " + r.recognize_azure(audio, key=AZURE_SPEECH_KEY)) +except sr.UnknownValueError: + print("Microsoft Azure Speech could not understand audio") +except sr.RequestError as e: + print("Could not request results from Microsoft Azure Speech service; {0}".format(e)) + # recognize speech using Houndify HOUNDIFY_CLIENT_ID = "INSERT HOUNDIFY CLIENT ID HERE" # Houndify client IDs are Base64-encoded strings HOUNDIFY_CLIENT_KEY = "INSERT HOUNDIFY CLIENT KEY HERE" # Houndify client keys are Base64-encoded strings From 804f197ba632755408704ad21732d5044b0b4cb2 Mon Sep 17 00:00:00 2001 From: Tommy Falgout Date: Thu, 25 Oct 2018 12:24:27 -0500 Subject: [PATCH 6/6] Fix audio_transcribe example and link to Azure Speech docs --- README.rst | 2 +- examples/audio_transcribe.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 42909375..44bdf726 100644 --- a/README.rst +++ b/README.rst @@ -29,8 +29,8 @@ Speech recognition engine/API support: * Google Speech Recognition * `Google Cloud Speech API `__ * `Wit.ai `__ +* `Microsoft Azure Speech `__ * `Microsoft Bing Voice Recognition (Deprecated) `__ -* `Microsoft Azure Speech `__ * `Houndify API `__ * `IBM Speech to Text `__ * `Snowboy Hotword Detection `__ (works offline) diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py index 37d149c5..7806023f 100644 --- a/examples/audio_transcribe.py +++ b/examples/audio_transcribe.py @@ -50,6 +50,15 @@ except sr.RequestError as e: print("Could not request results from Wit.ai service; {0}".format(e)) +# recognize speech using Microsoft Azure Speech +AZURE_SPEECH_KEY = "INSERT AZURE SPEECH API KEY HERE" # Microsoft Speech API keys 32-character lowercase hexadecimal strings +try: + print("Microsoft Azure Speech thinks you said " + r.recognize_azure(audio, key=AZURE_SPEECH_KEY)) +except sr.UnknownValueError: + print("Microsoft Azure Speech could not understand audio") +except sr.RequestError as e: + print("Could not request results from Microsoft Azure Speech service; {0}".format(e)) + # recognize speech using Microsoft Bing Voice Recognition BING_KEY = "INSERT BING API KEY HERE" # Microsoft Bing Voice Recognition API keys 32-character lowercase hexadecimal strings try: