diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index aa81884a..bf6233be 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: true matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v3 diff --git a/README.rst b/README.rst index f9bde14e..410e289d 100644 --- a/README.rst +++ b/README.rst @@ -39,6 +39,7 @@ Speech recognition engine/API support: * `Tensorflow `__ * `Vosk API `__ (works offline) * `OpenAI whisper `__ (works offline) +* `Whisper API `__ **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details. @@ -88,13 +89,14 @@ Requirements To use all of the functionality of the library, you should have: -* **Python** 3.7+ (required) +* **Python** 3.8+ (required) * **PyAudio** 0.2.11+ (required only if you need to use microphone input, ``Microphone``) * **PocketSphinx** (required only if you need to use the Sphinx recognizer, ``recognizer_instance.recognize_sphinx``) * **Google API Client Library for Python** (required only if you need to use the Google Cloud Speech API, ``recognizer_instance.recognize_google_cloud``) * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X) * **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``) * **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``) +* **openai** (required only if you need to use Whisper API speech recognition ``recognizer_instance.recognize_whisper_api``) The following requirements are optional, but can improve or extend functionality in some situations: @@ -105,7 +107,7 @@ The following sections go over the details of each requirement. Python ~~~~~~ -The first software requirement is `Python 3.7+ `__. This is required to use the library. +The first software requirement is `Python 3.8+ `__. This is required to use the library. PyAudio (for microphone users) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -169,6 +171,15 @@ Whisper is **required if and only if you want to use whisper** (``recognizer_ins You can install it with ``python3 -m pip install git+https://github.com/openai/whisper.git soundfile``. +Whisper API (for Whisper API users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The library `openai `__ is **required if and only if you want to use Whisper API** (``recognizer_instance.recognize_whisper_api``). + +If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_whisper_api`` will raise an ``RequestError``. + +You can install it with ``python3 -m pip install openai``. + Troubleshooting --------------- diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py index 56168b29..c46b412a 100644 --- a/examples/microphone_recognition.py +++ b/examples/microphone_recognition.py @@ -92,3 +92,10 @@ print("Whisper could not understand audio") except sr.RequestError as e: print("Could not request results from Whisper") + +# recognize speech using Whisper API +OPENAI_API_KEY = "INSERT OPENAI API KEY HERE" +try: + print(f"Whisper API thinks you said {r.recognize_whisper_api(audio, api_key=OPENAI_API_KEY)}") +except sr.RequestError as e: + print("Could not request results from Whisper API") diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 7323bd9b..0aa7a8ce 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -314,6 +314,17 @@ You can translate the result to english with Whisper by passing translate=True Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options +``recognizer_instance.recognize_whisper_api(audio_data: AudioData, model: str = "whisper-1", api_key: str | None = None)`` +-------------------------------------------------------------------------------------------------------------------------- + +Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API. + +This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings `__. + +Detail: https://platform.openai.com/docs/guides/speech-to-text + +Raises a ``speech_recognition.exceptions.SetupError`` exception if there are any issues with the openai installation, or the environment variable is missing. + ``AudioSource`` --------------- diff --git a/setup.cfg b/setup.cfg index 3af79a7b..69c937c4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,3 +2,7 @@ # the `universal` setting means that the project runs unmodified on both Python 2 and 3, # and doesn't use any C extensions to Python universal=1 + +[options.extras_require] +whisper-api = + openai diff --git a/setup.py b/setup.py index 231e390d..2b10a084 100644 --- a/setup.py +++ b/setup.py @@ -58,13 +58,12 @@ def run(self): "Operating System :: Other OS", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Multimedia :: Sound/Audio :: Speech", ], - python_requires=">=3.7", + python_requires=">=3.8", install_requires=['requests>=2.26.0'], ) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 66ebc04c..8365d8e3 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -15,13 +15,10 @@ import json import base64 import threading -import platform -import stat import hashlib import hmac import time import uuid -from pprint import pprint try: import requests @@ -36,20 +33,15 @@ from urllib.request import Request, urlopen from urllib.error import URLError, HTTPError - -class WaitTimeoutError(Exception): pass - - -class RequestError(Exception): pass - - -class UnknownValueError(Exception): pass - - -class TranscriptionNotReady(Exception): pass - - -class TranscriptionFailed(Exception): pass +from .audio import AudioData, get_flac_converter +from .exceptions import ( + RequestError, + TranscriptionFailed, + TranscriptionNotReady, + UnknownValueError, + WaitTimeoutError, +) +from .recognizers import whisper class AudioSource(object): @@ -331,180 +323,6 @@ def read(self, size=-1): return buffer -class AudioData(object): - """ - Creates a new ``AudioData`` instance, which represents mono audio data. - - The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format. - - The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample. - - The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz). - - Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly. - """ - def __init__(self, frame_data, sample_rate, sample_width): - assert sample_rate > 0, "Sample rate must be a positive integer" - assert sample_width % 1 == 0 and 1 <= sample_width <= 4, "Sample width must be between 1 and 4 inclusive" - self.frame_data = frame_data - self.sample_rate = sample_rate - self.sample_width = int(sample_width) - - def get_segment(self, start_ms=None, end_ms=None): - """ - Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in. - - If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end. - """ - assert start_ms is None or start_ms >= 0, "``start_ms`` must be a non-negative number" - assert end_ms is None or end_ms >= (0 if start_ms is None else start_ms), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``" - if start_ms is None: - start_byte = 0 - else: - start_byte = int((start_ms * self.sample_rate * self.sample_width) // 1000) - if end_ms is None: - end_byte = len(self.frame_data) - else: - end_byte = int((end_ms * self.sample_rate * self.sample_width) // 1000) - return AudioData(self.frame_data[start_byte:end_byte], self.sample_rate, self.sample_width) - - def get_raw_data(self, convert_rate=None, convert_width=None): - """ - Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance. - - If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. - - If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. - - Writing these bytes directly to a file results in a valid `RAW/PCM audio file `__. - """ - assert convert_rate is None or convert_rate > 0, "Sample rate to convert to must be a positive integer" - assert convert_width is None or (convert_width % 1 == 0 and 1 <= convert_width <= 4), "Sample width to convert to must be between 1 and 4 inclusive" - - raw_data = self.frame_data - - # make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples) - if self.sample_width == 1: - raw_data = audioop.bias(raw_data, 1, -128) # subtract 128 from every sample to make them act like signed samples - - # resample audio at the desired rate if specified - if convert_rate is not None and self.sample_rate != convert_rate: - raw_data, _ = audioop.ratecv(raw_data, self.sample_width, 1, self.sample_rate, convert_rate, None) - - # convert samples to desired sample width if specified - if convert_width is not None and self.sample_width != convert_width: - if convert_width == 3: # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866) - raw_data = audioop.lin2lin(raw_data, self.sample_width, 4) # convert audio into 32-bit first, which is always supported - try: audioop.bias(b"", 3, 0) # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do) - except audioop.error: # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less) - raw_data = b"".join(raw_data[i + 1:i + 4] for i in range(0, len(raw_data), 4)) # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample - else: # 24-bit audio fully supported, we don't need to shim anything - raw_data = audioop.lin2lin(raw_data, self.sample_width, convert_width) - else: - raw_data = audioop.lin2lin(raw_data, self.sample_width, convert_width) - - # if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again - if convert_width == 1: - raw_data = audioop.bias(raw_data, 1, 128) # add 128 to every sample to make them act like unsigned samples again - - return raw_data - - def get_wav_data(self, convert_rate=None, convert_width=None): - """ - Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance. - - If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. - - If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. - - Writing these bytes directly to a file results in a valid `WAV file `__. - """ - raw_data = self.get_raw_data(convert_rate, convert_width) - sample_rate = self.sample_rate if convert_rate is None else convert_rate - sample_width = self.sample_width if convert_width is None else convert_width - - # generate the WAV file contents - with io.BytesIO() as wav_file: - wav_writer = wave.open(wav_file, "wb") - try: # note that we can't use context manager, since that was only added in Python 3.4 - wav_writer.setframerate(sample_rate) - wav_writer.setsampwidth(sample_width) - wav_writer.setnchannels(1) - wav_writer.writeframes(raw_data) - wav_data = wav_file.getvalue() - finally: # make sure resources are cleaned up - wav_writer.close() - return wav_data - - def get_aiff_data(self, convert_rate=None, convert_width=None): - """ - Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance. - - If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. - - If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. - - Writing these bytes directly to a file results in a valid `AIFF-C file `__. - """ - raw_data = self.get_raw_data(convert_rate, convert_width) - sample_rate = self.sample_rate if convert_rate is None else convert_rate - sample_width = self.sample_width if convert_width is None else convert_width - - # the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian - if hasattr(audioop, "byteswap"): # ``audioop.byteswap`` was only added in Python 3.4 - raw_data = audioop.byteswap(raw_data, sample_width) - else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback - raw_data = raw_data[sample_width - 1::-1] + b"".join(raw_data[i + sample_width:i:-1] for i in range(sample_width - 1, len(raw_data), sample_width)) - - # generate the AIFF-C file contents - with io.BytesIO() as aiff_file: - aiff_writer = aifc.open(aiff_file, "wb") - try: # note that we can't use context manager, since that was only added in Python 3.4 - aiff_writer.setframerate(sample_rate) - aiff_writer.setsampwidth(sample_width) - aiff_writer.setnchannels(1) - aiff_writer.writeframes(raw_data) - aiff_data = aiff_file.getvalue() - finally: # make sure resources are cleaned up - aiff_writer.close() - return aiff_data - - def get_flac_data(self, convert_rate=None, convert_width=None): - """ - Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance. - - Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC. - - If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. - - If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. - - Writing these bytes directly to a file results in a valid `FLAC file `__. - """ - assert convert_width is None or (convert_width % 1 == 0 and 1 <= convert_width <= 3), "Sample width to convert to must be between 1 and 3 inclusive" - - if self.sample_width > 3 and convert_width is None: # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder - convert_width = 3 # the largest supported sample width is 24-bit, so we'll limit the sample width to that - - # run the FLAC converter with the WAV data to get the FLAC data - wav_data = self.get_wav_data(convert_rate, convert_width) - flac_converter = get_flac_converter() - if os.name == "nt": # on Windows, specify that the process is to be started without showing a console window - startup_info = subprocess.STARTUPINFO() - startup_info.dwFlags |= subprocess.STARTF_USESHOWWINDOW # specify that the wShowWindow field of `startup_info` contains a value - startup_info.wShowWindow = subprocess.SW_HIDE # specify that the console window should be hidden - else: - startup_info = None # default startupinfo - process = subprocess.Popen([ - flac_converter, - "--stdout", "--totally-silent", # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output - "--best", # highest level of compression available - "-", # the input FLAC file contents will be given in stdin - ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, startupinfo=startup_info) - flac_data, stderr = process.communicate(wav_data) - return flac_data - - class Recognizer(AudioSource): def __init__(self): """ @@ -1683,6 +1501,7 @@ def recognize_whisper(self, audio_data, model="base", show_dict=False, load_opti else: return result["text"] + recognize_whisper_api = whisper.recognize_whisper_api def recognize_vosk(self, audio_data, language='en'): from vosk import Model, KaldiRecognizer @@ -1702,47 +1521,6 @@ def recognize_vosk(self, audio_data, language='en'): return finalRecognition -def get_flac_converter(): - """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" - flac_converter = shutil_which("flac") # check for installed version first - if flac_converter is None: # flac utility is not installed - base_path = os.path.dirname(os.path.abspath(__file__)) # directory of the current module file, where all the FLAC bundled binaries are stored - system, machine = platform.system(), platform.machine() - if system == "Windows" and machine in {"i686", "i786", "x86", "x86_64", "AMD64"}: - flac_converter = os.path.join(base_path, "flac-win32.exe") - elif system == "Darwin" and machine in {"i686", "i786", "x86", "x86_64", "AMD64"}: - flac_converter = os.path.join(base_path, "flac-mac") - elif system == "Linux" and machine in {"i686", "i786", "x86"}: - flac_converter = os.path.join(base_path, "flac-linux-x86") - elif system == "Linux" and machine in {"x86_64", "AMD64"}: - flac_converter = os.path.join(base_path, "flac-linux-x86_64") - else: # no FLAC converter available - raise OSError("FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent") - - # mark FLAC converter as executable if possible - try: - # handle known issue when running on docker: - # run executable right after chmod() may result in OSError "Text file busy" - # fix: flush FS with sync - if not os.access(flac_converter, os.X_OK): - stat_info = os.stat(flac_converter) - os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC) - if 'Linux' in platform.system(): - os.sync() if sys.version_info >= (3, 3) else os.system('sync') - - except OSError: pass - - return flac_converter - - -def shutil_which(pgm): - """Python 2 compatibility: backport of ``shutil.which()`` from Python 3""" - path = os.getenv('PATH') - for p in path.split(os.path.pathsep): - p = os.path.join(p, pgm) - if os.path.exists(p) and os.access(p, os.X_OK): - return p - class PortableNamedTemporaryFile(object): """Limited replacement for ``tempfile.NamedTemporaryFile``, except unlike ``tempfile.NamedTemporaryFile``, the file can be opened again while it's currently open, even on Windows.""" diff --git a/speech_recognition/audio.py b/speech_recognition/audio.py new file mode 100644 index 00000000..732f7e01 --- /dev/null +++ b/speech_recognition/audio.py @@ -0,0 +1,317 @@ +import aifc +import audioop +import io +import os +import platform +import stat +import subprocess +import sys +import wave + + +class AudioData(object): + """ + Creates a new ``AudioData`` instance, which represents mono audio data. + + The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format. + + The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample. + + The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz). + + Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly. + """ + + def __init__(self, frame_data, sample_rate, sample_width): + assert sample_rate > 0, "Sample rate must be a positive integer" + assert ( + sample_width % 1 == 0 and 1 <= sample_width <= 4 + ), "Sample width must be between 1 and 4 inclusive" + self.frame_data = frame_data + self.sample_rate = sample_rate + self.sample_width = int(sample_width) + + def get_segment(self, start_ms=None, end_ms=None): + """ + Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in. + + If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end. + """ + assert ( + start_ms is None or start_ms >= 0 + ), "``start_ms`` must be a non-negative number" + assert end_ms is None or end_ms >= ( + 0 if start_ms is None else start_ms + ), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``" + if start_ms is None: + start_byte = 0 + else: + start_byte = int( + (start_ms * self.sample_rate * self.sample_width) // 1000 + ) + if end_ms is None: + end_byte = len(self.frame_data) + else: + end_byte = int( + (end_ms * self.sample_rate * self.sample_width) // 1000 + ) + return AudioData( + self.frame_data[start_byte:end_byte], + self.sample_rate, + self.sample_width, + ) + + def get_raw_data(self, convert_rate=None, convert_width=None): + """ + Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance. + + If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + + If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + + Writing these bytes directly to a file results in a valid `RAW/PCM audio file `__. + """ + assert ( + convert_rate is None or convert_rate > 0 + ), "Sample rate to convert to must be a positive integer" + assert convert_width is None or ( + convert_width % 1 == 0 and 1 <= convert_width <= 4 + ), "Sample width to convert to must be between 1 and 4 inclusive" + + raw_data = self.frame_data + + # make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples) + if self.sample_width == 1: + raw_data = audioop.bias( + raw_data, 1, -128 + ) # subtract 128 from every sample to make them act like signed samples + + # resample audio at the desired rate if specified + if convert_rate is not None and self.sample_rate != convert_rate: + raw_data, _ = audioop.ratecv( + raw_data, + self.sample_width, + 1, + self.sample_rate, + convert_rate, + None, + ) + + # convert samples to desired sample width if specified + if convert_width is not None and self.sample_width != convert_width: + if ( + convert_width == 3 + ): # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866) + raw_data = audioop.lin2lin( + raw_data, self.sample_width, 4 + ) # convert audio into 32-bit first, which is always supported + try: + audioop.bias( + b"", 3, 0 + ) # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do) + except ( + audioop.error + ): # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less) + raw_data = b"".join( + raw_data[i + 1 : i + 4] + for i in range(0, len(raw_data), 4) + ) # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample + else: # 24-bit audio fully supported, we don't need to shim anything + raw_data = audioop.lin2lin( + raw_data, self.sample_width, convert_width + ) + else: + raw_data = audioop.lin2lin( + raw_data, self.sample_width, convert_width + ) + + # if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again + if convert_width == 1: + raw_data = audioop.bias( + raw_data, 1, 128 + ) # add 128 to every sample to make them act like unsigned samples again + + return raw_data + + def get_wav_data(self, convert_rate=None, convert_width=None): + """ + Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance. + + If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + + If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + + Writing these bytes directly to a file results in a valid `WAV file `__. + """ + raw_data = self.get_raw_data(convert_rate, convert_width) + sample_rate = ( + self.sample_rate if convert_rate is None else convert_rate + ) + sample_width = ( + self.sample_width if convert_width is None else convert_width + ) + + # generate the WAV file contents + with io.BytesIO() as wav_file: + wav_writer = wave.open(wav_file, "wb") + try: # note that we can't use context manager, since that was only added in Python 3.4 + wav_writer.setframerate(sample_rate) + wav_writer.setsampwidth(sample_width) + wav_writer.setnchannels(1) + wav_writer.writeframes(raw_data) + wav_data = wav_file.getvalue() + finally: # make sure resources are cleaned up + wav_writer.close() + return wav_data + + def get_aiff_data(self, convert_rate=None, convert_width=None): + """ + Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance. + + If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + + If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + + Writing these bytes directly to a file results in a valid `AIFF-C file `__. + """ + raw_data = self.get_raw_data(convert_rate, convert_width) + sample_rate = ( + self.sample_rate if convert_rate is None else convert_rate + ) + sample_width = ( + self.sample_width if convert_width is None else convert_width + ) + + # the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian + if hasattr( + audioop, "byteswap" + ): # ``audioop.byteswap`` was only added in Python 3.4 + raw_data = audioop.byteswap(raw_data, sample_width) + else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback + raw_data = raw_data[sample_width - 1 :: -1] + b"".join( + raw_data[i + sample_width : i : -1] + for i in range(sample_width - 1, len(raw_data), sample_width) + ) + + # generate the AIFF-C file contents + with io.BytesIO() as aiff_file: + aiff_writer = aifc.open(aiff_file, "wb") + try: # note that we can't use context manager, since that was only added in Python 3.4 + aiff_writer.setframerate(sample_rate) + aiff_writer.setsampwidth(sample_width) + aiff_writer.setnchannels(1) + aiff_writer.writeframes(raw_data) + aiff_data = aiff_file.getvalue() + finally: # make sure resources are cleaned up + aiff_writer.close() + return aiff_data + + def get_flac_data(self, convert_rate=None, convert_width=None): + """ + Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance. + + Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC. + + If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + + If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + + Writing these bytes directly to a file results in a valid `FLAC file `__. + """ + assert convert_width is None or ( + convert_width % 1 == 0 and 1 <= convert_width <= 3 + ), "Sample width to convert to must be between 1 and 3 inclusive" + + if ( + self.sample_width > 3 and convert_width is None + ): # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder + convert_width = 3 # the largest supported sample width is 24-bit, so we'll limit the sample width to that + + # run the FLAC converter with the WAV data to get the FLAC data + wav_data = self.get_wav_data(convert_rate, convert_width) + flac_converter = get_flac_converter() + if ( + os.name == "nt" + ): # on Windows, specify that the process is to be started without showing a console window + startup_info = subprocess.STARTUPINFO() + startup_info.dwFlags |= ( + subprocess.STARTF_USESHOWWINDOW + ) # specify that the wShowWindow field of `startup_info` contains a value + startup_info.wShowWindow = ( + subprocess.SW_HIDE + ) # specify that the console window should be hidden + else: + startup_info = None # default startupinfo + process = subprocess.Popen( + [ + flac_converter, + "--stdout", + "--totally-silent", # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output + "--best", # highest level of compression available + "-", # the input FLAC file contents will be given in stdin + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + startupinfo=startup_info, + ) + flac_data, stderr = process.communicate(wav_data) + return flac_data + + +def get_flac_converter(): + """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" + flac_converter = shutil_which("flac") # check for installed version first + if flac_converter is None: # flac utility is not installed + base_path = os.path.dirname( + os.path.abspath(__file__) + ) # directory of the current module file, where all the FLAC bundled binaries are stored + system, machine = platform.system(), platform.machine() + if system == "Windows" and machine in { + "i686", + "i786", + "x86", + "x86_64", + "AMD64", + }: + flac_converter = os.path.join(base_path, "flac-win32.exe") + elif system == "Darwin" and machine in { + "i686", + "i786", + "x86", + "x86_64", + "AMD64", + }: + flac_converter = os.path.join(base_path, "flac-mac") + elif system == "Linux" and machine in {"i686", "i786", "x86"}: + flac_converter = os.path.join(base_path, "flac-linux-x86") + elif system == "Linux" and machine in {"x86_64", "AMD64"}: + flac_converter = os.path.join(base_path, "flac-linux-x86_64") + else: # no FLAC converter available + raise OSError( + "FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent" + ) + + # mark FLAC converter as executable if possible + try: + # handle known issue when running on docker: + # run executable right after chmod() may result in OSError "Text file busy" + # fix: flush FS with sync + if not os.access(flac_converter, os.X_OK): + stat_info = os.stat(flac_converter) + os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC) + if "Linux" in platform.system(): + os.sync() if sys.version_info >= (3, 3) else os.system("sync") + + except OSError: + pass + + return flac_converter + + +def shutil_which(pgm): + """Python 2 compatibility: backport of ``shutil.which()`` from Python 3""" + path = os.getenv("PATH") + for p in path.split(os.path.pathsep): + p = os.path.join(p, pgm) + if os.path.exists(p) and os.access(p, os.X_OK): + return p diff --git a/speech_recognition/exceptions.py b/speech_recognition/exceptions.py new file mode 100644 index 00000000..3e208a12 --- /dev/null +++ b/speech_recognition/exceptions.py @@ -0,0 +1,22 @@ +class SetupError(Exception): + pass + + +class WaitTimeoutError(Exception): + pass + + +class RequestError(Exception): + pass + + +class UnknownValueError(Exception): + pass + + +class TranscriptionNotReady(Exception): + pass + + +class TranscriptionFailed(Exception): + pass diff --git a/speech_recognition/recognizers/__init__.py b/speech_recognition/recognizers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/speech_recognition/recognizers/whisper.py b/speech_recognition/recognizers/whisper.py new file mode 100644 index 00000000..505c60ac --- /dev/null +++ b/speech_recognition/recognizers/whisper.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import os +from io import BytesIO + +from speech_recognition.audio import AudioData +from speech_recognition.exceptions import SetupError + + +def recognize_whisper_api( + recognizer, + audio_data: "AudioData", + *, + model: str = "whisper-1", + api_key: str | None = None, +): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API. + + This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings `__. + + Detail: https://platform.openai.com/docs/guides/speech-to-text + + Raises a ``speech_recognition.exceptions.SetupError`` exception if there are any issues with the openai installation, or the environment variable is missing. + """ + if not isinstance(audio_data, AudioData): + raise ValueError("``audio_data`` must be an ``AudioData`` instance") + if api_key is None and os.environ.get("OPENAI_API_KEY") is None: + raise SetupError("Set environment variable ``OPENAI_API_KEY``") + + try: + import openai + except ImportError: + raise SetupError( + "missing openai module: ensure that openai is set up correctly." + ) + + wav_data = BytesIO(audio_data.get_wav_data()) + wav_data.name = "SpeechRecognition_audio.wav" + + transcript = openai.Audio.transcribe(model, wav_data, api_key=api_key) + return transcript["text"] diff --git a/tests/test_recognition.py b/tests/test_recognition.py index 5759d657..a4e5f4a0 100644 --- a/tests/test_recognition.py +++ b/tests/test_recognition.py @@ -85,7 +85,7 @@ def test_ibm_chinese(self): def test_whisper_english(self): r = sr.Recognizer() with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) - self.assertEqual(r.recognize_whisper(audio, language="english", **self.WHISPER_CONFIG), " 1, 2, 3") + self.assertEqual(r.recognize_whisper(audio, language="english", **self.WHISPER_CONFIG), " 1, 2, 3.") def test_whisper_french(self): r = sr.Recognizer()