Merge pull request #2853 from MycroftAI/feature/consolidate_tts_cache

Consolidate TTS cache logic
MycroftAI · Mar 11, 2021 · bf2670c · bf2670c
2 parents a399b19 + ea013b2
commit bf2670c
Show file tree

Hide file tree

Showing 10 changed files with 488 additions and 54 deletions.
diff --git a/mycroft/audio/speech.py b/mycroft/audio/speech.py
@@ -130,8 +130,8 @@ def mute_and_speak(utterance, ident, listen=False):
     except RemoteTTSException as e:
         LOG.error(e)
         mimic_fallback_tts(utterance, ident, listen)
-    except Exception as e:
-        LOG.error('TTS execution failed ({})'.format(repr(e)))
+    except Exception:
+        LOG.exception('TTS execution failed.')
 
 
 def _get_mimic_fallback():

diff --git a/mycroft/tts/cache.py b/mycroft/tts/cache.py
@@ -0,0 +1,295 @@
+# Copyright 2021 Mycroft AI Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TTS cache maintenance.
+
+There are two types of cache available to a TTS engine.  Both are comprised of
+audio and phoneme files.  TTS engines can use the cache to improve performance
+by not performing inference on sentences in the cache.
+
+The first type of cache is a persistent cache.  The cache is considered
+persistent because the files are stored in a location that is not cleared on
+reboot.  TTS inference on these sentences should only need to occur once.  The
+persistent cache contains commonly spoken sentences.
+
+The second cache type is a temporary cache stored in the /tmp directory,
+which is cleared when a device is rebooted.  Sentences are added to this cache
+on the fly every time a TTS engine returns audio for a sentence that is not
+already cached.
+"""
+import base64
+import hashlib
+import json
+import re
+from pathlib import Path
+from typing import List, Set, Tuple
+from urllib import parse
+
+import requests
+
+from mycroft.util.file_utils import (
+    ensure_directory_exists, get_cache_directory
+)
+from mycroft.util.log import LOG
+
+
+def _get_mimic2_audio(sentence: str, url: str) -> Tuple[bytes, str]:
+    """Use the Mimic2 API to retrieve the audio for a sentence.
+
+    Arguments:
+        sentence: The sentence to be cached
+    """
+    LOG.debug("Retrieving Mimic2 audio for sentence \"{}\'".format(sentence))
+    mimic2_url = url + parse.quote(sentence) + '&visimes=True'
+    response = requests.get(mimic2_url)
+    response_data = response.json()
+    audio = base64.b64decode(response_data["audio_base64"])
+    phonemes = response_data["visimes"]
+
+    return audio, phonemes
+
+
+def hash_sentence(sentence: str):
+    """Convert the sentence into a hash value used for the file name
+
+    Arguments:
+        sentence: The sentence to be cached
+    """
+    encoded_sentence = sentence.encode("utf-8", "ignore")
+    sentence_hash = hashlib.md5(encoded_sentence).hexdigest()
+
+    return sentence_hash
+
+
+class AudioFile:
+    def __init__(self, cache_dir: Path, sentence_hash: str, file_type: str):
+        self.name = f"{sentence_hash}.{file_type}"
+        self.path = cache_dir.joinpath(self.name)
+
+    def save(self, audio: bytes):
+        """Write a TTS cache file containing the audio to be spoken.
+
+        Arguments:
+            audio: TTS inference of a sentence
+        """
+        try:
+            with open(self.path, "wb") as audio_file:
+                audio_file.write(audio)
+        except Exception:
+            LOG.exception("Failed to write {} to cache".format(self.name))
+
+
+class PhonemeFile:
+    def __init__(self, cache_dir: Path, sentence_hash: str):
+        self.name = f"{sentence_hash}.pho"
+        self.path = cache_dir.joinpath(self.name)
+
+    def load(self) -> str:
+        """Load phonemes from cache file."""
+        phonemes = None
+        if self.path.exists():
+            try:
+                with open(self.path) as phoneme_file:
+                    phonemes = phoneme_file.read().strip()
+            except Exception:
+                LOG.exception("Failed to read phoneme from cache")
+
+        return phonemes
+
+    def save(self, phonemes):
+        """Write a TTS cache file containing the phoneme to be displayed.
+
+        Arguments:
+            phonemes: instructions for how to make the mouth on a device move
+        """
+        if type(phonemes) == str:
+            rec = phonemes
+        else:
+            rec = json.dumps(phonemes)
+        try:
+            with open(self.path, "w") as phoneme_file:
+                phoneme_file.write(rec)
+        except Exception:
+            LOG.exception("Failed to write {} to cache".format(self.name))
+
+
+class TextToSpeechCache:
+    """Class for all persistent and temporary caching operations."""
+    def __init__(self, tts_config, tts_name, audio_file_type):
+        self.config = tts_config
+        self.tts_name = tts_name
+        if "preloaded_cache" in self.config:
+            self.persistent_cache_dir = Path(self.config["preloaded_cache"])
+        else:
+            self.persistent_cache_dir = None
+        self.temporary_cache_dir = Path(
+            get_cache_directory("tts/" + tts_name)
+        )
+        self.audio_file_type = audio_file_type
+        self.resource_dir = Path(__file__).parent.parent.joinpath("res")
+        self.cached_sentences = dict()
+        ensure_directory_exists(
+            str(self.persistent_cache_dir), permissions=0o755
+        )
+        ensure_directory_exists(
+            str(self.temporary_cache_dir), permissions=0o755
+        )
+
+    def load_persistent_cache(self):
+        """Load the contents of dialog files to the persistent cache directory.
+
+        Parse the dialog files in the resource directory into sentences.  Then
+        add the audio for each sentence to the cache directory.
+
+        NOTE: There may be files pre-loaded in the persistent cache directory
+        prior to run time, such as pre-recorded audio files.  This will add
+        files that do not already exist.
+
+        ANOTHER NOTE:  Mimic2 is the only TTS engine that supports this.  This
+        logic will need to change if another TTS engine implements it.
+        """
+        if self.persistent_cache_dir is not None:
+            LOG.info("Adding dialog resources to persistent TTS cache...")
+            self._load_existing_audio_files()
+            self._load_existing_phoneme_files()
+            dialogs = self._collect_dialogs()
+            sentences = self._parse_dialogs(dialogs)
+            for sentence in sentences:
+                self._load_sentence(sentence)
+            LOG.info("Persistent TTS cache files added successfully.")
+
+    def _load_existing_audio_files(self):
+        """Find the TTS audio files already in the persistent cache."""
+        glob_pattern = "*." + self.audio_file_type
+        for file_path in self.persistent_cache_dir.glob(glob_pattern):
+            sentence_hash = file_path.name.split(".")[0]
+            audio_file = AudioFile(
+                self.persistent_cache_dir, sentence_hash, self.audio_file_type
+            )
+            self.cached_sentences[sentence_hash] = audio_file, None
+
+    def _load_existing_phoneme_files(self):
+        """Find the TTS phoneme files already in the persistent cache.
+
+        A phoneme file is no good without an audio file to pair it with.  If
+        no audio file matches, do not load the phoneme.
+        """
+        for file_path in self.persistent_cache_dir.glob("*.pho"):
+            sentence_hash = file_path.name.split(".")[0]
+            cached_sentence = self.cached_sentences.get(sentence_hash)
+            if cached_sentence is not None:
+                audio_file = cached_sentence[0]
+                phoneme_file = PhonemeFile(
+                    self.persistent_cache_dir, sentence_hash
+                )
+                self.cached_sentences[sentence_hash] = audio_file, phoneme_file
+
+    def _collect_dialogs(self) -> List:
+        """Build a set of unique sentences from the dialog files.
+
+        The sentences will be parsed from *.dialog files present in
+        mycroft/res/text/en-us.
+        """
+        dialogs = []
+        dialog_directory = Path(self.resource_dir, "text", "en-us")
+        for dialog_file_path in dialog_directory.glob("*.dialog"):
+            with open(dialog_file_path) as dialog_file:
+                for dialog in dialog_file.readlines():
+                    dialogs.append(dialog.strip())
+
+        return dialogs
+
+    @staticmethod
+    def _parse_dialogs(dialogs: List[str]) -> Set[str]:
+        """Split each dialog in the resources directory into sentences.
+
+        Do not consider sentences with special characters other than
+        punctuation
+            example : <<< LOADING <<<
+
+        Arguments:
+            dialogs: a list of the records in the dialog resource files
+        """
+        sentences = set()
+        dialog_split_regex = r"(?<=\.|\;|\?)\s"
+        special_characters_regex = re.compile(r"[@#$%^*()<>/|}{~:]")
+        for dialog in dialogs:
+            dialog_sentences = re.split(dialog_split_regex, dialog)
+            for sentence in dialog_sentences:
+                match = special_characters_regex.search(sentence)
+                if match is None:
+                    sentences.add(sentence)
+
+        return sentences
+
+    def _load_sentence(self, sentence: str):
+        """Build audio and phoneme files for each sentence to be cached.
+
+        Perform TTS inference on sentences parsed from dialog files.  Store
+        the results in the persistent cache directory.
+
+        ASSUMPTION: The only TTS that supports persistent cache right now is
+        Mimic2.  This method assumes a call to the Mimic2 API.  If other TTS
+        engines want to take advantage of the persistent cache, this logic
+        will need to be more dynamic.
+        """
+        sentence_hash = hash_sentence(sentence)
+        if sentence_hash not in self.cached_sentences:
+            LOG.info("Adding \"{}\" to cache".format(sentence))
+            try:
+                mimic2_url = self.config["url"]
+                audio, phonemes = _get_mimic2_audio(sentence, mimic2_url)
+            except Exception:
+                log_msg = "Failed to get audio for sentence \"{}\""
+                LOG.exception(log_msg.format(sentence))
+            else:
+                self._add_to_persistent_cache(sentence_hash, audio, phonemes)
+
+    def _add_to_persistent_cache(
+            self, sentence_hash: str, audio: bytes, phonemes: str
+    ):
+        """Add a audio/phoneme file pair to the persistent cache."""
+        audio_file = AudioFile(
+            self.persistent_cache_dir, sentence_hash, self.audio_file_type
+        )
+        audio_file.save(audio)
+        if phonemes is None:
+            phoneme_file = None
+        else:
+            phoneme_file = PhonemeFile(
+                self.persistent_cache_dir, sentence_hash
+            )
+            phoneme_file.save(phonemes)
+        self.cached_sentences[sentence_hash] = audio_file, phoneme_file
+
+    def clear(self):
+        """Remove all files from the temporary cache."""
+        for cache_file_path in self.temporary_cache_dir.iterdir():
+            if cache_file_path.is_dir():
+                for sub_path in cache_file_path.iterdir():
+                    if sub_path.is_file():
+                        sub_path.unlink()
+            elif cache_file_path.is_file():
+                cache_file_path.unlink()
+
+    def define_audio_file(self, sentence_hash: str) -> AudioFile:
+        """Build an instance of an object representing an audio file."""
+        audio_file = AudioFile(
+            self.temporary_cache_dir, sentence_hash, self.audio_file_type
+        )
+        return audio_file
+
+    def define_phoneme_file(self, sentence_hash: str) -> PhonemeFile:
+        """Build an instance of an object representing an phoneme file."""
+        phoneme_file = PhonemeFile(self.temporary_cache_dir, sentence_hash)
+        return phoneme_file
diff --git a/mycroft/tts/cache_handler.py b/mycroft/tts/cache_handler.py
@@ -17,7 +17,12 @@
 mycroft responses) and does a tts inference.
 It then saves the .wav files to mark1 device
 
+* * * *   D E P R E C A T E D   * * * *
+THIS MODULE IS DEPRECATED IN FAVOR OF tts/cache.py. IT WILL BE REMOVED
+IN THE NEXT MAJOR RELEASE, 21.08
+
 """
+# TODO: remove in 21.08
 
 import base64
 import glob
@@ -70,14 +75,15 @@ def generate_cache_text(cache_audio_dir, cache_text_file):
                 if os.path.exists(each_path):
                     write_cache_text(each_path, text_file)
             text_file.close()
-            LOG.debug("Completed generating cache")
+            LOG.info("Completed generating cache")
         else:
-            LOG.debug("Cache file 'cache_text.txt' already exists")
+            LOG.info("Cache file 'cache_text.txt' already exists")
     except Exception:
         LOG.exception("Could not open text file to write cache")
 
 
 def write_cache_text(cache_path, f):
+    # TODO: remove in 21.08
     for file in glob.glob(cache_path + "/*.dialog"):
         try:
             with open(file, 'r') as fp:
@@ -109,6 +115,7 @@ def download_audio(cache_audio_dir, cache_text_file):
         cache_audio_dir (path): path to store .wav files
         cache_text_file (file): file containing the sentences
     """
+    # TODO: remove in 21.08
     if os.path.isfile(cache_text_file) and \
             os.path.exists(cache_audio_dir):
         if not os.listdir(cache_audio_dir):
@@ -137,16 +144,14 @@ def download_audio(cache_audio_dir, cache_text_file):
                             with open(pho_file, "w") as cachefile:
                                 cachefile.write(json.dumps(vis))  # Mimic2
                                 # cachefile.write(str(vis))  # Mimic
-                    except Exception as e:
+                    except Exception:
                         # Skip this dialog and continue
-                        LOG.error("Unable to get pre-loaded cache "
-                                  "due to ({})".format(repr(e)))
+                        LOG.exception("Unable to get pre-loaded cache")
 
-            LOG.debug("Completed getting cache for {}".format(TTS))
+            LOG.info("Completed getting cache for {}".format(TTS))
 
         else:
-            LOG.debug("Pre-loaded cache for {} already exists".
-                      format(TTS))
+            LOG.info("Pre-loaded cache for {} already exists".format(TTS))
     else:
         missing_path = cache_text_file if not \
             os.path.isfile(cache_text_file)\
@@ -163,21 +168,23 @@ def copy_cache(cache_audio_dir):
     Args:
         cache_audio_dir (path): path containing .wav files
     """
+    # TODO: remove in 21.08
     if os.path.exists(cache_audio_dir):
         # get tmp directory where tts cache is stored
         dest = util.get_cache_directory('tts/' + 'Mimic2')
         files = os.listdir(cache_audio_dir)
         for f in files:
             shutil.copy2(os.path.join(cache_audio_dir, f), dest)
-        LOG.debug("Copied all pre-loaded cache for {} to {}"
-                  .format(TTS, dest))
+        LOG.info(
+            "Copied all pre-loaded cache for {} to {}".format(TTS, dest))
     else:
-        LOG.debug("No Source directory for {} pre-loaded cache"
-                  .format(TTS))
+        LOG.info(
+            "No Source directory for {} pre-loaded cache".format(TTS))
 
 
 # Start here
 def main(cache_audio_dir):
+    # TODO: remove in 21.08
     # Path where cache is stored and not cleared on reboot/TTS change
     if cache_audio_dir:
         if not os.path.exists(cache_audio_dir):