Skip to content
This repository has been archived by the owner on Sep 8, 2024. It is now read-only.

Consolidate TTS cache logic #2853

Merged
merged 9 commits into from
Mar 11, 2021
4 changes: 2 additions & 2 deletions mycroft/audio/speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ def mute_and_speak(utterance, ident, listen=False):
except RemoteTTSException as e:
LOG.error(e)
mimic_fallback_tts(utterance, ident, listen)
except Exception as e:
LOG.error('TTS execution failed ({})'.format(repr(e)))
except Exception:
LOG.exception('TTS execution failed.')


def _get_mimic_fallback():
Expand Down
295 changes: 295 additions & 0 deletions mycroft/tts/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,295 @@
# Copyright 2021 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TTS cache maintenance.

There are two types of cache available to a TTS engine. Both are comprised of
audio and phoneme files. TTS engines can use the cache to improve performance
by not performing inference on sentences in the cache.

The first type of cache is a persistent cache. The cache is considered
persistent because the files are stored in a location that is not cleared on
reboot. TTS inference on these sentences should only need to occur once. The
persistent cache contains commonly spoken sentences.

The second cache type is a temporary cache stored in the /tmp directory,
which is cleared when a device is rebooted. Sentences are added to this cache
on the fly every time a TTS engine returns audio for a sentence that is not
already cached.
"""
import base64
import hashlib
import json
import re
from pathlib import Path
from typing import List, Set, Tuple
from urllib import parse

import requests

from mycroft.util.file_utils import (
ensure_directory_exists, get_cache_directory
)
from mycroft.util.log import LOG


def _get_mimic2_audio(sentence: str, url: str) -> Tuple[bytes, str]:
"""Use the Mimic2 API to retrieve the audio for a sentence.

Arguments:
sentence: The sentence to be cached
"""
LOG.debug("Retrieving Mimic2 audio for sentence \"{}\'".format(sentence))
mimic2_url = url + parse.quote(sentence) + '&visimes=True'
response = requests.get(mimic2_url)
response_data = response.json()
audio = base64.b64decode(response_data["audio_base64"])
phonemes = response_data["visimes"]

return audio, phonemes


def hash_sentence(sentence: str):
"""Convert the sentence into a hash value used for the file name

Arguments:
sentence: The sentence to be cached
"""
encoded_sentence = sentence.encode("utf-8", "ignore")
sentence_hash = hashlib.md5(encoded_sentence).hexdigest()

return sentence_hash


class AudioFile:
def __init__(self, cache_dir: Path, sentence_hash: str, file_type: str):
self.name = f"{sentence_hash}.{file_type}"
self.path = cache_dir.joinpath(self.name)

def save(self, audio: bytes):
"""Write a TTS cache file containing the audio to be spoken.

Arguments:
audio: TTS inference of a sentence
"""
try:
with open(self.path, "wb") as audio_file:
audio_file.write(audio)
except Exception:
LOG.exception("Failed to write {} to cache".format(self.name))


class PhonemeFile:
def __init__(self, cache_dir: Path, sentence_hash: str):
self.name = f"{sentence_hash}.pho"
self.path = cache_dir.joinpath(self.name)

def load(self) -> str:
"""Load phonemes from cache file."""
phonemes = None
if self.path.exists():
try:
with open(self.path) as phoneme_file:
phonemes = phoneme_file.read().strip()
except Exception:
LOG.exception("Failed to read phoneme from cache")

return phonemes

def save(self, phonemes):
"""Write a TTS cache file containing the phoneme to be displayed.

Arguments:
phonemes: instructions for how to make the mouth on a device move
"""
if type(phonemes) == str:
rec = phonemes
else:
rec = json.dumps(phonemes)
try:
with open(self.path, "w") as phoneme_file:
phoneme_file.write(rec)
except Exception:
LOG.exception("Failed to write {} to cache".format(self.name))


class TextToSpeechCache:
"""Class for all persistent and temporary caching operations."""
def __init__(self, tts_config, tts_name, audio_file_type):
self.config = tts_config
self.tts_name = tts_name
if "preloaded_cache" in self.config:
self.persistent_cache_dir = Path(self.config["preloaded_cache"])
else:
self.persistent_cache_dir = None
self.temporary_cache_dir = Path(
get_cache_directory("tts/" + tts_name)
)
self.audio_file_type = audio_file_type
self.resource_dir = Path(__file__).parent.parent.joinpath("res")
self.cached_sentences = dict()
ensure_directory_exists(
str(self.persistent_cache_dir), permissions=0o755
)
ensure_directory_exists(
str(self.temporary_cache_dir), permissions=0o755
)

def load_persistent_cache(self):
"""Load the contents of dialog files to the persistent cache directory.

Parse the dialog files in the resource directory into sentences. Then
add the audio for each sentence to the cache directory.

NOTE: There may be files pre-loaded in the persistent cache directory
prior to run time, such as pre-recorded audio files. This will add
files that do not already exist.

ANOTHER NOTE: Mimic2 is the only TTS engine that supports this. This
logic will need to change if another TTS engine implements it.
"""
if self.persistent_cache_dir is not None:
LOG.info("Adding dialog resources to persistent TTS cache...")
self._load_existing_audio_files()
self._load_existing_phoneme_files()
dialogs = self._collect_dialogs()
sentences = self._parse_dialogs(dialogs)
for sentence in sentences:
self._load_sentence(sentence)
LOG.info("Persistent TTS cache files added successfully.")

def _load_existing_audio_files(self):
"""Find the TTS audio files already in the persistent cache."""
glob_pattern = "*." + self.audio_file_type
for file_path in self.persistent_cache_dir.glob(glob_pattern):
sentence_hash = file_path.name.split(".")[0]
audio_file = AudioFile(
self.persistent_cache_dir, sentence_hash, self.audio_file_type
)
self.cached_sentences[sentence_hash] = audio_file, None

def _load_existing_phoneme_files(self):
"""Find the TTS phoneme files already in the persistent cache.

A phoneme file is no good without an audio file to pair it with. If
no audio file matches, do not load the phoneme.
"""
for file_path in self.persistent_cache_dir.glob("*.pho"):
sentence_hash = file_path.name.split(".")[0]
cached_sentence = self.cached_sentences.get(sentence_hash)
if cached_sentence is not None:
audio_file = cached_sentence[0]
phoneme_file = PhonemeFile(
self.persistent_cache_dir, sentence_hash
)
self.cached_sentences[sentence_hash] = audio_file, phoneme_file

def _collect_dialogs(self) -> List:
"""Build a set of unique sentences from the dialog files.

The sentences will be parsed from *.dialog files present in
mycroft/res/text/en-us.
"""
dialogs = []
dialog_directory = Path(self.resource_dir, "text", "en-us")
for dialog_file_path in dialog_directory.glob("*.dialog"):
with open(dialog_file_path) as dialog_file:
for dialog in dialog_file.readlines():
dialogs.append(dialog.strip())

return dialogs

@staticmethod
def _parse_dialogs(dialogs: List[str]) -> Set[str]:
"""Split each dialog in the resources directory into sentences.

Do not consider sentences with special characters other than
punctuation
example : <<< LOADING <<<

Arguments:
dialogs: a list of the records in the dialog resource files
"""
sentences = set()
dialog_split_regex = r"(?<=\.|\;|\?)\s"
special_characters_regex = re.compile(r"[@#$%^*()<>/|}{~:]")
for dialog in dialogs:
dialog_sentences = re.split(dialog_split_regex, dialog)
for sentence in dialog_sentences:
match = special_characters_regex.search(sentence)
if match is None:
sentences.add(sentence)

return sentences

def _load_sentence(self, sentence: str):
"""Build audio and phoneme files for each sentence to be cached.

Perform TTS inference on sentences parsed from dialog files. Store
the results in the persistent cache directory.

ASSUMPTION: The only TTS that supports persistent cache right now is
Mimic2. This method assumes a call to the Mimic2 API. If other TTS
engines want to take advantage of the persistent cache, this logic
will need to be more dynamic.
"""
sentence_hash = hash_sentence(sentence)
if sentence_hash not in self.cached_sentences:
LOG.info("Adding \"{}\" to cache".format(sentence))
try:
mimic2_url = self.config["url"]
audio, phonemes = _get_mimic2_audio(sentence, mimic2_url)
except Exception:
log_msg = "Failed to get audio for sentence \"{}\""
LOG.exception(log_msg.format(sentence))
else:
self._add_to_persistent_cache(sentence_hash, audio, phonemes)

def _add_to_persistent_cache(
self, sentence_hash: str, audio: bytes, phonemes: str
):
"""Add a audio/phoneme file pair to the persistent cache."""
audio_file = AudioFile(
self.persistent_cache_dir, sentence_hash, self.audio_file_type
)
audio_file.save(audio)
if phonemes is None:
phoneme_file = None
else:
phoneme_file = PhonemeFile(
self.persistent_cache_dir, sentence_hash
)
phoneme_file.save(phonemes)
self.cached_sentences[sentence_hash] = audio_file, phoneme_file

def clear(self):
"""Remove all files from the temporary cache."""
for cache_file_path in self.temporary_cache_dir.iterdir():
if cache_file_path.is_dir():
for sub_path in cache_file_path.iterdir():
if sub_path.is_file():
sub_path.unlink()
elif cache_file_path.is_file():
cache_file_path.unlink()

def define_audio_file(self, sentence_hash: str) -> AudioFile:
"""Build an instance of an object representing an audio file."""
audio_file = AudioFile(
self.temporary_cache_dir, sentence_hash, self.audio_file_type
)
return audio_file

def define_phoneme_file(self, sentence_hash: str) -> PhonemeFile:
"""Build an instance of an object representing an phoneme file."""
phoneme_file = PhonemeFile(self.temporary_cache_dir, sentence_hash)
return phoneme_file
31 changes: 19 additions & 12 deletions mycroft/tts/cache_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
mycroft responses) and does a tts inference.
It then saves the .wav files to mark1 device

* * * * D E P R E C A T E D * * * *
THIS MODULE IS DEPRECATED IN FAVOR OF tts/cache.py. IT WILL BE REMOVED
IN THE NEXT MAJOR RELEASE, 21.08

"""
# TODO: remove in 21.08

import base64
import glob
Expand Down Expand Up @@ -70,14 +75,15 @@ def generate_cache_text(cache_audio_dir, cache_text_file):
if os.path.exists(each_path):
write_cache_text(each_path, text_file)
text_file.close()
LOG.debug("Completed generating cache")
LOG.info("Completed generating cache")
else:
LOG.debug("Cache file 'cache_text.txt' already exists")
LOG.info("Cache file 'cache_text.txt' already exists")
except Exception:
LOG.exception("Could not open text file to write cache")


def write_cache_text(cache_path, f):
# TODO: remove in 21.08
for file in glob.glob(cache_path + "/*.dialog"):
try:
with open(file, 'r') as fp:
Expand Down Expand Up @@ -109,6 +115,7 @@ def download_audio(cache_audio_dir, cache_text_file):
cache_audio_dir (path): path to store .wav files
cache_text_file (file): file containing the sentences
"""
# TODO: remove in 21.08
if os.path.isfile(cache_text_file) and \
os.path.exists(cache_audio_dir):
if not os.listdir(cache_audio_dir):
Expand Down Expand Up @@ -137,16 +144,14 @@ def download_audio(cache_audio_dir, cache_text_file):
with open(pho_file, "w") as cachefile:
cachefile.write(json.dumps(vis)) # Mimic2
# cachefile.write(str(vis)) # Mimic
except Exception as e:
except Exception:
# Skip this dialog and continue
LOG.error("Unable to get pre-loaded cache "
"due to ({})".format(repr(e)))
LOG.exception("Unable to get pre-loaded cache")

LOG.debug("Completed getting cache for {}".format(TTS))
LOG.info("Completed getting cache for {}".format(TTS))

else:
LOG.debug("Pre-loaded cache for {} already exists".
format(TTS))
LOG.info("Pre-loaded cache for {} already exists".format(TTS))
else:
missing_path = cache_text_file if not \
os.path.isfile(cache_text_file)\
Expand All @@ -163,21 +168,23 @@ def copy_cache(cache_audio_dir):
Args:
cache_audio_dir (path): path containing .wav files
"""
# TODO: remove in 21.08
if os.path.exists(cache_audio_dir):
# get tmp directory where tts cache is stored
dest = util.get_cache_directory('tts/' + 'Mimic2')
files = os.listdir(cache_audio_dir)
for f in files:
shutil.copy2(os.path.join(cache_audio_dir, f), dest)
LOG.debug("Copied all pre-loaded cache for {} to {}"
.format(TTS, dest))
LOG.info(
"Copied all pre-loaded cache for {} to {}".format(TTS, dest))
else:
LOG.debug("No Source directory for {} pre-loaded cache"
.format(TTS))
LOG.info(
"No Source directory for {} pre-loaded cache".format(TTS))


# Start here
def main(cache_audio_dir):
# TODO: remove in 21.08
# Path where cache is stored and not cleared on reboot/TTS change
if cache_audio_dir:
if not os.path.exists(cache_audio_dir):
Expand Down
Loading