teiaddphonetics

#!/usr/bin/env python3
# (c) 2020 Sebastian Humenda; Licence: GPL-3+
# pylint: disable=global-statement
"""Restrictions:

-   Escaping within elements could be "lost" since the parser considers &#39;
    and other characters not to be worth escaping.
-   Name spacing is ignored.
This script depends on eSpeakNG and /usr/share/isocodes. On Debian-based
systems, install the packages isocodes and espeak-ng."""
import argparse
import json
import os
import re
import shutil
import subprocess
import sys
import xml.sax as sax

ISO_639_TABLE = "/usr/share/iso-codes/json/iso_639-3.json"
# We rely on espeak-ng to output a single line of output for a given input line.
# However, there seem to be characters that trigger additional newlines. So a
# long word list, with a phrase per line, must have a one-to-one mapping on its
# output. Therefore we replace those characters causing unexpected newlines.
CHARACTER_REPLACEMENTS = {
    '"',
    "(",
    ")",
    "{",
    "}",
    ",",
    "...",
    ".",
    ":",
    ";",
    "!",
    "?",
    "-",
    "…",
    "'",
    "[",
    "´",
    "&",
    "°",
    "$",
    "]",
    "®",
    "-",
    "̩",
    "£",
    "%",
    "\\",
    "§",
    "­",
    "/",
    "+",
    "‘",
    "̃",
    "²",
    "³",
    "\n",
    "\r",
    chr(0x60C),  # from ara-eng
    "−",
    "–",
    "—",
    "↑",
    "→",
    "←",
    "＠",
    "．",
    "〜",
    "×",
    "＋",
    "＆",
    "～",
    "↓",
    "、",
}
ESPEAKNG_LANG_CORRECTIONS = {
    "swh": "sw",  # eSpeak 1.49 uses "sw" that stands for "swa", while FD uses "swh"
}
DEBUG = None

# pylint: disable=too-few-public-methods
class Orth:
    """A representation of a <orth> element."""

    def __init__(self, attrs):
        self.attrs = attrs
        self.text = []


class Entry:
    """A (mostly) serialised entry.
    An entry may contain strings or Orth nodes. It also mimics an output stream
    using the write method."""

    def __init__(self, attrs):
        attrs = "".join(f' {k}="{v}"' for k, v in attrs.items())
        self.xml = [f"<entry{attrs}>"]

    def write(self, item):
        """ToDo: text or orth."""
        self.xml.append(item)

    def contains_pronunciation(self):
        return any(
            token for token in self.xml if isinstance(token, str) and "<pron" in token
        )


class EspeakNg:
    RESTART_AFTER = 10000

    def __init__(self, language):
        if not shutil.which("espeak-ng"):
            print("Espeak-ng not found, please install it and rerun the operation.")
            sys.exit(5)
        self.cmd = ("stdbuf", "-o0", "espeak-ng", "--ipa", "-v", language, "-q")
        self.proc = None
        self.headwords_transcribed = 0
        self.non_alpha = set()
        self._start()

    def _start(self):
        self.proc = subprocess.Popen(
            self.cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            bufsize=0,
        )
        self.headwords_transcribed = 0

    def ipa(self, orth):
        """Transcribe the given headword and return the IPA string. It attempts to
        remove any punctuation because this can trigger line breaks in the
        eSpeak-NG output. This scripts relies on a single line of output per
        headword."""
        if self.headwords_transcribed >= EspeakNg.RESTART_AFTER:
            self.reset()
        headword = "".join(orth.text)
        headword = self.escape(headword)
        self.non_alpha.update(
            c for c in headword if not (c.isalpha() or c.isdigit() or c.isspace())
        )
        self.proc.stdin.write(headword.encode(sys.getdefaultencoding()) + b"\n")
        self.proc.stdin.flush()
        buf = self.proc.stdout.readline()
        word = buf.decode(sys.getdefaultencoding()).strip()
        if DEBUG:
            open(DEBUG, "a").write(f"{headword} :: {word}\n")
        self.headwords_transcribed += 1
        return word

    def escape(self, word):
        """Escape any punctuation, see ipa()."""
        for ch in CHARACTER_REPLACEMENTS:
            word = word.replace(ch, " ")
        return word.replace("  ", " ")

    def close(self):
        remainder = self.proc.communicate()[0]
        if remainder:
            import textwrap

            non_alpha = "".join(self.non_alpha)

            print(
                "\n".join(
                    textwrap.wrap(
                        "Error: {} lines found that were not "
                        "recognised by this script during IPA transcription of "
                        "eSpeak-NG. This is a bug due to special characters that lead "
                        "          to unexpected newlines in the eSpeak-NG output. Expected is "
                        "a line per headword. Use --debug to save the transcriptions to a "
                        "separate file."
                    )
                ).format(len(remainder.split(b"\n")))
                + f"\nPotentially problematic characters: {non_alpha}"
            )
            sys.exit(9)
        ret = self.proc.wait()
        if ret != 0:
            raise OSError(f"Espeak-ng gave non-zero exit code {ret}.")

    def reset(self):
        """Reset eSpeak-NG to avoid filling the OS stdin buffer."""
        self.close()
        self._start()


class TeiConsumer(sax.handler.ContentHandler):
    """A streaming SAX parser that calls a registered handler function to
    process <orth/> elements."""

    def __init__(self, file_handle, ipa_generator):
        # This is either None or a list. If it is a list, all text data will be
        # appended.
        self.current_orth = None
        self.outstream = file_handle
        self.outstream_backup = None
        super().__init__()
        self.ipa_generator = ipa_generator

    def endDocument(self):
        self.outstream.write("\n")
        self.outstream.close()
        self.ipa_generator.close()

    def startElement(self, name, attrs):
        """Signals the start of an element in non-namespace mode."""
        if not attrs:
            attrs = {}
        if name == "entry":
            self.outstream_backup = self.outstream
            self.outstream = Entry(attrs)
        elif name == "orth":
            self.current_orth = Orth(attrs)
        else:
            attrs = attrs.items()
            self.outstream.write(f"<{name}")
            if attrs:
                for k, v in attrs:
                    self.outstream.write(f' {k}="{v}"')
            self.outstream.write(">")

    def endElement(self, name):
        """Signals the end of an element in non-namespace mode."""
        if name == "orth":
            # add the Orth to the Entry
            self.outstream.write(self.current_orth)
            self.current_orth = None
        elif name == "entry":
            entry = self.outstream
            self.outstream = self.outstream_backup
            for item in entry.xml:
                if isinstance(item, Orth):
                    attrs = "".join(f' {k}="{v}"' for k, v in item.attrs.items())
                    text = sax.saxutils.escape("".join(item.text))
                    self.outstream.write(f"<orth{attrs}>{text}</orth>")
                    # It could be that one orth has transcriptions, another
                    # doesn't, these cases are ignored in this version
                    if not entry.contains_pronunciation():
                        # transcriptions are rarely empty
                        transcribed = sax.saxutils.escape(self.ipa_generator.ipa(item))
                        if transcribed:
                            self.outstream.write(f"\n<pron>{transcribed}</pron>")
                else:
                    self.outstream.write(item)
            self.outstream.write(f"</{name}>")
        else:
            self.outstream.write(f"</{name}>")

    def characters(self, content):
        """Receive notification of character data."""
        if self.current_orth:
            self.current_orth.text.append(content)
        else:
            self.outstream.write(sax.saxutils.escape(content))

    def ignorableWhitespace(self, whitespace):
        """Receive notification of ignorable whitespace in element content."""
        self.outstream.write(whitespace)


def espeakng_voice(iso_639_3_code):
    """Return the Espeak-NG voice from the given ISO 639-3 code.
    StopIteration is raised if language is unknown."""
    # Read all available languages first
    if not os.path.exists(ISO_639_TABLE):
        print("Error: failed to load {ISO_639_TABLE}.")
        print("This file can be found in the isocodes package on Debian-based systems.")
        sys.exit(3)
    proc = subprocess.run(["espeak-ng", "--voices"], capture_output=True)
    espeakng_voices = set()
    for voice in proc.stdout.decode(sys.getdefaultencoding()).split("\n"):
        voice = re.search(r"^\s*[0-9]*\s*([a-z]+)(-[0-9]*[a-z]*)?\s.*$", voice)
        if voice:
            espeakng_voices.add(voice.group(1))
    all_iso_639 = json.load(open(ISO_639_TABLE))["639-3"]
    # Let StopIteration propagate
    lang = next(l for l in all_iso_639 if l["alpha_3"] == iso_639_3_code)
    # get the ISO-639-1 code (alpha_2) and afterwards, try to detect whether
    # there's a correction that maps from a code used by FD to one used by
    # eSpeakNG
    lang = lang.get("alpha_2", iso_639_3_code)
    lang = ESPEAKNG_LANG_CORRECTIONS.get(lang, lang)
    if lang not in espeakng_voices:
        print(f'Error, espeak did not recognise language "{lang}".')
        sys.exit(6)
    return lang


def xml_preamble(tei_file):
    """Extract everything until <TEI since the SAX parser ignores it and it's
    part of the document. This function seeks back to the beginning afterwards."""
    lines = []
    found = False
    for _i in range(100):  # upper limit
        line = tei_file.readline()
        idx = line.find("<TEI")
        if idx >= 0:
            lines.append(line[:idx])
            break
        else:
            lines.append(line)
    tei_file.seek(0)
    return "".join(lines) if found else ""


def handle_cmdline():
    global DEBUG
    parser = argparse.ArgumentParser(
        description="Enrich a TEI file with pronunciation info."
    )
    parser.add_argument(
        "tei_dict", metavar="TEI_DICT", nargs="?", help="input file name"
    )
    parser.add_argument(
        "-o",
        dest="outpath",
        metavar="OUTPUT_PATH",
        nargs=1,
        help="write processed TEI to given output file. default is \
        xxx-yyy-phoneme.tei",
    )
    parser.add_argument(
        "--supports-lang",
        dest="supports_lang",
        metavar="LANG",
        nargs=1,
        help="test wheter given language (ISO 639-3) is supported by eSpeak",
    )
    parser.add_argument(
        "--debug",
        dest="debug",
        metavar="DEBUGFILE",
        nargs=1,
        help="write phoneme transcription to specified file. The\
                    file will have a headword per line, separated by colons from \
                    the transcription. This helps to find the problematic \
                    headword if transcriptions start to mismatch.",
    )

    args = parser.parse_args()
    teipath = None
    # Handle --supports-lang that aborts after the check, return results
    # otherwise
    if args.supports_lang:
        try:
            espeakng_voice(args.supports_lang[0])
        except StopIteration:  # raised to propagate missing language
            sys.exit(4)
        else:
            sys.exit(0)
    elif not args.tei_dict:
        print("Error, need to provide input TEI file")
        sys.exit(9)
    teipath = args.tei_dict
    if args.outpath:
        outpath = args.outpath[0]
    else:
        outpath = f"{os.path.splitext(teipath)[0]}-phonetics.tei"

    if args.debug:
        DEBUG = args.debug[0]
    return (teipath, outpath)


def main():
    teipath, outpath = handle_cmdline()

    # Guess eSpeak-NG voice to use
    lang = os.path.splitext(os.path.basename(teipath))[0].split("-")[0]
    ipa_gen = EspeakNg(espeakng_voice(lang))

    #  Extract preamble (not parsed by SAX parser), write and call SAX parser
    #  afterwards
    with open(teipath) as teifile:
        outfile = open(outpath, "w")  # closed by TeiConsumer
        outfile.write(xml_preamble(teifile))
        parser = sax.make_parser()
        teihandler = TeiConsumer(outfile, ipa_gen)
        parser.setContentHandler(teihandler)
        parser.parse(teifile)


if __name__ == "__main__":
    main()