From b4e9357de64707d543d42d0fd8bb0d0a53f5fc2e Mon Sep 17 00:00:00 2001 From: Gijs van Tulder Date: Mon, 23 Sep 2024 15:32:32 +0200 Subject: [PATCH] (WIP) semanticate: Recursively apply regex rules to the XML tree. --- se/easy_xml.py | 23 ++ se/formatting.py | 238 ++++++++++++++++++ .../test-1/golden/semanticate.xhtml | 2 +- .../semanticate/test-1/in/semanticate.xhtml | 2 +- 4 files changed, 263 insertions(+), 2 deletions(-) diff --git a/se/easy_xml.py b/se/easy_xml.py index 157a4221..a0725dd8 100644 --- a/se/easy_xml.py +++ b/se/easy_xml.py @@ -581,6 +581,18 @@ def children(self, children) -> None: else: self.lxml_element.append(child) + @property + def first_child(self): + """ + Return an EasyXmlElement representing this node's first child. + """ + + children = self.lxml_element.getchildren() + if children: + return EasyXmlElement(children[0], self.namespaces) + else: + return None + @property def tag(self) -> str: """ @@ -597,6 +609,17 @@ def parent(self): # This returns an EasyXmlElement but we can't type hint this u return EasyXmlElement(self.lxml_element.getparent(), self.namespaces) + @property + def next(self): + """ + Return an EasyXmlElement representing this node's next sibling. + """ + + sibling = self.lxml_element.getnext() + if sibling is not None: + sibling = EasyXmlElement(sibling, self.namespaces) + return sibling + @property def text(self) -> str: """ diff --git a/se/formatting.py b/se/formatting.py index 6d361daa..8af11a87 100644 --- a/se/formatting.py +++ b/se/formatting.py @@ -49,6 +49,244 @@ def semanticate(xhtml: str) -> str: A string of XHTML with semantics added. """ + dom = EasyXmlTree(xhtml) + + for el in dom.xpath("/html/body/*"): + _semanticate_element(el, ancestors=['html', 'body']) + + return dom.to_string() + +def _semanticate_element(el: EasyXmlElement, ancestors: List[str]): + """ + Recursive helper function to semanticate an XML element and its descendants. + """ + + # Keep track of ancestors (to exclude rules inside , etc.) + ancestors.append(el.tag) + + def sub_fn(pattern: str, repl: str, **kwargs): + # Finds pattern in the text of this element and the tail of its children + # Replaces with repl, which may introduce new XML elements + # Additional arguments are passed to regex.sub + + if el.text: + # Apply the regex to the text before the first child + new_text, new_els = _sub_elements(pattern, repl, el.text, **kwargs) + el.text = new_text + # Insert new elements before the first child + for new_el in reversed(new_els): + el.lxml_element.insert(0, new_el.lxml_element) + + for child in el.children: + if child.tail: + # Apply the regex to the text after this child + new_tail, new_els = _sub_elements(pattern, repl, child.tail, **kwargs) + if child.tail != new_tail or new_els: + child.tail = new_tail + # Insert new elements after this child + for new_el in new_els: + el.lxml_element.insert(el.lxml_element.index(child.lxml_element) + 1, new_el.lxml_element) + + # Run the semanticate rules, using sub_fn as a callback to make the replacements + _semanticate_rules(el, ancestors, sub_fn) + + # Recursively process all children + for child in el.children: + _semanticate_element(child, ancestors) + + ancestors.pop() + +table = str.maketrans({ + "<": "<", + ">": ">", + "&": "&", + "'": "'", + '"': """, +}) +def xmlescape(txt): + return txt.translate(table) + +def _sub_elements(pattern: str, repl: str, text: str, **kwargs) -> List[Union[str, EasyXmlElement]]: + """ + Helper function to apply a regex to a string and return a list of new XML elements. + """ + + escaped_text = xmlescape(text) + new_text = regex.sub(pattern, repl, escaped_text, **kwargs) + if new_text != escaped_text: + root = EasyXmlElement(f"""{new_text}""") + return root.text, root.children + else: + return text, [] + +def _semanticate_rules(el, ancestors, sub): + """ + Helper function to apply the semanticate rules to an XML element. + """ + + if "abbr" not in ancestors: + # Some common abbreviations + sub(r"(?\.)", r"""\1""", titles=[ + "Capt", + "Col", + "Dr", + "Drs", + "Esq", + "Fr", + "Hon", + "Lieut", + "Lt", + "MM", + "Mdlle", + "Messers", + "Messrs", + "Mlle", + "Mlles", + "Mme", + "Mmes", + "Mon", + "Mr", + "Mrs", + "Ms", + "Prof", + "Pvt", + "Rev", + ]) + sub(r"(?\.)", r"""\1""", abbreviations=[ + "Bros", + "Mt", + "[Vv]ols?", + "Co", + "Inc", + "Ltd", + "St", + "[Gg]ov", + "MSS?", + "[Vv]iz", + "etc", + "[Cc])f", + "ed", + "(?:Jan\.|Feb\.|Mar\.|Apr\.|Jun\.|Jul\.|Aug\.|Sep\.|Sept\.|Oct\.|Nov\.|Dec\.)", + "[Vv]s", + "[Ff]f", # ff. typically used in footnotes, means "and following" + "[Ll]ib", # Lib. = Liber = Book + ]) + sub(r"(?\1\2") + sub(r"(?\1hap. \2") # The number allows us to avoid phrases like `Hello, old chap.` + sub(r"(?\1""") + sub(r"(?inst.""") # `inst.` is short for `instante mense` but it is not italicized + sub(r"(?\1.e.""") + sub(r"(?\1.g.""") + sub(r"(?N.B.""") + sub(r"(?Ph. D.""") + sub(r"(?I.O.U.""") + sub(r"(?\1""") + sub(r"(?\1""") + sub(r"(?\1.m.") + sub(r"(?\1\2") # Book sizes + sub(r"(?a.m.") + sub(r"(?p.m.") + + # this should be placed after the am/pm test, to prevent tagging just the p. in "p. m." + sub(r"(?p\1.\2") + # keep a period after TV that terminates a clause + if el.tag == "p": + sub(r"(?TV.\1""") + sub(r"(?TV.\1""") + # otherwise, get rid of any periods in TV + sub(r"(?TV""") + # keep a period after AD/BC that terminates a clause + if el.tag == "p": + sub(r"(?AD.\1""") + sub(r"(?BC.\1""") + sub(r"(?|\s+[“‘]?[\p{Uppercase_Letter}])", r"""AD.\1""") + sub(r"(?|\s+[“‘]?[\p{Uppercase_Letter}])", r"""BC.\1""") + # otherwise, get rid of any periods in AD/BC + sub(r"(?AD""") + sub(r"(?BC""") + + # Wrap £sd shorthand + sub(r"([0-9½¼¾⅙⅚⅛⅜⅝⅞]+)([sd]\.)", r"\1\2") + + # Add abbrevations around some SI measurements + sub(r"([0-9]+)\s*([cmk][mgl])\b", fr"\1{se.NO_BREAK_SPACE}\2") + + # Add abbrevations around Imperial measurements + sub(r"(?\2.") + + # Handle `in.` separately to require a period, because with an optional period there are too many false positives + sub(r"(?in.") + + # Tweak some other Imperial measurements + sub(r"([0-9]+)\s*m\.?p\.?h\.?", fr"\1{se.NO_BREAK_SPACE}mph", flags=regex.IGNORECASE) + sub(r"([0-9]+)\s*h\.?p\.?", fr"\1{se.NO_BREAK_SPACE}hp", flags=regex.IGNORECASE) + + if el.tag == "abbr": + # add eoc (End Of Clause) class + eoc = False + + # sub(r"etc\.([”’]?(?:

|\s+[“‘]?[\p{Uppercase_Letter}]))", r"""etc.\1""") + # sub(r"""([^<]+\.)([”’]?

)""", r"""\2\3""") + if el.text == "etc." and not el.children: + if el.tail and regex.match(r"[”’]?\s+[“‘]?[\p{Uppercase_Letter}]", el.tail): + eoc = True + + if el.text.endswith(".") and not el.children: + if el.parent.tag == "p" and el.next is None and (not el.tail or el.tail in "”’"): + eoc = True + + if eoc and not "eoc" in (el.get_attr("class") or ""): + el.add_attr_value("class", "eoc") + # sort attributes + el.attrs = dict(el.attrs) + + if "abbr" not in ancestors and "span" not in ancestors: + # Get Roman numerals >= 2 characters + # Ignore "numerals" followed by a dash, as they are more likely something like `x-ray` or `v-shaped` + # Note that `j` may occur only at the end of a numeral as an old-fashioned terminal `i`, like int `ij` (2), `vij` (7) + sub(r"([^\p{Letter}])([ixvIXV]{2,}j?)(\b[^\-]|st\b|nd\b|rd\b|th\b)", r"""\1\2\3""") + + # Get Roman numerals that are X or V and single characters. We can't do I for obvious reasons. + sub(r"""([^\p{Letter}\"])([vxVX])(\b[^\-]|st\b|nd\b|rd\b|th\b)""", r"""\1\2\3""") + + # We can assume a lowercase i is always a Roman numeral unless followed by ’ + sub(r"""([^\p{Letter}<>/\"])i\b(?!’)""", r"""\1i""") + + + + + + +def _semanticate_todo(): + """ + Temporary holding place for unconverted functions from the old semanticate. + """ + + # Fix obscured names starting with I, V, or X + xhtml = regex.sub(fr"""([IVX]){se.WORD_JOINER}⸺""", fr"""\1{se.WORD_JOINER}⸺""", xhtml) + + # Fix some possible errors introduced by the above + xhtml = regex.sub(fr"((?:[Nn]o\.|[Nn]umber)\s[0-9]+){se.NO_BREAK_SPACE}in\.", r"\1 in", xhtml) + + # We may have added HTML tags within title tags. Remove those here + matches = regex.findall(r".+?", xhtml) + if matches: + xhtml = regex.sub(r".+?", f"{se.formatting.remove_tags(matches[0])}", xhtml) + + return xhtml + + +def semanticate_old(xhtml: str) -> str: + """ + Add semantics to well-formed XHTML + + INPUTS + xhtml: A string of well-formed XHTML + + OUTPUTS + A string of XHTML with semantics added. + """ + # Some common abbreviations xhtml = regex.sub(r"(?]*?\>))(\L\.)", r"""\1""", xhtml, titles=[ "Capt", diff --git a/tests/draft_commands/semanticate/test-1/golden/semanticate.xhtml b/tests/draft_commands/semanticate/test-1/golden/semanticate.xhtml index 00cb212c..6ee7debb 100644 --- a/tests/draft_commands/semanticate/test-1/golden/semanticate.xhtml +++ b/tests/draft_commands/semanticate/test-1/golden/semanticate.xhtml @@ -170,4 +170,4 @@

His name was abbreviated Chas.

- \ No newline at end of file + diff --git a/tests/draft_commands/semanticate/test-1/in/semanticate.xhtml b/tests/draft_commands/semanticate/test-1/in/semanticate.xhtml index 59933729..7e97bfb3 100644 --- a/tests/draft_commands/semanticate/test-1/in/semanticate.xhtml +++ b/tests/draft_commands/semanticate/test-1/in/semanticate.xhtml @@ -170,4 +170,4 @@

His name was abbreviated Chas.

- \ No newline at end of file +