From b4e9357de64707d543d42d0fd8bb0d0a53f5fc2e Mon Sep 17 00:00:00 2001
From: Gijs van Tulder
Date: Mon, 23 Sep 2024 15:32:32 +0200
Subject: [PATCH] (WIP) semanticate: Recursively apply regex rules to the XML
tree.
---
se/easy_xml.py | 23 ++
se/formatting.py | 238 ++++++++++++++++++
.../test-1/golden/semanticate.xhtml | 2 +-
.../semanticate/test-1/in/semanticate.xhtml | 2 +-
4 files changed, 263 insertions(+), 2 deletions(-)
diff --git a/se/easy_xml.py b/se/easy_xml.py
index 157a4221..a0725dd8 100644
--- a/se/easy_xml.py
+++ b/se/easy_xml.py
@@ -581,6 +581,18 @@ def children(self, children) -> None:
else:
self.lxml_element.append(child)
+ @property
+ def first_child(self):
+ """
+ Return an EasyXmlElement representing this node's first child.
+ """
+
+ children = self.lxml_element.getchildren()
+ if children:
+ return EasyXmlElement(children[0], self.namespaces)
+ else:
+ return None
+
@property
def tag(self) -> str:
"""
@@ -597,6 +609,17 @@ def parent(self): # This returns an EasyXmlElement but we can't type hint this u
return EasyXmlElement(self.lxml_element.getparent(), self.namespaces)
+ @property
+ def next(self):
+ """
+ Return an EasyXmlElement representing this node's next sibling.
+ """
+
+ sibling = self.lxml_element.getnext()
+ if sibling is not None:
+ sibling = EasyXmlElement(sibling, self.namespaces)
+ return sibling
+
@property
def text(self) -> str:
"""
diff --git a/se/formatting.py b/se/formatting.py
index 6d361daa..8af11a87 100644
--- a/se/formatting.py
+++ b/se/formatting.py
@@ -49,6 +49,244 @@ def semanticate(xhtml: str) -> str:
A string of XHTML with semantics added.
"""
+ dom = EasyXmlTree(xhtml)
+
+ for el in dom.xpath("/html/body/*"):
+ _semanticate_element(el, ancestors=['html', 'body'])
+
+ return dom.to_string()
+
+def _semanticate_element(el: EasyXmlElement, ancestors: List[str]):
+ """
+ Recursive helper function to semanticate an XML element and its descendants.
+ """
+
+ # Keep track of ancestors (to exclude rules inside , etc.)
+ ancestors.append(el.tag)
+
+ def sub_fn(pattern: str, repl: str, **kwargs):
+ # Finds pattern in the text of this element and the tail of its children
+ # Replaces with repl, which may introduce new XML elements
+ # Additional arguments are passed to regex.sub
+
+ if el.text:
+ # Apply the regex to the text before the first child
+ new_text, new_els = _sub_elements(pattern, repl, el.text, **kwargs)
+ el.text = new_text
+ # Insert new elements before the first child
+ for new_el in reversed(new_els):
+ el.lxml_element.insert(0, new_el.lxml_element)
+
+ for child in el.children:
+ if child.tail:
+ # Apply the regex to the text after this child
+ new_tail, new_els = _sub_elements(pattern, repl, child.tail, **kwargs)
+ if child.tail != new_tail or new_els:
+ child.tail = new_tail
+ # Insert new elements after this child
+ for new_el in new_els:
+ el.lxml_element.insert(el.lxml_element.index(child.lxml_element) + 1, new_el.lxml_element)
+
+ # Run the semanticate rules, using sub_fn as a callback to make the replacements
+ _semanticate_rules(el, ancestors, sub_fn)
+
+ # Recursively process all children
+ for child in el.children:
+ _semanticate_element(child, ancestors)
+
+ ancestors.pop()
+
+table = str.maketrans({
+ "<": "<",
+ ">": ">",
+ "&": "&",
+ "'": "'",
+ '"': """,
+})
+def xmlescape(txt):
+ return txt.translate(table)
+
+def _sub_elements(pattern: str, repl: str, text: str, **kwargs) -> List[Union[str, EasyXmlElement]]:
+ """
+ Helper function to apply a regex to a string and return a list of new XML elements.
+ """
+
+ escaped_text = xmlescape(text)
+ new_text = regex.sub(pattern, repl, escaped_text, **kwargs)
+ if new_text != escaped_text:
+ root = EasyXmlElement(f"""{new_text}""")
+ return root.text, root.children
+ else:
+ return text, []
+
+def _semanticate_rules(el, ancestors, sub):
+ """
+ Helper function to apply the semanticate rules to an XML element.
+ """
+
+ if "abbr" not in ancestors:
+ # Some common abbreviations
+ sub(r"(?\.)", r"""\1""", titles=[
+ "Capt",
+ "Col",
+ "Dr",
+ "Drs",
+ "Esq",
+ "Fr",
+ "Hon",
+ "Lieut",
+ "Lt",
+ "MM",
+ "Mdlle",
+ "Messers",
+ "Messrs",
+ "Mlle",
+ "Mlles",
+ "Mme",
+ "Mmes",
+ "Mon",
+ "Mr",
+ "Mrs",
+ "Ms",
+ "Prof",
+ "Pvt",
+ "Rev",
+ ])
+ sub(r"(?\.)", r"""\1""", abbreviations=[
+ "Bros",
+ "Mt",
+ "[Vv]ols?",
+ "Co",
+ "Inc",
+ "Ltd",
+ "St",
+ "[Gg]ov",
+ "MSS?",
+ "[Vv]iz",
+ "etc",
+ "[Cc])f",
+ "ed",
+ "(?:Jan\.|Feb\.|Mar\.|Apr\.|Jun\.|Jul\.|Aug\.|Sep\.|Sept\.|Oct\.|Nov\.|Dec\.)",
+ "[Vv]s",
+ "[Ff]f", # ff. typically used in footnotes, means "and following"
+ "[Ll]ib", # Lib. = Liber = Book
+ ])
+ sub(r"(?\1\2")
+ sub(r"(?\1hap. \2") # The number allows us to avoid phrases like `Hello, old chap.`
+ sub(r"(?\1""")
+ sub(r"(?inst.""") # `inst.` is short for `instante mense` but it is not italicized
+ sub(r"(?\1.e.""")
+ sub(r"(?\1.g.""")
+ sub(r"(?N.B.""")
+ sub(r"(?Ph. D.""")
+ sub(r"(?I.O.U.""")
+ sub(r"(?\1""")
+ sub(r"(?\1""")
+ sub(r"(?\1.m.")
+ sub(r"(?\1\2") # Book sizes
+ sub(r"(?a.m.")
+ sub(r"(?p.m.")
+
+ # this should be placed after the am/pm test, to prevent tagging just the p. in "p. m."
+ sub(r"(?p\1.\2")
+ # keep a period after TV that terminates a clause
+ if el.tag == "p":
+ sub(r"(?TV.\1""")
+ sub(r"(?TV.\1""")
+ # otherwise, get rid of any periods in TV
+ sub(r"(?TV""")
+ # keep a period after AD/BC that terminates a clause
+ if el.tag == "p":
+ sub(r"(?AD.\1""")
+ sub(r"(?BC.\1""")
+ sub(r"(?|\s+[“‘]?[\p{Uppercase_Letter}])", r"""AD.\1""")
+ sub(r"(?|\s+[“‘]?[\p{Uppercase_Letter}])", r"""BC.\1""")
+ # otherwise, get rid of any periods in AD/BC
+ sub(r"(?AD""")
+ sub(r"(?BC""")
+
+ # Wrap £sd shorthand
+ sub(r"([0-9½¼¾⅙⅚⅛⅜⅝⅞]+)([sd]\.)", r"\1\2")
+
+ # Add abbrevations around some SI measurements
+ sub(r"([0-9]+)\s*([cmk][mgl])\b", fr"\1{se.NO_BREAK_SPACE}\2")
+
+ # Add abbrevations around Imperial measurements
+ sub(r"(?\2.")
+
+ # Handle `in.` separately to require a period, because with an optional period there are too many false positives
+ sub(r"(?in.")
+
+ # Tweak some other Imperial measurements
+ sub(r"([0-9]+)\s*m\.?p\.?h\.?", fr"\1{se.NO_BREAK_SPACE}mph", flags=regex.IGNORECASE)
+ sub(r"([0-9]+)\s*h\.?p\.?", fr"\1{se.NO_BREAK_SPACE}hp", flags=regex.IGNORECASE)
+
+ if el.tag == "abbr":
+ # add eoc (End Of Clause) class
+ eoc = False
+
+ # sub(r"etc\.([”’]?(?:
|\s+[“‘]?[\p{Uppercase_Letter}]))", r"""etc.\1""")
+ # sub(r"""([^<]+\.)([”’]?)""", r"""\2\3""")
+ if el.text == "etc." and not el.children:
+ if el.tail and regex.match(r"[”’]?\s+[“‘]?[\p{Uppercase_Letter}]", el.tail):
+ eoc = True
+
+ if el.text.endswith(".") and not el.children:
+ if el.parent.tag == "p" and el.next is None and (not el.tail or el.tail in "”’"):
+ eoc = True
+
+ if eoc and not "eoc" in (el.get_attr("class") or ""):
+ el.add_attr_value("class", "eoc")
+ # sort attributes
+ el.attrs = dict(el.attrs)
+
+ if "abbr" not in ancestors and "span" not in ancestors:
+ # Get Roman numerals >= 2 characters
+ # Ignore "numerals" followed by a dash, as they are more likely something like `x-ray` or `v-shaped`
+ # Note that `j` may occur only at the end of a numeral as an old-fashioned terminal `i`, like int `ij` (2), `vij` (7)
+ sub(r"([^\p{Letter}])([ixvIXV]{2,}j?)(\b[^\-]|st\b|nd\b|rd\b|th\b)", r"""\1\2\3""")
+
+ # Get Roman numerals that are X or V and single characters. We can't do I for obvious reasons.
+ sub(r"""([^\p{Letter}\"])([vxVX])(\b[^\-]|st\b|nd\b|rd\b|th\b)""", r"""\1\2\3""")
+
+ # We can assume a lowercase i is always a Roman numeral unless followed by ’
+ sub(r"""([^\p{Letter}<>/\"])i\b(?!’)""", r"""\1i""")
+
+
+
+
+
+
+def _semanticate_todo():
+ """
+ Temporary holding place for unconverted functions from the old semanticate.
+ """
+
+ # Fix obscured names starting with I, V, or X
+ xhtml = regex.sub(fr"""([IVX]){se.WORD_JOINER}⸺""", fr"""\1{se.WORD_JOINER}⸺""", xhtml)
+
+ # Fix some possible errors introduced by the above
+ xhtml = regex.sub(fr"((?:[Nn]o\.|[Nn]umber)\s[0-9]+){se.NO_BREAK_SPACE}in\.", r"\1 in", xhtml)
+
+ # We may have added HTML tags within title tags. Remove those here
+ matches = regex.findall(r".+?", xhtml)
+ if matches:
+ xhtml = regex.sub(r".+?", f"{se.formatting.remove_tags(matches[0])}", xhtml)
+
+ return xhtml
+
+
+def semanticate_old(xhtml: str) -> str:
+ """
+ Add semantics to well-formed XHTML
+
+ INPUTS
+ xhtml: A string of well-formed XHTML
+
+ OUTPUTS
+ A string of XHTML with semantics added.
+ """
+
# Some common abbreviations
xhtml = regex.sub(r"(?]*?\>))(\L\.)", r"""\1""", xhtml, titles=[
"Capt",
diff --git a/tests/draft_commands/semanticate/test-1/golden/semanticate.xhtml b/tests/draft_commands/semanticate/test-1/golden/semanticate.xhtml
index 00cb212c..6ee7debb 100644
--- a/tests/draft_commands/semanticate/test-1/golden/semanticate.xhtml
+++ b/tests/draft_commands/semanticate/test-1/golden/semanticate.xhtml
@@ -170,4 +170,4 @@
His name was abbreviated Chas.