Skip to content

Commit

Permalink
(WIP) semanticate: Recursively apply regex rules to the XML tree.
Browse files Browse the repository at this point in the history
  • Loading branch information
gvtulder committed Sep 23, 2024
1 parent 99f028a commit b4e9357
Show file tree
Hide file tree
Showing 4 changed files with 263 additions and 2 deletions.
23 changes: 23 additions & 0 deletions se/easy_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,18 @@ def children(self, children) -> None:
else:
self.lxml_element.append(child)

@property
def first_child(self):
"""
Return an EasyXmlElement representing this node's first child.
"""

children = self.lxml_element.getchildren()
if children:
return EasyXmlElement(children[0], self.namespaces)
else:
return None

@property
def tag(self) -> str:
"""
Expand All @@ -597,6 +609,17 @@ def parent(self): # This returns an EasyXmlElement but we can't type hint this u

return EasyXmlElement(self.lxml_element.getparent(), self.namespaces)

@property
def next(self):
"""
Return an EasyXmlElement representing this node's next sibling.
"""

sibling = self.lxml_element.getnext()
if sibling is not None:
sibling = EasyXmlElement(sibling, self.namespaces)
return sibling

@property
def text(self) -> str:
"""
Expand Down
238 changes: 238 additions & 0 deletions se/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,244 @@ def semanticate(xhtml: str) -> str:
A string of XHTML with semantics added.
"""

dom = EasyXmlTree(xhtml)

for el in dom.xpath("/html/body/*"):
_semanticate_element(el, ancestors=['html', 'body'])

return dom.to_string()

def _semanticate_element(el: EasyXmlElement, ancestors: List[str]):
"""
Recursive helper function to semanticate an XML element and its descendants.
"""

# Keep track of ancestors (to exclude rules inside <abbr>, etc.)
ancestors.append(el.tag)

def sub_fn(pattern: str, repl: str, **kwargs):
# Finds pattern in the text of this element and the tail of its children
# Replaces with repl, which may introduce new XML elements
# Additional arguments are passed to regex.sub

if el.text:
# Apply the regex to the text before the first child
new_text, new_els = _sub_elements(pattern, repl, el.text, **kwargs)
el.text = new_text
# Insert new elements before the first child
for new_el in reversed(new_els):
el.lxml_element.insert(0, new_el.lxml_element)

for child in el.children:
if child.tail:
# Apply the regex to the text after this child
new_tail, new_els = _sub_elements(pattern, repl, child.tail, **kwargs)
if child.tail != new_tail or new_els:
child.tail = new_tail
# Insert new elements after this child
for new_el in new_els:
el.lxml_element.insert(el.lxml_element.index(child.lxml_element) + 1, new_el.lxml_element)

# Run the semanticate rules, using sub_fn as a callback to make the replacements
_semanticate_rules(el, ancestors, sub_fn)

# Recursively process all children
for child in el.children:
_semanticate_element(child, ancestors)

ancestors.pop()

table = str.maketrans({
"<": "&lt;",
">": "&gt;",
"&": "&amp;",
"'": "&apos;",
'"': "&quot;",
})
def xmlescape(txt):
return txt.translate(table)

def _sub_elements(pattern: str, repl: str, text: str, **kwargs) -> List[Union[str, EasyXmlElement]]:
"""
Helper function to apply a regex to a string and return a list of new XML elements.
"""

escaped_text = xmlescape(text)
new_text = regex.sub(pattern, repl, escaped_text, **kwargs)
if new_text != escaped_text:
root = EasyXmlElement(f"""<root xmlns:epub="http://www.idpf.org/2007/ops">{new_text}</root>""")
return root.text, root.children
else:
return text, []

def _semanticate_rules(el, ancestors, sub):
"""
Helper function to apply the semanticate rules to an XML element.
"""

if "abbr" not in ancestors:
# Some common abbreviations
sub(r"(?<!(?:\.|\B))(\L<titles>\.)", r"""<abbr epub:type="z3998:name-title">\1</abbr>""", titles=[
"Capt",
"Col",
"Dr",
"Drs",
"Esq",
"Fr",
"Hon",
"Lieut",
"Lt",
"MM",
"Mdlle",
"Messers",
"Messrs",
"Mlle",
"Mlles",
"Mme",
"Mmes",
"Mon",
"Mr",
"Mrs",
"Ms",
"Prof",
"Pvt",
"Rev",
])
sub(r"(?<!(?:\.|\B))(\L<abbreviations>\.)", r"""<abbr>\1</abbr>""", abbreviations=[
"Bros",
"Mt",
"[Vv]ols?",
"Co",
"Inc",
"Ltd",
"St",
"[Gg]ov",
"MSS?",
"[Vv]iz",
"etc",
"[Cc])f",
"ed",
"(?:Jan\.|Feb\.|Mar\.|Apr\.|Jun\.|Jul\.|Aug\.|Sep\.|Sept\.|Oct\.|Nov\.|Dec\.)",
"[Vv]s",
"[Ff]f", # ff. typically used in footnotes, means "and following"
"[Ll]ib", # Lib. = Liber = Book
])
sub(r"(?<!(?:\.|\B))(No\.)(\s+[0-9]+)", r"<abbr>\1</abbr>\2")
sub(r"(?<!(?:\.|\B))([Cc])hap\. ([0-9])", r"<abbr>\1hap.</abbr> \2") # The number allows us to avoid phrases like `Hello, old chap.`
sub(r"(?<!(?:\.|\B))(P\.(?:P\.)?S\.(?:S\.)?\B)", r"""<abbr epub:type="z3998:initialism">\1</abbr>""")
sub(r"(?<!(?:\.|\B))inst\.", r"""<abbr xml:lang="la">inst.</abbr>""") # `inst.` is short for `instante mense` but it is not italicized
sub(r"(?<!(?:\.|\B))([Ii])\.e\.", r"""<abbr epub:type="z3998:initialism">\1.e.</abbr>""")
sub(r"(?<!(?:\.|\B))([Ee])\.g\.", r"""<abbr epub:type="z3998:initialism">\1.g.</abbr>""")
sub(r"(?<!(?:\.|\B))\bN\.?B\.\B", r"""<abbr epub:type="z3998:initialism">N.B.</abbr>""")
sub(r"(?<!(?:\.|\B))Ph\.?\s*D\.?", r"""<abbr epub:type="z3998:name-title">Ph. D.</abbr>""")
sub(r"(?<!(?:\.|\B))(?:IOU(?:\.|\b)|I\.O\.U\.)", r"""<abbr epub:type="z3998:initialism">I.O.U.</abbr>""")
sub(r"(?<!(?:\.|\B))\b([1-4]D)\b", r"""<abbr epub:type="z3998:initialism">\1</abbr>""")
sub(r"(?<!(?:\.|\B))(Thos\.|Jas\.|Chas\.|Wm\.)", r"""<abbr epub:type="z3998:given-name">\1</abbr>""")
sub(r"(?<!(?:\.|\B))([ap])\.\s?m\.", r"<abbr>\1.m.</abbr>")
sub(r"(?<!(?:\.|\B))(4to|8vo|12mo|16mo|18mo|32mo|48mo|64mo)(?:\.(\s+\p{Lowercase_Letter}))?", r"<abbr>\1</abbr>\2") # Book sizes
sub(r"(?<!(?:\.|\B))([0-9]{1,2})\s?[Aa]\.?\s?[Mm](?:\.|\b)", r"\1 <abbr>a.m.</abbr>")
sub(r"(?<!(?:\.|\B))([0-9]{1,2})\s?[Pp]\.?\s?[Mm](?:\.|\b)", r"\1 <abbr>p.m.</abbr>")

# this should be placed after the am/pm test, to prevent tagging just the p. in "p. m."
sub(r"(?<!(?:\.|\B))p(p?)\.([\s0-9])", r"<abbr>p\1.</abbr>\2")
# keep a period after TV that terminates a clause
if el.tag == "p":
sub(r"(?<!(?:\.|\B))T\.?V\.([”’]?)$", r"""<abbr epub:type="z3998:initialism">TV</abbr>.\1""")
sub(r"(?<!(?:\.|\B))T\.?V\.(\s+[“‘]?[\p{Uppercase_Letter}])", r"""<abbr epub:type="z3998:initialism">TV</abbr>.\1""")
# otherwise, get rid of any periods in TV
sub(r"(?<!(?:\.|\B))(?:TV\b|T\.V\.\B)", r"""<abbr epub:type="z3998:initialism">TV</abbr>""")
# keep a period after AD/BC that terminates a clause
if el.tag == "p":
sub(r"(?<!(?:\.|\B))A\.?D\.([”’]?)$", r"""<abbr epub:type="se:era">AD</abbr>.\1""")
sub(r"(?<!(?:\.|\B))B\.?C\.([”’]?)$", r"""<abbr epub:type="se:era">BC</abbr>.\1""")
sub(r"(?<!(?:\.|\B))A\.?D\.([”’]?</p>|\s+[“‘]?[\p{Uppercase_Letter}])", r"""<abbr epub:type="se:era">AD</abbr>.\1""")
sub(r"(?<!(?:\.|\B))B\.?C\.([”’]?</p>|\s+[“‘]?[\p{Uppercase_Letter}])", r"""<abbr epub:type="se:era">BC</abbr>.\1""")
# otherwise, get rid of any periods in AD/BC
sub(r"(?<!(?:\.|\B))(?:AD\b|A\.D\.\B)", r"""<abbr epub:type="se:era">AD</abbr>""")
sub(r"(?<!(?:\.|\B))(?:BC\b|B\.C\.\B)", r"""<abbr epub:type="se:era">BC</abbr>""")

# Wrap £sd shorthand
sub(r"([0-9½¼¾⅙⅚⅛⅜⅝⅞]+)([sd]\.)", r"\1<abbr>\2</abbr>")

# Add abbrevations around some SI measurements
sub(r"([0-9]+)\s*([cmk][mgl])\b", fr"\1{se.NO_BREAK_SPACE}<abbr>\2</abbr>")

# Add abbrevations around Imperial measurements
sub(r"(?<![\$£0-9,])([0-9½¼⅙⅚⅛⅜⅝⅞]+)\s*(ft|yd|mi|pt|qt|gal|oz|lbs)\.?\b", fr"\1{se.NO_BREAK_SPACE}<abbr>\2.</abbr>")

# Handle `in.` separately to require a period, because with an optional period there are too many false positives
sub(r"(?<![\$£0-9,])([0-9½¼⅙⅚⅛⅜⅝⅞]+)\s*in\.(\b|\s)", fr"\1{se.NO_BREAK_SPACE}<abbr>in.</abbr>")

# Tweak some other Imperial measurements
sub(r"([0-9]+)\s*m\.?p\.?h\.?", fr"\1{se.NO_BREAK_SPACE}<abbr>mph</abbr>", flags=regex.IGNORECASE)
sub(r"([0-9]+)\s*h\.?p\.?", fr"\1{se.NO_BREAK_SPACE}<abbr>hp</abbr>", flags=regex.IGNORECASE)

if el.tag == "abbr":
# add eoc (End Of Clause) class
eoc = False

# sub(r"<abbr>etc\.</abbr>([”’]?(?:</p>|\s+[“‘]?[\p{Uppercase_Letter}]))", r"""<abbr class="eoc">etc.</abbr>\1""")
# sub(r"""<abbr( epub:type="[^"]+")?>([^<]+\.)</abbr>([”’]?</p>)""", r"""<abbr class="eoc"\1>\2</abbr>\3""")
if el.text == "etc." and not el.children:
if el.tail and regex.match(r"[”’]?\s+[“‘]?[\p{Uppercase_Letter}]", el.tail):
eoc = True

if el.text.endswith(".") and not el.children:
if el.parent.tag == "p" and el.next is None and (not el.tail or el.tail in "”’"):
eoc = True

if eoc and not "eoc" in (el.get_attr("class") or ""):
el.add_attr_value("class", "eoc")
# sort attributes
el.attrs = dict(el.attrs)

if "abbr" not in ancestors and "span" not in ancestors:
# Get Roman numerals >= 2 characters
# Ignore "numerals" followed by a dash, as they are more likely something like `x-ray` or `v-shaped`
# Note that `j` may occur only at the end of a numeral as an old-fashioned terminal `i`, like int `ij` (2), `vij` (7)
sub(r"([^\p{Letter}])([ixvIXV]{2,}j?)(\b[^\-]|st\b|nd\b|rd\b|th\b)", r"""\1<span epub:type="z3998:roman">\2</span>\3""")

# Get Roman numerals that are X or V and single characters. We can't do I for obvious reasons.
sub(r"""([^\p{Letter}\"])([vxVX])(\b[^\-]|st\b|nd\b|rd\b|th\b)""", r"""\1<span epub:type="z3998:roman">\2</span>\3""")

# We can assume a lowercase i is always a Roman numeral unless followed by ’
sub(r"""([^\p{Letter}<>/\"])i\b(?!’)""", r"""\1<span epub:type="z3998:roman">i</span>""")






def _semanticate_todo():
"""
Temporary holding place for unconverted functions from the old semanticate.
"""

# Fix obscured names starting with I, V, or X
xhtml = regex.sub(fr"""<span epub:type="z3998:roman">([IVX])</span>{se.WORD_JOINER}⸺""", fr"""\1{se.WORD_JOINER}⸺""", xhtml)

# Fix some possible errors introduced by the above
xhtml = regex.sub(fr"((?:[Nn]o\.|[Nn]umber)\s[0-9]+){se.NO_BREAK_SPACE}<abbr>in\.</abbr>", r"\1 in", xhtml)

# We may have added HTML tags within title tags. Remove those here
matches = regex.findall(r"<title>.+?</title>", xhtml)
if matches:
xhtml = regex.sub(r"<title>.+?</title>", f"<title>{se.formatting.remove_tags(matches[0])}</title>", xhtml)

return xhtml


def semanticate_old(xhtml: str) -> str:
"""
Add semantics to well-formed XHTML
INPUTS
xhtml: A string of well-formed XHTML
OUTPUTS
A string of XHTML with semantics added.
"""

# Some common abbreviations
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))(\L<titles>\.)", r"""<abbr epub:type="z3998:name-title">\1</abbr>""", xhtml, titles=[
"Capt",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,4 +170,4 @@
<p>His name was abbreviated <abbr epub:type="z3998:given-name" class="eoc">Chas.</abbr></p>
</section>
</body>
</html>
</html>
Original file line number Diff line number Diff line change
Expand Up @@ -170,4 +170,4 @@
<p>His name was abbreviated <abbr epub:type="z3998:given-name" class="eoc">Chas.</abbr></p>
</section>
</body>
</html>
</html>

0 comments on commit b4e9357

Please sign in to comment.