Skip to content

Commit

Permalink
TEI: address change in LXML addnext method (#484)
Browse files Browse the repository at this point in the history
* Use insert() instead of addnext()

* Handle tail on parent and new sibling

* Adjust tests

Some tails on elements now contain whitespace instead of None, affected test cases where changed
to reflect this behaviour.

* simplify code and address warning

---------

Co-authored-by: Luise Köhler <[email protected]>
Co-authored-by: Adrien Barbaresi <[email protected]>
  • Loading branch information
3 people authored Jan 23, 2024
1 parent eec05b2 commit c703271
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 11 deletions.
10 changes: 5 additions & 5 deletions tests/xml_tei_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,9 @@ def test_ab_with_p_parent_resolved():
</TEI>"""
)
cleaned = check_tei(xml_doc, "fake_url")
result = [(elem.tag, elem.text, elem.tail) for elem in xml_doc.iter(["p", "ab"])]
result = [(elem.tag, elem.text, elem.tail if elem.tail is None else elem.tail.strip()) for elem in xml_doc.iter(["p", "ab"])]
assert result == [
("p", "text1", None),
("p", "text1", ""),
("ab", "text2", None),
("p", "text3", None),
("ab", "text4", None),
Expand All @@ -339,10 +339,10 @@ def test_ab_with_p_parent_resolved():
</TEI>"""
)
cleaned = check_tei(xml_doc, "fake_url")
result = [(elem.tag, elem.text, elem.tail) for elem in xml_doc.iter(["p", "ab"])]
result = [(elem.tag, elem.text, elem.tail if elem.tail is None else elem.tail.strip()) for elem in xml_doc.iter(["p", "ab"])]
assert result == [
("p", "text0", None),
("ab", "text1", None),
("p", "text0", ""),
("ab", "text1", ""),
("p", None, None),
("ab", "text3", None),
("p", "text4", None),
Expand Down
24 changes: 18 additions & 6 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def replace_element_text(element, include_formatting):
def merge_with_parent(element, include_formatting=False):
'''Merge element with its parent and convert formatting to markdown.'''
parent = element.getparent()
if not parent:
if parent is None:
return

full_text = replace_element_text(element, include_formatting)
Expand Down Expand Up @@ -491,16 +491,28 @@ def _wrap_unwanted_siblings_of_div(div_element):


def _move_element_one_level_up(element):
"""
Fix TEI compatibility issues by moving certain p-elems up in the XML tree.
There is always a n+2 nesting for p-elements with the minimal structure ./TEI/text/body/p
"""
parent = element.getparent()
grand_parent = parent.getparent()

new_elem = Element("p")
new_elem.extend(sibling for sibling in element.itersiblings())

parent.addnext(element)
grand_parent.insert(grand_parent.index(parent) + 1, element)

if element.tail is not None and element.tail.strip():
if element.tail and element.tail.strip():
new_elem.text = element.tail.strip()
element.tail = None
if len(new_elem) != 0 or new_elem.text:
element.addnext(new_elem)

if parent.tail and parent.tail.strip():
new_elem.tail = parent.tail.strip()
parent.tail = None

if len(new_elem) != 0 or new_elem.text or new_elem.tail:
grand_parent.insert(grand_parent.index(element) + 1, new_elem)

if len(parent) == 0 and parent.text is None:
parent.getparent().remove(parent)
grand_parent.remove(parent)

0 comments on commit c703271

Please sign in to comment.