Skip to content

Commit

Permalink
allow no tags in XML input
Browse files Browse the repository at this point in the history
  • Loading branch information
lukavdplas committed May 1, 2024
1 parent 3065d00 commit 33637da
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 3 deletions.
4 changes: 4 additions & 0 deletions ianalyzer_readers/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,8 @@ class XML(Extractor):
tags:
Tags to select. Each of these can be a `Tag` object, or a callable that
takes the document metadata as input and returns a `Tag`.
If no tags are provided, the extractor will work form the starting tag.
Tags represent a query to select tags from current tag (e.g. the entry tag of
the document). If you provide multiple, they are chained: each Tag query is
Expand Down Expand Up @@ -341,6 +343,8 @@ def _select(self, tags: Iterable[TagSpecification], soup: bs4.PageElement, metad
tag = resolve_tag_specification(tags[0], metadata)
for result in tag.find_in_soup(soup):
yield result
else:
yield soup


def _apply(self, soup_top, soup_entry, *nargs, **kwargs):
Expand Down
6 changes: 3 additions & 3 deletions tests/xml/test_xml_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def test_xml_transform(tmpdir):


def test_xml_no_tag(tmpdir):
extractor = XML(CurrentTag())
extractor = XML()
reader = make_test_reader(extractor, Tag('play'), Tag('character'), basic_doc, tmpdir)
assert_extractor_output(reader, 'HAMLET')

Expand All @@ -80,7 +80,7 @@ def test_xml_no_tag(tmpdir):


def test_xml_attribute(tmpdir):
extractor = XML(CurrentTag(), attribute='character')
extractor = XML(attribute='character')
reader = make_test_reader(extractor, Tag('play'), Tag('lines'), doc_with_attribute, tmpdir)
assert_extractor_output(reader, 'HAMLET')

Expand All @@ -100,7 +100,7 @@ def test_xml_attribute(tmpdir):
'''

def test_xml_flatten(tmpdir):
extractor = XML(CurrentTag(), flatten=True)
extractor = XML(flatten=True)
reader = make_test_reader(extractor, Tag('play'), Tag('lines'), doc_multiline, tmpdir)
expected = 'My hour is almost come, When I to sulph\'rous and tormenting flames Must render up myself.'
assert_extractor_output(reader, expected)
Expand Down

0 comments on commit 33637da

Please sign in to comment.