diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index 7527dbb..2aa9a06 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -274,6 +274,8 @@ class XML(Extractor): tags: Tags to select. Each of these can be a `Tag` object, or a callable that takes the document metadata as input and returns a `Tag`. + + If no tags are provided, the extractor will work form the starting tag. Tags represent a query to select tags from current tag (e.g. the entry tag of the document). If you provide multiple, they are chained: each Tag query is @@ -341,6 +343,8 @@ def _select(self, tags: Iterable[TagSpecification], soup: bs4.PageElement, metad tag = resolve_tag_specification(tags[0], metadata) for result in tag.find_in_soup(soup): yield result + else: + yield soup def _apply(self, soup_top, soup_entry, *nargs, **kwargs): diff --git a/tests/xml/test_xml_extraction.py b/tests/xml/test_xml_extraction.py index 88bf693..ac15313 100644 --- a/tests/xml/test_xml_extraction.py +++ b/tests/xml/test_xml_extraction.py @@ -63,7 +63,7 @@ def test_xml_transform(tmpdir): def test_xml_no_tag(tmpdir): - extractor = XML(CurrentTag()) + extractor = XML() reader = make_test_reader(extractor, Tag('play'), Tag('character'), basic_doc, tmpdir) assert_extractor_output(reader, 'HAMLET') @@ -80,7 +80,7 @@ def test_xml_no_tag(tmpdir): def test_xml_attribute(tmpdir): - extractor = XML(CurrentTag(), attribute='character') + extractor = XML(attribute='character') reader = make_test_reader(extractor, Tag('play'), Tag('lines'), doc_with_attribute, tmpdir) assert_extractor_output(reader, 'HAMLET') @@ -100,7 +100,7 @@ def test_xml_attribute(tmpdir): ''' def test_xml_flatten(tmpdir): - extractor = XML(CurrentTag(), flatten=True) + extractor = XML(flatten=True) reader = make_test_reader(extractor, Tag('play'), Tag('lines'), doc_multiline, tmpdir) expected = 'My hour is almost come, When I to sulph\'rous and tormenting flames Must render up myself.' assert_extractor_output(reader, expected)