diff --git a/CHANGELOG.md b/CHANGELOG.md index 6da9dda568..6525da7ee4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.11.4-dev9 +## 0.11.4-dev10 ### Enhancements diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 68785a1996..2934b05473 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -5,7 +5,6 @@ import pytest from unstructured.chunking.title import ( - _NonTextSection, _SectionCombiner, _split_elements_by_title_and_table, _TableSection, @@ -233,11 +232,9 @@ def test_split_elements_by_title_and_table(): Title("A Bad Day"), Text("Today is a bad day."), Text("It is storming outside."), + CheckBox(), ] # -- - section = next(sections) - assert isinstance(section, _NonTextSection) - # -- with pytest.raises(StopIteration): next(sections) @@ -273,7 +270,6 @@ def test_chunk_by_title(): CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", ), - CheckBox(), ] assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"]) assert chunks[3].metadata == ElementMetadata( @@ -315,7 +311,6 @@ def test_chunk_by_title_respects_section_change(): CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", ), - CheckBox(), ] @@ -352,7 +347,6 @@ def test_chunk_by_title_separates_by_page_number(): CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", ), - CheckBox(), ] @@ -470,7 +464,6 @@ def test_chunk_by_title_groups_across_pages(): CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", ), - CheckBox(), ] @@ -703,21 +696,6 @@ def test_it_considers_separator_length_when_sectioning(): # == Sections ==================================================================================== -class Describe_NonTextSection: - """Unit-test suite for `unstructured.chunking.title._NonTextSection objects.""" - - def it_iterates_its_element_as_the_sole_chunk(self): - checkbox = CheckBox() - section = _NonTextSection(checkbox) - - chunk_iter = section.iter_chunks(maxlen=500) - - chunk = next(chunk_iter) - assert isinstance(chunk, CheckBox) - with pytest.raises(StopIteration): - next(chunk_iter) - - class Describe_TableSection: """Unit-test suite for `unstructured.chunking.title._TableSection objects.""" @@ -1240,7 +1218,7 @@ def it_combines_sequential_small_text_sections(self): with pytest.raises(StopIteration): next(section_iter) - def but_it_does_not_combine_table_or_non_text_sections(self): + def but_it_does_not_combine_table_sections(self): sections = [ _TextSection( [ @@ -1255,13 +1233,6 @@ def but_it_does_not_combine_table_or_non_text_sections(self): Text("Mauris nec urna non augue vulputate consequat eget et nisi."), ] ), - _NonTextSection(CheckBox()), - _TextSection( - [ - Title("Sed Orci"), - Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), - ] - ), ] section_iter = _SectionCombiner( @@ -1286,16 +1257,6 @@ def but_it_does_not_combine_table_or_non_text_sections(self): Text("Mauris nec urna non augue vulputate consequat eget et nisi."), ] # -- - section = next(section_iter) - assert isinstance(section, _NonTextSection) - # -- - section = next(section_iter) - assert isinstance(section, _TextSection) - assert section._elements == [ - Title("Sed Orci"), - Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), - ] - # -- with pytest.raises(StopIteration): next(section_iter) diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index bf8bc7a146..2487591e35 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -209,6 +209,7 @@ def test_element_to_dict(): }, }, "type": None, + "text": "", "element_id": "awt32t1", } diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py index 3464275270..01c1201559 100644 --- a/test_unstructured/documents/test_html.py +++ b/test_unstructured/documents/test_html.py @@ -2,7 +2,7 @@ import os import pathlib -from typing import Dict, List, cast +from typing import Dict, List import pytest from lxml import etree @@ -218,7 +218,7 @@ def test_it_provides_parseable_HTML_in_text_as_html(): def test_it_does_not_extract_text_in_script_tags(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html") doc = HTMLDocument.from_file(filename=filename) - assert all("function (" not in element.text for element in cast(List[Text], doc.elements)) + assert all("function (" not in element.text for element in doc.elements) def test_it_does_not_extract_text_in_style_tags(): diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index fab744449b..778fea4dc3 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -3,7 +3,7 @@ import pathlib import re from tempfile import SpooledTemporaryFile -from typing import Dict, List, cast +from typing import Dict, List import docx import pytest @@ -293,7 +293,7 @@ def test_partition_docx_raises_with_neither(): def test_parition_docx_from_team_chat(): """Docx with no sections partitions recognizing both paragraphs and tables.""" - elements = cast(List[Text], partition_docx(example_doc_path("teams_chat.docx"))) + elements = partition_docx(example_doc_path("teams_chat.docx")) assert [e.text for e in elements] == [ "0:0:0.0 --> 0:0:1.510\nSome Body\nOK. Yeah.", "0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.", @@ -681,7 +681,7 @@ def test_partition_docx_raises_TypeError_for_invalid_languages(): def test_partition_docx_includes_hyperlink_metadata(): - elements = cast(List[Text], partition_docx(example_doc_path("hlink-meta.docx"))) + elements = partition_docx(example_doc_path("hlink-meta.docx")) # -- regular paragraph, no hyperlinks -- element = elements[0] diff --git a/test_unstructured/partition/pptx/test_pptx.py b/test_unstructured/partition/pptx/test_pptx.py index 5eb72b7021..1202b18dfb 100644 --- a/test_unstructured/partition/pptx/test_pptx.py +++ b/test_unstructured/partition/pptx/test_pptx.py @@ -4,7 +4,6 @@ import os import pathlib -from typing import Iterator, Sequence, cast import pptx import pytest @@ -108,12 +107,9 @@ class DescribePptxPartitionerShapeOrderingBehaviors: """Tests related to shape inclusion and ordering based on position.""" def it_recurses_into_group_shapes(self): - elements = cast( - Iterator[Text], - _PptxPartitioner( - get_test_file_path("group-shapes-nested.pptx"), - )._iter_presentation_elements(), - ) + elements = _PptxPartitioner( + get_test_file_path("group-shapes-nested.pptx") + )._iter_presentation_elements() assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] @@ -254,7 +250,7 @@ def test_partition_pptx_orders_elements(tmp_path: pathlib.Path): def test_partition_pptx_grabs_tables(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx") - elements = cast(Sequence[Text], partition_pptx(filename=filename)) + elements = partition_pptx(filename=filename) assert elements[1].text.startswith("Column 1") assert elements[1].text.strip().endswith("Aqua") @@ -271,10 +267,7 @@ def test_partition_pptx_grabs_tables(): ) def test_partition_pptx_infer_table_structure(infer_table_structure): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx") - elements = cast( - Sequence[Text], - partition_pptx(filename=filename, infer_table_structure=infer_table_structure), - ) + elements = partition_pptx(filename=filename, infer_table_structure=infer_table_structure) table_element_has_text_as_html_field = ( hasattr(elements[1].metadata, "text_as_html") and elements[1].metadata.text_as_html is not None @@ -284,7 +277,7 @@ def test_partition_pptx_infer_table_structure(infer_table_structure): def test_partition_pptx_malformed(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx") - elements = cast(Sequence[Text], partition_pptx(filename=filename)) + elements = partition_pptx(filename=filename) assert elements[0].text == "Problem Date Placeholder" assert elements[1].text == "Test Slide" diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py index beb006d8fb..dfa29bfd13 100644 --- a/test_unstructured/partition/test_text.py +++ b/test_unstructured/partition/test_text.py @@ -5,7 +5,7 @@ import json import os import pathlib -from typing import Optional, Sequence, Type, cast +from typing import Optional, Type import pytest from pytest_mock import MockerFixture @@ -13,7 +13,7 @@ from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from unstructured.chunking.title import chunk_by_title from unstructured.cleaners.core import group_broken_paragraphs -from unstructured.documents.elements import Address, ListItem, NarrativeText, Text, Title +from unstructured.documents.elements import Address, ListItem, NarrativeText, Title from unstructured.partition.text import ( _combine_paragraphs_less_than_min, _split_content_to_fit_max, @@ -256,7 +256,7 @@ def test_partition_text_extract_regex_metadata(): def test_partition_text_splits_long_text(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt") - elements = cast(Sequence[Text], partition_text(filename=filename)) + elements = partition_text(filename=filename) assert len(elements) > 0 assert elements[0].text.startswith("Iwan Roberts") assert elements[-1].text.endswith("External links") @@ -264,8 +264,8 @@ def test_partition_text_splits_long_text(): def test_partition_text_splits_long_text_max_partition(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt") - elements = cast(Sequence[Text], partition_text(filename=filename)) - elements_max_part = cast(Sequence[Text], partition_text(filename=filename, max_partition=500)) + elements = partition_text(filename=filename) + elements_max_part = partition_text(filename=filename, max_partition=500) # NOTE(klaijan) - I edited the operation here from < to <= # Please revert back if this does not make sense assert len(elements) <= len(elements_max_part) @@ -278,11 +278,8 @@ def test_partition_text_splits_long_text_max_partition(): def test_partition_text_splits_max_min_partition(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt") - elements = cast(Sequence[Text], partition_text(filename=filename)) - elements_max_part = cast( - Sequence[Text], - partition_text(filename=filename, min_partition=1000, max_partition=1500), - ) + elements = partition_text(filename=filename) + elements_max_part = partition_text(filename=filename, min_partition=1000, max_partition=1500) for i, element in enumerate(elements_max_part): # NOTE(robinson) - the last element does not have a next element to merge with, # so it can be short @@ -314,27 +311,14 @@ def test_partition_text_splits_max_min_partition(): def test_partition_text_min_max(): - segments = cast( - Sequence[Text], - partition_text( - text=SHORT_PARAGRAPHS, - min_partition=6, - ), - ) + segments = partition_text(text=SHORT_PARAGRAPHS, min_partition=6) for i, segment in enumerate(segments): # NOTE(robinson) - the last element does not have a next element to merge with, # so it can be short if i < len(segments) - 1: assert len(segment.text) >= 6 - segments = cast( - Sequence[Text], - partition_text( - text=SHORT_PARAGRAPHS, - max_partition=20, - min_partition=7, - ), - ) + segments = partition_text(text=SHORT_PARAGRAPHS, max_partition=20, min_partition=7) for i, segment in enumerate(segments): # NOTE(robinson) - the last element does not have a next element to merge with, # so it can be short @@ -368,7 +352,7 @@ def test_combine_paragraphs_less_than_min(): def test_partition_text_doesnt_get_page_breaks(): text = "--------------------" - elements = cast(Sequence[Text], partition_text(text=text)) + elements = partition_text(text=text) assert len(elements) == 1 assert elements[0].text == text assert not isinstance(elements[0], ListItem) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e0544a08fd..6525ce9a6c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.4-dev9" # pragma: no cover +__version__ = "0.11.4-dev10" # pragma: no cover diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index b6435d0b4a..c13591d3d3 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -21,12 +21,11 @@ RegexMetadata, Table, TableChunk, - Text, Title, ) from unstructured.utils import lazyproperty -_Section: TypeAlias = "_NonTextSection | _TableSection | _TextSection" +_Section: TypeAlias = "_TableSection | _TextSection" # -- goes between text of each element when element-text is concatenated to form chunk -- TEXT_SEPARATOR = "\n\n" @@ -118,7 +117,7 @@ def _split_elements_by_title_and_table( multipage_sections: bool, new_after_n_chars: int, max_characters: int, -) -> Iterator[_TextSection | _TableSection | _NonTextSection]: +) -> Iterator[_TextSection | _TableSection]: """Implements "sectioner" responsibilities. A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a @@ -155,9 +154,8 @@ def _split_elements_by_title_and_table( # -- start new section when necessary -- if ( - # -- Title, Table, and non-Text element (CheckBox) all start a new section -- + # -- Title and Table both start a new section -- isinstance(element, (Title, Table)) - or not isinstance(element, Text) # -- adding this element would exceed hard-maxlen for section -- or section_builder.remaining_space < len(str(element)) # -- section already meets or exceeds soft-maxlen -- @@ -171,8 +169,6 @@ def _split_elements_by_title_and_table( # -- emit table and checkbox immediately since they are always isolated -- if isinstance(element, Table): yield _TableSection(table=element) - elif not isinstance(element, Text): - yield _NonTextSection(element) # -- but accumulate text elements for consolidation into a composite chunk -- else: section_builder.add_element(element) @@ -262,20 +258,6 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: # == Sections ==================================================================================== -class _NonTextSection: - """A section composed of a single `Element` that does not subclass `Text`. - - Currently, only `CheckBox` fits that description - """ - - def __init__(self, element: Element) -> None: - self._element = element - - def iter_chunks(self, maxlen: int) -> Iterator[Element]: - """Generate the non-text element of this section.""" - yield self._element - - class _TableSection: """A section composed of a single Table element.""" @@ -324,7 +306,7 @@ class _TextSection: This object is purposely immutable. """ - def __init__(self, elements: Iterable[Text]) -> None: + def __init__(self, elements: Iterable[Element]) -> None: self._elements = list(elements) def __eq__(self, other: Any) -> bool: @@ -503,7 +485,7 @@ class _TextSectionBuilder: def __init__(self, maxlen: int) -> None: self._maxlen = maxlen self._separator_len = len(TEXT_SEPARATOR) - self._elements: List[Text] = [] + self._elements: List[Element] = [] # -- these mutable working values probably represent premature optimization but improve # -- performance and I expect will be welcome when processing a million elements @@ -513,7 +495,7 @@ def __init__(self, maxlen: int) -> None: # -- combined length of text-segments, not including separators -- self._text_len: int = 0 - def add_element(self, element: Text) -> None: + def add_element(self, element: Element) -> None: """Add `element` to this section.""" self._elements.append(element) if element.text: @@ -585,8 +567,8 @@ def iter_combined_sections(self) -> Iterator[_Section]: for section in self._sections: # -- start new section under these conditions -- if ( - # -- a table or checkbox section is never combined -- - isinstance(section, (_TableSection, _NonTextSection)) + # -- a table section is never combined -- + isinstance(section, _TableSection) # -- don't add another section once length has reached combination soft-max -- or accum.text_length >= self._combine_text_under_n_chars # -- combining would exceed hard-max -- @@ -594,8 +576,8 @@ def iter_combined_sections(self) -> Iterator[_Section]: ): yield from accum.flush() - # -- a table or checkbox section is never combined so don't accumulate -- - if isinstance(section, (_TableSection, _NonTextSection)): + # -- a table section is never combined so don't accumulate -- + if isinstance(section, _TableSection): yield section else: accum.add_section(section) diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index a5370a2e16..0e152c8fa1 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -621,6 +621,8 @@ def to_dict(cls): class Element(abc.ABC): """An element is a section of a page in the document.""" + text: str + def __init__( self, element_id: Union[str, uuid.UUID, NoID, UUID] = NoID(), @@ -636,6 +638,9 @@ def __init__( points=coordinates, system=coordinate_system ) self.metadata.detection_origin = detection_origin + # -- all `Element` instances get a `text` attribute, defaults to the empty string if not + # -- defined in a subclass. + self.text = self.text if hasattr(self, "text") else "" def id_to_uuid(self): self.id = str(uuid.uuid4()) @@ -644,6 +649,7 @@ def to_dict(self) -> Dict[str, Any]: return { "type": None, "element_id": self.id, + "text": self.text, "metadata": self.metadata.to_dict(), }