Merge branch 'main' into austin/partition_csv_fix

Unstructured-IO · Dec 13, 2023 · e294bde · e294bde
2 parents aa488db + 74d089d
commit e294bde
Show file tree

Hide file tree

Showing 8 changed files with 40 additions and 113 deletions.
diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
@@ -5,7 +5,6 @@
 import pytest
 
 from unstructured.chunking.title import (
-    _NonTextSection,
     _SectionCombiner,
     _split_elements_by_title_and_table,
     _TableSection,
@@ -233,11 +232,9 @@ def test_split_elements_by_title_and_table():
         Title("A Bad Day"),
         Text("Today is a bad day."),
         Text("It is storming outside."),
+        CheckBox(),
     ]
     # --
-    section = next(sections)
-    assert isinstance(section, _NonTextSection)
-    # --
     with pytest.raises(StopIteration):
         next(sections)
 
@@ -273,7 +270,6 @@ def test_chunk_by_title():
         CompositeElement(
             "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
         ),
-        CheckBox(),
     ]
     assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
     assert chunks[3].metadata == ElementMetadata(
@@ -315,7 +311,6 @@ def test_chunk_by_title_respects_section_change():
         CompositeElement(
             "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
         ),
-        CheckBox(),
     ]
 
 
@@ -352,7 +347,6 @@ def test_chunk_by_title_separates_by_page_number():
         CompositeElement(
             "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
         ),
-        CheckBox(),
     ]
 
 
@@ -470,7 +464,6 @@ def test_chunk_by_title_groups_across_pages():
         CompositeElement(
             "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
         ),
-        CheckBox(),
     ]
 
 
@@ -703,21 +696,6 @@ def test_it_considers_separator_length_when_sectioning():
 # == Sections ====================================================================================
 
 
-class Describe_NonTextSection:
-    """Unit-test suite for `unstructured.chunking.title._NonTextSection objects."""
-
-    def it_iterates_its_element_as_the_sole_chunk(self):
-        checkbox = CheckBox()
-        section = _NonTextSection(checkbox)
-
-        chunk_iter = section.iter_chunks(maxlen=500)
-
-        chunk = next(chunk_iter)
-        assert isinstance(chunk, CheckBox)
-        with pytest.raises(StopIteration):
-            next(chunk_iter)
-
-
 class Describe_TableSection:
     """Unit-test suite for `unstructured.chunking.title._TableSection objects."""
 
@@ -1240,7 +1218,7 @@ def it_combines_sequential_small_text_sections(self):
         with pytest.raises(StopIteration):
             next(section_iter)
 
-    def but_it_does_not_combine_table_or_non_text_sections(self):
+    def but_it_does_not_combine_table_sections(self):
         sections = [
             _TextSection(
                 [
@@ -1255,13 +1233,6 @@ def but_it_does_not_combine_table_or_non_text_sections(self):
                     Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
                 ]
             ),
-            _NonTextSection(CheckBox()),
-            _TextSection(
-                [
-                    Title("Sed Orci"),
-                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
-                ]
-            ),
         ]
 
         section_iter = _SectionCombiner(
@@ -1286,16 +1257,6 @@ def but_it_does_not_combine_table_or_non_text_sections(self):
             Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
         ]
         # --
-        section = next(section_iter)
-        assert isinstance(section, _NonTextSection)
-        # --
-        section = next(section_iter)
-        assert isinstance(section, _TextSection)
-        assert section._elements == [
-            Title("Sed Orci"),
-            Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
-        ]
-        # --
         with pytest.raises(StopIteration):
             next(section_iter)
 

diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py
@@ -209,6 +209,7 @@ def test_element_to_dict():
             },
         },
         "type": None,
+        "text": "",
         "element_id": "awt32t1",
     }
 

diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py
@@ -2,7 +2,7 @@
 
 import os
 import pathlib
-from typing import Dict, List, cast
+from typing import Dict, List
 
 import pytest
 from lxml import etree
@@ -218,7 +218,7 @@ def test_it_provides_parseable_HTML_in_text_as_html():
 def test_it_does_not_extract_text_in_script_tags():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
     doc = HTMLDocument.from_file(filename=filename)
-    assert all("function (" not in element.text for element in cast(List[Text], doc.elements))
+    assert all("function (" not in element.text for element in doc.elements)
 
 
 def test_it_does_not_extract_text_in_style_tags():

diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
@@ -3,7 +3,7 @@
 import pathlib
 import re
 from tempfile import SpooledTemporaryFile
-from typing import Dict, List, cast
+from typing import Dict, List
 
 import docx
 import pytest
@@ -293,7 +293,7 @@ def test_partition_docx_raises_with_neither():
 
 def test_parition_docx_from_team_chat():
     """Docx with no sections partitions recognizing both paragraphs and tables."""
-    elements = cast(List[Text], partition_docx(example_doc_path("teams_chat.docx")))
+    elements = partition_docx(example_doc_path("teams_chat.docx"))
     assert [e.text for e in elements] == [
         "0:0:0.0 --> 0:0:1.510\nSome Body\nOK. Yeah.",
         "0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
@@ -681,7 +681,7 @@ def test_partition_docx_raises_TypeError_for_invalid_languages():
 
 
 def test_partition_docx_includes_hyperlink_metadata():
-    elements = cast(List[Text], partition_docx(example_doc_path("hlink-meta.docx")))
+    elements = partition_docx(example_doc_path("hlink-meta.docx"))
 
     # -- regular paragraph, no hyperlinks --
     element = elements[0]

diff --git a/test_unstructured/partition/pptx/test_pptx.py b/test_unstructured/partition/pptx/test_pptx.py
@@ -4,7 +4,6 @@
 
 import os
 import pathlib
-from typing import Iterator, Sequence, cast
 
 import pptx
 import pytest
@@ -108,12 +107,9 @@ class DescribePptxPartitionerShapeOrderingBehaviors:
     """Tests related to shape inclusion and ordering based on position."""
 
     def it_recurses_into_group_shapes(self):
-        elements = cast(
-            Iterator[Text],
-            _PptxPartitioner(
-                get_test_file_path("group-shapes-nested.pptx"),
-            )._iter_presentation_elements(),
-        )
+        elements = _PptxPartitioner(
+            get_test_file_path("group-shapes-nested.pptx")
+        )._iter_presentation_elements()
 
         assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
 
@@ -254,7 +250,7 @@ def test_partition_pptx_orders_elements(tmp_path: pathlib.Path):
 
 def test_partition_pptx_grabs_tables():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
-    elements = cast(Sequence[Text], partition_pptx(filename=filename))
+    elements = partition_pptx(filename=filename)
 
     assert elements[1].text.startswith("Column 1")
     assert elements[1].text.strip().endswith("Aqua")
@@ -271,10 +267,7 @@ def test_partition_pptx_grabs_tables():
 )
 def test_partition_pptx_infer_table_structure(infer_table_structure):
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
-    elements = cast(
-        Sequence[Text],
-        partition_pptx(filename=filename, infer_table_structure=infer_table_structure),
-    )
+    elements = partition_pptx(filename=filename, infer_table_structure=infer_table_structure)
     table_element_has_text_as_html_field = (
         hasattr(elements[1].metadata, "text_as_html")
         and elements[1].metadata.text_as_html is not None
@@ -284,7 +277,7 @@ def test_partition_pptx_infer_table_structure(infer_table_structure):
 
 def test_partition_pptx_malformed():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
-    elements = cast(Sequence[Text], partition_pptx(filename=filename))
+    elements = partition_pptx(filename=filename)
 
     assert elements[0].text == "Problem Date Placeholder"
     assert elements[1].text == "Test Slide"

diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
@@ -5,15 +5,15 @@
 import json
 import os
 import pathlib
-from typing import Optional, Sequence, Type, cast
+from typing import Optional, Type
 
 import pytest
 from pytest_mock import MockerFixture
 
 from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 from unstructured.chunking.title import chunk_by_title
 from unstructured.cleaners.core import group_broken_paragraphs
-from unstructured.documents.elements import Address, ListItem, NarrativeText, Text, Title
+from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
 from unstructured.partition.text import (
     _combine_paragraphs_less_than_min,
     _split_content_to_fit_max,
@@ -256,16 +256,16 @@ def test_partition_text_extract_regex_metadata():
 
 def test_partition_text_splits_long_text():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
-    elements = cast(Sequence[Text], partition_text(filename=filename))
+    elements = partition_text(filename=filename)
     assert len(elements) > 0
     assert elements[0].text.startswith("Iwan Roberts")
     assert elements[-1].text.endswith("External links")
 
 
 def test_partition_text_splits_long_text_max_partition():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
-    elements = cast(Sequence[Text], partition_text(filename=filename))
-    elements_max_part = cast(Sequence[Text], partition_text(filename=filename, max_partition=500))
+    elements = partition_text(filename=filename)
+    elements_max_part = partition_text(filename=filename, max_partition=500)
     # NOTE(klaijan) - I edited the operation here from < to <=
     # Please revert back if this does not make sense
     assert len(elements) <= len(elements_max_part)
@@ -278,11 +278,8 @@ def test_partition_text_splits_long_text_max_partition():
 
 def test_partition_text_splits_max_min_partition():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
-    elements = cast(Sequence[Text], partition_text(filename=filename))
-    elements_max_part = cast(
-        Sequence[Text],
-        partition_text(filename=filename, min_partition=1000, max_partition=1500),
-    )
+    elements = partition_text(filename=filename)
+    elements_max_part = partition_text(filename=filename, min_partition=1000, max_partition=1500)
     for i, element in enumerate(elements_max_part):
         # NOTE(robinson) - the last element does not have a next element to merge with,
         # so it can be short
@@ -314,27 +311,14 @@ def test_partition_text_splits_max_min_partition():
 
 
 def test_partition_text_min_max():
-    segments = cast(
-        Sequence[Text],
-        partition_text(
-            text=SHORT_PARAGRAPHS,
-            min_partition=6,
-        ),
-    )
+    segments = partition_text(text=SHORT_PARAGRAPHS, min_partition=6)
     for i, segment in enumerate(segments):
         # NOTE(robinson) - the last element does not have a next element to merge with,
         # so it can be short
         if i < len(segments) - 1:
             assert len(segment.text) >= 6
 
-    segments = cast(
-        Sequence[Text],
-        partition_text(
-            text=SHORT_PARAGRAPHS,
-            max_partition=20,
-            min_partition=7,
-        ),
-    )
+    segments = partition_text(text=SHORT_PARAGRAPHS, max_partition=20, min_partition=7)
     for i, segment in enumerate(segments):
         # NOTE(robinson) - the last element does not have a next element to merge with,
         # so it can be short
@@ -368,7 +352,7 @@ def test_combine_paragraphs_less_than_min():
 
 def test_partition_text_doesnt_get_page_breaks():
     text = "--------------------"
-    elements = cast(Sequence[Text], partition_text(text=text))
+    elements = partition_text(text=text)
     assert len(elements) == 1
     assert elements[0].text == text
     assert not isinstance(elements[0], ListItem)
-Original file line number
+Diff line change
@@ Expand Up / @@ -209,6 +209,7 @@ def test_element_to_dict(): @@
                 },
             },
             "type": None,
+            "text": "",
             "element_id": "awt32t1",
         }
@@ Expand Down @@