Skip to content

Commit

Permalink
Merge branch 'main' into austin/partition_csv_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
awalker4 authored Dec 13, 2023
2 parents aa488db + 74d089d commit e294bde
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 113 deletions.
43 changes: 2 additions & 41 deletions test_unstructured/chunking/test_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import pytest

from unstructured.chunking.title import (
_NonTextSection,
_SectionCombiner,
_split_elements_by_title_and_table,
_TableSection,
Expand Down Expand Up @@ -233,11 +232,9 @@ def test_split_elements_by_title_and_table():
Title("A Bad Day"),
Text("Today is a bad day."),
Text("It is storming outside."),
CheckBox(),
]
# --
section = next(sections)
assert isinstance(section, _NonTextSection)
# --
with pytest.raises(StopIteration):
next(sections)

Expand Down Expand Up @@ -273,7 +270,6 @@ def test_chunk_by_title():
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
assert chunks[3].metadata == ElementMetadata(
Expand Down Expand Up @@ -315,7 +311,6 @@ def test_chunk_by_title_respects_section_change():
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]


Expand Down Expand Up @@ -352,7 +347,6 @@ def test_chunk_by_title_separates_by_page_number():
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]


Expand Down Expand Up @@ -470,7 +464,6 @@ def test_chunk_by_title_groups_across_pages():
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]


Expand Down Expand Up @@ -703,21 +696,6 @@ def test_it_considers_separator_length_when_sectioning():
# == Sections ====================================================================================


class Describe_NonTextSection:
"""Unit-test suite for `unstructured.chunking.title._NonTextSection objects."""

def it_iterates_its_element_as_the_sole_chunk(self):
checkbox = CheckBox()
section = _NonTextSection(checkbox)

chunk_iter = section.iter_chunks(maxlen=500)

chunk = next(chunk_iter)
assert isinstance(chunk, CheckBox)
with pytest.raises(StopIteration):
next(chunk_iter)


class Describe_TableSection:
"""Unit-test suite for `unstructured.chunking.title._TableSection objects."""

Expand Down Expand Up @@ -1240,7 +1218,7 @@ def it_combines_sequential_small_text_sections(self):
with pytest.raises(StopIteration):
next(section_iter)

def but_it_does_not_combine_table_or_non_text_sections(self):
def but_it_does_not_combine_table_sections(self):
sections = [
_TextSection(
[
Expand All @@ -1255,13 +1233,6 @@ def but_it_does_not_combine_table_or_non_text_sections(self):
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
),
_NonTextSection(CheckBox()),
_TextSection(
[
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
]
),
]

section_iter = _SectionCombiner(
Expand All @@ -1286,16 +1257,6 @@ def but_it_does_not_combine_table_or_non_text_sections(self):
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
# --
section = next(section_iter)
assert isinstance(section, _NonTextSection)
# --
section = next(section_iter)
assert isinstance(section, _TextSection)
assert section._elements == [
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
]
# --
with pytest.raises(StopIteration):
next(section_iter)

Expand Down
1 change: 1 addition & 0 deletions test_unstructured/documents/test_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ def test_element_to_dict():
},
},
"type": None,
"text": "",
"element_id": "awt32t1",
}

Expand Down
4 changes: 2 additions & 2 deletions test_unstructured/documents/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import pathlib
from typing import Dict, List, cast
from typing import Dict, List

import pytest
from lxml import etree
Expand Down Expand Up @@ -218,7 +218,7 @@ def test_it_provides_parseable_HTML_in_text_as_html():
def test_it_does_not_extract_text_in_script_tags():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-with-scripts.html")
doc = HTMLDocument.from_file(filename=filename)
assert all("function (" not in element.text for element in cast(List[Text], doc.elements))
assert all("function (" not in element.text for element in doc.elements)


def test_it_does_not_extract_text_in_style_tags():
Expand Down
6 changes: 3 additions & 3 deletions test_unstructured/partition/docx/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pathlib
import re
from tempfile import SpooledTemporaryFile
from typing import Dict, List, cast
from typing import Dict, List

import docx
import pytest
Expand Down Expand Up @@ -293,7 +293,7 @@ def test_partition_docx_raises_with_neither():

def test_parition_docx_from_team_chat():
"""Docx with no sections partitions recognizing both paragraphs and tables."""
elements = cast(List[Text], partition_docx(example_doc_path("teams_chat.docx")))
elements = partition_docx(example_doc_path("teams_chat.docx"))
assert [e.text for e in elements] == [
"0:0:0.0 --> 0:0:1.510\nSome Body\nOK. Yeah.",
"0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
Expand Down Expand Up @@ -681,7 +681,7 @@ def test_partition_docx_raises_TypeError_for_invalid_languages():


def test_partition_docx_includes_hyperlink_metadata():
elements = cast(List[Text], partition_docx(example_doc_path("hlink-meta.docx")))
elements = partition_docx(example_doc_path("hlink-meta.docx"))

# -- regular paragraph, no hyperlinks --
element = elements[0]
Expand Down
19 changes: 6 additions & 13 deletions test_unstructured/partition/pptx/test_pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import os
import pathlib
from typing import Iterator, Sequence, cast

import pptx
import pytest
Expand Down Expand Up @@ -108,12 +107,9 @@ class DescribePptxPartitionerShapeOrderingBehaviors:
"""Tests related to shape inclusion and ordering based on position."""

def it_recurses_into_group_shapes(self):
elements = cast(
Iterator[Text],
_PptxPartitioner(
get_test_file_path("group-shapes-nested.pptx"),
)._iter_presentation_elements(),
)
elements = _PptxPartitioner(
get_test_file_path("group-shapes-nested.pptx")
)._iter_presentation_elements()

assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]

Expand Down Expand Up @@ -254,7 +250,7 @@ def test_partition_pptx_orders_elements(tmp_path: pathlib.Path):

def test_partition_pptx_grabs_tables():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
elements = cast(Sequence[Text], partition_pptx(filename=filename))
elements = partition_pptx(filename=filename)

assert elements[1].text.startswith("Column 1")
assert elements[1].text.strip().endswith("Aqua")
Expand All @@ -271,10 +267,7 @@ def test_partition_pptx_grabs_tables():
)
def test_partition_pptx_infer_table_structure(infer_table_structure):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
elements = cast(
Sequence[Text],
partition_pptx(filename=filename, infer_table_structure=infer_table_structure),
)
elements = partition_pptx(filename=filename, infer_table_structure=infer_table_structure)
table_element_has_text_as_html_field = (
hasattr(elements[1].metadata, "text_as_html")
and elements[1].metadata.text_as_html is not None
Expand All @@ -284,7 +277,7 @@ def test_partition_pptx_infer_table_structure(infer_table_structure):

def test_partition_pptx_malformed():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
elements = cast(Sequence[Text], partition_pptx(filename=filename))
elements = partition_pptx(filename=filename)

assert elements[0].text == "Problem Date Placeholder"
assert elements[1].text == "Test Slide"
Expand Down
36 changes: 10 additions & 26 deletions test_unstructured/partition/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
import json
import os
import pathlib
from typing import Optional, Sequence, Type, cast
from typing import Optional, Type

import pytest
from pytest_mock import MockerFixture

from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import group_broken_paragraphs
from unstructured.documents.elements import Address, ListItem, NarrativeText, Text, Title
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
from unstructured.partition.text import (
_combine_paragraphs_less_than_min,
_split_content_to_fit_max,
Expand Down Expand Up @@ -256,16 +256,16 @@ def test_partition_text_extract_regex_metadata():

def test_partition_text_splits_long_text():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
elements = cast(Sequence[Text], partition_text(filename=filename))
elements = partition_text(filename=filename)
assert len(elements) > 0
assert elements[0].text.startswith("Iwan Roberts")
assert elements[-1].text.endswith("External links")


def test_partition_text_splits_long_text_max_partition():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
elements = cast(Sequence[Text], partition_text(filename=filename))
elements_max_part = cast(Sequence[Text], partition_text(filename=filename, max_partition=500))
elements = partition_text(filename=filename)
elements_max_part = partition_text(filename=filename, max_partition=500)
# NOTE(klaijan) - I edited the operation here from < to <=
# Please revert back if this does not make sense
assert len(elements) <= len(elements_max_part)
Expand All @@ -278,11 +278,8 @@ def test_partition_text_splits_long_text_max_partition():

def test_partition_text_splits_max_min_partition():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
elements = cast(Sequence[Text], partition_text(filename=filename))
elements_max_part = cast(
Sequence[Text],
partition_text(filename=filename, min_partition=1000, max_partition=1500),
)
elements = partition_text(filename=filename)
elements_max_part = partition_text(filename=filename, min_partition=1000, max_partition=1500)
for i, element in enumerate(elements_max_part):
# NOTE(robinson) - the last element does not have a next element to merge with,
# so it can be short
Expand Down Expand Up @@ -314,27 +311,14 @@ def test_partition_text_splits_max_min_partition():


def test_partition_text_min_max():
segments = cast(
Sequence[Text],
partition_text(
text=SHORT_PARAGRAPHS,
min_partition=6,
),
)
segments = partition_text(text=SHORT_PARAGRAPHS, min_partition=6)
for i, segment in enumerate(segments):
# NOTE(robinson) - the last element does not have a next element to merge with,
# so it can be short
if i < len(segments) - 1:
assert len(segment.text) >= 6

segments = cast(
Sequence[Text],
partition_text(
text=SHORT_PARAGRAPHS,
max_partition=20,
min_partition=7,
),
)
segments = partition_text(text=SHORT_PARAGRAPHS, max_partition=20, min_partition=7)
for i, segment in enumerate(segments):
# NOTE(robinson) - the last element does not have a next element to merge with,
# so it can be short
Expand Down Expand Up @@ -368,7 +352,7 @@ def test_combine_paragraphs_less_than_min():

def test_partition_text_doesnt_get_page_breaks():
text = "--------------------"
elements = cast(Sequence[Text], partition_text(text=text))
elements = partition_text(text=text)
assert len(elements) == 1
assert elements[0].text == text
assert not isinstance(elements[0], ListItem)
Expand Down
Loading

0 comments on commit e294bde

Please sign in to comment.