Skip to content

Commit

Permalink
rfctr(doc): organize test_doc.py
Browse files Browse the repository at this point in the history
No code changes, purely line-block moves.

- Move module-level fixtures to the bottom.
- Organize tests into related groups with markers.
  • Loading branch information
scanny committed May 14, 2024
1 parent 3f8e6b7 commit 67c1d28
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 119 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.13.8-dev5
## 0.13.8-dev6

### Enhancements

Expand Down
255 changes: 138 additions & 117 deletions test_unstructured/partition/docx/test_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,59 +20,15 @@
from unstructured.partition.docx import partition_docx


def test_partition_doc_for_deterministic_and_unique_ids():
ids = [element.id for element in partition_doc("example-docs/duplicate-paragraphs.doc")]

assert ids == [
"ade273c622c48d67a7be7b3816d5b4d8",
"7d0b32fdf169f9578723486cb4bc1235",
"1feb6e8e9c1662cfaef75907aeeb0900",
"aa2a8ac10143b12f0fe2087837ea11d2",
"da31ba7ed3919067d2c6572dc1617271",
"1914359c179a160df921b769acf8c353",
"f9d0d379fc791bae487b7a45f65caa50",
]


@pytest.fixture()
def mock_document():
document = docx.Document()

document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
# NOTE(robinson) - this should get picked up as a list item due to the •
document.add_paragraph("• Parrots", style="Normal")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("• ", style="Normal")
document.add_paragraph("Hockey", style="List Bullet")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="List Bullet")
# NOTE(robinson) - this should get picked up as a title
document.add_paragraph("Analysis", style="Normal")
# NOTE(robinson) - this should get dropped because it is empty
document.add_paragraph("", style="Normal")
# NOTE(robinson) - this should get picked up as a narrative text
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
document.add_paragraph("This is my third thought.", style="Body Text")
# NOTE(robinson) - this should just be regular text
document.add_paragraph("2023")
# NOTE(robinson) - this should be an address
document.add_paragraph("DOYLESTOWN, PA 18901")
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
assert partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename)

return document


@pytest.fixture()
def expected_elements():
return [
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Address("DOYLESTOWN, PA 18901"),
]
# -- document-source (file or filename) ----------------------------------------------------------


def test_partition_doc_from_filename(mock_document, expected_elements, tmpdir, capsys):
Expand All @@ -88,36 +44,6 @@ def test_partition_doc_from_filename(mock_document, expected_elements, tmpdir, c
assert capsys.readouterr().err == ""


def test_partition_doc_from_filename_with_metadata_filename(
mock_document,
expected_elements,
tmpdir,
):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")

elements = partition_doc(filename=doc_filename, metadata_filename="test")
assert elements == expected_elements
assert all(element.metadata.filename == "test" for element in elements)


def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
assert partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename)


def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir):
doc_filename = os.path.join(tmpdir.dirname, "asdf.doc")

with pytest.raises(ValueError):
partition_doc(filename=doc_filename)


def test_partition_doc_from_file_with_filter(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
Expand Down Expand Up @@ -148,18 +74,6 @@ def test_partition_doc_from_file_with_no_filter(mock_document, expected_elements
assert element.metadata.filename is None


def test_partition_doc_from_file_with_metadata_filename(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")

with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, metadata_filename="test")
for element in elements:
assert element.metadata.filename == "test"


def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
Expand All @@ -175,33 +89,76 @@ def test_partition_doc_raises_with_neither():
partition_doc()


def test_partition_doc_from_file_exclude_metadata(mock_document, tmpdir):
def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir):
doc_filename = os.path.join(tmpdir.dirname, "asdf.doc")

with pytest.raises(ValueError):
partition_doc(filename=doc_filename)


# -- `include_metadata` arg ----------------------------------------------------------------------


def test_partition_doc_from_filename_exclude_metadata(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")

with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, include_metadata=False)
elements = partition_doc(filename=doc_filename, include_metadata=False)

assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None


def test_partition_doc_from_filename_exclude_metadata(mock_document, tmpdir):
def test_partition_doc_from_file_exclude_metadata(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")

elements = partition_doc(filename=doc_filename, include_metadata=False)
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, include_metadata=False)

assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None


# -- .metadata.filename --------------------------------------------------------------------------


def test_partition_doc_from_filename_with_metadata_filename(
mock_document,
expected_elements,
tmpdir,
):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")

elements = partition_doc(filename=doc_filename, metadata_filename="test")
assert elements == expected_elements
assert all(element.metadata.filename == "test" for element in elements)


def test_partition_doc_from_file_with_metadata_filename(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")

with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, metadata_filename="test")
for element in elements:
assert element.metadata.filename == "test"


# -- .metadata.last_modified ---------------------------------------------------------------------


def test_partition_doc_metadata_date(
mocker,
filename="example-docs/fake.doc",
Expand Down Expand Up @@ -283,6 +240,19 @@ def test_partition_doc_from_file_explicit_get_metadata_date(
assert elements[0].metadata.last_modified == mocked_last_modification_date


def test_partition_doc_from_file_without_metadata_date(
filename="example-docs/fake.doc",
):
"""Test partition_doc() with file that are not possible to get last modified date"""
with open(filename, "rb") as f:
sf = SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_doc(file=sf, date_from_file_object=True)

assert elements[0].metadata.last_modified is None


def test_partition_doc_from_file_metadata_date_with_custom_metadata(
mocker,
filename="example-docs/fake.doc",
Expand All @@ -302,17 +272,23 @@ def test_partition_doc_from_file_metadata_date_with_custom_metadata(
assert elements[0].metadata.last_modified == expected_last_modified_date


def test_partition_doc_from_file_without_metadata_date(
filename="example-docs/fake.doc",
):
"""Test partition_doc() with file that are not possible to get last modified date"""
with open(filename, "rb") as f:
sf = SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_doc(file=sf, date_from_file_object=True)
# -- language-recognition metadata ---------------------------------------------------------------

assert elements[0].metadata.last_modified is None

def test_partition_doc_element_metadata_has_languages():
filename = "example-docs/fake-doc-emphasized-text.doc"
elements = partition_doc(filename=filename)
assert elements[0].metadata.languages == ["eng"]


def test_partition_doc_respects_detect_language_per_element():
filename = "example-docs/language-docs/eng_spa_mult.doc"
elements = partition_doc(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]


# -- miscellaneous -------------------------------------------------------------------------------


def test_partition_doc_grabs_emphasized_texts():
Expand Down Expand Up @@ -352,14 +328,59 @@ def test_add_chunking_strategy_on_partition_doc(filename="example-docs/fake.doc"
assert chunk_elements == chunks


def test_partition_doc_element_metadata_has_languages():
filename = "example-docs/fake-doc-emphasized-text.doc"
elements = partition_doc(filename=filename)
assert elements[0].metadata.languages == ["eng"]
def test_partition_doc_for_deterministic_and_unique_ids():
ids = [element.id for element in partition_doc("example-docs/duplicate-paragraphs.doc")]

assert ids == [
"ade273c622c48d67a7be7b3816d5b4d8",
"7d0b32fdf169f9578723486cb4bc1235",
"1feb6e8e9c1662cfaef75907aeeb0900",
"aa2a8ac10143b12f0fe2087837ea11d2",
"da31ba7ed3919067d2c6572dc1617271",
"1914359c179a160df921b769acf8c353",
"f9d0d379fc791bae487b7a45f65caa50",
]

def test_partition_doc_respects_detect_language_per_element():
filename = "example-docs/language-docs/eng_spa_mult.doc"
elements = partition_doc(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]

# == module-level fixtures =======================================================================


@pytest.fixture()
def expected_elements():
return [
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Address("DOYLESTOWN, PA 18901"),
]


@pytest.fixture()
def mock_document():
document = docx.Document()

document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
# NOTE(robinson) - this should get picked up as a list item due to the •
document.add_paragraph("• Parrots", style="Normal")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("• ", style="Normal")
document.add_paragraph("Hockey", style="List Bullet")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="List Bullet")
# NOTE(robinson) - this should get picked up as a title
document.add_paragraph("Analysis", style="Normal")
# NOTE(robinson) - this should get dropped because it is empty
document.add_paragraph("", style="Normal")
# NOTE(robinson) - this should get picked up as a narrative text
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
document.add_paragraph("This is my third thought.", style="Body Text")
# NOTE(robinson) - this should just be regular text
document.add_paragraph("2023")
# NOTE(robinson) - this should be an address
document.add_paragraph("DOYLESTOWN, PA 18901")

return document
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.8-dev5" # pragma: no cover
__version__ = "0.13.8-dev6" # pragma: no cover

0 comments on commit 67c1d28

Please sign in to comment.