diff --git a/CHANGELOG.md b/CHANGELOG.md index 40d0576bb9..d504a2a1ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.8-dev5 +## 0.13.8-dev6 ### Enhancements diff --git a/test_unstructured/partition/docx/test_doc.py b/test_unstructured/partition/docx/test_doc.py index 2d80e18a47..a87722a968 100644 --- a/test_unstructured/partition/docx/test_doc.py +++ b/test_unstructured/partition/docx/test_doc.py @@ -20,59 +20,15 @@ from unstructured.partition.docx import partition_docx -def test_partition_doc_for_deterministic_and_unique_ids(): - ids = [element.id for element in partition_doc("example-docs/duplicate-paragraphs.doc")] - - assert ids == [ - "ade273c622c48d67a7be7b3816d5b4d8", - "7d0b32fdf169f9578723486cb4bc1235", - "1feb6e8e9c1662cfaef75907aeeb0900", - "aa2a8ac10143b12f0fe2087837ea11d2", - "da31ba7ed3919067d2c6572dc1617271", - "1914359c179a160df921b769acf8c353", - "f9d0d379fc791bae487b7a45f65caa50", - ] - - -@pytest.fixture() -def mock_document(): - document = docx.Document() - - document.add_paragraph("These are a few of my favorite things:", style="Heading 1") - # NOTE(robinson) - this should get picked up as a list item due to the • - document.add_paragraph("• Parrots", style="Normal") - # NOTE(robinson) - this should get dropped because it's empty - document.add_paragraph("• ", style="Normal") - document.add_paragraph("Hockey", style="List Bullet") - # NOTE(robinson) - this should get dropped because it's empty - document.add_paragraph("", style="List Bullet") - # NOTE(robinson) - this should get picked up as a title - document.add_paragraph("Analysis", style="Normal") - # NOTE(robinson) - this should get dropped because it is empty - document.add_paragraph("", style="Normal") - # NOTE(robinson) - this should get picked up as a narrative text - document.add_paragraph("This is my first thought. This is my second thought.", style="Normal") - document.add_paragraph("This is my third thought.", style="Body Text") - # NOTE(robinson) - this should just be regular text - document.add_paragraph("2023") - # NOTE(robinson) - this should be an address - document.add_paragraph("DOYLESTOWN, PA 18901") +def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir): + docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") + doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") + mock_document.save(docx_filename) + convert_office_doc(docx_filename, tmpdir.dirname, "doc") + assert partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename) - return document - -@pytest.fixture() -def expected_elements(): - return [ - Title("These are a few of my favorite things:"), - ListItem("Parrots"), - ListItem("Hockey"), - Title("Analysis"), - NarrativeText("This is my first thought. This is my second thought."), - NarrativeText("This is my third thought."), - Text("2023"), - Address("DOYLESTOWN, PA 18901"), - ] +# -- document-source (file or filename) ---------------------------------------------------------- def test_partition_doc_from_filename(mock_document, expected_elements, tmpdir, capsys): @@ -88,36 +44,6 @@ def test_partition_doc_from_filename(mock_document, expected_elements, tmpdir, c assert capsys.readouterr().err == "" -def test_partition_doc_from_filename_with_metadata_filename( - mock_document, - expected_elements, - tmpdir, -): - docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") - doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") - mock_document.save(docx_filename) - convert_office_doc(docx_filename, tmpdir.dirname, "doc") - - elements = partition_doc(filename=doc_filename, metadata_filename="test") - assert elements == expected_elements - assert all(element.metadata.filename == "test" for element in elements) - - -def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir): - docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") - doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") - mock_document.save(docx_filename) - convert_office_doc(docx_filename, tmpdir.dirname, "doc") - assert partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename) - - -def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir): - doc_filename = os.path.join(tmpdir.dirname, "asdf.doc") - - with pytest.raises(ValueError): - partition_doc(filename=doc_filename) - - def test_partition_doc_from_file_with_filter(mock_document, expected_elements, tmpdir, capsys): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") @@ -148,18 +74,6 @@ def test_partition_doc_from_file_with_no_filter(mock_document, expected_elements assert element.metadata.filename is None -def test_partition_doc_from_file_with_metadata_filename(mock_document, tmpdir): - docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") - doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") - mock_document.save(docx_filename) - convert_office_doc(docx_filename, tmpdir.dirname, "doc") - - with open(doc_filename, "rb") as f: - elements = partition_doc(file=f, metadata_filename="test") - for element in elements: - assert element.metadata.filename == "test" - - def test_partition_doc_raises_with_both_specified(mock_document, tmpdir): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") @@ -175,33 +89,76 @@ def test_partition_doc_raises_with_neither(): partition_doc() -def test_partition_doc_from_file_exclude_metadata(mock_document, tmpdir): +def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir): + doc_filename = os.path.join(tmpdir.dirname, "asdf.doc") + + with pytest.raises(ValueError): + partition_doc(filename=doc_filename) + + +# -- `include_metadata` arg ---------------------------------------------------------------------- + + +def test_partition_doc_from_filename_exclude_metadata(mock_document, tmpdir): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") mock_document.save(docx_filename) convert_office_doc(docx_filename, tmpdir.dirname, "doc") - with open(doc_filename, "rb") as f: - elements = partition_doc(file=f, include_metadata=False) + elements = partition_doc(filename=doc_filename, include_metadata=False) assert elements[0].metadata.filetype is None assert elements[0].metadata.page_name is None assert elements[0].metadata.filename is None -def test_partition_doc_from_filename_exclude_metadata(mock_document, tmpdir): +def test_partition_doc_from_file_exclude_metadata(mock_document, tmpdir): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") mock_document.save(docx_filename) convert_office_doc(docx_filename, tmpdir.dirname, "doc") - elements = partition_doc(filename=doc_filename, include_metadata=False) + with open(doc_filename, "rb") as f: + elements = partition_doc(file=f, include_metadata=False) assert elements[0].metadata.filetype is None assert elements[0].metadata.page_name is None assert elements[0].metadata.filename is None +# -- .metadata.filename -------------------------------------------------------------------------- + + +def test_partition_doc_from_filename_with_metadata_filename( + mock_document, + expected_elements, + tmpdir, +): + docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") + doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") + mock_document.save(docx_filename) + convert_office_doc(docx_filename, tmpdir.dirname, "doc") + + elements = partition_doc(filename=doc_filename, metadata_filename="test") + assert elements == expected_elements + assert all(element.metadata.filename == "test" for element in elements) + + +def test_partition_doc_from_file_with_metadata_filename(mock_document, tmpdir): + docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") + doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") + mock_document.save(docx_filename) + convert_office_doc(docx_filename, tmpdir.dirname, "doc") + + with open(doc_filename, "rb") as f: + elements = partition_doc(file=f, metadata_filename="test") + for element in elements: + assert element.metadata.filename == "test" + + +# -- .metadata.last_modified --------------------------------------------------------------------- + + def test_partition_doc_metadata_date( mocker, filename="example-docs/fake.doc", @@ -283,6 +240,19 @@ def test_partition_doc_from_file_explicit_get_metadata_date( assert elements[0].metadata.last_modified == mocked_last_modification_date +def test_partition_doc_from_file_without_metadata_date( + filename="example-docs/fake.doc", +): + """Test partition_doc() with file that are not possible to get last modified date""" + with open(filename, "rb") as f: + sf = SpooledTemporaryFile() + sf.write(f.read()) + sf.seek(0) + elements = partition_doc(file=sf, date_from_file_object=True) + + assert elements[0].metadata.last_modified is None + + def test_partition_doc_from_file_metadata_date_with_custom_metadata( mocker, filename="example-docs/fake.doc", @@ -302,17 +272,23 @@ def test_partition_doc_from_file_metadata_date_with_custom_metadata( assert elements[0].metadata.last_modified == expected_last_modified_date -def test_partition_doc_from_file_without_metadata_date( - filename="example-docs/fake.doc", -): - """Test partition_doc() with file that are not possible to get last modified date""" - with open(filename, "rb") as f: - sf = SpooledTemporaryFile() - sf.write(f.read()) - sf.seek(0) - elements = partition_doc(file=sf, date_from_file_object=True) +# -- language-recognition metadata --------------------------------------------------------------- - assert elements[0].metadata.last_modified is None + +def test_partition_doc_element_metadata_has_languages(): + filename = "example-docs/fake-doc-emphasized-text.doc" + elements = partition_doc(filename=filename) + assert elements[0].metadata.languages == ["eng"] + + +def test_partition_doc_respects_detect_language_per_element(): + filename = "example-docs/language-docs/eng_spa_mult.doc" + elements = partition_doc(filename=filename, detect_language_per_element=True) + langs = [element.metadata.languages for element in elements] + assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]] + + +# -- miscellaneous ------------------------------------------------------------------------------- def test_partition_doc_grabs_emphasized_texts(): @@ -352,14 +328,59 @@ def test_add_chunking_strategy_on_partition_doc(filename="example-docs/fake.doc" assert chunk_elements == chunks -def test_partition_doc_element_metadata_has_languages(): - filename = "example-docs/fake-doc-emphasized-text.doc" - elements = partition_doc(filename=filename) - assert elements[0].metadata.languages == ["eng"] +def test_partition_doc_for_deterministic_and_unique_ids(): + ids = [element.id for element in partition_doc("example-docs/duplicate-paragraphs.doc")] + assert ids == [ + "ade273c622c48d67a7be7b3816d5b4d8", + "7d0b32fdf169f9578723486cb4bc1235", + "1feb6e8e9c1662cfaef75907aeeb0900", + "aa2a8ac10143b12f0fe2087837ea11d2", + "da31ba7ed3919067d2c6572dc1617271", + "1914359c179a160df921b769acf8c353", + "f9d0d379fc791bae487b7a45f65caa50", + ] -def test_partition_doc_respects_detect_language_per_element(): - filename = "example-docs/language-docs/eng_spa_mult.doc" - elements = partition_doc(filename=filename, detect_language_per_element=True) - langs = [element.metadata.languages for element in elements] - assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]] + +# == module-level fixtures ======================================================================= + + +@pytest.fixture() +def expected_elements(): + return [ + Title("These are a few of my favorite things:"), + ListItem("Parrots"), + ListItem("Hockey"), + Title("Analysis"), + NarrativeText("This is my first thought. This is my second thought."), + NarrativeText("This is my third thought."), + Text("2023"), + Address("DOYLESTOWN, PA 18901"), + ] + + +@pytest.fixture() +def mock_document(): + document = docx.Document() + + document.add_paragraph("These are a few of my favorite things:", style="Heading 1") + # NOTE(robinson) - this should get picked up as a list item due to the • + document.add_paragraph("• Parrots", style="Normal") + # NOTE(robinson) - this should get dropped because it's empty + document.add_paragraph("• ", style="Normal") + document.add_paragraph("Hockey", style="List Bullet") + # NOTE(robinson) - this should get dropped because it's empty + document.add_paragraph("", style="List Bullet") + # NOTE(robinson) - this should get picked up as a title + document.add_paragraph("Analysis", style="Normal") + # NOTE(robinson) - this should get dropped because it is empty + document.add_paragraph("", style="Normal") + # NOTE(robinson) - this should get picked up as a narrative text + document.add_paragraph("This is my first thought. This is my second thought.", style="Normal") + document.add_paragraph("This is my third thought.", style="Body Text") + # NOTE(robinson) - this should just be regular text + document.add_paragraph("2023") + # NOTE(robinson) - this should be an address + document.add_paragraph("DOYLESTOWN, PA 18901") + + return document diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1e8fd23481..0e859a38f0 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.8-dev5" # pragma: no cover +__version__ = "0.13.8-dev6" # pragma: no cover