chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we are able to chunk Table elements' `text` and `text_as_html` params. In order to keep the functionality under the same `by_title` chunking strategy we have renamed the `combine_under_n_chars` to `max_characters`. It functions the same way for the combining elements under Title's, as well as specifying a chunk size (in chars) for TableChunk elements. *renaming the variable to `max_characters` will also reflect the 'hard max' we will implement for large elements in followup PRs Additionally -> some lint changes snuck in when I ran `make tidy` hence the minor changes in unrelated files :) TODO: ✅ add unit tests --> note: added where I could to unit tests! Some unit tests I just clarified that the chunking strategy was now 'by_title' because we don't have a file example that has Table elements to test the 'by_num_characters' chunking strategy ✅ update changelog To manually test: ``` In [1]: filename="example-docs/example-10k.html" In [2]: from unstructured.chunking.title import chunk_table_element In [3]: from unstructured.partition.auto import partition In [4]: elements = partition(filename) # element at -2 happens to be a Table, and we'll get chunks of char size 4 here In [5]: chunks = chunk_table_element(elements[-2], 4) # examine text and text_as_html params ln [6]: for c in chunks: print(c.text) print(c.metadata.text_as_html) ``` --------- Co-authored-by: Yao You <[email protected]>
Unstructured-IO · Oct 3, 2023 · 1fb4642 · 1fb4642
1 parent bcd0eee
commit 1fb4642
Show file tree

Hide file tree

Showing 21 changed files with 356 additions and 91 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,13 @@
-## 0.10.19-dev6
+## 0.10.19-dev7
 
 ### Enhancements
 
 * **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images.
 * **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title.
 * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself.
 * * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
+* **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=<n>` argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length <n> characters. This means partitioned Table results are ready for use in downstream applications without any post processing.
+
 
 ### Features 
 

diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
@@ -31,7 +31,7 @@ def test_split_elements_by_title_and_table():
         Text("It is storming outside."),
         CheckBox(),
     ]
-    sections = _split_elements_by_title_and_table(elements, combine_under_n_chars=0)
+    sections = _split_elements_by_title_and_table(elements, combine_text_under_n_chars=0)
 
     assert sections == [
         [
@@ -75,7 +75,7 @@ def test_chunk_by_title():
         Text("It is storming outside."),
         CheckBox(),
     ]
-    chunks = chunk_by_title(elements, combine_under_n_chars=0)
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
 
     assert chunks == [
         CompositeElement(
@@ -112,7 +112,7 @@ def test_chunk_by_title_respects_section_change():
         Text("It is storming outside."),
         CheckBox(),
     ]
-    chunks = chunk_by_title(elements, combine_under_n_chars=0)
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
 
     assert chunks == [
         CompositeElement(
@@ -147,7 +147,7 @@ def test_chunk_by_title_separates_by_page_number():
         Text("It is storming outside."),
         CheckBox(),
     ]
-    chunks = chunk_by_title(elements, multipage_sections=False, combine_under_n_chars=0)
+    chunks = chunk_by_title(elements, multipage_sections=False, combine_text_under_n_chars=0)
 
     assert chunks == [
         CompositeElement(
@@ -182,7 +182,7 @@ def test_chunk_by_title_groups_across_pages():
         Text("It is storming outside."),
         CheckBox(),
     ]
-    chunks = chunk_by_title(elements, multipage_sections=True, combine_under_n_chars=0)
+    chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0)
 
     assert chunks == [
         CompositeElement(
@@ -212,24 +212,32 @@ def test_add_chunking_strategy_on_partition_html_respects_multipage():
         filename,
         chunking_strategy="by_title",
         multipage_sections=False,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     partitioned_elements_multipage_true_combine_chars_0 = partition_html(
         filename,
         chunking_strategy="by_title",
         multipage_sections=True,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     elements = partition_html(filename)
     cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
         elements,
         multipage_sections=False,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
         elements,
         multipage_sections=True,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     assert (
         partitioned_elements_multipage_false_combine_chars_0
@@ -244,7 +252,21 @@ def test_add_chunking_strategy_on_partition_html_respects_multipage():
     )
 
 
-def test_add_chunking_strategy_raises_error_for_invalid_n_chars():
+@pytest.mark.parametrize(
+    ("combine_text_under_n_chars", "new_after_n_chars", "max_characters"),
+    [
+        (-1, -1, -1),
+        (0, 0, 0),
+        (-5666, -6777, -8999),
+        (-5, 40, 50),
+        (50, 100, 20),
+    ],
+)
+def test_add_chunking_strategy_raises_error_for_invalid_n_chars(
+    combine_text_under_n_chars,
+    new_after_n_chars,
+    max_characters,
+):
     elements = [
         Title("A Great Day"),
         Text("Today is a great day."),
@@ -258,7 +280,12 @@ def test_add_chunking_strategy_raises_error_for_invalid_n_chars():
         CheckBox(),
     ]
     with pytest.raises(ValueError):
-        chunk_by_title(elements, combine_under_n_chars=1, new_after_n_chars=0)
+        chunk_by_title(
+            elements,
+            combine_text_under_n_chars=combine_text_under_n_chars,
+            new_after_n_chars=new_after_n_chars,
+            max_characters=max_characters,
+        )
 
 
 def test_chunk_by_title_drops_extra_metadata():
@@ -335,7 +362,7 @@ def test_chunk_by_title_drops_extra_metadata():
         ),
     ]
 
-    chunks = chunk_by_title(elements, combine_under_n_chars=0)
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
 
     assert str(chunks[0]) == str(
         CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),

diff --git a/test_unstructured/partition/csv/test_csv.py b/test_unstructured/partition/csv/test_csv.py
@@ -8,6 +8,7 @@
     EXPECTED_TEXT,
     EXPECTED_TEXT_WITH_EMOJI,
 )
+from unstructured.chunking.title import chunk_by_title
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Table
 from unstructured.partition.csv import partition_csv
@@ -189,3 +190,18 @@ def test_partition_csv_with_json(filename, expected_text, expected_table):
     assert elements[0].metadata.filename == test_elements[0].metadata.filename
     for i in range(len(elements)):
         assert elements[i] == test_elements[i]
+
+
+def test_add_chunking_strategy_to_partition_csv_non_default():
+    filename = "example-docs/stanley-cups.csv"
+
+    elements = partition_csv(filename=filename)
+    chunk_elements = partition_csv(
+        filename,
+        chunking_strategy="by_title",
+        max_characters=9,
+        combine_text_under_n_chars=0,
+    )
+    chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0)
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
@@ -18,6 +18,7 @@
     ListItem,
     NarrativeText,
     Table,
+    TableChunk,
     Text,
     Title,
 )
@@ -422,14 +423,6 @@ def test_partition_docx_with_json(mock_document, expected_elements, tmpdir):
         assert elements[i] == test_elements[i]
 
 
-def test_add_chunking_strategy_on_partition_docx(filename="example-docs/handbook-1p.docx"):
-    chunk_elements = partition_docx(filename, chunking_strategy="by_title")
-    elements = partition_docx(filename)
-    chunks = chunk_by_title(elements)
-    assert chunk_elements != elements
-    assert chunk_elements == chunks
-
-
 def test_parse_category_depth_by_style():
     partitioner = _DocxPartitioner("example-docs/category-level.docx", None, None, False, None)
 
@@ -489,3 +482,37 @@ def test_parse_category_depth_by_style_name():
 def test_parse_category_depth_by_style_ilvl():
     partitioner = _DocxPartitioner(None, None, None, False, None)
     assert partitioner._parse_category_depth_by_style_ilvl() == 0
+
+
+def test_add_chunking_strategy_on_partition_docx_default_args(
+    filename="example-docs/handbook-1p.docx",
+):
+    chunk_elements = partition_docx(filename, chunking_strategy="by_title")
+    elements = partition_docx(filename)
+    chunks = chunk_by_title(elements)
+
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
+
+
+def test_add_chunking_strategy_on_partition_docx(
+    filename="example-docs/fake-doc-emphasized-text.docx",
+):
+    chunk_elements = partition_docx(
+        filename,
+        chunking_strategy="by_title",
+        max_characters=9,
+        combine_text_under_n_chars=5,
+    )
+    elements = partition_docx(filename)
+    chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=5)
+    # remove the last element of the TableChunk list because it will be the leftover slice
+    # and not necessarily the max_characters len
+    table_chunks = [chunk for chunk in chunks if isinstance(chunk, TableChunk)][:-1]
+    other_chunks = [chunk for chunk in chunks if not isinstance(chunk, TableChunk)]
+    for table_chunk in table_chunks:
+        assert len(table_chunk.text) == 9
+    for chunk in other_chunks:
+        assert len(chunk.text) >= 5
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/epub/test_epub.py
@@ -193,3 +193,24 @@ def test_add_chunking_strategy_on_partition_epub(
     chunks = chunk_by_title(elements)
     assert chunk_elements != elements
     assert chunk_elements == chunks
+
+
+def test_add_chunking_strategy_on_partition_epub_non_default(
+    filename=os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub"),
+):
+    elements = partition_epub(filename=filename)
+    chunk_elements = partition_epub(
+        filename,
+        chunking_strategy="by_title",
+        max_characters=5,
+        new_after_n_chars=5,
+        combine_text_under_n_chars=0,
+    )
+    chunks = chunk_by_title(
+        elements,
+        max_characters=5,
+        new_after_n_chars=5,
+        combine_text_under_n_chars=0,
+    )
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
diff --git a/test_unstructured/partition/markdown/test_md.py b/test_unstructured/partition/markdown/test_md.py
@@ -276,7 +276,7 @@ def test_partition_md_with_json(
         assert elements[i] == test_elements[i]
 
 
-def test_add_chunking_strategy_on_partition_md(
+def test_add_chunking_strategy_by_title_on_partition_md(
     filename="example-docs/README.md",
 ):
     elements = partition_md(filename=filename)

diff --git a/test_unstructured/partition/msg/test_msg.py b/test_unstructured/partition/msg/test_msg.py
@@ -285,7 +285,7 @@ def test_partition_msg_with_pgp_encrypted_message(
     assert "Encrypted email detected" in caplog.text
 
 
-def test_add_chunking_strategy_on_partition_msg(
+def test_add_chunking_strategy_by_title_on_partition_msg(
     filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg"),
 ):
     elements = partition_msg(filename=filename)

diff --git a/test_unstructured/partition/odt/test_odt.py b/test_unstructured/partition/odt/test_odt.py
@@ -2,7 +2,7 @@
 import pathlib
 
 from unstructured.chunking.title import chunk_by_title
-from unstructured.documents.elements import Table, Title
+from unstructured.documents.elements import Table, TableChunk, Title
 from unstructured.partition.json import partition_json
 from unstructured.partition.odt import partition_odt
 from unstructured.staging.base import elements_to_json
@@ -169,3 +169,24 @@ def test_add_chunking_strategy_on_partition_odt(
     chunks = chunk_by_title(elements)
     assert chunk_elements != elements
     assert chunk_elements == chunks
+
+
+def test_add_chunking_strategy_on_partition_odt_non_default():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
+    elements = partition_odt(filename=filename)
+    chunk_elements = partition_odt(
+        filename,
+        chunking_strategy="by_title",
+        max_characters=7,
+        combine_text_under_n_chars=5,
+    )
+    chunks = chunk_by_title(
+        elements,
+        max_characters=7,
+        combine_text_under_n_chars=5,
+    )
+    for chunk in chunk_elements:
+        if isinstance(chunk, TableChunk):
+            assert len(chunk.text) <= 7
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py
@@ -460,6 +460,25 @@ def test_add_chunking_strategy_on_partition_image(
     assert chunk_elements == chunks
 
 
+def test_add_chunking_strategy_on_partition_image_hi_res(
+    filename="example-docs/layout-parser-paper-with-table.jpg",
+):
+    elements = image.partition_image(
+        filename=filename,
+        strategy="hi_res",
+        infer_table_structure=True,
+    )
+    chunk_elements = image.partition_image(
+        filename,
+        strategy="hi_res",
+        infer_table_structure=True,
+        chunking_strategy="by_title",
+    )
+    chunks = chunk_by_title(elements)
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
+
+
 def test_partition_image_uses_model_name():
     with mock.patch.object(
         pdf,

diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py
@@ -838,7 +838,7 @@ def test_partition_pdf_with_ocr_coordinates_are_not_nan_from_file(
                     assert point[1] is not math.nan
 
 
-def test_add_chunking_strategy_on_partition_pdf(
+def test_add_chunking_strategy_by_title_on_partition_pdf(
     filename="example-docs/layout-parser-paper-fast.pdf",
 ):
     elements = pdf.partition_pdf(filename=filename)

diff --git a/test_unstructured/partition/pptx/test_ppt.py b/test_unstructured/partition/pptx/test_ppt.py
@@ -174,7 +174,7 @@ def test_partition_ppt_with_json(
         assert elements[i] == test_elements[i]
 
 
-def test_add_chunking_strategy_on_partition_ppt(
+def test_add_chunking_strategy_by_title_on_partition_ppt(
     filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt"),
 ):
     elements = partition_ppt(filename=filename)

diff --git a/test_unstructured/partition/pptx/test_pptx.py b/test_unstructured/partition/pptx/test_pptx.py
@@ -371,8 +371,8 @@ def test_partition_pptx_with_json():
         assert elements[i] == test_elements[i]
 
 
-def test_add_chunking_strategy_on_partition_pptx():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
+def test_add_chunking_strategy_by_title_on_partition_pptx():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "science-exploration-1p.pptx")
     elements = partition_pptx(filename=filename)
     chunk_elements = partition_pptx(filename, chunking_strategy="by_title")
     chunks = chunk_by_title(elements)

diff --git a/test_unstructured/partition/pypandoc/test_org.py b/test_unstructured/partition/pypandoc/test_org.py
@@ -136,7 +136,7 @@ def test_partition_org_with_json(filename="example-docs/README.org"):
         assert elements[i] == test_elements[i]
 
 
-def test_add_chunking_strategy_on_partition_org(
+def test_add_chunking_strategy_by_title_on_partition_org(
     filename="example-docs/README.org",
 ):
     elements = partition_org(filename=filename)