From 1fb464235a2d4fb399244db4ff2ee7a34202b2d6 Mon Sep 17 00:00:00 2001 From: Amanda Cameron Date: Tue, 3 Oct 2023 09:40:34 -0700 Subject: [PATCH] chore: Table chunking (#1540) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change is adding to our `add_chunking_strategy` logic so that we are able to chunk Table elements' `text` and `text_as_html` params. In order to keep the functionality under the same `by_title` chunking strategy we have renamed the `combine_under_n_chars` to `max_characters`. It functions the same way for the combining elements under Title's, as well as specifying a chunk size (in chars) for TableChunk elements. *renaming the variable to `max_characters` will also reflect the 'hard max' we will implement for large elements in followup PRs Additionally -> some lint changes snuck in when I ran `make tidy` hence the minor changes in unrelated files :) TODO: ✅ add unit tests --> note: added where I could to unit tests! Some unit tests I just clarified that the chunking strategy was now 'by_title' because we don't have a file example that has Table elements to test the 'by_num_characters' chunking strategy ✅ update changelog To manually test: ``` In [1]: filename="example-docs/example-10k.html" In [2]: from unstructured.chunking.title import chunk_table_element In [3]: from unstructured.partition.auto import partition In [4]: elements = partition(filename) # element at -2 happens to be a Table, and we'll get chunks of char size 4 here In [5]: chunks = chunk_table_element(elements[-2], 4) # examine text and text_as_html params ln [6]: for c in chunks: print(c.text) print(c.metadata.text_as_html) ``` --------- Co-authored-by: Yao You --- CHANGELOG.md | 4 +- test_unstructured/chunking/test_title.py | 51 +++++-- test_unstructured/partition/csv/test_csv.py | 16 ++ test_unstructured/partition/docx/test_docx.py | 43 +++++- test_unstructured/partition/epub/test_epub.py | 21 +++ .../partition/markdown/test_md.py | 2 +- test_unstructured/partition/msg/test_msg.py | 2 +- test_unstructured/partition/odt/test_odt.py | 23 ++- .../partition/pdf-image/test_image.py | 19 +++ .../partition/pdf-image/test_pdf.py | 2 +- test_unstructured/partition/pptx/test_ppt.py | 2 +- test_unstructured/partition/pptx/test_pptx.py | 4 +- .../partition/pypandoc/test_org.py | 2 +- test_unstructured/partition/test_auto.py | 87 ++++++++++- unstructured/__version__.py | 2 +- unstructured/chunking/title.py | 142 +++++++++++------- unstructured/documents/elements.py | 12 ++ unstructured/ingest/interfaces.py | 8 +- unstructured/partition/csv.py | 2 + unstructured/partition/xlsx.py | 2 + unstructured/staging/weaviate.py | 1 + 21 files changed, 356 insertions(+), 91 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index febb223d00..0d06dc15f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.19-dev6 +## 0.10.19-dev7 ### Enhancements @@ -6,6 +6,8 @@ * **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title. * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself. * * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. +* **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=` argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length characters. This means partitioned Table results are ready for use in downstream applications without any post processing. + ### Features diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 8ccfde5af8..bc8bdcc6b0 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -31,7 +31,7 @@ def test_split_elements_by_title_and_table(): Text("It is storming outside."), CheckBox(), ] - sections = _split_elements_by_title_and_table(elements, combine_under_n_chars=0) + sections = _split_elements_by_title_and_table(elements, combine_text_under_n_chars=0) assert sections == [ [ @@ -75,7 +75,7 @@ def test_chunk_by_title(): Text("It is storming outside."), CheckBox(), ] - chunks = chunk_by_title(elements, combine_under_n_chars=0) + chunks = chunk_by_title(elements, combine_text_under_n_chars=0) assert chunks == [ CompositeElement( @@ -112,7 +112,7 @@ def test_chunk_by_title_respects_section_change(): Text("It is storming outside."), CheckBox(), ] - chunks = chunk_by_title(elements, combine_under_n_chars=0) + chunks = chunk_by_title(elements, combine_text_under_n_chars=0) assert chunks == [ CompositeElement( @@ -147,7 +147,7 @@ def test_chunk_by_title_separates_by_page_number(): Text("It is storming outside."), CheckBox(), ] - chunks = chunk_by_title(elements, multipage_sections=False, combine_under_n_chars=0) + chunks = chunk_by_title(elements, multipage_sections=False, combine_text_under_n_chars=0) assert chunks == [ CompositeElement( @@ -182,7 +182,7 @@ def test_chunk_by_title_groups_across_pages(): Text("It is storming outside."), CheckBox(), ] - chunks = chunk_by_title(elements, multipage_sections=True, combine_under_n_chars=0) + chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0) assert chunks == [ CompositeElement( @@ -212,24 +212,32 @@ def test_add_chunking_strategy_on_partition_html_respects_multipage(): filename, chunking_strategy="by_title", multipage_sections=False, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) partitioned_elements_multipage_true_combine_chars_0 = partition_html( filename, chunking_strategy="by_title", multipage_sections=True, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) elements = partition_html(filename) cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title( elements, multipage_sections=False, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title( elements, multipage_sections=True, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) assert ( partitioned_elements_multipage_false_combine_chars_0 @@ -244,7 +252,21 @@ def test_add_chunking_strategy_on_partition_html_respects_multipage(): ) -def test_add_chunking_strategy_raises_error_for_invalid_n_chars(): +@pytest.mark.parametrize( + ("combine_text_under_n_chars", "new_after_n_chars", "max_characters"), + [ + (-1, -1, -1), + (0, 0, 0), + (-5666, -6777, -8999), + (-5, 40, 50), + (50, 100, 20), + ], +) +def test_add_chunking_strategy_raises_error_for_invalid_n_chars( + combine_text_under_n_chars, + new_after_n_chars, + max_characters, +): elements = [ Title("A Great Day"), Text("Today is a great day."), @@ -258,7 +280,12 @@ def test_add_chunking_strategy_raises_error_for_invalid_n_chars(): CheckBox(), ] with pytest.raises(ValueError): - chunk_by_title(elements, combine_under_n_chars=1, new_after_n_chars=0) + chunk_by_title( + elements, + combine_text_under_n_chars=combine_text_under_n_chars, + new_after_n_chars=new_after_n_chars, + max_characters=max_characters, + ) def test_chunk_by_title_drops_extra_metadata(): @@ -335,7 +362,7 @@ def test_chunk_by_title_drops_extra_metadata(): ), ] - chunks = chunk_by_title(elements, combine_under_n_chars=0) + chunks = chunk_by_title(elements, combine_text_under_n_chars=0) assert str(chunks[0]) == str( CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."), diff --git a/test_unstructured/partition/csv/test_csv.py b/test_unstructured/partition/csv/test_csv.py index 050c2c2567..3f3d5e4ae0 100644 --- a/test_unstructured/partition/csv/test_csv.py +++ b/test_unstructured/partition/csv/test_csv.py @@ -8,6 +8,7 @@ EXPECTED_TEXT, EXPECTED_TEXT_WITH_EMOJI, ) +from unstructured.chunking.title import chunk_by_title from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import Table from unstructured.partition.csv import partition_csv @@ -189,3 +190,18 @@ def test_partition_csv_with_json(filename, expected_text, expected_table): assert elements[0].metadata.filename == test_elements[0].metadata.filename for i in range(len(elements)): assert elements[i] == test_elements[i] + + +def test_add_chunking_strategy_to_partition_csv_non_default(): + filename = "example-docs/stanley-cups.csv" + + elements = partition_csv(filename=filename) + chunk_elements = partition_csv( + filename, + chunking_strategy="by_title", + max_characters=9, + combine_text_under_n_chars=0, + ) + chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0) + assert chunk_elements != elements + assert chunk_elements == chunks diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index 9c82c9d471..c622c390b4 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -18,6 +18,7 @@ ListItem, NarrativeText, Table, + TableChunk, Text, Title, ) @@ -422,14 +423,6 @@ def test_partition_docx_with_json(mock_document, expected_elements, tmpdir): assert elements[i] == test_elements[i] -def test_add_chunking_strategy_on_partition_docx(filename="example-docs/handbook-1p.docx"): - chunk_elements = partition_docx(filename, chunking_strategy="by_title") - elements = partition_docx(filename) - chunks = chunk_by_title(elements) - assert chunk_elements != elements - assert chunk_elements == chunks - - def test_parse_category_depth_by_style(): partitioner = _DocxPartitioner("example-docs/category-level.docx", None, None, False, None) @@ -489,3 +482,37 @@ def test_parse_category_depth_by_style_name(): def test_parse_category_depth_by_style_ilvl(): partitioner = _DocxPartitioner(None, None, None, False, None) assert partitioner._parse_category_depth_by_style_ilvl() == 0 + + +def test_add_chunking_strategy_on_partition_docx_default_args( + filename="example-docs/handbook-1p.docx", +): + chunk_elements = partition_docx(filename, chunking_strategy="by_title") + elements = partition_docx(filename) + chunks = chunk_by_title(elements) + + assert chunk_elements != elements + assert chunk_elements == chunks + + +def test_add_chunking_strategy_on_partition_docx( + filename="example-docs/fake-doc-emphasized-text.docx", +): + chunk_elements = partition_docx( + filename, + chunking_strategy="by_title", + max_characters=9, + combine_text_under_n_chars=5, + ) + elements = partition_docx(filename) + chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=5) + # remove the last element of the TableChunk list because it will be the leftover slice + # and not necessarily the max_characters len + table_chunks = [chunk for chunk in chunks if isinstance(chunk, TableChunk)][:-1] + other_chunks = [chunk for chunk in chunks if not isinstance(chunk, TableChunk)] + for table_chunk in table_chunks: + assert len(table_chunk.text) == 9 + for chunk in other_chunks: + assert len(chunk.text) >= 5 + assert chunk_elements != elements + assert chunk_elements == chunks diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/epub/test_epub.py index 7d0e741899..991ec1991f 100644 --- a/test_unstructured/partition/epub/test_epub.py +++ b/test_unstructured/partition/epub/test_epub.py @@ -193,3 +193,24 @@ def test_add_chunking_strategy_on_partition_epub( chunks = chunk_by_title(elements) assert chunk_elements != elements assert chunk_elements == chunks + + +def test_add_chunking_strategy_on_partition_epub_non_default( + filename=os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub"), +): + elements = partition_epub(filename=filename) + chunk_elements = partition_epub( + filename, + chunking_strategy="by_title", + max_characters=5, + new_after_n_chars=5, + combine_text_under_n_chars=0, + ) + chunks = chunk_by_title( + elements, + max_characters=5, + new_after_n_chars=5, + combine_text_under_n_chars=0, + ) + assert chunk_elements != elements + assert chunk_elements == chunks diff --git a/test_unstructured/partition/markdown/test_md.py b/test_unstructured/partition/markdown/test_md.py index 33d131b7a3..c73247998c 100644 --- a/test_unstructured/partition/markdown/test_md.py +++ b/test_unstructured/partition/markdown/test_md.py @@ -276,7 +276,7 @@ def test_partition_md_with_json( assert elements[i] == test_elements[i] -def test_add_chunking_strategy_on_partition_md( +def test_add_chunking_strategy_by_title_on_partition_md( filename="example-docs/README.md", ): elements = partition_md(filename=filename) diff --git a/test_unstructured/partition/msg/test_msg.py b/test_unstructured/partition/msg/test_msg.py index 6e179987a2..7678a6cda5 100644 --- a/test_unstructured/partition/msg/test_msg.py +++ b/test_unstructured/partition/msg/test_msg.py @@ -285,7 +285,7 @@ def test_partition_msg_with_pgp_encrypted_message( assert "Encrypted email detected" in caplog.text -def test_add_chunking_strategy_on_partition_msg( +def test_add_chunking_strategy_by_title_on_partition_msg( filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg"), ): elements = partition_msg(filename=filename) diff --git a/test_unstructured/partition/odt/test_odt.py b/test_unstructured/partition/odt/test_odt.py index 9fe9b4b99d..982a11f9b4 100644 --- a/test_unstructured/partition/odt/test_odt.py +++ b/test_unstructured/partition/odt/test_odt.py @@ -2,7 +2,7 @@ import pathlib from unstructured.chunking.title import chunk_by_title -from unstructured.documents.elements import Table, Title +from unstructured.documents.elements import Table, TableChunk, Title from unstructured.partition.json import partition_json from unstructured.partition.odt import partition_odt from unstructured.staging.base import elements_to_json @@ -169,3 +169,24 @@ def test_add_chunking_strategy_on_partition_odt( chunks = chunk_by_title(elements) assert chunk_elements != elements assert chunk_elements == chunks + + +def test_add_chunking_strategy_on_partition_odt_non_default(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt") + elements = partition_odt(filename=filename) + chunk_elements = partition_odt( + filename, + chunking_strategy="by_title", + max_characters=7, + combine_text_under_n_chars=5, + ) + chunks = chunk_by_title( + elements, + max_characters=7, + combine_text_under_n_chars=5, + ) + for chunk in chunk_elements: + if isinstance(chunk, TableChunk): + assert len(chunk.text) <= 7 + assert chunk_elements != elements + assert chunk_elements == chunks diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py index e2c9496356..721eed64dd 100644 --- a/test_unstructured/partition/pdf-image/test_image.py +++ b/test_unstructured/partition/pdf-image/test_image.py @@ -460,6 +460,25 @@ def test_add_chunking_strategy_on_partition_image( assert chunk_elements == chunks +def test_add_chunking_strategy_on_partition_image_hi_res( + filename="example-docs/layout-parser-paper-with-table.jpg", +): + elements = image.partition_image( + filename=filename, + strategy="hi_res", + infer_table_structure=True, + ) + chunk_elements = image.partition_image( + filename, + strategy="hi_res", + infer_table_structure=True, + chunking_strategy="by_title", + ) + chunks = chunk_by_title(elements) + assert chunk_elements != elements + assert chunk_elements == chunks + + def test_partition_image_uses_model_name(): with mock.patch.object( pdf, diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index e14a793a2a..37af371598 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -838,7 +838,7 @@ def test_partition_pdf_with_ocr_coordinates_are_not_nan_from_file( assert point[1] is not math.nan -def test_add_chunking_strategy_on_partition_pdf( +def test_add_chunking_strategy_by_title_on_partition_pdf( filename="example-docs/layout-parser-paper-fast.pdf", ): elements = pdf.partition_pdf(filename=filename) diff --git a/test_unstructured/partition/pptx/test_ppt.py b/test_unstructured/partition/pptx/test_ppt.py index 1662002ddd..3750e0e9c6 100644 --- a/test_unstructured/partition/pptx/test_ppt.py +++ b/test_unstructured/partition/pptx/test_ppt.py @@ -174,7 +174,7 @@ def test_partition_ppt_with_json( assert elements[i] == test_elements[i] -def test_add_chunking_strategy_on_partition_ppt( +def test_add_chunking_strategy_by_title_on_partition_ppt( filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt"), ): elements = partition_ppt(filename=filename) diff --git a/test_unstructured/partition/pptx/test_pptx.py b/test_unstructured/partition/pptx/test_pptx.py index 3540c020e7..37e9b7ce3e 100644 --- a/test_unstructured/partition/pptx/test_pptx.py +++ b/test_unstructured/partition/pptx/test_pptx.py @@ -371,8 +371,8 @@ def test_partition_pptx_with_json(): assert elements[i] == test_elements[i] -def test_add_chunking_strategy_on_partition_pptx(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx") +def test_add_chunking_strategy_by_title_on_partition_pptx(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "science-exploration-1p.pptx") elements = partition_pptx(filename=filename) chunk_elements = partition_pptx(filename, chunking_strategy="by_title") chunks = chunk_by_title(elements) diff --git a/test_unstructured/partition/pypandoc/test_org.py b/test_unstructured/partition/pypandoc/test_org.py index 9017c5e86f..81ad6d4ed2 100644 --- a/test_unstructured/partition/pypandoc/test_org.py +++ b/test_unstructured/partition/pypandoc/test_org.py @@ -136,7 +136,7 @@ def test_partition_org_with_json(filename="example-docs/README.org"): assert elements[i] == test_elements[i] -def test_add_chunking_strategy_on_partition_org( +def test_add_chunking_strategy_by_title_on_partition_org( filename="example-docs/README.org", ): elements = partition_org(filename=filename) diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index dcacf01ba2..a0c907aad3 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -17,6 +17,7 @@ ListItem, NarrativeText, Table, + TableChunk, Text, Title, ) @@ -937,37 +938,45 @@ def test_get_partition_with_extras_prompts_for_install_if_missing(): def test_add_chunking_strategy_on_partition_auto(): filename = "example-docs/example-10k-1p.html" - chunk_elements = partition(filename, chunking_strategy="by_title") elements = partition(filename) + chunk_elements = partition(filename, chunking_strategy="by_title") chunks = chunk_by_title(elements) assert chunk_elements != elements assert chunk_elements == chunks -def test_add_chunking_strategy_on_partition_auto_respects_multipage(): +def test_add_chunking_strategy_title_on_partition_auto_respects_multipage(): filename = "example-docs/example-10k-1p.html" partitioned_elements_multipage_false_combine_chars_0 = partition( filename, chunking_strategy="by_title", multipage_sections=False, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) partitioned_elements_multipage_true_combine_chars_0 = partition( filename, chunking_strategy="by_title", multipage_sections=True, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) elements = partition(filename) cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title( elements, multipage_sections=False, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title( elements, multipage_sections=True, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) assert ( partitioned_elements_multipage_false_combine_chars_0 @@ -980,3 +989,69 @@ def test_add_chunking_strategy_on_partition_auto_respects_multipage(): assert len(partitioned_elements_multipage_true_combine_chars_0) != len( partitioned_elements_multipage_false_combine_chars_0, ) + + +def test_add_chunking_strategy_on_partition_auto_respects_max_chars(): + filename = "example-docs/example-10k-1p.html" + + # default chunk size in chars is 200 + partitioned_table_elements_200_chars = [ + e + for e in partition( + filename, + chunking_strategy="by_title", + max_characters=200, + combine_text_under_n_chars=5, + ) + if isinstance(e, (Table, TableChunk)) + ] + + partitioned_table_elements_5_chars = [ + e + for e in partition( + filename, + chunking_strategy="by_title", + max_characters=5, + combine_text_under_n_chars=5, + ) + if isinstance(e, (Table, TableChunk)) + ] + + elements = partition(filename) + + table_elements = [e for e in elements if isinstance(e, Table)] + + assert len(partitioned_table_elements_5_chars) != len(table_elements) + assert len(partitioned_table_elements_200_chars) != len(table_elements) + + assert len(partitioned_table_elements_5_chars[0].text) == 5 + assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5 + + # the first table element is under 200 chars so doesn't get chunked! + assert table_elements[0] == partitioned_table_elements_200_chars[0] + assert len(partitioned_table_elements_200_chars[0].text) < 200 + assert len(partitioned_table_elements_200_chars[1].text) == 200 + assert len(partitioned_table_elements_200_chars[1].metadata.text_as_html) == 200 + + +def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation(): + filename = "example-docs/example-10k-1p.html" + + # default chunk size in chars is 200 + partitioned_table_elements_200_chars = [ + e + for e in partition( + filename, + chunking_strategy="by_num_characters", + ) + if isinstance(e, Table) + ] + + i = 0 + for table in partitioned_table_elements_200_chars: + # have to reset the counter to 0 here when we encounter a Table element + if isinstance(table, Table): + i = 0 + if i > 0 and isinstance(table, TableChunk): + assert table.metadata.is_continuation is True + i += 1 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5f8fd628c9..a4cf981717 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev6" # pragma: no cover +__version__ = "0.10.19-dev7" # pragma: no cover diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index 8fd38d62f3..0c5bde799c 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -1,6 +1,7 @@ +import copy import functools import inspect -from typing import Any, Callable, Dict, List, TypeVar +from typing import Any, Callable, Dict, List, Optional, TypeVar, Union from typing_extensions import ParamSpec @@ -9,96 +10,130 @@ Element, ElementMetadata, Table, + TableChunk, Text, Title, ) +def chunk_table_element( + element: Table, + max_characters: Optional[int] = 500, +) -> List[Union[Table, TableChunk]]: + text = element.text + html = getattr(element, "text_as_html", None) + + if len(text) <= max_characters and ( # type: ignore + html is None or len(html) <= max_characters # type: ignore + ): + return [element] + + chunks: List[Union[Table, TableChunk]] = [] + metadata = copy.copy(element.metadata) + is_continuation = False + + while text or html: + text_chunk, text = text[:max_characters], text[max_characters:] + table_chunk = TableChunk(text=text_chunk, metadata=copy.copy(metadata)) + + if html: + html_chunk, html = html[:max_characters], html[max_characters:] + table_chunk.metadata.text_as_html = html_chunk + + if is_continuation: + table_chunk.metadata.is_continuation = True + + chunks.append(table_chunk) + is_continuation = True + + return chunks + + def chunk_by_title( elements: List[Element], multipage_sections: bool = True, - combine_under_n_chars: int = 500, - new_after_n_chars: int = 1500, + combine_text_under_n_chars: int = 500, + new_after_n_chars: int = 500, + max_characters: int = 500, ) -> List[Element]: """Uses title elements to identify sections within the document for chunking. Splits off into a new section when a title is detected or if metadata changes, which happens when page numbers or sections change. Cuts off sections once they have exceeded - a character length of new_after_n_chars. + a character length of max_characters. Parameters ---------- elements - A list of unstructured elements. Usually the ouput of a partition functions. + A list of unstructured elements. Usually the output of a partition functions. multipage_sections If True, sections can span multiple pages. Defaults to True. - combine_under_n_chars + combine_text_under_n_chars Combines elements (for example a series of titles) until a section reaches a length of n characters. new_after_n_chars - Cuts off new sections once they reach a length of n characters + Cuts off new sections once they reach a length of n characters (soft max) + max_characters + Chunks table elements text and text_as_html into chunks of length n characters (hard max) + TODO: (amanda) extend to other elements """ if ( - combine_under_n_chars is not None + combine_text_under_n_chars is not None and new_after_n_chars is not None + and max_characters is not None and ( - combine_under_n_chars > new_after_n_chars - or combine_under_n_chars < 0 + combine_text_under_n_chars > new_after_n_chars + or combine_text_under_n_chars < 0 or new_after_n_chars < 0 + or max_characters <= 0 + or combine_text_under_n_chars > max_characters ) ): raise ValueError( - "Invalid values for combine_under_n_chars and/or new_after_n_chars.", + "Invalid values for combine_text_under_n_chars and/or max_characters.", ) chunked_elements: List[Element] = [] sections = _split_elements_by_title_and_table( elements, multipage_sections=multipage_sections, - combine_under_n_chars=combine_under_n_chars, + combine_text_under_n_chars=combine_text_under_n_chars, new_after_n_chars=new_after_n_chars, ) - for section in sections: if not section: continue - if not isinstance(section[0], Text) or isinstance(section[0], Table): - chunked_elements.extend(section) - elif isinstance(section[0], Text): - text = "" - metadata = section[0].metadata + first_element = section[0] - for i, element in enumerate(section): - if isinstance(element, Text): - text += "\n\n" if text else "" - start_char = len(text) - text += element.text + if not isinstance(first_element, Text): + chunked_elements.extend(section) + continue - for attr, value in vars(element.metadata).items(): - if not isinstance(value, list): - continue + elif isinstance(first_element, Table): + chunked_elements.extend(chunk_table_element(first_element, max_characters)) + continue - _value = getattr(metadata, attr, []) - if _value is None: - _value = [] + text = "" + metadata = first_element.metadata + start_char = 0 + for element in section: + if isinstance(element, Text): + text += "\n\n" if text else "" + start_char = len(text) + text += element.text + for attr, value in vars(element.metadata).items(): + if isinstance(value, list): + _value = getattr(metadata, attr, []) or [] if attr == "regex_metadata": for item in value: item["start"] += start_char item["end"] += start_char - if i > 0: - # NOTE(newelh): Previously, _value was extended with value. - # This caused a memory error if the content was a list of strings - # with a large number of elements -- doubling the list size each time. - # This now instead ensures that the _value list is unique and updated. - for item in value: - if item not in _value: - _value.append(item) - - setattr(metadata, attr, _value) + _value.extend(item for item in value if item not in _value) + setattr(metadata, attr, _value) - chunked_elements.append(CompositeElement(text=text, metadata=metadata)) + chunked_elements.append(CompositeElement(text=text, metadata=metadata)) return chunked_elements @@ -106,8 +141,8 @@ def chunk_by_title( def _split_elements_by_title_and_table( elements: List[Element], multipage_sections: bool = True, - combine_under_n_chars: int = 500, - new_after_n_chars: int = 1500, + combine_text_under_n_chars: int = 500, + new_after_n_chars: int = 500, ) -> List[List[Element]]: sections: List[List[Element]] = [] section: List[Element] = [] @@ -123,11 +158,11 @@ def _split_elements_by_title_and_table( ) section_length = sum([len(str(element)) for element in section]) - new_section = (isinstance(element, Title) and section_length > combine_under_n_chars) or ( - not metadata_matches or section_length > new_after_n_chars - ) + new_section = ( + isinstance(element, Title) and section_length > combine_text_under_n_chars + ) or (not metadata_matches or section_length > new_after_n_chars) - if isinstance(element, Table) or not isinstance(element, Text): + if not isinstance(element, Text) or isinstance(element, Table): sections.append(section) sections.append([element]) section = [] @@ -185,7 +220,7 @@ def add_chunking_strategy() -> Callable[[Callable[_P, List[Element]]], Callable[ """Decorator for chuncking text. Uses title elements to identify sections within the document for chunking. Splits off a new section when a title is detected or if metadata changes, which happens when page numbers or sections change. Cuts off sections once they have exceeded - a character length of new_after_n_chars.""" + a character length of max_characters.""" def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: if func.__doc__ and ( @@ -199,11 +234,15 @@ def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: + "\n\tAdditional Parameters:" + "\n\t\tmultipage_sections" + "\n\t\t\tIf True, sections can span multiple pages. Defaults to True." - + "\n\t\tcombine_under_n_chars" + + "\n\t\tcombine_text_under_n_chars" + "\n\t\t\tCombines elements (for example a series of titles) until a section" + "\n\t\t\treaches a length of n characters." + "\n\t\tnew_after_n_chars" - + "\n\t\t\tCuts off new sections once they reach a length of n characters" + + "\n\t\t\t Cuts off new sections once they reach a length of n characters" + + "\n\t\t\t a soft max." + + "\n\t\tmax_characters" + + "\n\t\t\tChunks table elements text and text_as_html into chunks" + + "\n\t\t\tof length n characters, a hard max." ) @functools.wraps(func) @@ -218,8 +257,9 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: elements = chunk_by_title( elements, multipage_sections=params.get("multipage_sections", True), - combine_under_n_chars=params.get("combine_under_n_chars", 500), - new_after_n_chars=params.get("new_after_n_chars", 1500), + combine_text_under_n_chars=params.get("combine_text_under_n_chars", 500), + new_after_n_chars=params.get("new_after_n_chars", 500), + max_characters=params.get("max_characters", 500), ) return elements diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index f051e1b4f6..75c15a3e36 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -185,6 +185,10 @@ class ElementMetadata: # Metadata extracted via regex regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None + # Chunking metadata fields + num_characters: Optional[int] = None + is_continuation: Optional[bool] = None + # Detection Model Class Probabilities from Unstructured-Inference Hi-Res detection_class_prob: Optional[float] = None @@ -566,6 +570,14 @@ class Table(Text): pass +class TableChunk(Table): + """An element for capturing chunks of tables.""" + + category = "Table" + + pass + + class Header(Text): """An element for capturing document headers.""" diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index c76fdfb783..caefa50afd 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -83,16 +83,16 @@ def get_embedder(self) -> BaseEmbeddingEncoder: class ChunkingConfig(BaseConfig): chunk_elements: bool = False multipage_sections: bool = True - combine_under_n_chars: int = 500 - new_after_n_chars: int = 1500 + combine_text_under_n_chars: int = 500 + max_characters: int = 1500 def chunk(self, elements: t.List[Element]) -> t.List[Element]: if self.chunk_elements: return chunk_by_title( elements=elements, multipage_sections=self.multipage_sections, - combine_under_n_chars=self.combine_under_n_chars, - new_after_n_chars=self.new_after_n_chars, + combine_text_under_n_chars=self.combine_text_under_n_chars, + max_characters=self.max_characters, ) else: return elements diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 6a7314de03..2528f321cd 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -4,6 +4,7 @@ import pandas as pd from lxml.html.soupparser import fromstring as soupparser_fromstring +from unstructured.chunking.title import add_chunking_strategy from unstructured.documents.elements import ( Element, ElementMetadata, @@ -21,6 +22,7 @@ @process_metadata() @add_metadata_with_filetype(FileType.CSV) +@add_chunking_strategy() def partition_csv( filename: Optional[str] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index 2f4538210f..ebffd6cdf9 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -4,6 +4,7 @@ import pandas as pd from lxml.html.soupparser import fromstring as soupparser_fromstring +from unstructured.chunking.title import add_chunking_strategy from unstructured.documents.elements import ( Element, ElementMetadata, @@ -21,6 +22,7 @@ @process_metadata() @add_metadata_with_filetype(FileType.XLSX) +@add_chunking_strategy() def partition_xlsx( filename: Optional[str] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, diff --git a/unstructured/staging/weaviate.py b/unstructured/staging/weaviate.py index c6efc80bd4..4a4e15276c 100644 --- a/unstructured/staging/weaviate.py +++ b/unstructured/staging/weaviate.py @@ -15,6 +15,7 @@ class Properties(TypedDict): "regex_metadata", "emphasized_texts", "detection_class_prob", + "is_continuation", )