tidying

Unstructured-IO · Sep 29, 2023 · c70ba59 · c70ba59
1 parent 0522f9e
commit c70ba59
Show file tree

Hide file tree

Showing 8 changed files with 29 additions and 27 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,13 +11,12 @@
 * **Embeddings support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally create embeddings from the elements it pulls out during partition and upload those embeddings to Azure Cognitive Search index.
 * **Improves hierarchy from docx files by leveraging natural hierarchies built into docx documents**  Hierarchy can now be detected from an indentation level for list bullets/numbers and by style name (e.g. Heading 1, List Bullet 2, List Number).
 * **Chunking support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally chunk the elements pulled out during partition via the chunking unstructured brick. This can be used as a stage before creating embeddings.
-
+* **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=<n>` (previously `combine_under_n_chars`) argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length <n> characters. This enables users to use partitioned Table results effectively in downstream applications without any post processing.
 
 ### Features
 
 * **Adds `links` metadata in `partition_pdf` for `fast` strategy.** Problem: PDF files contain rich information and hyperlink that Unstructured did not captured earlier. Feature: `partition_pdf` now can capture embedded links within the file along with its associated text and page number. Importance: Providing depth in extracted elements give user a better understanding and richer context of documents. This also enables user to map to other elements within the document if the hyperlink is refered internally.
 * **Adds the embedding module to be able to embed Elements** Problem: Many NLP applications require the ability to represent parts of documents in a semantic way. Until now, Unstructured did not have text embedding ability within the core library. Feature: This embedding module is able to track embeddings related data with a class, embed a list of elements, and return an updated list of Elements with the *embeddings* property. The module is also able to embed query strings. Importance: Ability to embed documents or parts of documents will enable users to make use of these semantic representations in different NLP applications, such as search, retrieval, and retrieval augmented generation.
-* **Adds `by_num_characters` as an option to the `add_chunking_strategy` decorator to partition functions.** Currently this strategy will only apply to Table elements but we plan to expand this to other element types. By specifying `chunking_strategy=by_num_characters` and optionally `num_characters=<n>` paritition will return TableChunk elements that have `text` and `text_as_html` variables in chunks of <n> characters. This enables users to use partitioned Table results effectively in downstream applications without any post processing.
 
 ### Fixes
 

diff --git a/test_unstructured/partition/csv/test_csv.py b/test_unstructured/partition/csv/test_csv.py
@@ -202,7 +202,6 @@ def test_add_chunking_strategy_to_partition_csv_non_default():
         max_characters=9,
         combine_text_under_n_chars=0,
     )
-    chunks = chunk_by_title(elements, max_characters=9,
-        combine_text_under_n_chars=0,)
+    chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0)
     assert chunk_elements != elements
     assert chunk_elements == chunks
diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
@@ -483,7 +483,9 @@ def test_parse_category_depth_by_style_ilvl():
     assert partitioner._parse_category_depth_by_style_ilvl() == 0
 
 
-def test_add_chunking_strategy_on_partition_docx_default_args(filename="example-docs/handbook-1p.docx"):
+def test_add_chunking_strategy_on_partition_docx_default_args(
+    filename="example-docs/handbook-1p.docx",
+):
     chunk_elements = partition_docx(filename, chunking_strategy="by_title")
     elements = partition_docx(filename)
     chunks = chunk_by_title(elements)
@@ -499,7 +501,7 @@ def test_add_chunking_strategy_on_partition_docx(
         filename,
         chunking_strategy="by_title",
         max_characters=9,
-        combine_text_under_n_chars=5
+        combine_text_under_n_chars=5,
     )
     elements = partition_docx(filename)
     chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=5)

diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/epub/test_epub.py
@@ -205,9 +205,10 @@ def test_add_chunking_strategy_on_partition_epub_non_default(
         max_characters=5,
         combine_text_under_n_chars=0,
     )
-    chunks = chunk_by_title(elements,
-                            max_characters=5,
-                            combine_text_under_n_chars=0,
-                            )
+    chunks = chunk_by_title(
+        elements,
+        max_characters=5,
+        combine_text_under_n_chars=0,
+    )
     assert chunk_elements != elements
     assert chunk_elements == chunks
diff --git a/test_unstructured/partition/odt/test_odt.py b/test_unstructured/partition/odt/test_odt.py
@@ -2,7 +2,7 @@
 import pathlib
 
 from unstructured.chunking.title import chunk_by_title
-from unstructured.documents.elements import Table, Title, TableChunk
+from unstructured.documents.elements import Table, TableChunk, Title
 from unstructured.partition.json import partition_json
 from unstructured.partition.odt import partition_odt
 from unstructured.staging.base import elements_to_json
@@ -180,10 +180,11 @@ def test_add_chunking_strategy_on_partition_odt_non_default():
         max_characters=7,
         combine_text_under_n_chars=5,
     )
-    chunks = chunk_by_title(elements,
-                            max_characters=7,
-                            combine_text_under_n_chars=5,
-                            )
+    chunks = chunk_by_title(
+        elements,
+        max_characters=7,
+        combine_text_under_n_chars=5,
+    )
     for chunk in chunk_elements:
         if isinstance(chunk, TableChunk):
             assert len(chunk.text) <= 7

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -995,7 +995,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
             max_characters=200,
             combine_text_under_n_chars=5,
         )
-        if isinstance(e, Table) or isinstance(e, TableChunk)
+        if isinstance(e, (Table, TableChunk))
     ]
 
     partitioned_table_elements_5_chars = [
@@ -1006,7 +1006,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
             max_characters=5,
             combine_text_under_n_chars=5,
         )
-        if isinstance(e, Table) or isinstance(e, TableChunk)
+        if isinstance(e, (Table, TableChunk))
     ]
 
     elements = partition(filename)

diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py
@@ -1,7 +1,7 @@
 import copy
 import functools
 import inspect
-from typing import Any, Callable, Dict, List, Optional, TypeVar
+from typing import Any, Callable, Dict, List, Optional, TypeVar, Union
 
 from typing_extensions import ParamSpec
 
@@ -19,15 +19,15 @@
 def chunk_table_element(
     element: Table,
     max_characters: Optional[int] = 1500,
-) -> List[TableChunk]:
+) -> List[Union[Table, TableChunk]]:
     chunks = []
 
     element_char_len = len(element.text)
 
     html_table = element.text_as_html if hasattr(element, "text_as_html") else None
     if html_table:
         element_char_len = len(html_table)
-    if element_char_len <= max_characters:
+    if element_char_len <= max_characters:  # type: ignore
         chunks.append(element)
     else:
         text = element.text
@@ -172,9 +172,9 @@ def _split_elements_by_title_and_table(
             )
 
         section_length = sum([len(str(element)) for element in section])
-        new_section = (isinstance(element, Title) and section_length > combine_text_under_n_chars) or (
-            not metadata_matches or section_length > max_characters
-        )
+        new_section = (
+            isinstance(element, Title) and section_length > combine_text_under_n_chars
+        ) or (not metadata_matches or section_length > max_characters)
 
         if not isinstance(element, Text) or isinstance(element, Table):
             sections.append(section)

diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py
@@ -83,16 +83,16 @@ def get_embedder(self) -> BaseEmbeddingEncoder:
 class ChunkingConfig(BaseConfig):
     chunk_elements: bool = False
     multipage_sections: bool = True
-    combine_under_n_chars: int = 500
-    new_after_n_chars: int = 1500
+    combine_text_under_n_chars: int = 500
+    max_characters: int = 1500
 
     def chunk(self, elements: t.List[Element]) -> t.List[Element]:
         if self.chunk_elements:
             return chunk_by_title(
                 elements=elements,
                 multipage_sections=self.multipage_sections,
-                combine_under_n_chars=self.combine_under_n_chars,
-                new_after_n_chars=self.new_after_n_chars,
+                combine_text_under_n_chars=self.combine_text_under_n_chars,
+                max_characters=self.max_characters,
             )
         else:
             return elements