diff --git a/CHANGELOG.md b/CHANGELOG.md index 201227b647..88a758d959 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,13 +11,12 @@ * **Embeddings support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally create embeddings from the elements it pulls out during partition and upload those embeddings to Azure Cognitive Search index. * **Improves hierarchy from docx files by leveraging natural hierarchies built into docx documents** Hierarchy can now be detected from an indentation level for list bullets/numbers and by style name (e.g. Heading 1, List Bullet 2, List Number). * **Chunking support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally chunk the elements pulled out during partition via the chunking unstructured brick. This can be used as a stage before creating embeddings. - +* **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=` (previously `combine_under_n_chars`) argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length characters. This enables users to use partitioned Table results effectively in downstream applications without any post processing. ### Features * **Adds `links` metadata in `partition_pdf` for `fast` strategy.** Problem: PDF files contain rich information and hyperlink that Unstructured did not captured earlier. Feature: `partition_pdf` now can capture embedded links within the file along with its associated text and page number. Importance: Providing depth in extracted elements give user a better understanding and richer context of documents. This also enables user to map to other elements within the document if the hyperlink is refered internally. * **Adds the embedding module to be able to embed Elements** Problem: Many NLP applications require the ability to represent parts of documents in a semantic way. Until now, Unstructured did not have text embedding ability within the core library. Feature: This embedding module is able to track embeddings related data with a class, embed a list of elements, and return an updated list of Elements with the *embeddings* property. The module is also able to embed query strings. Importance: Ability to embed documents or parts of documents will enable users to make use of these semantic representations in different NLP applications, such as search, retrieval, and retrieval augmented generation. -* **Adds `by_num_characters` as an option to the `add_chunking_strategy` decorator to partition functions.** Currently this strategy will only apply to Table elements but we plan to expand this to other element types. By specifying `chunking_strategy=by_num_characters` and optionally `num_characters=` paritition will return TableChunk elements that have `text` and `text_as_html` variables in chunks of characters. This enables users to use partitioned Table results effectively in downstream applications without any post processing. ### Fixes diff --git a/test_unstructured/partition/csv/test_csv.py b/test_unstructured/partition/csv/test_csv.py index fe5718886d..3f3d5e4ae0 100644 --- a/test_unstructured/partition/csv/test_csv.py +++ b/test_unstructured/partition/csv/test_csv.py @@ -202,7 +202,6 @@ def test_add_chunking_strategy_to_partition_csv_non_default(): max_characters=9, combine_text_under_n_chars=0, ) - chunks = chunk_by_title(elements, max_characters=9, - combine_text_under_n_chars=0,) + chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0) assert chunk_elements != elements assert chunk_elements == chunks diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index 6217be239e..f58cba150c 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -483,7 +483,9 @@ def test_parse_category_depth_by_style_ilvl(): assert partitioner._parse_category_depth_by_style_ilvl() == 0 -def test_add_chunking_strategy_on_partition_docx_default_args(filename="example-docs/handbook-1p.docx"): +def test_add_chunking_strategy_on_partition_docx_default_args( + filename="example-docs/handbook-1p.docx", +): chunk_elements = partition_docx(filename, chunking_strategy="by_title") elements = partition_docx(filename) chunks = chunk_by_title(elements) @@ -499,7 +501,7 @@ def test_add_chunking_strategy_on_partition_docx( filename, chunking_strategy="by_title", max_characters=9, - combine_text_under_n_chars=5 + combine_text_under_n_chars=5, ) elements = partition_docx(filename) chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=5) diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/epub/test_epub.py index 89f8470ec2..723f76ff93 100644 --- a/test_unstructured/partition/epub/test_epub.py +++ b/test_unstructured/partition/epub/test_epub.py @@ -205,9 +205,10 @@ def test_add_chunking_strategy_on_partition_epub_non_default( max_characters=5, combine_text_under_n_chars=0, ) - chunks = chunk_by_title(elements, - max_characters=5, - combine_text_under_n_chars=0, - ) + chunks = chunk_by_title( + elements, + max_characters=5, + combine_text_under_n_chars=0, + ) assert chunk_elements != elements assert chunk_elements == chunks diff --git a/test_unstructured/partition/odt/test_odt.py b/test_unstructured/partition/odt/test_odt.py index 9eb6ba7633..982a11f9b4 100644 --- a/test_unstructured/partition/odt/test_odt.py +++ b/test_unstructured/partition/odt/test_odt.py @@ -2,7 +2,7 @@ import pathlib from unstructured.chunking.title import chunk_by_title -from unstructured.documents.elements import Table, Title, TableChunk +from unstructured.documents.elements import Table, TableChunk, Title from unstructured.partition.json import partition_json from unstructured.partition.odt import partition_odt from unstructured.staging.base import elements_to_json @@ -180,10 +180,11 @@ def test_add_chunking_strategy_on_partition_odt_non_default(): max_characters=7, combine_text_under_n_chars=5, ) - chunks = chunk_by_title(elements, - max_characters=7, - combine_text_under_n_chars=5, - ) + chunks = chunk_by_title( + elements, + max_characters=7, + combine_text_under_n_chars=5, + ) for chunk in chunk_elements: if isinstance(chunk, TableChunk): assert len(chunk.text) <= 7 diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 7af935f2d3..4b9443729d 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -995,7 +995,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars(): max_characters=200, combine_text_under_n_chars=5, ) - if isinstance(e, Table) or isinstance(e, TableChunk) + if isinstance(e, (Table, TableChunk)) ] partitioned_table_elements_5_chars = [ @@ -1006,7 +1006,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars(): max_characters=5, combine_text_under_n_chars=5, ) - if isinstance(e, Table) or isinstance(e, TableChunk) + if isinstance(e, (Table, TableChunk)) ] elements = partition(filename) diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index 928d18a93b..83b0e3d730 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -1,7 +1,7 @@ import copy import functools import inspect -from typing import Any, Callable, Dict, List, Optional, TypeVar +from typing import Any, Callable, Dict, List, Optional, TypeVar, Union from typing_extensions import ParamSpec @@ -19,7 +19,7 @@ def chunk_table_element( element: Table, max_characters: Optional[int] = 1500, -) -> List[TableChunk]: +) -> List[Union[Table, TableChunk]]: chunks = [] element_char_len = len(element.text) @@ -27,7 +27,7 @@ def chunk_table_element( html_table = element.text_as_html if hasattr(element, "text_as_html") else None if html_table: element_char_len = len(html_table) - if element_char_len <= max_characters: + if element_char_len <= max_characters: # type: ignore chunks.append(element) else: text = element.text @@ -172,9 +172,9 @@ def _split_elements_by_title_and_table( ) section_length = sum([len(str(element)) for element in section]) - new_section = (isinstance(element, Title) and section_length > combine_text_under_n_chars) or ( - not metadata_matches or section_length > max_characters - ) + new_section = ( + isinstance(element, Title) and section_length > combine_text_under_n_chars + ) or (not metadata_matches or section_length > max_characters) if not isinstance(element, Text) or isinstance(element, Table): sections.append(section) diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index c76fdfb783..caefa50afd 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -83,16 +83,16 @@ def get_embedder(self) -> BaseEmbeddingEncoder: class ChunkingConfig(BaseConfig): chunk_elements: bool = False multipage_sections: bool = True - combine_under_n_chars: int = 500 - new_after_n_chars: int = 1500 + combine_text_under_n_chars: int = 500 + max_characters: int = 1500 def chunk(self, elements: t.List[Element]) -> t.List[Element]: if self.chunk_elements: return chunk_by_title( elements=elements, multipage_sections=self.multipage_sections, - combine_under_n_chars=self.combine_under_n_chars, - new_after_n_chars=self.new_after_n_chars, + combine_text_under_n_chars=self.combine_text_under_n_chars, + max_characters=self.max_characters, ) else: return elements