From 36e81c33673263f696c6e9f46b08d502fef45810 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Sat, 16 Dec 2023 09:28:15 -0800 Subject: [PATCH] rfctr(chunking): extract general-purpose objects to base (#2281) Many of the classes defined in `unstructured.chunking.title` are applicable to any chunking strategy and will shortly be used for the "by-character" chunking strategy as well. Move these and their tests to `unstructured.chunking.base`. Along the way, rename `TextPreChunkBuilder` to `PreChunkBuilder` because it will be generalized in a subsequent PR to also take `Table` elements such that inter-pre-chunk overlap can be implemented. Otherwise, no logic changes, just moves. --- CHANGELOG.md | 2 +- test_unstructured/chunking/test_base.py | 869 ++++++++++++++++++++++- test_unstructured/chunking/test_title.py | 854 +--------------------- unstructured/__version__.py | 2 +- unstructured/chunking/base.py | 418 ++++++++++- unstructured/chunking/title.py | 421 +---------- 6 files changed, 1298 insertions(+), 1268 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dc82c3e53..fd76ccbc63 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.11.5-dev1 +## 0.11.5-dev2 ### Enhancements diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index d2d4132eab..8988d6ded8 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -1,14 +1,35 @@ +# pyright: reportPrivateUsage=false + """Unit-test suite for the `unstructured.chunking.base` module.""" from __future__ import annotations +from typing import List + import pytest -from unstructured.chunking.base import ChunkingOptions +from unstructured.chunking.base import ( + ChunkingOptions, + PreChunkBuilder, + PreChunkCombiner, + TablePreChunk, + TextPreChunk, + TextPreChunkAccumulator, +) +from unstructured.documents.elements import ( + CompositeElement, + ElementMetadata, + PageBreak, + RegexMetadata, + Table, + TableChunk, + Text, + Title, +) class DescribeChunkingOptions: - """Unit-test suite for `unstructured.chunking.model.ChunkingOptions objects.""" + """Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects.""" @pytest.mark.parametrize("max_characters", [0, -1, -42]) def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int): @@ -111,3 +132,847 @@ def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self): def it_knows_the_text_separator_string(self): assert ChunkingOptions.new().text_separator == "\n\n" + + +# ================================================================================================ +# PRE-CHUNK SUBTYPES +# ================================================================================================ + + +class DescribeTablePreChunk: + """Unit-test suite for `unstructured.chunking.base.TablePreChunk objects.""" + + def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self): + html_table = ( + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "
Header Col 1 Header Col 2
Lorem ipsum adipiscing
" + ) + text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing" + pre_chunk = TablePreChunk( + Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), + opts=ChunkingOptions.new(max_characters=175), + ) + + chunk_iter = pre_chunk.iter_chunks() + + chunk = next(chunk_iter) + assert isinstance(chunk, Table) + assert chunk.text == "Header Col 1 Header Col 2\nLorem ipsum adipiscing" + assert chunk.metadata.text_as_html == ( + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "
Header Col 1 Header Col 2
Lorem ipsum adipiscing
" + ) + with pytest.raises(StopIteration): + next(chunk_iter) + + def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self): + # fixed-overhead = 8+8+9+8+9+8 = 50 + # per-row overhead = 27 + html_table = ( + "\n" # 8 + "\n" # 8 + "\n" + "\n" # 9 + "\n" # 8 + "\n" + "\n" + "\n" + "\n" + "\n" # 9 + "
Header Col 1 Header Col 2
Lorem ipsum A Link example
Consectetur adipiscing elit
Nunc aliquam id enim nec molestie
Vivamus quis nunc ipsum donec ac fermentum
" # 8 + ) + text_table = ( + "Header Col 1 Header Col 2\n" + "Lorem ipsum dolor sit amet\n" + "Consectetur adipiscing elit\n" + "Nunc aliquam id enim nec molestie\n" + "Vivamus quis nunc ipsum donec ac fermentum" + ) + pre_chunk = TablePreChunk( + Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), + opts=ChunkingOptions.new(max_characters=100), + ) + + chunk_iter = pre_chunk.iter_chunks() + + chunk = next(chunk_iter) + assert isinstance(chunk, TableChunk) + assert chunk.text == ( + "Header Col 1 Header Col 2\n" + "Lorem ipsum dolor sit amet\n" + "Consectetur adipiscing elit\n" + "Nunc aliqua" + ) + assert chunk.metadata.text_as_html == ( + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "<" + ) + # -- note that text runs out but HTML continues because it's significantly longer. So two + # -- of these chunks have HTML but no text. + chunk = next(chunk_iter) + assert isinstance(chunk, TableChunk) + assert chunk.text == "" + assert chunk.metadata.text_as_html == ( + "/tr>\n" + "\n" + "\n\n
Header Col 1 Header Col 2
Lo" + ) + # -- + chunk = next(chunk_iter) + assert isinstance(chunk, TableChunk) + assert ( + chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum" + ) + assert chunk.metadata.text_as_html == ( + "rem ipsum A Link example
Consectetur adipiscing elit
Nunc aliquam id enim nec molestie
Vivamus quis " + ) + # -- + chunk = next(chunk_iter) + assert isinstance(chunk, TableChunk) + assert chunk.text == "" + assert chunk.metadata.text_as_html == ( + "nunc ipsum donec ac fermentum
" + ) + # -- + with pytest.raises(StopIteration): + next(chunk_iter) + + +class DescribeTextPreChunk: + """Unit-test suite for `unstructured.chunking.base.TextPreChunk objects.""" + + def it_can_combine_itself_with_another_TextPreChunk_instance(self): + """.combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`. + + Note that neither the original or other pre_chunk are mutated. + """ + opts = ChunkingOptions.new() + pre_chunk = TextPreChunk( + [ + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Text("In rhoncus ipsum sed lectus porta volutpat."), + ], + opts=opts, + ) + other_pre_chunk = TextPreChunk( + [ + Text("Donec semper facilisis metus finibus malesuada."), + Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), + ], + opts=opts, + ) + + new_pre_chunk = pre_chunk.combine(other_pre_chunk) + + assert new_pre_chunk == TextPreChunk( + [ + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Text("In rhoncus ipsum sed lectus porta volutpat."), + Text("Donec semper facilisis metus finibus malesuada."), + Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), + ], + opts=opts, + ) + assert pre_chunk == TextPreChunk( + [ + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Text("In rhoncus ipsum sed lectus porta volutpat."), + ], + opts=opts, + ) + assert other_pre_chunk == TextPreChunk( + [ + Text("Donec semper facilisis metus finibus malesuada."), + Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), + ], + opts=opts, + ) + + def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self): + pre_chunk = TextPreChunk( + [ + Title("Introduction"), + Text( + "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" + "lectus porta volutpat.", + ), + ], + opts=ChunkingOptions.new(max_characters=200), + ) + + chunk_iter = pre_chunk.iter_chunks() + + chunk = next(chunk_iter) + assert chunk == CompositeElement( + "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit." + " In rhoncus ipsum sedlectus porta volutpat.", + ) + assert chunk.metadata is pre_chunk._consolidated_metadata + + def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self): + # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window. + # -- The pre-chunker will isolate that element in a pre_chunk of its own. + pre_chunk = TextPreChunk( + [ + Text( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" + " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" + " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea" + " commodo consequat." + ), + ], + opts=ChunkingOptions.new(max_characters=200), + ) + + chunk_iter = pre_chunk.iter_chunks() + + chunk = next(chunk_iter) + assert chunk == CompositeElement( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" + " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" + " veniam, quis nostrud exercitation ullamco laboris nisi ut a" + ) + assert chunk.metadata is pre_chunk._consolidated_metadata + # -- + chunk = next(chunk_iter) + assert chunk == CompositeElement("liquip ex ea commodo consequat.") + assert chunk.metadata is pre_chunk._consolidated_metadata + # -- + with pytest.raises(StopIteration): + next(chunk_iter) + + def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self): + """.text_length is the size of chunk this pre-chunk will produce (before any splitting).""" + pre_chunk = TextPreChunk( + [PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new() + ) + assert pre_chunk.text_length == 8 + + def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self): + pre_chunk = TextPreChunk( + [ + Title( + "Lorem Ipsum", + metadata=ElementMetadata( + category_depth=0, + filename="foo.docx", + languages=["lat"], + parent_id="f87731e0", + ), + ), + Text( + "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", + metadata=ElementMetadata( + category_depth=1, + filename="foo.docx", + image_path="sprite.png", + languages=["lat", "eng"], + ), + ), + ], + opts=ChunkingOptions.new(), + ) + + assert pre_chunk._all_metadata_values == { + # -- scalar values are accumulated in a list in element order -- + "category_depth": [0, 1], + # -- all values are accumulated, not only unique ones -- + "filename": ["foo.docx", "foo.docx"], + # -- list-type fields produce a list of lists -- + "languages": [["lat"], ["lat", "eng"]], + # -- fields that only appear in some elements are captured -- + "image_path": ["sprite.png"], + "parent_id": ["f87731e0"], + # -- A `None` value never appears, neither does a field-name with an empty list -- + } + + def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self): + metadata = ElementMetadata( + category_depth=0, + filename="foo.docx", + languages=["lat"], + parent_id="f87731e0", + ) + metadata.coefficient = 0.62 + metadata_2 = ElementMetadata( + category_depth=1, + filename="foo.docx", + image_path="sprite.png", + languages=["lat", "eng"], + ) + metadata_2.quotient = 1.74 + + pre_chunk = TextPreChunk( + [ + Title("Lorem Ipsum", metadata=metadata), + Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2), + ], + opts=ChunkingOptions.new(), + ) + + # -- ad-hoc fields "coefficient" and "quotient" do not appear -- + assert pre_chunk._all_metadata_values == { + "category_depth": [0, 1], + "filename": ["foo.docx", "foo.docx"], + "image_path": ["sprite.png"], + "languages": [["lat"], ["lat", "eng"]], + "parent_id": ["f87731e0"], + } + + def it_consolidates_regex_metadata_in_a_field_specific_way(self): + """regex_metadata of chunk is combined regex_metadatas of its elements. + + Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new + position in the chunk after element text has been concatenated. + """ + pre_chunk = TextPreChunk( + [ + Title( + "Lorem Ipsum", + metadata=ElementMetadata( + regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}, + ), + ), + Text( + "Lorem ipsum dolor sit amet consectetur adipiscing elit.", + metadata=ElementMetadata( + regex_metadata={ + "dolor": [RegexMetadata(text="dolor", start=12, end=17)], + "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)], + }, + ), + ), + Text( + "In rhoncus ipsum sed lectus porta volutpat.", + metadata=ElementMetadata( + regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}, + ), + ), + ], + opts=ChunkingOptions.new(), + ) + + regex_metadata = pre_chunk._consolidated_regex_meta + + assert regex_metadata == { + "dolor": [RegexMetadata(text="dolor", start=25, end=30)], + "ipsum": [ + RegexMetadata(text="Ipsum", start=6, end=11), + RegexMetadata(text="ipsum", start=19, end=24), + RegexMetadata(text="ipsum", start=81, end=86), + ], + } + + def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self): + """._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata. + + Only non-None fields should appear in the dict and each field value should be the + consolidation of the values across the pre_chunk elements. + """ + pre_chunk = TextPreChunk( + [ + PageBreak(""), + Title( + "Lorem Ipsum", + metadata=ElementMetadata( + filename="foo.docx", + # -- category_depth has DROP strategy so doesn't appear in result -- + category_depth=0, + emphasized_text_contents=["Lorem", "Ipsum"], + emphasized_text_tags=["b", "i"], + languages=["lat"], + regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}, + ), + ), + Text( + "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", + metadata=ElementMetadata( + # -- filename change doesn't happen IRL but demonstrates FIRST strategy -- + filename="bar.docx", + # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem" + # -- appears twice in consolidated-meta (as it should) and length matches + # -- that of emphasized_text_tags both before and after consolidation. + emphasized_text_contents=["Lorem", "ipsum"], + emphasized_text_tags=["i", "b"], + # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once -- + languages=["eng", "lat"], + # -- regex_metadata has its own dedicated consolidation-strategy (REGEX) -- + regex_metadata={ + "dolor": [RegexMetadata(text="dolor", start=12, end=17)], + "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)], + }, + ), + ), + ], + opts=ChunkingOptions.new(), + ) + + meta_kwargs = pre_chunk._meta_kwargs + + assert meta_kwargs == { + "filename": "foo.docx", + "emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"], + "emphasized_text_tags": ["b", "i", "i", "b"], + "languages": ["lat", "eng"], + "regex_metadata": { + "ipsum": [ + RegexMetadata(text="Ipsum", start=6, end=11), + RegexMetadata(text="ipsum", start=19, end=24), + ], + "dolor": [RegexMetadata(text="dolor", start=25, end=30)], + }, + } + + @pytest.mark.parametrize( + ("elements", "expected_value"), + [ + ([Text("foo"), Text("bar")], "foo\n\nbar"), + ([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"), + ([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"), + ([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"), + ], + ) + def it_knows_the_concatenated_text_of_the_pre_chunk( + self, elements: List[Text], expected_value: str + ): + """._text is the "joined" text of the pre-chunk elements. + + The text-segment contributed by each element is separated from the next by a blank line + ("\n\n"). An element that contributes no text does not give rise to a separator. + """ + pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new()) + assert pre_chunk._text == expected_value + + +# ================================================================================================ +# PRE-CHUNKING ACCUMULATORS +# ================================================================================================ + + +class DescribePreChunkBuilder: + """Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`.""" + + def it_is_empty_on_construction(self): + builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50)) + + assert builder.text_length == 0 + assert builder.remaining_space == 50 + + def it_accumulates_elements_added_to_it(self): + builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) + + builder.add_element(Title("Introduction")) + assert builder.text_length == 12 + assert builder.remaining_space == 136 + + builder.add_element( + Text( + "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" + "lectus porta volutpat.", + ), + ) + assert builder.text_length == 112 + assert builder.remaining_space == 36 + + def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): + builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) + builder.add_element(Title("Introduction")) + builder.add_element( + Text( + "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" + "lectus porta volutpat.", + ), + ) + + pre_chunk = next(builder.flush()) + + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ + Title("Introduction"), + Text( + "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" + "lectus porta volutpat.", + ), + ] + assert builder.text_length == 0 + assert builder.remaining_space == 150 + + def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): + builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) + + pre_chunks = list(builder.flush()) + + assert pre_chunks == [] + assert builder.text_length == 0 + assert builder.remaining_space == 150 + + def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): + builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50)) + builder.add_element(Text("abcde")) + builder.add_element(Text("fghij")) + + # -- .text_length includes a separator ("\n\n", len==2) between each text-segment, + # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10 + assert builder.text_length == 12 + # -- .remaining_space is reduced by the length (2) of the trailing separator which would go + # -- between the current text and that of the next element if one was added. + # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38 + assert builder.remaining_space == 36 + + +class DescribePreChunkCombiner: + """Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`.""" + + def it_combines_sequential_small_text_pre_chunks(self): + opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) + pre_chunks = [ + TextPreChunk( + [ + Title("Lorem Ipsum"), # 11 + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 + ], + opts=opts, + ), + TextPreChunk( + [ + Title("Mauris Nec"), # 10 + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 + ], + opts=opts, + ), + TextPreChunk( + [ + Title("Sed Orci"), # 8 + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 + ], + opts=opts, + ), + ] + + pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() + + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), + ] + with pytest.raises(StopIteration): + next(pre_chunk_iter) + + def but_it_does_not_combine_table_pre_chunks(self): + opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) + pre_chunks = [ + TextPreChunk( + [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + ], + opts=opts, + ), + TablePreChunk(Table("Heading\nCell text"), opts=opts), + TextPreChunk( + [ + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ], + opts=opts, + ), + ] + + pre_chunk_iter = PreChunkCombiner( + pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) + ).iter_combined_pre_chunks() + + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + ] + # -- + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TablePreChunk) + assert pre_chunk._table == Table("Heading\nCell text") + # -- + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ] + # -- + with pytest.raises(StopIteration): + next(pre_chunk_iter) + + def it_respects_the_specified_combination_threshold(self): + opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80) + pre_chunks = [ + TextPreChunk( # 68 + [ + Title("Lorem Ipsum"), # 11 + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 + ], + opts=opts, + ), + TextPreChunk( # 71 + [ + Title("Mauris Nec"), # 10 + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 + ], + opts=opts, + ), + # -- len == 139 + TextPreChunk( + [ + Title("Sed Orci"), # 8 + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 + ], + opts=opts, + ), + ] + + pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() + + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ] + # -- + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), + ] + # -- + with pytest.raises(StopIteration): + next(pre_chunk_iter) + + def it_respects_the_hard_maximum_window_length(self): + opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200) + pre_chunks = [ + TextPreChunk( # 68 + [ + Title("Lorem Ipsum"), # 11 + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 + ], + opts=opts, + ), + TextPreChunk( # 71 + [ + Title("Mauris Nec"), # 10 + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 + ], + opts=opts, + ), + # -- len == 139 + TextPreChunk( + [ + Title("Sed Orci"), # 8 + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 + ], + opts=opts, + ), + # -- len == 214 + ] + + pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() + + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ] + # -- + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), + ] + # -- + with pytest.raises(StopIteration): + next(pre_chunk_iter) + + def it_accommodates_and_isolates_an_oversized_pre_chunk(self): + """Such as occurs when a single element exceeds the window size.""" + opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150) + pre_chunks = [ + TextPreChunk([Title("Lorem Ipsum")], opts=opts), + TextPreChunk( # 179 + [ + Text( + "Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55 + " Mauris nec urna non augue vulputate consequat eget et nisi." # 60 + " Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64 + ) + ], + opts=opts, + ), + TextPreChunk([Title("Vulputate Consequat")], opts=opts), + ] + + pre_chunk_iter = PreChunkCombiner( + pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150) + ).iter_combined_pre_chunks() + + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [Title("Lorem Ipsum")] + # -- + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ + Text( + "Lorem ipsum dolor sit amet consectetur adipiscing elit." + " Mauris nec urna non augue vulputate consequat eget et nisi." + " Sed orci quam, eleifend sit amet vehicula, elementum ultricies." + ) + ] + # -- + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [Title("Vulputate Consequat")] + # -- + with pytest.raises(StopIteration): + next(pre_chunk_iter) + + +class DescribeTextPreChunkAccumulator: + """Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`.""" + + def it_is_empty_on_construction(self): + accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100)) + + assert accum.text_length == 0 + assert accum.remaining_space == 100 + + def it_accumulates_pre_chunks_added_to_it(self): + opts = ChunkingOptions.new(max_characters=500) + accum = TextPreChunkAccumulator(opts=opts) + + accum.add_pre_chunk( + TextPreChunk( + [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + ], + opts=opts, + ) + ) + assert accum.text_length == 68 + assert accum.remaining_space == 430 + + accum.add_pre_chunk( + TextPreChunk( + [ + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ], + opts=opts, + ) + ) + assert accum.text_length == 141 + assert accum.remaining_space == 357 + + def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): + opts = ChunkingOptions.new(max_characters=150) + accum = TextPreChunkAccumulator(opts=opts) + accum.add_pre_chunk( + TextPreChunk( + [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + ], + opts=opts, + ) + ) + accum.add_pre_chunk( + TextPreChunk( + [ + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ], + opts=opts, + ) + ) + accum.add_pre_chunk( + TextPreChunk( + [ + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."), + ], + opts=opts, + ) + ) + + pre_chunk_iter = accum.flush() + + # -- iterator generates exactly one pre_chunk -- + pre_chunk = next(pre_chunk_iter) + with pytest.raises(StopIteration): + next(pre_chunk_iter) + # -- and it is a _TextPreChunk containing all the elements -- + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."), + ] + assert accum.text_length == 0 + assert accum.remaining_space == 150 + + def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): + accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150)) + + pre_chunks = list(accum.flush()) + + assert pre_chunks == [] + assert accum.text_length == 0 + assert accum.remaining_space == 150 + + def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): + opts = ChunkingOptions.new(max_characters=100) + accum = TextPreChunkAccumulator(opts=opts) + accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts)) + accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts)) + + # -- .text_length includes a separator ("\n\n", len==2) between each text-segment, + # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10 + assert accum.text_length == 12 + # -- .remaining_space is reduced by the length (2) of the trailing separator which would + # -- go between the current text and that of the next pre-chunk if one was added. + # -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88 + assert accum.remaining_space == 86 diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index df12220b6e..dfb196b069 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -4,16 +4,8 @@ import pytest -from unstructured.chunking.base import ChunkingOptions -from unstructured.chunking.title import ( - PreChunkCombiner, - TablePreChunk, - TextPreChunk, - TextPreChunkAccumulator, - TextPreChunkBuilder, - _split_elements_by_title_and_table, - chunk_by_title, -) +from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk +from unstructured.chunking.title import _split_elements_by_title_and_table, chunk_by_title from unstructured.documents.coordinates import CoordinateSystem from unstructured.documents.elements import ( CheckBox, @@ -22,10 +14,8 @@ Element, ElementMetadata, ListItem, - PageBreak, RegexMetadata, Table, - TableChunk, Text, Title, ) @@ -552,843 +542,3 @@ def test_it_considers_separator_length_when_pre_chunking(): ), CompositeElement("Minimize mid-text chunk-splitting"), ] - - -# == PreChunks =================================================================================== - - -class DescribeTablePreChunk: - """Unit-test suite for `unstructured.chunking.title.TablePreChunk objects.""" - - def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self): - html_table = ( - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "
Header Col 1 Header Col 2
Lorem ipsum adipiscing
" - ) - text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing" - pre_chunk = TablePreChunk( - Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), - opts=ChunkingOptions.new(max_characters=175), - ) - - chunk_iter = pre_chunk.iter_chunks() - - chunk = next(chunk_iter) - assert isinstance(chunk, Table) - assert chunk.text == "Header Col 1 Header Col 2\nLorem ipsum adipiscing" - assert chunk.metadata.text_as_html == ( - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "
Header Col 1 Header Col 2
Lorem ipsum adipiscing
" - ) - with pytest.raises(StopIteration): - next(chunk_iter) - - def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self): - # fixed-overhead = 8+8+9+8+9+8 = 50 - # per-row overhead = 27 - html_table = ( - "\n" # 8 - "\n" # 8 - "\n" - "\n" # 9 - "\n" # 8 - "\n" - "\n" - "\n" - "\n" - "\n" # 9 - "
Header Col 1 Header Col 2
Lorem ipsum A Link example
Consectetur adipiscing elit
Nunc aliquam id enim nec molestie
Vivamus quis nunc ipsum donec ac fermentum
" # 8 - ) - text_table = ( - "Header Col 1 Header Col 2\n" - "Lorem ipsum dolor sit amet\n" - "Consectetur adipiscing elit\n" - "Nunc aliquam id enim nec molestie\n" - "Vivamus quis nunc ipsum donec ac fermentum" - ) - pre_chunk = TablePreChunk( - Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), - opts=ChunkingOptions.new(max_characters=100), - ) - - chunk_iter = pre_chunk.iter_chunks() - - chunk = next(chunk_iter) - assert isinstance(chunk, TableChunk) - assert chunk.text == ( - "Header Col 1 Header Col 2\n" - "Lorem ipsum dolor sit amet\n" - "Consectetur adipiscing elit\n" - "Nunc aliqua" - ) - assert chunk.metadata.text_as_html == ( - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "<" - ) - # -- note that text runs out but HTML continues because it's significantly longer. So two - # -- of these chunks have HTML but no text. - chunk = next(chunk_iter) - assert isinstance(chunk, TableChunk) - assert chunk.text == "" - assert chunk.metadata.text_as_html == ( - "/tr>\n" - "\n" - "\n\n
Header Col 1 Header Col 2
Lo" - ) - # -- - chunk = next(chunk_iter) - assert isinstance(chunk, TableChunk) - assert ( - chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum" - ) - assert chunk.metadata.text_as_html == ( - "rem ipsum A Link example
Consectetur adipiscing elit
Nunc aliquam id enim nec molestie
Vivamus quis " - ) - # -- - chunk = next(chunk_iter) - assert isinstance(chunk, TableChunk) - assert chunk.text == "" - assert chunk.metadata.text_as_html == ( - "nunc ipsum donec ac fermentum
" - ) - # -- - with pytest.raises(StopIteration): - next(chunk_iter) - - -class DescribeTextPreChunk: - """Unit-test suite for `unstructured.chunking.title.TextPreChunk objects.""" - - def it_can_combine_itself_with_another_TextPreChunk_instance(self): - """.combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`. - - Note that neither the original or other pre_chunk are mutated. - """ - opts = ChunkingOptions.new() - pre_chunk = TextPreChunk( - [ - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - Text("In rhoncus ipsum sed lectus porta volutpat."), - ], - opts=opts, - ) - other_pre_chunk = TextPreChunk( - [ - Text("Donec semper facilisis metus finibus malesuada."), - Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), - ], - opts=opts, - ) - - new_pre_chunk = pre_chunk.combine(other_pre_chunk) - - assert new_pre_chunk == TextPreChunk( - [ - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - Text("In rhoncus ipsum sed lectus porta volutpat."), - Text("Donec semper facilisis metus finibus malesuada."), - Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), - ], - opts=opts, - ) - assert pre_chunk == TextPreChunk( - [ - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - Text("In rhoncus ipsum sed lectus porta volutpat."), - ], - opts=opts, - ) - assert other_pre_chunk == TextPreChunk( - [ - Text("Donec semper facilisis metus finibus malesuada."), - Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), - ], - opts=opts, - ) - - def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self): - pre_chunk = TextPreChunk( - [ - Title("Introduction"), - Text( - "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" - "lectus porta volutpat.", - ), - ], - opts=ChunkingOptions.new(max_characters=200), - ) - - chunk_iter = pre_chunk.iter_chunks() - - chunk = next(chunk_iter) - assert chunk == CompositeElement( - "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit." - " In rhoncus ipsum sedlectus porta volutpat.", - ) - assert chunk.metadata is pre_chunk._consolidated_metadata - - def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self): - # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window. - # -- The pre-chunker will isolate that element in a pre_chunk of its own. - pre_chunk = TextPreChunk( - [ - Text( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" - " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" - " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea" - " commodo consequat." - ), - ], - opts=ChunkingOptions.new(max_characters=200), - ) - - chunk_iter = pre_chunk.iter_chunks() - - chunk = next(chunk_iter) - assert chunk == CompositeElement( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" - " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" - " veniam, quis nostrud exercitation ullamco laboris nisi ut a" - ) - assert chunk.metadata is pre_chunk._consolidated_metadata - # -- - chunk = next(chunk_iter) - assert chunk == CompositeElement("liquip ex ea commodo consequat.") - assert chunk.metadata is pre_chunk._consolidated_metadata - # -- - with pytest.raises(StopIteration): - next(chunk_iter) - - def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self): - """.text_length is the size of chunk this pre-chunk will produce (before any splitting).""" - pre_chunk = TextPreChunk( - [PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new() - ) - assert pre_chunk.text_length == 8 - - def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self): - pre_chunk = TextPreChunk( - [ - Title( - "Lorem Ipsum", - metadata=ElementMetadata( - category_depth=0, - filename="foo.docx", - languages=["lat"], - parent_id="f87731e0", - ), - ), - Text( - "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", - metadata=ElementMetadata( - category_depth=1, - filename="foo.docx", - image_path="sprite.png", - languages=["lat", "eng"], - ), - ), - ], - opts=ChunkingOptions.new(), - ) - - assert pre_chunk._all_metadata_values == { - # -- scalar values are accumulated in a list in element order -- - "category_depth": [0, 1], - # -- all values are accumulated, not only unique ones -- - "filename": ["foo.docx", "foo.docx"], - # -- list-type fields produce a list of lists -- - "languages": [["lat"], ["lat", "eng"]], - # -- fields that only appear in some elements are captured -- - "image_path": ["sprite.png"], - "parent_id": ["f87731e0"], - # -- A `None` value never appears, neither does a field-name with an empty list -- - } - - def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self): - metadata = ElementMetadata( - category_depth=0, - filename="foo.docx", - languages=["lat"], - parent_id="f87731e0", - ) - metadata.coefficient = 0.62 - metadata_2 = ElementMetadata( - category_depth=1, - filename="foo.docx", - image_path="sprite.png", - languages=["lat", "eng"], - ) - metadata_2.quotient = 1.74 - - pre_chunk = TextPreChunk( - [ - Title("Lorem Ipsum", metadata=metadata), - Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2), - ], - opts=ChunkingOptions.new(), - ) - - # -- ad-hoc fields "coefficient" and "quotient" do not appear -- - assert pre_chunk._all_metadata_values == { - "category_depth": [0, 1], - "filename": ["foo.docx", "foo.docx"], - "image_path": ["sprite.png"], - "languages": [["lat"], ["lat", "eng"]], - "parent_id": ["f87731e0"], - } - - def it_consolidates_regex_metadata_in_a_field_specific_way(self): - """regex_metadata of chunk is combined regex_metadatas of its elements. - - Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new - position in the chunk after element text has been concatenated. - """ - pre_chunk = TextPreChunk( - [ - Title( - "Lorem Ipsum", - metadata=ElementMetadata( - regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}, - ), - ), - Text( - "Lorem ipsum dolor sit amet consectetur adipiscing elit.", - metadata=ElementMetadata( - regex_metadata={ - "dolor": [RegexMetadata(text="dolor", start=12, end=17)], - "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)], - }, - ), - ), - Text( - "In rhoncus ipsum sed lectus porta volutpat.", - metadata=ElementMetadata( - regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}, - ), - ), - ], - opts=ChunkingOptions.new(), - ) - - regex_metadata = pre_chunk._consolidated_regex_meta - - assert regex_metadata == { - "dolor": [RegexMetadata(text="dolor", start=25, end=30)], - "ipsum": [ - RegexMetadata(text="Ipsum", start=6, end=11), - RegexMetadata(text="ipsum", start=19, end=24), - RegexMetadata(text="ipsum", start=81, end=86), - ], - } - - def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self): - """._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata. - - Only non-None fields should appear in the dict and each field value should be the - consolidation of the values across the pre_chunk elements. - """ - pre_chunk = TextPreChunk( - [ - PageBreak(""), - Title( - "Lorem Ipsum", - metadata=ElementMetadata( - filename="foo.docx", - # -- category_depth has DROP strategy so doesn't appear in result -- - category_depth=0, - emphasized_text_contents=["Lorem", "Ipsum"], - emphasized_text_tags=["b", "i"], - languages=["lat"], - regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}, - ), - ), - Text( - "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", - metadata=ElementMetadata( - # -- filename change doesn't happen IRL but demonstrates FIRST strategy -- - filename="bar.docx", - # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem" - # -- appears twice in consolidated-meta (as it should) and length matches - # -- that of emphasized_text_tags both before and after consolidation. - emphasized_text_contents=["Lorem", "ipsum"], - emphasized_text_tags=["i", "b"], - # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once -- - languages=["eng", "lat"], - # -- regex_metadata has its own dedicated consolidation-strategy (REGEX) -- - regex_metadata={ - "dolor": [RegexMetadata(text="dolor", start=12, end=17)], - "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)], - }, - ), - ), - ], - opts=ChunkingOptions.new(), - ) - - meta_kwargs = pre_chunk._meta_kwargs - - assert meta_kwargs == { - "filename": "foo.docx", - "emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"], - "emphasized_text_tags": ["b", "i", "i", "b"], - "languages": ["lat", "eng"], - "regex_metadata": { - "ipsum": [ - RegexMetadata(text="Ipsum", start=6, end=11), - RegexMetadata(text="ipsum", start=19, end=24), - ], - "dolor": [RegexMetadata(text="dolor", start=25, end=30)], - }, - } - - @pytest.mark.parametrize( - ("elements", "expected_value"), - [ - ([Text("foo"), Text("bar")], "foo\n\nbar"), - ([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"), - ([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"), - ([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"), - ], - ) - def it_knows_the_concatenated_text_of_the_pre_chunk( - self, elements: List[Text], expected_value: str - ): - """._text is the "joined" text of the pre-chunk elements. - - The text-segment contributed by each element is separated from the next by a blank line - ("\n\n"). An element that contributes no text does not give rise to a separator. - """ - pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new()) - assert pre_chunk._text == expected_value - - -class DescribeTextPreChunkBuilder: - """Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`.""" - - def it_is_empty_on_construction(self): - builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50)) - - assert builder.text_length == 0 - assert builder.remaining_space == 50 - - def it_accumulates_elements_added_to_it(self): - builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) - - builder.add_element(Title("Introduction")) - assert builder.text_length == 12 - assert builder.remaining_space == 136 - - builder.add_element( - Text( - "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" - "lectus porta volutpat.", - ), - ) - assert builder.text_length == 112 - assert builder.remaining_space == 36 - - def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): - builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) - builder.add_element(Title("Introduction")) - builder.add_element( - Text( - "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" - "lectus porta volutpat.", - ), - ) - - pre_chunk = next(builder.flush()) - - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Title("Introduction"), - Text( - "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" - "lectus porta volutpat.", - ), - ] - assert builder.text_length == 0 - assert builder.remaining_space == 150 - - def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): - builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) - - pre_chunks = list(builder.flush()) - - assert pre_chunks == [] - assert builder.text_length == 0 - assert builder.remaining_space == 150 - - def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): - builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50)) - builder.add_element(Text("abcde")) - builder.add_element(Text("fghij")) - - # -- .text_length includes a separator ("\n\n", len==2) between each text-segment, - # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10 - assert builder.text_length == 12 - # -- .remaining_space is reduced by the length (2) of the trailing separator which would go - # -- between the current text and that of the next element if one was added. - # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38 - assert builder.remaining_space == 36 - - -# == PreChunkCombiner ============================================================================= - - -class DescribePreChunkCombiner: - """Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`.""" - - def it_combines_sequential_small_text_pre_chunks(self): - opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) - pre_chunks = [ - TextPreChunk( - [ - Title("Lorem Ipsum"), # 11 - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 - ], - opts=opts, - ), - TextPreChunk( - [ - Title("Mauris Nec"), # 10 - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 - ], - opts=opts, - ), - TextPreChunk( - [ - Title("Sed Orci"), # 8 - Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 - ], - opts=opts, - ), - ] - - pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() - - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Title("Lorem Ipsum"), - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - Title("Mauris Nec"), - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - Title("Sed Orci"), - Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), - ] - with pytest.raises(StopIteration): - next(pre_chunk_iter) - - def but_it_does_not_combine_table_pre_chunks(self): - opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) - pre_chunks = [ - TextPreChunk( - [ - Title("Lorem Ipsum"), - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - ], - opts=opts, - ), - TablePreChunk(Table("Heading\nCell text"), opts=opts), - TextPreChunk( - [ - Title("Mauris Nec"), - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - ], - opts=opts, - ), - ] - - pre_chunk_iter = PreChunkCombiner( - pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) - ).iter_combined_pre_chunks() - - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Title("Lorem Ipsum"), - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - ] - # -- - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TablePreChunk) - assert pre_chunk._table == Table("Heading\nCell text") - # -- - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Title("Mauris Nec"), - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - ] - # -- - with pytest.raises(StopIteration): - next(pre_chunk_iter) - - def it_respects_the_specified_combination_threshold(self): - opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80) - pre_chunks = [ - TextPreChunk( # 68 - [ - Title("Lorem Ipsum"), # 11 - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 - ], - opts=opts, - ), - TextPreChunk( # 71 - [ - Title("Mauris Nec"), # 10 - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 - ], - opts=opts, - ), - # -- len == 139 - TextPreChunk( - [ - Title("Sed Orci"), # 8 - Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 - ], - opts=opts, - ), - ] - - pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() - - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Title("Lorem Ipsum"), - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - Title("Mauris Nec"), - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - ] - # -- - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Title("Sed Orci"), - Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), - ] - # -- - with pytest.raises(StopIteration): - next(pre_chunk_iter) - - def it_respects_the_hard_maximum_window_length(self): - opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200) - pre_chunks = [ - TextPreChunk( # 68 - [ - Title("Lorem Ipsum"), # 11 - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 - ], - opts=opts, - ), - TextPreChunk( # 71 - [ - Title("Mauris Nec"), # 10 - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 - ], - opts=opts, - ), - # -- len == 139 - TextPreChunk( - [ - Title("Sed Orci"), # 8 - Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 - ], - opts=opts, - ), - # -- len == 214 - ] - - pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() - - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Title("Lorem Ipsum"), - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - Title("Mauris Nec"), - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - ] - # -- - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Title("Sed Orci"), - Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), - ] - # -- - with pytest.raises(StopIteration): - next(pre_chunk_iter) - - def it_accommodates_and_isolates_an_oversized_pre_chunk(self): - """Such as occurs when a single element exceeds the window size.""" - opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150) - pre_chunks = [ - TextPreChunk([Title("Lorem Ipsum")], opts=opts), - TextPreChunk( # 179 - [ - Text( - "Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55 - " Mauris nec urna non augue vulputate consequat eget et nisi." # 60 - " Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64 - ) - ], - opts=opts, - ), - TextPreChunk([Title("Vulputate Consequat")], opts=opts), - ] - - pre_chunk_iter = PreChunkCombiner( - pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150) - ).iter_combined_pre_chunks() - - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [Title("Lorem Ipsum")] - # -- - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Text( - "Lorem ipsum dolor sit amet consectetur adipiscing elit." - " Mauris nec urna non augue vulputate consequat eget et nisi." - " Sed orci quam, eleifend sit amet vehicula, elementum ultricies." - ) - ] - # -- - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [Title("Vulputate Consequat")] - # -- - with pytest.raises(StopIteration): - next(pre_chunk_iter) - - -class DescribeTextPreChunkAccumulator: - """Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`.""" - - def it_is_empty_on_construction(self): - accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100)) - - assert accum.text_length == 0 - assert accum.remaining_space == 100 - - def it_accumulates_pre_chunks_added_to_it(self): - opts = ChunkingOptions.new(max_characters=500) - accum = TextPreChunkAccumulator(opts=opts) - - accum.add_pre_chunk( - TextPreChunk( - [ - Title("Lorem Ipsum"), - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - ], - opts=opts, - ) - ) - assert accum.text_length == 68 - assert accum.remaining_space == 430 - - accum.add_pre_chunk( - TextPreChunk( - [ - Title("Mauris Nec"), - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - ], - opts=opts, - ) - ) - assert accum.text_length == 141 - assert accum.remaining_space == 357 - - def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): - opts = ChunkingOptions.new(max_characters=150) - accum = TextPreChunkAccumulator(opts=opts) - accum.add_pre_chunk( - TextPreChunk( - [ - Title("Lorem Ipsum"), - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - ], - opts=opts, - ) - ) - accum.add_pre_chunk( - TextPreChunk( - [ - Title("Mauris Nec"), - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - ], - opts=opts, - ) - ) - accum.add_pre_chunk( - TextPreChunk( - [ - Title("Sed Orci"), - Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."), - ], - opts=opts, - ) - ) - - pre_chunk_iter = accum.flush() - - # -- iterator generates exactly one pre_chunk -- - pre_chunk = next(pre_chunk_iter) - with pytest.raises(StopIteration): - next(pre_chunk_iter) - # -- and it is a _TextPreChunk containing all the elements -- - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Title("Lorem Ipsum"), - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - Title("Mauris Nec"), - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - Title("Sed Orci"), - Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."), - ] - assert accum.text_length == 0 - assert accum.remaining_space == 150 - - def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): - accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150)) - - pre_chunks = list(accum.flush()) - - assert pre_chunks == [] - assert accum.text_length == 0 - assert accum.remaining_space == 150 - - def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): - opts = ChunkingOptions.new(max_characters=100) - accum = TextPreChunkAccumulator(opts=opts) - accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts)) - accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts)) - - # -- .text_length includes a separator ("\n\n", len==2) between each text-segment, - # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10 - assert accum.text_length == 12 - # -- .remaining_space is reduced by the length (2) of the trailing separator which would - # -- go between the current text and that of the next pre-chunk if one was added. - # -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88 - assert accum.remaining_space == 86 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3ff8b04c20..b056c56d9a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.5-dev1" # pragma: no cover +__version__ = "0.11.5-dev2" # pragma: no cover diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index a36f437d1b..6e64344a63 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -2,12 +2,25 @@ from __future__ import annotations -from typing import Optional +import collections +import copy +from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast -from typing_extensions import Self +from typing_extensions import Self, TypeAlias +from unstructured.documents.elements import ( + CompositeElement, + ConsolidationStrategy, + Element, + ElementMetadata, + RegexMetadata, + Table, + TableChunk, +) from unstructured.utils import lazyproperty +PreChunk: TypeAlias = "TablePreChunk | TextPreChunk" + class ChunkingOptions: """Specifies parameters of optional chunking behaviors.""" @@ -150,3 +163,404 @@ def _validate(self) -> None: # loop (I think). if self._overlap >= max_characters: raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}") + + +# ================================================================================================ +# PRE-CHUNK SUB-TYPES +# ================================================================================================ + + +class TablePreChunk: + """A pre-chunk composed of a single Table element.""" + + def __init__(self, table: Table, opts: ChunkingOptions) -> None: + self._table = table + self._opts = opts + + def iter_chunks(self) -> Iterator[Table | TableChunk]: + """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller.""" + text = self._table.text + html = self._table.metadata.text_as_html or "" + maxlen = self._opts.hard_max + + # -- only chunk a table when it's too big to swallow whole -- + if len(text) <= maxlen and len(html) <= maxlen: + yield self._table + return + + is_continuation = False + + while text or html: + # -- split off the next maxchars into the next TableChunk -- + text_chunk, text = text[:maxlen], text[maxlen:] + table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata)) + + # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the + # -- HTML elements that *correspond* to the TextChunk.text fragment. + if html: + html_chunk, html = html[:maxlen], html[maxlen:] + table_chunk.metadata.text_as_html = html_chunk + + # -- mark second and later chunks as a continuation -- + if is_continuation: + table_chunk.metadata.is_continuation = True + + yield table_chunk + + is_continuation = True + + +class TextPreChunk: + """A sequence of elements that belong to the same semantic unit within a document. + + The name "section" derives from the idea of a document-section, a heading followed by the + paragraphs "under" that heading. That structure is not found in all documents and actual section + content can vary, but that's the concept. + + This object is purposely immutable. + """ + + def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None: + self._elements = list(elements) + self._opts = opts + + def __eq__(self, other: Any) -> bool: + if not isinstance(other, TextPreChunk): + return False + return self._elements == other._elements + + def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk: + """Return new `TextPreChunk` that combines this and `other_pre_chunk`.""" + return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts) + + def iter_chunks(self) -> Iterator[CompositeElement]: + """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller.""" + text = self._text + text_len = len(text) + maxlen = self._opts.hard_max + start = 0 + remaining = text_len + + while remaining > 0: + end = min(start + maxlen, text_len) + yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata) + start = end + remaining = text_len - end + + @lazyproperty + def text_length(self) -> int: + """Length of concatenated text of this pre-chunk, including separators.""" + # -- used by pre-chunk-combiner to identify combination candidates -- + return len(self._text) + + @lazyproperty + def _all_metadata_values(self) -> Dict[str, List[Any]]: + """Collection of all populated metadata values across elements. + + The resulting dict has one key for each `ElementMetadata` field that had a non-None value in + at least one of the elements in this pre-chunk. The value of that key is a list of all those + populated values, in element order, for example: + + { + "filename": ["sample.docx", "sample.docx"], + "languages": [["lat"], ["lat", "eng"]] + ... + } + + This preprocessing step provides the input for a specified consolidation strategy that will + resolve the list of values for each field to a single consolidated value. + """ + + def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]: + """(field_name, value) pair for each non-None field in single `ElementMetadata`.""" + return ( + (field_name, value) + for field_name, value in metadata.known_fields.items() + if value is not None + ) + + field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list) + + # -- collect all non-None field values in a list for each field, in element-order -- + for e in self._elements: + for field_name, value in iter_populated_fields(e.metadata): + field_values[field_name].append(value) + + return dict(field_values) + + @lazyproperty + def _consolidated_metadata(self) -> ElementMetadata: + """Metadata applicable to this pre-chunk as a single chunk. + + Formed by applying consolidation rules to all metadata fields across the elements of this + pre-chunk. + + For the sake of consistency, the same rules are applied (for example, for dropping values) + to a single-element pre-chunk too, even though metadata for such a pre-chunk is already + "consolidated". + """ + return ElementMetadata(**self._meta_kwargs) + + @lazyproperty + def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]: + """Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict. + + This consolidated value is suitable for use in the chunk metadata. `start` and `end` + offsets of each regex match are also adjusted for their new positions. + """ + chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {} + separator_len = len(self._opts.text_separator) + running_text_len = 0 + start_offset = 0 + + for element in self._elements: + text_len = len(element.text) + # -- skip empty elements like `PageBreak("")` -- + if not text_len: + continue + # -- account for blank line between "squashed" elements, but not before first element -- + running_text_len += separator_len if running_text_len else 0 + start_offset = running_text_len + running_text_len += text_len + + if not element.metadata.regex_metadata: + continue + + # -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets -- + element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata) + for regex_name, matches in element_regex_metadata.items(): + for m in matches: + m["start"] += start_offset + m["end"] += start_offset + chunk_matches = chunk_regex_metadata.get(regex_name, []) + chunk_matches.extend(matches) + chunk_regex_metadata[regex_name] = chunk_matches + + return chunk_regex_metadata + + @lazyproperty + def _meta_kwargs(self) -> Dict[str, Any]: + """The consolidated metadata values as a dict suitable for constructing ElementMetadata. + + This is where consolidation strategies are actually applied. The output is suitable for use + in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`. + """ + CS = ConsolidationStrategy + field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies() + + def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]: + """Generate (field-name, value) pairs for each field in consolidated metadata.""" + for field_name, values in self._all_metadata_values.items(): + strategy = field_consolidation_strategies.get(field_name) + if strategy is CS.FIRST: + yield field_name, values[0] + # -- concatenate lists from each element that had one, in order -- + elif strategy is CS.LIST_CONCATENATE: + yield field_name, sum(values, cast(List[Any], [])) + # -- union lists from each element, preserving order of appearance -- + elif strategy is CS.LIST_UNIQUE: + # -- Python 3.7+ maintains dict insertion order -- + ordered_unique_keys = {key: None for val_list in values for key in val_list} + yield field_name, list(ordered_unique_keys.keys()) + elif strategy is CS.REGEX: + yield field_name, self._consolidated_regex_meta + elif strategy is CS.DROP: + continue + else: + # -- not likely to hit this since we have a test in `text_elements.py` that + # -- ensures every ElementMetadata fields has an assigned strategy. + raise NotImplementedError( + f"metadata field {repr(field_name)} has no defined consolidation strategy" + ) + + return dict(iter_kwarg_pairs()) + + @lazyproperty + def _text(self) -> str: + """The concatenated text of all elements in this pre-chunk. + + Each element-text is separated from the next by a blank line ("\n\n"). + """ + text_separator = self._opts.text_separator + return text_separator.join(e.text for e in self._elements if e.text) + + +# ================================================================================================ +# PRE-CHUNKING ACCUMULATORS +# ------------------------------------------------------------------------------------------------ +# Accumulators encapsulate the work of grouping elements and later pre-chunks to form the larger +# pre-chunk and combined-pre-chunk items central to unstructured chunking. +# ================================================================================================ + + +class PreChunkBuilder: + """An element accumulator suitable for incrementally forming a pre-chunk. + + Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use + to determine whether it should add the next element in the element stream. + + `.flush()` is used to build a PreChunk object from the accumulated elements. This method + returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is + used like so: + + yield from builder.flush() + + If no elements have been accumulated, no `PreChunk` instance is generated. Flushing the builder + clears the elements it contains so it is ready to build the next pre-chunk. + """ + + def __init__(self, opts: ChunkingOptions) -> None: + self._opts = opts + self._separator_len = len(opts.text_separator) + self._elements: List[Element] = [] + + # -- only includes non-empty element text, e.g. PageBreak.text=="" is not included -- + self._text_segments: List[str] = [] + # -- combined length of text-segments, not including separators -- + self._text_len: int = 0 + + def add_element(self, element: Element) -> None: + """Add `element` to this section.""" + self._elements.append(element) + if element.text: + self._text_segments.append(element.text) + self._text_len += len(element.text) + + def flush(self) -> Iterator[TextPreChunk]: + """Generate zero-or-one `PreChunk` object and clear the accumulator. + + Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic + boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element + stream. + """ + if not self._elements: + return + # -- clear builder before yield so we're not sensitive to the timing of how/when this + # -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately. + elements = self._elements[:] + self._elements.clear() + self._text_segments.clear() + self._text_len = 0 + yield TextPreChunk(elements, self._opts) + + @property + def remaining_space(self) -> int: + """Maximum text-length of an element that can be added without exceeding maxlen.""" + # -- include length of trailing separator that will go before next element text -- + separators_len = self._separator_len * len(self._text_segments) + return self._opts.hard_max - self._text_len - separators_len + + @property + def text_length(self) -> int: + """Length of the text in this pre-chunk. + + This value represents the chunk-size that would result if this pre-chunk was flushed in its + current state. In particular, it does not include the length of a trailing separator (since + that would only appear if an additional element was added). + + Not suitable for judging remaining space, use `.remaining_space` for that value. + """ + # -- number of text separators present in joined text of elements. This includes only + # -- separators *between* text segments, not one at the end. Note there are zero separators + # -- for both 0 and 1 text-segments. + n = len(self._text_segments) + separator_count = n - 1 if n else 0 + return self._text_len + (separator_count * self._separator_len) + + +class PreChunkCombiner: + """Filters pre-chunk stream to combine small pre-chunks where possible.""" + + def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions): + self._pre_chunks = pre_chunks + self._opts = opts + + def iter_combined_pre_chunks(self) -> Iterator[PreChunk]: + """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window.""" + accum = TextPreChunkAccumulator(self._opts) + combine_text_under_n_chars = self._opts.combine_text_under_n_chars + + for pre_chunk in self._pre_chunks: + # -- start new pre-chunk under these conditions -- + if ( + # -- a table pre-chunk is never combined -- + isinstance(pre_chunk, TablePreChunk) + # -- don't add another pre-chunk once length has reached combination soft-max -- + or accum.text_length >= combine_text_under_n_chars + # -- combining would exceed hard-max -- + or accum.remaining_space < pre_chunk.text_length + ): + yield from accum.flush() + + # -- a table pre-chunk is never combined so don't accumulate -- + if isinstance(pre_chunk, TablePreChunk): + yield pre_chunk + else: + accum.add_pre_chunk(pre_chunk) + + yield from accum.flush() + + +class TextPreChunkAccumulator: + """Accumulates, measures, and combines pre-chunk objects. + + Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding + whether to add another pre-chunk. + + `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object. + This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used + like so: + + yield from accum.flush() + + If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder + clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk. + """ + + def __init__(self, opts: ChunkingOptions) -> None: + self._opts = opts + self._pre_chunks: List[TextPreChunk] = [] + + def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None: + """Add a pre-chunk to the accumulator for possible combination with next pre-chunk.""" + self._pre_chunks.append(pre_chunk) + + def flush(self) -> Iterator[TextPreChunk]: + """Generate all accumulated pre-chunks as a single combined pre-chunk.""" + pre_chunks = self._pre_chunks + + # -- nothing to do if no pre-chunks have been accumulated -- + if not pre_chunks: + return + + # -- otherwise combine all accumulated pre-chunk into one -- + pre_chunk = pre_chunks[0] + for other_pre_chunk in pre_chunks[1:]: + pre_chunk = pre_chunk.combine(other_pre_chunk) + yield pre_chunk + + # -- and reset the accumulator (to empty) -- + pre_chunks.clear() + + @property + def remaining_space(self) -> int: + """Maximum size of pre-chunk that can be added without exceeding maxlen.""" + maxlen = self._opts.hard_max + return ( + maxlen + if not self._pre_chunks + # -- an additional pre-chunk will also incur an additional separator -- + else maxlen - self.text_length - len(self._opts.text_separator) + ) + + @property + def text_length(self) -> int: + """Size of concatenated text in all pre-chunks in accumulator.""" + n = len(self._pre_chunks) + + if n == 0: + return 0 + + total_text_length = sum(s.text_length for s in self._pre_chunks) + total_separator_length = len(self._opts.text_separator) * (n - 1) + return total_text_length + total_separator_length diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index 93301e963c..ebc1a92069 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -5,26 +5,20 @@ from __future__ import annotations -import collections -import copy -from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast - -from typing_extensions import TypeAlias - -from unstructured.chunking.base import ChunkingOptions +from typing import Iterator, List, Optional + +from unstructured.chunking.base import ( + ChunkingOptions, + PreChunk, + PreChunkBuilder, + PreChunkCombiner, + TablePreChunk, +) from unstructured.documents.elements import ( - CompositeElement, - ConsolidationStrategy, Element, - ElementMetadata, - RegexMetadata, Table, - TableChunk, Title, ) -from unstructured.utils import lazyproperty - -PreChunk: TypeAlias = "TablePreChunk | TextPreChunk" def chunk_by_title( @@ -78,7 +72,7 @@ def chunk_by_title( def _split_elements_by_title_and_table( elements: List[Element], opts: ChunkingOptions -) -> Iterator[TextPreChunk | TablePreChunk]: +) -> Iterator[PreChunk]: """Implements "pre-chunker" responsibilities. A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a @@ -102,7 +96,7 @@ def _split_elements_by_title_and_table( A Table or Checkbox element is placed into a pre-chunk by itself. """ - pre_chunk_builder = TextPreChunkBuilder(opts) + pre_chunk_builder = PreChunkBuilder(opts) prior_element = None @@ -156,396 +150,3 @@ def _metadata_differs( if ignore_page_numbers: return False return metadata1.page_number != metadata2.page_number - - -# == PreChunks =================================================================================== - - -class TablePreChunk: - """A pre-chunk composed of a single Table element.""" - - def __init__(self, table: Table, opts: ChunkingOptions) -> None: - self._table = table - self._opts = opts - - def iter_chunks(self) -> Iterator[Table | TableChunk]: - """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller.""" - text = self._table.text - html = self._table.metadata.text_as_html or "" - maxlen = self._opts.hard_max - - # -- only chunk a table when it's too big to swallow whole -- - if len(text) <= maxlen and len(html) <= maxlen: - yield self._table - return - - is_continuation = False - - while text or html: - # -- split off the next maxchars into the next TableChunk -- - text_chunk, text = text[:maxlen], text[maxlen:] - table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata)) - - # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the - # -- HTML elements that *correspond* to the TextChunk.text fragment. - if html: - html_chunk, html = html[:maxlen], html[maxlen:] - table_chunk.metadata.text_as_html = html_chunk - - # -- mark second and later chunks as a continuation -- - if is_continuation: - table_chunk.metadata.is_continuation = True - - yield table_chunk - - is_continuation = True - - -class TextPreChunk: - """A sequence of elements that belong to the same semantic unit within a document. - - The name "section" derives from the idea of a document-section, a heading followed by the - paragraphs "under" that heading. That structure is not found in all documents and actual section - content can vary, but that's the concept. - - This object is purposely immutable. - """ - - def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None: - self._elements = list(elements) - self._opts = opts - - def __eq__(self, other: Any) -> bool: - if not isinstance(other, TextPreChunk): - return False - return self._elements == other._elements - - def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk: - """Return new `TextPreChunk` that combines this and `other_pre_chunk`.""" - return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts) - - def iter_chunks(self) -> Iterator[CompositeElement]: - """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller.""" - text = self._text - text_len = len(text) - maxlen = self._opts.hard_max - start = 0 - remaining = text_len - - while remaining > 0: - end = min(start + maxlen, text_len) - yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata) - start = end - remaining = text_len - end - - @lazyproperty - def text_length(self) -> int: - """Length of concatenated text of this pre-chunk, including separators.""" - # -- used by pre-chunk-combiner to identify combination candidates -- - return len(self._text) - - @lazyproperty - def _all_metadata_values(self) -> Dict[str, List[Any]]: - """Collection of all populated metadata values across elements. - - The resulting dict has one key for each `ElementMetadata` field that had a non-None value in - at least one of the elements in this pre-chunk. The value of that key is a list of all those - populated values, in element order, for example: - - { - "filename": ["sample.docx", "sample.docx"], - "languages": [["lat"], ["lat", "eng"]] - ... - } - - This preprocessing step provides the input for a specified consolidation strategy that will - resolve the list of values for each field to a single consolidated value. - """ - - def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]: - """(field_name, value) pair for each non-None field in single `ElementMetadata`.""" - return ( - (field_name, value) - for field_name, value in metadata.known_fields.items() - if value is not None - ) - - field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list) - - # -- collect all non-None field values in a list for each field, in element-order -- - for e in self._elements: - for field_name, value in iter_populated_fields(e.metadata): - field_values[field_name].append(value) - - return dict(field_values) - - @lazyproperty - def _consolidated_metadata(self) -> ElementMetadata: - """Metadata applicable to this pre-chunk as a single chunk. - - Formed by applying consolidation rules to all metadata fields across the elements of this - pre-chunk. - - For the sake of consistency, the same rules are applied (for example, for dropping values) - to a single-element pre-chunk too, even though metadata for such a pre-chunk is already - "consolidated". - """ - return ElementMetadata(**self._meta_kwargs) - - @lazyproperty - def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]: - """Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict. - - This consolidated value is suitable for use in the chunk metadata. `start` and `end` - offsets of each regex match are also adjusted for their new positions. - """ - chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {} - separator_len = len(self._opts.text_separator) - running_text_len = 0 - start_offset = 0 - - for element in self._elements: - text_len = len(element.text) - # -- skip empty elements like `PageBreak("")` -- - if not text_len: - continue - # -- account for blank line between "squashed" elements, but not before first element -- - running_text_len += separator_len if running_text_len else 0 - start_offset = running_text_len - running_text_len += text_len - - if not element.metadata.regex_metadata: - continue - - # -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets -- - element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata) - for regex_name, matches in element_regex_metadata.items(): - for m in matches: - m["start"] += start_offset - m["end"] += start_offset - chunk_matches = chunk_regex_metadata.get(regex_name, []) - chunk_matches.extend(matches) - chunk_regex_metadata[regex_name] = chunk_matches - - return chunk_regex_metadata - - @lazyproperty - def _meta_kwargs(self) -> Dict[str, Any]: - """The consolidated metadata values as a dict suitable for constructing ElementMetadata. - - This is where consolidation strategies are actually applied. The output is suitable for use - in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`. - """ - CS = ConsolidationStrategy - field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies() - - def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]: - """Generate (field-name, value) pairs for each field in consolidated metadata.""" - for field_name, values in self._all_metadata_values.items(): - strategy = field_consolidation_strategies.get(field_name) - if strategy is CS.FIRST: - yield field_name, values[0] - # -- concatenate lists from each element that had one, in order -- - elif strategy is CS.LIST_CONCATENATE: - yield field_name, sum(values, cast(List[Any], [])) - # -- union lists from each element, preserving order of appearance -- - elif strategy is CS.LIST_UNIQUE: - # -- Python 3.7+ maintains dict insertion order -- - ordered_unique_keys = {key: None for val_list in values for key in val_list} - yield field_name, list(ordered_unique_keys.keys()) - elif strategy is CS.REGEX: - yield field_name, self._consolidated_regex_meta - elif strategy is CS.DROP: - continue - else: - # -- not likely to hit this since we have a test in `text_elements.py` that - # -- ensures every ElementMetadata fields has an assigned strategy. - raise NotImplementedError( - f"metadata field {repr(field_name)} has no defined consolidation strategy" - ) - - return dict(iter_kwarg_pairs()) - - @lazyproperty - def _text(self) -> str: - """The concatenated text of all elements in this pre-chunk. - - Each element-text is separated from the next by a blank line ("\n\n"). - """ - text_separator = self._opts.text_separator - return text_separator.join(e.text for e in self._elements if e.text) - - -class TextPreChunkBuilder: - """An element accumulator suitable for incrementally forming a pre-chunk. - - Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use - to determine whether it should add the next element in the element stream. - - `.flush()` is used to build a `TextPreChunk` object from the accumulated elements. This method - returns an interator that generates zero-or-one `TextPreChunk` object and is used like so: - - yield from builder.flush() - - If no elements have been accumulated, no `TextPreChunk` is generated. Flushing the builder - clears the elements it contains so it is ready to build the next text-pre-chunk. - """ - - def __init__(self, opts: ChunkingOptions) -> None: - self._opts = opts - self._separator_len = len(opts.text_separator) - self._elements: List[Element] = [] - - # -- only includes non-empty element text, e.g. PageBreak.text=="" is not included -- - self._text_segments: List[str] = [] - # -- combined length of text-segments, not including separators -- - self._text_len: int = 0 - - def add_element(self, element: Element) -> None: - """Add `element` to this section.""" - self._elements.append(element) - if element.text: - self._text_segments.append(element.text) - self._text_len += len(element.text) - - def flush(self) -> Iterator[TextPreChunk]: - """Generate zero-or-one `PreChunk` object and clear the accumulator. - - Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic - boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element - stream. - """ - if not self._elements: - return - # -- clear builder before yield so we're not sensitive to the timing of how/when this - # -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately. - elements = self._elements[:] - self._elements.clear() - self._text_segments.clear() - self._text_len = 0 - yield TextPreChunk(elements, self._opts) - - @property - def remaining_space(self) -> int: - """Maximum text-length of an element that can be added without exceeding maxlen.""" - # -- include length of trailing separator that will go before next element text -- - separators_len = self._separator_len * len(self._text_segments) - return self._opts.hard_max - self._text_len - separators_len - - @property - def text_length(self) -> int: - """Length of the text in this pre-chunk. - - This value represents the chunk-size that would result if this pre-chunk was flushed in its - current state. In particular, it does not include the length of a trailing separator (since - that would only appear if an additional element was added). - - Not suitable for judging remaining space, use `.remaining_space` for that value. - """ - # -- number of text separators present in joined text of elements. This includes only - # -- separators *between* text segments, not one at the end. Note there are zero separators - # -- for both 0 and 1 text-segments. - n = len(self._text_segments) - separator_count = n - 1 if n else 0 - return self._text_len + (separator_count * self._separator_len) - - -# == PreChunkCombiner ============================================================================ - - -class PreChunkCombiner: - """Filters pre-chunk stream to combine small pre-chunks where possible.""" - - def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions): - self._pre_chunks = pre_chunks - self._opts = opts - - def iter_combined_pre_chunks(self) -> Iterator[PreChunk]: - """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window.""" - accum = TextPreChunkAccumulator(self._opts) - combine_text_under_n_chars = self._opts.combine_text_under_n_chars - - for pre_chunk in self._pre_chunks: - # -- start new pre-chunk under these conditions -- - if ( - # -- a table pre-chunk is never combined -- - isinstance(pre_chunk, TablePreChunk) - # -- don't add another pre-chunk once length has reached combination soft-max -- - or accum.text_length >= combine_text_under_n_chars - # -- combining would exceed hard-max -- - or accum.remaining_space < pre_chunk.text_length - ): - yield from accum.flush() - - # -- a table pre-chunk is never combined so don't accumulate -- - if isinstance(pre_chunk, TablePreChunk): - yield pre_chunk - else: - accum.add_pre_chunk(pre_chunk) - - yield from accum.flush() - - -class TextPreChunkAccumulator: - """Accumulates, measures, and combines pre-chunk objects. - - Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding - whether to add another pre-chunk. - - `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object. - This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used - like so: - - yield from accum.flush() - - If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder - clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk. - """ - - def __init__(self, opts: ChunkingOptions) -> None: - self._opts = opts - self._pre_chunks: List[TextPreChunk] = [] - - def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None: - """Add a pre-chunk to the accumulator for possible combination with next pre-chunk.""" - self._pre_chunks.append(pre_chunk) - - def flush(self) -> Iterator[TextPreChunk]: - """Generate all accumulated pre-chunks as a single combined pre-chunk.""" - pre_chunks = self._pre_chunks - - # -- nothing to do if no pre-chunks have been accumulated -- - if not pre_chunks: - return - - # -- otherwise combine all accumulated pre-chunk into one -- - pre_chunk = pre_chunks[0] - for other_pre_chunk in pre_chunks[1:]: - pre_chunk = pre_chunk.combine(other_pre_chunk) - yield pre_chunk - - # -- and reset the accumulator (to empty) -- - pre_chunks.clear() - - @property - def remaining_space(self) -> int: - """Maximum size of pre-chunk that can be added without exceeding maxlen.""" - maxlen = self._opts.hard_max - return ( - maxlen - if not self._pre_chunks - # -- an additional pre-chunk will also incur an additional separator -- - else maxlen - self.text_length - len(self._opts.text_separator) - ) - - @property - def text_length(self) -> int: - """Size of concatenated text in all pre-chunks in accumulator.""" - n = len(self._pre_chunks) - - if n == 0: - return 0 - - total_text_length = sum(s.text_length for s in self._pre_chunks) - total_separator_length = len(self._opts.text_separator) * (n - 1) - return total_text_length + total_separator_length