diff --git a/CHANGELOG.md b/CHANGELOG.md index 736a70186a..4ea8a8187b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.11.5-dev0 + +### Enhancements + +### Features + +### Fixes + ## 0.11.4 ### Enhancements diff --git a/test_unstructured/chunking/__init__.py b/test_unstructured/chunking/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py new file mode 100644 index 0000000000..d2d4132eab --- /dev/null +++ b/test_unstructured/chunking/test_base.py @@ -0,0 +1,113 @@ +"""Unit-test suite for the `unstructured.chunking.base` module.""" + +from __future__ import annotations + +import pytest + +from unstructured.chunking.base import ChunkingOptions + + +class DescribeChunkingOptions: + """Unit-test suite for `unstructured.chunking.model.ChunkingOptions objects.""" + + @pytest.mark.parametrize("max_characters", [0, -1, -42]) + def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int): + with pytest.raises( + ValueError, + match=f"'max_characters' argument must be > 0, got {max_characters}", + ): + ChunkingOptions.new(max_characters=max_characters) + + def it_does_not_complain_when_specifying_max_characters_by_itself(self): + """Caller can specify `max_characters` arg without specifying any others. + + In particular, When `combine_text_under_n_chars` is not specified it defaults to the value + of `max_characters`; it has no fixed default value that can be greater than `max_characters` + and trigger an exception. + """ + try: + ChunkingOptions.new(max_characters=50) + except ValueError: + pytest.fail("did not accept `max_characters` as option by itself") + + @pytest.mark.parametrize("n_chars", [-1, -42]) + def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int): + with pytest.raises( + ValueError, + match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}", + ): + ChunkingOptions.new(combine_text_under_n_chars=n_chars) + + def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self): + """Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining.""" + opts = ChunkingOptions.new(combine_text_under_n_chars=0) + assert opts.combine_text_under_n_chars == 0 + + def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self): + """Caller can specify `combine_text_under_n_chars` arg without specifying other options.""" + try: + opts = ChunkingOptions.new(combine_text_under_n_chars=50) + except ValueError: + pytest.fail("did not accept `combine_text_under_n_chars` as option by itself") + + assert opts.combine_text_under_n_chars == 50 + + def it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars(self): + """`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior. + + So rather than raising an exception or warning, we just cap that value at `max_characters` + which is the behavioral equivalent. + """ + try: + opts = ChunkingOptions.new(max_characters=500, combine_text_under_n_chars=600) + except ValueError: + pytest.fail("did not accept `combine_text_under_n_chars` greater than `max_characters`") + + assert opts.combine_text_under_n_chars == 500 + + @pytest.mark.parametrize("n_chars", [-1, -42]) + def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int): + with pytest.raises( + ValueError, + match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}", + ): + ChunkingOptions.new(new_after_n_chars=n_chars) + + def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self): + """Caller can specify `new_after_n_chars` arg without specifying any other options. + + In particular, `combine_text_under_n_chars` value is adjusted down to the + `new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the + value of `new_after_n_chars`. + """ + try: + opts = ChunkingOptions.new(new_after_n_chars=200) + except ValueError: + pytest.fail("did not accept `new_after_n_chars` as option by itself") + + assert opts.soft_max == 200 + assert opts.combine_text_under_n_chars == 200 + + def it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(self): + """Specifying `new_after_n_chars=0` places each element into its own pre-chunk. + + This puts each element into its own chunk, although long chunks are still split. + """ + opts = ChunkingOptions.new(new_after_n_chars=0) + assert opts.soft_max == 0 + + def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self): + """`new_after_n_chars` > `max_characters` doesn't affect chunking behavior. + + So rather than raising an exception or warning, we just cap that value at `max_characters` + which is the behavioral equivalent. + """ + try: + opts = ChunkingOptions.new(max_characters=444, new_after_n_chars=555) + except ValueError: + pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`") + + assert opts.soft_max == 444 + + def it_knows_the_text_separator_string(self): + assert ChunkingOptions.new().text_separator == "\n\n" diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 709057c581..df12220b6e 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -4,6 +4,7 @@ import pytest +from unstructured.chunking.base import ChunkingOptions from unstructured.chunking.title import ( PreChunkCombiner, TablePreChunk, @@ -30,141 +31,6 @@ ) from unstructured.partition.html import partition_html -# == chunk_by_title() validation behaviors ======================================================= - - -@pytest.mark.parametrize("max_characters", [0, -1, -42]) -def test_it_rejects_max_characters_not_greater_than_zero(max_characters: int): - elements: List[Element] = [Text("Lorem ipsum dolor.")] - - with pytest.raises( - ValueError, - match=f"'max_characters' argument must be > 0, got {max_characters}", - ): - chunk_by_title(elements, max_characters=max_characters) - - -def test_it_does_not_complain_when_specifying_max_characters_by_itself(): - """Caller can specify `max_characters` arg without specifying any others. - - In particular, When `combine_text_under_n_chars` is not specified it defaults to the value of - `max_characters`; it has no fixed default value that can be greater than `max_characters` and - trigger an exception. - """ - elements: List[Element] = [Text("Lorem ipsum dolor.")] - - try: - chunk_by_title(elements, max_characters=50) - except ValueError: - pytest.fail("did not accept `max_characters` as option by itself") - - -@pytest.mark.parametrize("n_chars", [-1, -42]) -def test_it_rejects_combine_text_under_n_chars_for_n_less_than_zero(n_chars: int): - elements: List[Element] = [Text("Lorem ipsum dolor.")] - - with pytest.raises( - ValueError, - match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}", - ): - chunk_by_title(elements, combine_text_under_n_chars=n_chars) - - -def test_it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(): - """Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining.""" - elements: List[Element] = [Text("Lorem ipsum dolor.")] - - chunks = chunk_by_title(elements, max_characters=50, combine_text_under_n_chars=0) - - assert chunks == [CompositeElement("Lorem ipsum dolor.")] - - -def test_it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(): - """Caller can specify `combine_text_under_n_chars` arg without specifying any other options.""" - elements: List[Element] = [Text("Lorem ipsum dolor.")] - - try: - chunk_by_title(elements, combine_text_under_n_chars=50) - except ValueError: - pytest.fail("did not accept `combine_text_under_n_chars` as option by itself") - - -def test_it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars(): - """`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior. - - So rather than raising an exception or warning, we just cap that value at `max_characters` which - is the behavioral equivalent. - """ - elements: List[Element] = [Text("Lorem ipsum dolor.")] - - try: - chunk_by_title(elements, max_characters=500, combine_text_under_n_chars=600) - except ValueError: - pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`") - - -@pytest.mark.parametrize("n_chars", [-1, -42]) -def test_it_rejects_new_after_n_chars_for_n_less_than_zero(n_chars: int): - elements: List[Element] = [Text("Lorem ipsum dolor.")] - - with pytest.raises( - ValueError, - match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}", - ): - chunk_by_title(elements, new_after_n_chars=n_chars) - - -def test_it_does_not_complain_when_specifying_new_after_n_chars_by_itself(): - """Caller can specify `new_after_n_chars` arg without specifying any other options. - - In particular, `combine_text_under_n_chars` value is adjusted down to the `new_after_n_chars` - value when the default for `combine_text_under_n_chars` exceeds the value of - `new_after_n_chars`. - """ - elements: List[Element] = [Text("Lorem ipsum dolor.")] - - try: - chunk_by_title(elements, new_after_n_chars=50) - except ValueError: - pytest.fail("did not accept `new_after_n_chars` as option by itself") - - -def test_it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(): - """Specifying `new_after_n_chars=0` places each element into its own pre-chunk. - - This puts each element into its own chunk, although long chunks are still split. - """ - elements: List[Element] = [ - Text("Lorem"), - Text("ipsum"), - Text("dolor"), - ] - - chunks = chunk_by_title(elements, max_characters=50, new_after_n_chars=0) - - assert chunks == [ - CompositeElement("Lorem"), - CompositeElement("ipsum"), - CompositeElement("dolor"), - ] - - -def test_it_silently_accepts_new_after_n_chars_greater_than_maxchars(): - """`new_after_n_chars` > `max_characters` doesn't affect chunking behavior. - - So rather than raising an exception or warning, we just cap that value at `max_characters` which - is the behavioral equivalent. - """ - elements: List[Element] = [Text("Lorem ipsum dolor.")] - - try: - chunk_by_title(elements, max_characters=500, new_after_n_chars=600) - except ValueError: - pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`") - - -# ================================================================================================ - def test_it_splits_a_large_element_into_multiple_chunks(): elements: List[Element] = [ @@ -199,12 +65,7 @@ def test_split_elements_by_title_and_table(): CheckBox(), ] - pre_chunks = _split_elements_by_title_and_table( - elements, - multipage_sections=True, - new_after_n_chars=500, - max_characters=500, - ) + pre_chunks = _split_elements_by_title_and_table(elements, opts=ChunkingOptions.new()) pre_chunk = next(pre_chunks) assert isinstance(pre_chunk, TextPreChunk) @@ -712,10 +573,11 @@ def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self): ) text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing" pre_chunk = TablePreChunk( - Table(text_table, metadata=ElementMetadata(text_as_html=html_table)) + Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), + opts=ChunkingOptions.new(max_characters=175), ) - chunk_iter = pre_chunk.iter_chunks(maxlen=175) + chunk_iter = pre_chunk.iter_chunks() chunk = next(chunk_iter) assert isinstance(chunk, Table) @@ -757,10 +619,11 @@ def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_win "Vivamus quis nunc ipsum donec ac fermentum" ) pre_chunk = TablePreChunk( - Table(text_table, metadata=ElementMetadata(text_as_html=html_table)) + Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), + opts=ChunkingOptions.new(max_characters=100), ) - chunk_iter = pre_chunk.iter_chunks(maxlen=100) + chunk_iter = pre_chunk.iter_chunks() chunk = next(chunk_iter) assert isinstance(chunk, TableChunk) @@ -818,17 +681,20 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self): Note that neither the original or other pre_chunk are mutated. """ + opts = ChunkingOptions.new() pre_chunk = TextPreChunk( [ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("In rhoncus ipsum sed lectus porta volutpat."), - ] + ], + opts=opts, ) other_pre_chunk = TextPreChunk( [ Text("Donec semper facilisis metus finibus malesuada."), Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), - ] + ], + opts=opts, ) new_pre_chunk = pre_chunk.combine(other_pre_chunk) @@ -839,19 +705,22 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self): Text("In rhoncus ipsum sed lectus porta volutpat."), Text("Donec semper facilisis metus finibus malesuada."), Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), - ] + ], + opts=opts, ) assert pre_chunk == TextPreChunk( [ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("In rhoncus ipsum sed lectus porta volutpat."), - ] + ], + opts=opts, ) assert other_pre_chunk == TextPreChunk( [ Text("Donec semper facilisis metus finibus malesuada."), Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), - ] + ], + opts=opts, ) def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self): @@ -862,10 +731,11 @@ def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" "lectus porta volutpat.", ), - ] + ], + opts=ChunkingOptions.new(max_characters=200), ) - chunk_iter = pre_chunk.iter_chunks(maxlen=200) + chunk_iter = pre_chunk.iter_chunks() chunk = next(chunk_iter) assert chunk == CompositeElement( @@ -885,10 +755,11 @@ def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(se " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea" " commodo consequat." ), - ] + ], + opts=ChunkingOptions.new(max_characters=200), ) - chunk_iter = pre_chunk.iter_chunks(maxlen=200) + chunk_iter = pre_chunk.iter_chunks() chunk = next(chunk_iter) assert chunk == CompositeElement( @@ -907,7 +778,9 @@ def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(se def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self): """.text_length is the size of chunk this pre-chunk will produce (before any splitting).""" - pre_chunk = TextPreChunk([PageBreak(""), Text("foo"), Text("bar")]) + pre_chunk = TextPreChunk( + [PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new() + ) assert pre_chunk.text_length == 8 def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self): @@ -931,7 +804,8 @@ def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self): languages=["lat", "eng"], ), ), - ] + ], + opts=ChunkingOptions.new(), ) assert pre_chunk._all_metadata_values == { @@ -967,7 +841,8 @@ def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self): [ Title("Lorem Ipsum", metadata=metadata), Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2), - ] + ], + opts=ChunkingOptions.new(), ) # -- ad-hoc fields "coefficient" and "quotient" do not appear -- @@ -1008,7 +883,8 @@ def it_consolidates_regex_metadata_in_a_field_specific_way(self): regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}, ), ), - ] + ], + opts=ChunkingOptions.new(), ) regex_metadata = pre_chunk._consolidated_regex_meta @@ -1062,7 +938,8 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate }, ), ), - ] + ], + opts=ChunkingOptions.new(), ) meta_kwargs = pre_chunk._meta_kwargs @@ -1098,7 +975,7 @@ def it_knows_the_concatenated_text_of_the_pre_chunk( The text-segment contributed by each element is separated from the next by a blank line ("\n\n"). An element that contributes no text does not give rise to a separator. """ - pre_chunk = TextPreChunk(elements) + pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new()) assert pre_chunk._text == expected_value @@ -1106,13 +983,13 @@ class DescribeTextPreChunkBuilder: """Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`.""" def it_is_empty_on_construction(self): - builder = TextPreChunkBuilder(maxlen=50) + builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50)) assert builder.text_length == 0 assert builder.remaining_space == 50 def it_accumulates_elements_added_to_it(self): - builder = TextPreChunkBuilder(maxlen=150) + builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) builder.add_element(Title("Introduction")) assert builder.text_length == 12 @@ -1128,7 +1005,7 @@ def it_accumulates_elements_added_to_it(self): assert builder.remaining_space == 36 def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): - builder = TextPreChunkBuilder(maxlen=150) + builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) builder.add_element(Title("Introduction")) builder.add_element( Text( @@ -1151,7 +1028,7 @@ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): assert builder.remaining_space == 150 def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): - builder = TextPreChunkBuilder(maxlen=150) + builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) pre_chunks = list(builder.flush()) @@ -1160,7 +1037,7 @@ def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): assert builder.remaining_space == 150 def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): - builder = TextPreChunkBuilder(maxlen=50) + builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50)) builder.add_element(Text("abcde")) builder.add_element(Text("fghij")) @@ -1180,30 +1057,32 @@ class DescribePreChunkCombiner: """Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`.""" def it_combines_sequential_small_text_pre_chunks(self): + opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) pre_chunks = [ TextPreChunk( [ Title("Lorem Ipsum"), # 11 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 - ] + ], + opts=opts, ), TextPreChunk( [ Title("Mauris Nec"), # 10 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 - ] + ], + opts=opts, ), TextPreChunk( [ Title("Sed Orci"), # 8 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 - ] + ], + opts=opts, ), ] - pre_chunk_iter = PreChunkCombiner( - pre_chunks, maxlen=250, combine_text_under_n_chars=250 - ).iter_combined_pre_chunks() + pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() pre_chunk = next(pre_chunk_iter) assert isinstance(pre_chunk, TextPreChunk) @@ -1219,24 +1098,27 @@ def it_combines_sequential_small_text_pre_chunks(self): next(pre_chunk_iter) def but_it_does_not_combine_table_pre_chunks(self): + opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) pre_chunks = [ TextPreChunk( [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - ] + ], + opts=opts, ), - TablePreChunk(Table("Heading\nCell text")), + TablePreChunk(Table("Heading\nCell text"), opts=opts), TextPreChunk( [ Title("Mauris Nec"), Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - ] + ], + opts=opts, ), ] pre_chunk_iter = PreChunkCombiner( - pre_chunks, maxlen=250, combine_text_under_n_chars=250 + pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) ).iter_combined_pre_chunks() pre_chunk = next(pre_chunk_iter) @@ -1261,31 +1143,33 @@ def but_it_does_not_combine_table_pre_chunks(self): next(pre_chunk_iter) def it_respects_the_specified_combination_threshold(self): + opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80) pre_chunks = [ TextPreChunk( # 68 [ Title("Lorem Ipsum"), # 11 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 - ] + ], + opts=opts, ), TextPreChunk( # 71 [ Title("Mauris Nec"), # 10 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 - ] + ], + opts=opts, ), # -- len == 139 TextPreChunk( [ Title("Sed Orci"), # 8 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 - ] + ], + opts=opts, ), ] - pre_chunk_iter = PreChunkCombiner( - pre_chunks, maxlen=250, combine_text_under_n_chars=80 - ).iter_combined_pre_chunks() + pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() pre_chunk = next(pre_chunk_iter) assert isinstance(pre_chunk, TextPreChunk) @@ -1307,32 +1191,34 @@ def it_respects_the_specified_combination_threshold(self): next(pre_chunk_iter) def it_respects_the_hard_maximum_window_length(self): + opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200) pre_chunks = [ TextPreChunk( # 68 [ Title("Lorem Ipsum"), # 11 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 - ] + ], + opts=opts, ), TextPreChunk( # 71 [ Title("Mauris Nec"), # 10 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 - ] + ], + opts=opts, ), # -- len == 139 TextPreChunk( [ Title("Sed Orci"), # 8 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 - ] + ], + opts=opts, ), # -- len == 214 ] - pre_chunk_iter = PreChunkCombiner( - pre_chunks, maxlen=200, combine_text_under_n_chars=200 - ).iter_combined_pre_chunks() + pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() pre_chunk = next(pre_chunk_iter) assert isinstance(pre_chunk, TextPreChunk) @@ -1355,9 +1241,9 @@ def it_respects_the_hard_maximum_window_length(self): def it_accommodates_and_isolates_an_oversized_pre_chunk(self): """Such as occurs when a single element exceeds the window size.""" - + opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150) pre_chunks = [ - TextPreChunk([Title("Lorem Ipsum")]), + TextPreChunk([Title("Lorem Ipsum")], opts=opts), TextPreChunk( # 179 [ Text( @@ -1365,13 +1251,14 @@ def it_accommodates_and_isolates_an_oversized_pre_chunk(self): " Mauris nec urna non augue vulputate consequat eget et nisi." # 60 " Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64 ) - ] + ], + opts=opts, ), - TextPreChunk([Title("Vulputate Consequat")]), + TextPreChunk([Title("Vulputate Consequat")], opts=opts), ] pre_chunk_iter = PreChunkCombiner( - pre_chunks, maxlen=150, combine_text_under_n_chars=150 + pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150) ).iter_combined_pre_chunks() pre_chunk = next(pre_chunk_iter) @@ -1400,20 +1287,22 @@ class DescribeTextPreChunkAccumulator: """Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`.""" def it_is_empty_on_construction(self): - accum = TextPreChunkAccumulator(maxlen=100) + accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100)) assert accum.text_length == 0 assert accum.remaining_space == 100 def it_accumulates_pre_chunks_added_to_it(self): - accum = TextPreChunkAccumulator(maxlen=500) + opts = ChunkingOptions.new(max_characters=500) + accum = TextPreChunkAccumulator(opts=opts) accum.add_pre_chunk( TextPreChunk( [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - ] + ], + opts=opts, ) ) assert accum.text_length == 68 @@ -1424,20 +1313,23 @@ def it_accumulates_pre_chunks_added_to_it(self): [ Title("Mauris Nec"), Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - ] + ], + opts=opts, ) ) assert accum.text_length == 141 assert accum.remaining_space == 357 def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): - accum = TextPreChunkAccumulator(maxlen=150) + opts = ChunkingOptions.new(max_characters=150) + accum = TextPreChunkAccumulator(opts=opts) accum.add_pre_chunk( TextPreChunk( [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - ] + ], + opts=opts, ) ) accum.add_pre_chunk( @@ -1445,7 +1337,8 @@ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): [ Title("Mauris Nec"), Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - ] + ], + opts=opts, ) ) accum.add_pre_chunk( @@ -1453,7 +1346,8 @@ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): [ Title("Sed Orci"), Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."), - ] + ], + opts=opts, ) ) @@ -1477,7 +1371,7 @@ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): assert accum.remaining_space == 150 def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): - accum = TextPreChunkAccumulator(maxlen=150) + accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150)) pre_chunks = list(accum.flush()) @@ -1486,9 +1380,10 @@ def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): assert accum.remaining_space == 150 def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): - accum = TextPreChunkAccumulator(maxlen=100) - accum.add_pre_chunk(TextPreChunk([Text("abcde")])) - accum.add_pre_chunk(TextPreChunk([Text("fghij")])) + opts = ChunkingOptions.new(max_characters=100) + accum = TextPreChunkAccumulator(opts=opts) + accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts)) + accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts)) # -- .text_length includes a separator ("\n\n", len==2) between each text-segment, # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 58c11be740..5c4cdf0b1e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.4" # pragma: no cover +__version__ = "0.11.5-dev0" # pragma: no cover diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py new file mode 100644 index 0000000000..a36f437d1b --- /dev/null +++ b/unstructured/chunking/base.py @@ -0,0 +1,152 @@ +"""Chunking objects not specific to a particular chunking strategy.""" + +from __future__ import annotations + +from typing import Optional + +from typing_extensions import Self + +from unstructured.utils import lazyproperty + + +class ChunkingOptions: + """Specifies parameters of optional chunking behaviors.""" + + def __init__( + self, + combine_text_under_n_chars: Optional[int] = None, + max_characters: int = 500, + multipage_sections: bool = True, + new_after_n_chars: Optional[int] = None, + overlap: int = 0, + ): + self._combine_text_under_n_chars_arg = combine_text_under_n_chars + self._max_characters = max_characters + self._multipage_sections = multipage_sections + self._new_after_n_chars_arg = new_after_n_chars + self._overlap = overlap + + @classmethod + def new( + cls, + combine_text_under_n_chars: Optional[int] = None, + max_characters: int = 500, + multipage_sections: bool = True, + new_after_n_chars: Optional[int] = None, + overlap: int = 0, + ) -> Self: + """Construct validated instance. + + Raises `ValueError` on invalid arguments like overlap > max_chars. + """ + self = cls( + combine_text_under_n_chars, + max_characters, + multipage_sections, + new_after_n_chars, + overlap, + ) + self._validate() + return self + + @lazyproperty + def combine_text_under_n_chars(self) -> int: + """Combine consecutive text pre-chunks if former is smaller than this and both will fit. + + - Does not combine table chunks with text chunks even if they would both fit in the + chunking window. + - Does not combine text chunks if together they would exceed the chunking window. + - Defaults to `max_characters` when not specified. + - Is reduced to `new_after_n_chars` when it exceeds that value. + """ + max_characters = self._max_characters + soft_max = self.soft_max + arg = self._combine_text_under_n_chars_arg + + # -- `combine_text_under_n_chars` defaults to `max_characters` when not specified and is + # -- capped at max-chars + combine_text_under_n_chars = max_characters if arg is None or arg > max_characters else arg + + # -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` -- + return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars + + @lazyproperty + def hard_max(self) -> int: + """The maximum size for a chunk. + + A pre-chunk will only exceed this size when it contains exactly one element which by itself + exceeds this size. Such a pre-chunk is subject to mid-text splitting later in the chunking + process. + """ + return self._max_characters + + @lazyproperty + def multipage_sections(self) -> bool: + """When False, break pre-chunks on page-boundaries.""" + return self._multipage_sections + + @lazyproperty + def overlap(self) -> int: + """The number of characters to overlap text when splitting chunks mid-text. + + The actual overlap will not exceed this number of characters but may be less as required to + respect splitting-character boundaries. + """ + return self._overlap + + @lazyproperty + def soft_max(self) -> int: + """A pre-chunk of this size or greater is considered full. + + ??? Is a value of 0 valid? It would produce the behavior: "put each element into its own + chunk". + """ + max_chars = self._max_characters + new_after_n_chars = self._new_after_n_chars_arg + return ( + max_chars + if (new_after_n_chars is None or new_after_n_chars < 0 or new_after_n_chars > max_chars) + else new_after_n_chars + ) + + @lazyproperty + def text_separator(self) -> str: + """The string to insert between elements when concatenating their text for a chunk. + + Right now this is just "\n\n" (a blank line in plain text), but having this here rather + than as a module-level constant provides a way for us to easily make it user-configurable + in future if we want to. + """ + return "\n\n" + + def _validate(self) -> None: + """Raise ValueError if requestion option-set is invalid.""" + max_characters = self._max_characters + # -- chunking window must have positive length -- + if max_characters <= 0: + raise ValueError(f"'max_characters' argument must be > 0," f" got {max_characters}") + + # -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination) + # -- but a negative value is not + combine_text_under_n_chars = self._combine_text_under_n_chars_arg + if combine_text_under_n_chars is not None and combine_text_under_n_chars < 0: + raise ValueError( + f"'combine_text_under_n_chars' argument must be >= 0," + f" got {combine_text_under_n_chars}" + ) + + # -- a negative value for `new_after_n_chars` is assumed to + # -- be a mistake the caller will want to know about + new_after_n_chars = self._new_after_n_chars_arg + if new_after_n_chars is not None and new_after_n_chars < 0: + raise ValueError( + f"'new_after_n_chars' argument must be >= 0," f" got {new_after_n_chars}" + ) + + # -- overlap must be less than max-chars or the chunk text will + # -- never be consumed + # TODO: consider a heuristic like never overlap more than half, + # otherwise there could be corner cases leading to an infinite + # loop (I think). + if self._overlap >= max_characters: + raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}") diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index cf40620c4a..93301e963c 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -11,6 +11,7 @@ from typing_extensions import TypeAlias +from unstructured.chunking.base import ChunkingOptions from unstructured.documents.elements import ( CompositeElement, ConsolidationStrategy, @@ -25,9 +26,6 @@ PreChunk: TypeAlias = "TablePreChunk | TextPreChunk" -# -- goes between text of each element when element-text is concatenated to form chunk -- -TEXT_SEPARATOR = "\n\n" - def chunk_by_title( elements: List[Element], @@ -64,57 +62,22 @@ def chunk_by_title( Chunks elements text and text_as_html (if present) into chunks of length n characters (hard max) """ - - # -- validation and arg pre-processing --------------------------- - - # -- chunking window must have positive length -- - if max_characters <= 0: - raise ValueError(f"'max_characters' argument must be > 0, got {max_characters}") - - # -- `combine_text_under_n_chars` defaults to `max_characters` when not specified and is - # -- capped at max-chars - if combine_text_under_n_chars is None or combine_text_under_n_chars > max_characters: - combine_text_under_n_chars = max_characters - - # -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination) - # -- but a negative value is not - if combine_text_under_n_chars < 0: - raise ValueError( - f"'combine_text_under_n_chars' argument must be >= 0, got {combine_text_under_n_chars}", - ) - - # -- same with `new_after_n_chars` -- - if new_after_n_chars is None or new_after_n_chars > max_characters: - new_after_n_chars = max_characters - - if new_after_n_chars < 0: - raise ValueError(f"'new_after_n_chars' argument must be >= 0, got {new_after_n_chars}") - - # -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` -- - if combine_text_under_n_chars > new_after_n_chars: - combine_text_under_n_chars = new_after_n_chars - - # ---------------------------------------------------------------- + opts = ChunkingOptions.new( + combine_text_under_n_chars=combine_text_under_n_chars, + max_characters=max_characters, + multipage_sections=multipage_sections, + new_after_n_chars=new_after_n_chars, + ) pre_chunks = PreChunkCombiner( - _split_elements_by_title_and_table( - elements, - multipage_sections=multipage_sections, - new_after_n_chars=new_after_n_chars, - max_characters=max_characters, - ), - max_characters, - combine_text_under_n_chars, + _split_elements_by_title_and_table(elements, opts), opts=opts ).iter_combined_pre_chunks() - return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks(max_characters)] + return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()] def _split_elements_by_title_and_table( - elements: List[Element], - multipage_sections: bool, - new_after_n_chars: int, - max_characters: int, + elements: List[Element], opts: ChunkingOptions ) -> Iterator[TextPreChunk | TablePreChunk]: """Implements "pre-chunker" responsibilities. @@ -139,13 +102,13 @@ def _split_elements_by_title_and_table( A Table or Checkbox element is placed into a pre-chunk by itself. """ - pre_chunk_builder = TextPreChunkBuilder(max_characters) + pre_chunk_builder = TextPreChunkBuilder(opts) prior_element = None for element in elements: metadata_differs = ( - _metadata_differs(element, prior_element, ignore_page_numbers=multipage_sections) + _metadata_differs(element, prior_element, ignore_page_numbers=opts.multipage_sections) if prior_element else False ) @@ -157,7 +120,7 @@ def _split_elements_by_title_and_table( # -- adding this element would exceed hard-maxlen for pre_chunk -- or pre_chunk_builder.remaining_space < len(str(element)) # -- pre_chunk already meets or exceeds soft-maxlen -- - or pre_chunk_builder.text_length >= new_after_n_chars + or pre_chunk_builder.text_length >= opts.soft_max # -- a semantic boundary is indicated by metadata change since prior element -- or metadata_differs ): @@ -166,7 +129,7 @@ def _split_elements_by_title_and_table( # -- emit table and checkbox immediately since they are always isolated -- if isinstance(element, Table): - yield TablePreChunk(table=element) + yield TablePreChunk(table=element, opts=opts) # -- but accumulate text elements for consolidation into a composite chunk -- else: pre_chunk_builder.add_element(element) @@ -201,13 +164,15 @@ def _metadata_differs( class TablePreChunk: """A pre-chunk composed of a single Table element.""" - def __init__(self, table: Table) -> None: + def __init__(self, table: Table, opts: ChunkingOptions) -> None: self._table = table + self._opts = opts - def iter_chunks(self, maxlen: int) -> Iterator[Table | TableChunk]: + def iter_chunks(self) -> Iterator[Table | TableChunk]: """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller.""" text = self._table.text html = self._table.metadata.text_as_html or "" + maxlen = self._opts.hard_max # -- only chunk a table when it's too big to swallow whole -- if len(text) <= maxlen and len(html) <= maxlen: @@ -246,8 +211,9 @@ class TextPreChunk: This object is purposely immutable. """ - def __init__(self, elements: Iterable[Element]) -> None: + def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None: self._elements = list(elements) + self._opts = opts def __eq__(self, other: Any) -> bool: if not isinstance(other, TextPreChunk): @@ -256,12 +222,13 @@ def __eq__(self, other: Any) -> bool: def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk: """Return new `TextPreChunk` that combines this and `other_pre_chunk`.""" - return TextPreChunk(self._elements + other_pre_chunk._elements) + return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts) - def iter_chunks(self, maxlen: int) -> Iterator[CompositeElement]: + def iter_chunks(self) -> Iterator[CompositeElement]: """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller.""" text = self._text text_len = len(text) + maxlen = self._opts.hard_max start = 0 remaining = text_len @@ -333,6 +300,7 @@ def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]: offsets of each regex match are also adjusted for their new positions. """ chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {} + separator_len = len(self._opts.text_separator) running_text_len = 0 start_offset = 0 @@ -342,7 +310,7 @@ def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]: if not text_len: continue # -- account for blank line between "squashed" elements, but not before first element -- - running_text_len += len(TEXT_SEPARATOR) if running_text_len else 0 + running_text_len += separator_len if running_text_len else 0 start_offset = running_text_len running_text_len += text_len @@ -404,7 +372,8 @@ def _text(self) -> str: Each element-text is separated from the next by a blank line ("\n\n"). """ - return TEXT_SEPARATOR.join(e.text for e in self._elements if e.text) + text_separator = self._opts.text_separator + return text_separator.join(e.text for e in self._elements if e.text) class TextPreChunkBuilder: @@ -422,14 +391,11 @@ class TextPreChunkBuilder: clears the elements it contains so it is ready to build the next text-pre-chunk. """ - def __init__(self, maxlen: int) -> None: - self._maxlen = maxlen - self._separator_len = len(TEXT_SEPARATOR) + def __init__(self, opts: ChunkingOptions) -> None: + self._opts = opts + self._separator_len = len(opts.text_separator) self._elements: List[Element] = [] - # -- these mutable working values probably represent premature optimization but improve - # -- performance and I expect will be welcome when processing a million elements - # -- only includes non-empty element text, e.g. PageBreak.text=="" is not included -- self._text_segments: List[str] = [] # -- combined length of text-segments, not including separators -- @@ -457,14 +423,14 @@ def flush(self) -> Iterator[TextPreChunk]: self._elements.clear() self._text_segments.clear() self._text_len = 0 - yield TextPreChunk(elements) + yield TextPreChunk(elements, self._opts) @property def remaining_space(self) -> int: """Maximum text-length of an element that can be added without exceeding maxlen.""" # -- include length of trailing separator that will go before next element text -- separators_len = self._separator_len * len(self._text_segments) - return self._maxlen - self._text_len - separators_len + return self._opts.hard_max - self._text_len - separators_len @property def text_length(self) -> int: @@ -490,19 +456,14 @@ def text_length(self) -> int: class PreChunkCombiner: """Filters pre-chunk stream to combine small pre-chunks where possible.""" - def __init__( - self, - pre_chunks: Iterable[PreChunk], - maxlen: int, - combine_text_under_n_chars: int, - ): + def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions): self._pre_chunks = pre_chunks - self._maxlen = maxlen - self._combine_text_under_n_chars = combine_text_under_n_chars + self._opts = opts def iter_combined_pre_chunks(self) -> Iterator[PreChunk]: """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window.""" - accum = TextPreChunkAccumulator(self._maxlen) + accum = TextPreChunkAccumulator(self._opts) + combine_text_under_n_chars = self._opts.combine_text_under_n_chars for pre_chunk in self._pre_chunks: # -- start new pre-chunk under these conditions -- @@ -510,7 +471,7 @@ def iter_combined_pre_chunks(self) -> Iterator[PreChunk]: # -- a table pre-chunk is never combined -- isinstance(pre_chunk, TablePreChunk) # -- don't add another pre-chunk once length has reached combination soft-max -- - or accum.text_length >= self._combine_text_under_n_chars + or accum.text_length >= combine_text_under_n_chars # -- combining would exceed hard-max -- or accum.remaining_space < pre_chunk.text_length ): @@ -541,8 +502,8 @@ class TextPreChunkAccumulator: clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk. """ - def __init__(self, maxlen: int) -> None: - self._maxlen = maxlen + def __init__(self, opts: ChunkingOptions) -> None: + self._opts = opts self._pre_chunks: List[TextPreChunk] = [] def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None: @@ -569,19 +530,22 @@ def flush(self) -> Iterator[TextPreChunk]: @property def remaining_space(self) -> int: """Maximum size of pre-chunk that can be added without exceeding maxlen.""" + maxlen = self._opts.hard_max return ( - self._maxlen + maxlen if not self._pre_chunks # -- an additional pre-chunk will also incur an additional separator -- - else self._maxlen - self.text_length - len(TEXT_SEPARATOR) + else maxlen - self.text_length - len(self._opts.text_separator) ) @property def text_length(self) -> int: """Size of concatenated text in all pre-chunks in accumulator.""" n = len(self._pre_chunks) - return ( - 0 - if n == 0 - else sum(s.text_length for s in self._pre_chunks) + len(TEXT_SEPARATOR) * (n - 1) - ) + + if n == 0: + return 0 + + total_text_length = sum(s.text_length for s in self._pre_chunks) + total_separator_length = len(self._opts.text_separator) * (n - 1) + return total_text_length + total_separator_length