diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index 8ee63062d8..7951d38f1d 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -16,6 +16,7 @@ PreChunkCombiner, PreChunker, _CellAccumulator, + _Chunker, _HtmlTableSplitter, _PreChunkAccumulator, _RowAccumulator, @@ -498,6 +499,52 @@ def it_can_combine_itself_with_another_PreChunk_instance(self): opts=opts, ) + @pytest.mark.parametrize( + ("text", "expected_value"), + [ + # -- normally it splits exactly on overlap size |------- 20 -------| + ("In rhoncus ipsum sed lectus porta volutpat.", "ctus porta volutpat."), + # -- but it strips leading and trailing whitespace when the tail includes it -- + ("In rhoncus ipsum sed lect us portas volutpat. ", "us portas volutpat."), + ], + ) + def it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap( + self, text: str, expected_value: str + ): + pre_chunk = PreChunk( + [Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True) + ) + assert pre_chunk.overlap_tail == expected_value + + @pytest.mark.parametrize( + ("elements", "overlap_prefix", "expected_value"), + [ + ([Text("foo"), Text("bar")], "bah da bing.", "bah da bing.\n\nfoo\n\nbar"), + ([Text("foo"), PageBreak(""), Text("bar")], "da bang.", "da bang.\n\nfoo\n\nbar"), + ([PageBreak(""), Text("foo")], "bah da boom.", "bah da boom.\n\nfoo"), + ([Text("foo"), Text("bar"), PageBreak("")], "", "foo\n\nbar"), + ], + ) + def it_knows_the_concatenated_text_of_the_pre_chunk_to_help( + self, elements: list[Text], overlap_prefix: str, expected_value: str + ): + """._text is the "joined" text of the pre-chunk elements. + + The text-segment contributed by each element is separated from the next by a blank line + ("\n\n"). An element that contributes no text does not give rise to a separator. + """ + pre_chunk = PreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions()) + assert pre_chunk._text == expected_value + + +# ================================================================================================ +# CHUNKING HELPER/SPLITTERS +# ================================================================================================ + + +class Describe_Chunker: + """Unit-test suite for `unstructured.chunking.base._Chunker` objects.""" + def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self): elements = [ Title("Introduction"), @@ -507,16 +554,23 @@ def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window ), ] opts = ChunkingOptions(max_characters=200, include_orig_elements=True) - pre_chunk = PreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts) + chunker = _Chunker( + elements, + text=( + "e feugiat efficitur.\n\nIntroduction\n\nLorem ipsum dolor sit amet consectetur" + " adipiscing elit. In rhoncus ipsum sed lectus porta volutpat." + ), + opts=opts, + ) - chunk_iter = pre_chunk.iter_chunks() + chunk_iter = chunker._iter_chunks() chunk = next(chunk_iter) assert chunk == CompositeElement( "e feugiat efficitur.\n\nIntroduction\n\nLorem ipsum dolor sit amet consectetur" " adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.", ) - assert chunk.metadata is pre_chunk._consolidated_metadata + assert chunk.metadata is chunker._consolidated_metadata assert chunk.metadata.orig_elements == elements # -- with pytest.raises(StopIteration): @@ -524,19 +578,17 @@ def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self): # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window. - # -- The pre-chunker will isolate that element in a pre_chunk of its own. - elements = [ - Text( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" - " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" - " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea" - " commodo consequat." - ) - ] + # -- The pre-chunker will automatically isolate that element in a pre_chunk of its own. + text = ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor" + " incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud" + " exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat." + ) + elements = [Text(text)] opts = ChunkingOptions(max_characters=200, include_orig_elements=True) - pre_chunk = PreChunk(elements, overlap_prefix="", opts=opts) + chunker = _Chunker(elements, text=text, opts=opts) - chunk_iter = pre_chunk.iter_chunks() + chunk_iter = chunker._iter_chunks() # -- Note that .metadata.orig_elements is the same single original element, "repeated" for # -- each text-split chunk. This behavior emerges without explicit command as a consequence @@ -548,93 +600,70 @@ def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(se " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" " veniam, quis nostrud exercitation ullamco laboris nisi ut" ) - assert chunk.metadata is pre_chunk._consolidated_metadata + assert chunk.metadata is chunker._consolidated_metadata assert chunk.metadata.orig_elements == elements # -- chunk = next(chunk_iter) assert chunk == CompositeElement("aliquip ex ea commodo consequat.") - assert chunk.metadata is pre_chunk._continuation_metadata + assert chunk.metadata is chunker._continuation_metadata assert chunk.metadata.orig_elements == elements # -- with pytest.raises(StopIteration): next(chunk_iter) def and_it_adds_the_is_continuation_flag_for_second_and_later_split_chunks(self): + # -- |--------------------- 48 ---------------------| + text = "'Lorem ipsum dolor' means 'Thank you very much'." metadata = ElementMetadata( category_depth=0, filename="foo.docx", languages=["lat"], parent_id="f87731e0", ) + elements = [Text(text, metadata=metadata)] - pre_chunk = PreChunk( - # -- |--------------------- 48 ---------------------| - [Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)], - overlap_prefix="", - opts=ChunkingOptions(max_characters=20), - ) - - chunk_iter = pre_chunk.iter_chunks() + chunk_iter = _Chunker.iter_chunks(elements, text, opts=ChunkingOptions(max_characters=20)) assert [c.metadata.is_continuation for c in chunk_iter] == [None, True, True] def but_it_generates_no_chunks_when_the_pre_chunk_contains_no_text(self): metadata = ElementMetadata() - pre_chunk = PreChunk( + + chunk_iter = _Chunker.iter_chunks( [PageBreak(" ", metadata=metadata)], - overlap_prefix="", + text="", opts=ChunkingOptions(), ) - chunk_iter = pre_chunk.iter_chunks() - with pytest.raises(StopIteration): next(chunk_iter) - @pytest.mark.parametrize( - ("text", "expected_value"), - [ - # -- normally it splits exactly on overlap size |------- 20 -------| - ("In rhoncus ipsum sed lectus porta volutpat.", "ctus porta volutpat."), - # -- but it strips leading and trailing whitespace when the tail includes it -- - ("In rhoncus ipsum sed lect us portas volutpat. ", "us portas volutpat."), - ], - ) - def it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap( - self, text: str, expected_value: str - ): - pre_chunk = PreChunk( - [Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True) - ) - assert pre_chunk.overlap_tail == expected_value - def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self): - pre_chunk = PreChunk( - [ - Title( - "Lorem Ipsum", - metadata=ElementMetadata( - category_depth=0, - filename="foo.docx", - languages=["lat"], - parent_id="f87731e0", - ), + elements = [ + Title( + "Lorem Ipsum", + metadata=ElementMetadata( + category_depth=0, + filename="foo.docx", + languages=["lat"], + parent_id="f87731e0", ), - Text( - "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", - metadata=ElementMetadata( - category_depth=1, - filename="foo.docx", - image_path="sprite.png", - languages=["lat", "eng"], - ), + ), + Text( + "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", + metadata=ElementMetadata( + category_depth=1, + filename="foo.docx", + image_path="sprite.png", + languages=["lat", "eng"], ), - ], - overlap_prefix="", - opts=ChunkingOptions(), - ) + ), + ] + text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin." + + chunker = _Chunker(elements, text=text, opts=ChunkingOptions()) - assert pre_chunk._all_metadata_values == { + assert chunker._all_metadata_values == { # -- scalar values are accumulated in a list in element order -- "category_depth": [0, 1], # -- all values are accumulated, not only unique ones -- @@ -662,18 +691,16 @@ def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self): languages=["lat", "eng"], ) metadata_2.quotient = 1.74 + elements = [ + Title("Lorem Ipsum", metadata=metadata), + Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2), + ] + text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin." - pre_chunk = PreChunk( - [ - Title("Lorem Ipsum", metadata=metadata), - Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2), - ], - overlap_prefix="", - opts=ChunkingOptions(), - ) + chunker = _Chunker(elements, text=text, opts=ChunkingOptions()) # -- ad-hoc fields "coefficient" and "quotient" do not appear -- - assert pre_chunk._all_metadata_values == { + assert chunker._all_metadata_values == { "category_depth": [0, 1], "filename": ["foo.docx", "foo.docx"], "image_path": ["sprite.png"], @@ -686,9 +713,11 @@ def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self): metadata = ElementMetadata(filename="foo.pdf") element = Title("Lorem Ipsum", metadata=metadata) element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata) - pre_chunk = PreChunk([element, element_2], overlap_prefix="", opts=opts) + elements = [element, element_2] + text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin." + chunker = _Chunker(elements, text=text, opts=opts) - consolidated_metadata = pre_chunk._consolidated_metadata + consolidated_metadata = chunker._consolidated_metadata # -- pre-chunk elements are included as metadata -- orig_elements = consolidated_metadata.orig_elements @@ -704,40 +733,38 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate Only non-None fields should appear in the dict and each field value should be the consolidation of the values across the pre_chunk elements. """ - pre_chunk = PreChunk( - [ - PageBreak(""), - Title( - "Lorem Ipsum", - metadata=ElementMetadata( - filename="foo.docx", - # -- category_depth has DROP strategy so doesn't appear in result -- - category_depth=0, - emphasized_text_contents=["Lorem", "Ipsum"], - emphasized_text_tags=["b", "i"], - languages=["lat"], - ), + elements = [ + PageBreak(""), + Title( + "Lorem Ipsum", + metadata=ElementMetadata( + filename="foo.docx", + # -- category_depth has DROP strategy so doesn't appear in result -- + category_depth=0, + emphasized_text_contents=["Lorem", "Ipsum"], + emphasized_text_tags=["b", "i"], + languages=["lat"], ), - Text( - "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", - metadata=ElementMetadata( - # -- filename change doesn't happen IRL but demonstrates FIRST strategy -- - filename="bar.docx", - # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem" - # -- appears twice in consolidated-meta (as it should) and length matches - # -- that of emphasized_text_tags both before and after consolidation. - emphasized_text_contents=["Lorem", "ipsum"], - emphasized_text_tags=["i", "b"], - # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once -- - languages=["eng", "lat"], - ), + ), + Text( + "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", + metadata=ElementMetadata( + # -- filename change doesn't happen IRL but demonstrates FIRST strategy -- + filename="bar.docx", + # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem" + # -- appears twice in consolidated-meta (as it should) and length matches + # -- that of emphasized_text_tags both before and after consolidation. + emphasized_text_contents=["Lorem", "ipsum"], + emphasized_text_tags=["i", "b"], + # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once -- + languages=["eng", "lat"], ), - ], - overlap_prefix="", - opts=ChunkingOptions(), - ) + ), + ] + text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin." + chunker = _Chunker(elements, text=text, opts=ChunkingOptions()) - meta_kwargs = pre_chunk._meta_kwargs + meta_kwargs = chunker._meta_kwargs assert meta_kwargs == { "filename": "foo.docx", @@ -747,19 +774,21 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate } def it_computes_the_original_elements_list_to_help(self): + opts = ChunkingOptions(include_orig_elements=True) element = Title("Introduction") element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") element_3 = CompositeElement( "In rhoncus ipsum sed lectus porta volutpat.", metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]), ) - pre_chunk = PreChunk( - [element, element_2, element_3], - overlap_prefix="", - opts=ChunkingOptions(include_orig_elements=True), + elements = [element, element_2, element_3] + text = ( + "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn" + " rhoncus ipsum sed lectus porta volutpat." ) + chunker = _Chunker(elements, text=text, opts=opts) - orig_elements = pre_chunk._orig_elements + orig_elements = chunker._orig_elements # -- all elements of pre-chunk are included -- assert orig_elements == [element, element_2, element_3] @@ -770,32 +799,7 @@ def it_computes_the_original_elements_list_to_help(self): assert orig_elements[2] is not element_3 assert orig_elements[2].metadata.orig_elements is None # -- computation is only on first call, all chunks get exactly the same orig-elements -- - assert pre_chunk._orig_elements is orig_elements - - @pytest.mark.parametrize( - ("elements", "overlap_prefix", "expected_value"), - [ - ([Text("foo"), Text("bar")], "bah da bing.", "bah da bing.\n\nfoo\n\nbar"), - ([Text("foo"), PageBreak(""), Text("bar")], "da bang.", "da bang.\n\nfoo\n\nbar"), - ([PageBreak(""), Text("foo")], "bah da boom.", "bah da boom.\n\nfoo"), - ([Text("foo"), Text("bar"), PageBreak("")], "", "foo\n\nbar"), - ], - ) - def it_knows_the_concatenated_text_of_the_pre_chunk_to_help( - self, elements: list[Text], overlap_prefix: str, expected_value: str - ): - """._text is the "joined" text of the pre-chunk elements. - - The text-segment contributed by each element is separated from the next by a blank line - ("\n\n"). An element that contributes no text does not give rise to a separator. - """ - pre_chunk = PreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions()) - assert pre_chunk._text == expected_value - - -# ================================================================================================ -# CHUNKING HELPER/SPLITTERS -# ================================================================================================ + assert chunker._orig_elements is orig_elements class Describe_TableChunker: diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index 318b747640..27fd7e62fb 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -481,13 +481,78 @@ def iter_chunks(self) -> Iterator[CompositeElement | Table | TableChunk]: yield from _TableChunker.iter_chunks( self._elements[0], self._overlap_prefix, self._opts ) - return + else: + yield from _Chunker.iter_chunks(self._elements, self._text, self._opts) + + @lazyproperty + def overlap_tail(self) -> str: + """The portion of this chunk's text to be repeated as a prefix in the next chunk. + + This value is the empty-string ("") when either the `.overlap` length option is `0` or + `.overlap_all` is `False`. When there is a text value, it is stripped of both leading and + trailing whitespace. + """ + overlap = self._opts.inter_chunk_overlap + return self._text[-overlap:].strip() if overlap else "" + + def _iter_text_segments(self) -> Iterator[str]: + """Generate overlap text and each element text segment in order. + + Empty text segments are not included. + """ + if self._overlap_prefix: + yield self._overlap_prefix + for e in self._elements: + text = " ".join(e.text.strip().split()) + if not text: + continue + yield text + + @lazyproperty + def _text(self) -> str: + """The concatenated text of all elements in this pre-chunk, including any overlap. + + Whitespace is normalized to a single space. The text of each element is separated from + that of the next by a blank line ("\n\n"). + """ + return self._opts.text_separator.join(self._iter_text_segments()) + +# ================================================================================================ +# CHUNKING HELPER/SPLITTERS +# ================================================================================================ + + +class _Chunker: + """Forms chunks from a pre-chunk other than one containing only a `Table`. + + Produces zero-or-more `CompositeElement` objects. + """ + + def __init__(self, elements: Iterable[Element], text: str, opts: ChunkingOptions) -> None: + self._elements = list(elements) + self._text = text + self._opts = opts + + @classmethod + def iter_chunks( + cls, elements: Iterable[Element], text: str, opts: ChunkingOptions + ) -> Iterator[CompositeElement]: + """Form zero or more chunks from `elements`. + + One `CompositeElement` is produced when all `elements` will fit. Otherwise there is a + single `Text`-subtype element and chunks are formed by splitting. + """ + return cls(elements, text, opts)._iter_chunks() + + def _iter_chunks(self) -> Iterator[CompositeElement]: + """Form zero or more chunks from `elements`.""" # -- a pre-chunk containing no text (maybe only a PageBreak element for example) does not # -- generate any chunks. if not self._text: return + # -- `split()` is the text-splitting function used to split an oversized element -- split = self._opts.split # -- emit first chunk -- @@ -500,17 +565,6 @@ def iter_chunks(self) -> Iterator[CompositeElement | Table | TableChunk]: s, remainder = split(remainder) yield CompositeElement(text=s, metadata=self._continuation_metadata) - @lazyproperty - def overlap_tail(self) -> str: - """The portion of this chunk's text to be repeated as a prefix in the next chunk. - - This value is the empty-string ("") when either the `.overlap` length option is `0` or - `.overlap_all` is `False`. When there is a text value, it is stripped of both leading and - trailing whitespace. - """ - overlap = self._opts.inter_chunk_overlap - return self._text[-overlap:].strip() if overlap else "" - @lazyproperty def _all_metadata_values(self) -> dict[str, list[Any]]: """Collection of all populated metadata values across elements. @@ -576,19 +630,6 @@ def _continuation_metadata(self) -> ElementMetadata: continuation_metadata.is_continuation = True return continuation_metadata - def _iter_text_segments(self) -> Iterator[str]: - """Generate overlap text and each element text segment in order. - - Empty text segments are not included. - """ - if self._overlap_prefix: - yield self._overlap_prefix - for e in self._elements: - text = " ".join(e.text.strip().split()) - if not text: - continue - yield text - @lazyproperty def _meta_kwargs(self) -> dict[str, Any]: """The consolidated metadata values as a dict suitable for constructing ElementMetadata. @@ -645,21 +686,6 @@ def iter_orig_elements(): return list(iter_orig_elements()) - @lazyproperty - def _text(self) -> str: - """The concatenated text of all elements in this pre-chunk, including any overlap. - - Whitespace is normalized to a single space. The text of each element is separated from - that of the next by a blank line ("\n\n"). - """ - text_separator = self._opts.text_separator - return text_separator.join(self._iter_text_segments()) - - -# ================================================================================================ -# CHUNKING HELPER/SPLITTERS -# ================================================================================================ - class _TableChunker: """Responsible for forming chunks, especially splits, from a single-table pre-chunk.