diff --git a/CHANGELOG.md b/CHANGELOG.md index d603e41945..000b853330 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ * **Fix wrong logger for paddle info** Replace the logger from unstructured-inference with the logger from unstructured for paddle_ocr.py module. * **Fix ingest pipeline to be able to use chunking and embedding together** Problem: When ingest pipeline was using chunking and embedding together, embedding outputs were empty and the outputs of chunking couldn't be re-read into memory and be forwarded to embeddings. Fix: Added CompositeElement type to TYPE_TO_TEXT_ELEMENT_MAP to be able to process CompositeElements with unstructured.staging.base.isd_to_elements * **Fix unnecessary mid-text chunk-splitting.** The "pre-chunker" did not consider separator blank-line ("\n\n") length when grouping elements for a single chunk. As a result, sections were frequently over-populated producing a over-sized chunk that required mid-text splitting. +* **Fix frequent dissociation of title from chunk.** The sectioning algorithm included the title of the next section with the prior section whenever it would fit, frequently producing association of a section title with the prior section and dissociating it from its actual section. Fix this by performing combination of whole sections only. ## 0.10.27 diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 3853872388..db77d0703e 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -6,9 +6,11 @@ from unstructured.chunking.title import ( _NonTextSection, + _SectionCombiner, _split_elements_by_title_and_table, _TableSection, _TextSection, + _TextSectionAccumulator, _TextSectionBuilder, chunk_by_title, ) @@ -199,7 +201,6 @@ def test_split_elements_by_title_and_table(): sections = _split_elements_by_title_and_table( elements, multipage_sections=True, - combine_text_under_n_chars=0, new_after_n_chars=500, max_characters=500, ) @@ -734,7 +735,7 @@ def it_provides_access_to_its_elements(self): class Describe_TextSectionBuilder: - """Unit-test suite for `unstructured.chunking.title._TextSection objects.""" + """Unit-test suite for `unstructured.chunking.title._TextSectionBuilder`.""" def it_is_empty_on_construction(self): builder = _TextSectionBuilder(maxlen=50) @@ -802,3 +803,347 @@ def it_considers_separator_length_when_computing_text_length_and_remaining_space # -- between the current text and that of the next element if one was added. # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38 assert builder.remaining_space == 36 + + +# == SectionCombiner ============================================================================= + + +class Describe_SectionCombiner: + """Unit-test suite for `unstructured.chunking.title._SectionCombiner`.""" + + def it_combines_sequential_small_text_sections(self): + sections = [ + _TextSection( + [ + Title("Lorem Ipsum"), # 11 + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 + ] + ), + _TextSection( + [ + Title("Mauris Nec"), # 10 + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 + ] + ), + _TextSection( + [ + Title("Sed Orci"), # 8 + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 + ] + ), + ] + + section_iter = _SectionCombiner( + sections, maxlen=250, combine_text_under_n_chars=250 + ).iter_combined_sections() + + section = next(section_iter) + assert isinstance(section, _TextSection) + assert section._elements == [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), + ] + with pytest.raises(StopIteration): + next(section_iter) + + def but_it_does_not_combine_table_or_non_text_sections(self): + sections = [ + _TextSection( + [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + ] + ), + _TableSection(Table("
")), + _TextSection( + [ + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ] + ), + _NonTextSection(CheckBox()), + _TextSection( + [ + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), + ] + ), + ] + + section_iter = _SectionCombiner( + sections, maxlen=250, combine_text_under_n_chars=250 + ).iter_combined_sections() + + section = next(section_iter) + assert isinstance(section, _TextSection) + assert section._elements == [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + ] + # -- + section = next(section_iter) + assert isinstance(section, _TableSection) + assert section.table == Table("
") + # -- + section = next(section_iter) + assert isinstance(section, _TextSection) + assert section._elements == [ + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ] + # -- + section = next(section_iter) + assert isinstance(section, _NonTextSection) + assert section.element == CheckBox() + # -- + section = next(section_iter) + assert isinstance(section, _TextSection) + assert section._elements == [ + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), + ] + # -- + with pytest.raises(StopIteration): + next(section_iter) + + def it_respects_the_specified_combination_threshold(self): + sections = [ + _TextSection( # 68 + [ + Title("Lorem Ipsum"), # 11 + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 + ] + ), + _TextSection( # 71 + [ + Title("Mauris Nec"), # 10 + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 + ] + ), + # -- len == 139 + _TextSection( + [ + Title("Sed Orci"), # 8 + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 + ] + ), + ] + + section_iter = _SectionCombiner( + sections, maxlen=250, combine_text_under_n_chars=80 + ).iter_combined_sections() + + section = next(section_iter) + assert isinstance(section, _TextSection) + assert section._elements == [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ] + # -- + section = next(section_iter) + assert isinstance(section, _TextSection) + assert section._elements == [ + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), + ] + # -- + with pytest.raises(StopIteration): + next(section_iter) + + def it_respects_the_hard_maximum_window_length(self): + sections = [ + _TextSection( # 68 + [ + Title("Lorem Ipsum"), # 11 + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 + ] + ), + _TextSection( # 71 + [ + Title("Mauris Nec"), # 10 + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 + ] + ), + # -- len == 139 + _TextSection( + [ + Title("Sed Orci"), # 8 + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 + ] + ), + # -- len == 214 + ] + + section_iter = _SectionCombiner( + sections, maxlen=200, combine_text_under_n_chars=200 + ).iter_combined_sections() + + section = next(section_iter) + assert isinstance(section, _TextSection) + assert section._elements == [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ] + # -- + section = next(section_iter) + assert isinstance(section, _TextSection) + assert section._elements == [ + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), + ] + # -- + with pytest.raises(StopIteration): + next(section_iter) + + def it_accommodates_and_isolates_an_oversized_section(self): + """Such as occurs when a single element exceeds the window size.""" + + sections = [ + _TextSection([Title("Lorem Ipsum")]), + _TextSection( # 179 + [ + Text( + "Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55 + " Mauris nec urna non augue vulputate consequat eget et nisi." # 60 + " Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64 + ) + ] + ), + _TextSection([Title("Vulputate Consequat")]), + ] + + section_iter = _SectionCombiner( + sections, maxlen=150, combine_text_under_n_chars=150 + ).iter_combined_sections() + + section = next(section_iter) + assert isinstance(section, _TextSection) + assert section._elements == [Title("Lorem Ipsum")] + # -- + section = next(section_iter) + assert isinstance(section, _TextSection) + assert section._elements == [ + Text( + "Lorem ipsum dolor sit amet consectetur adipiscing elit." + " Mauris nec urna non augue vulputate consequat eget et nisi." + " Sed orci quam, eleifend sit amet vehicula, elementum ultricies." + ) + ] + # -- + section = next(section_iter) + assert isinstance(section, _TextSection) + assert section._elements == [Title("Vulputate Consequat")] + # -- + with pytest.raises(StopIteration): + next(section_iter) + + +class Describe_TextSectionAccumulator: + """Unit-test suite for `unstructured.chunking.title._TextSectionAccumulator`.""" + + def it_is_empty_on_construction(self): + accum = _TextSectionAccumulator(maxlen=100) + + assert accum.text_length == 0 + assert accum.remaining_space == 100 + + def it_accumulates_sections_added_to_it(self): + accum = _TextSectionAccumulator(maxlen=500) + + accum.add_section( + _TextSection( + [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + ] + ) + ) + assert accum.text_length == 68 + assert accum.remaining_space == 430 + + accum.add_section( + _TextSection( + [ + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ] + ) + ) + assert accum.text_length == 141 + assert accum.remaining_space == 357 + + def it_generates_a_TextSection_when_flushed_and_resets_itself_to_empty(self): + accum = _TextSectionAccumulator(maxlen=150) + accum.add_section( + _TextSection( + [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + ] + ) + ) + accum.add_section( + _TextSection( + [ + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + ] + ) + ) + accum.add_section( + _TextSection( + [ + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."), + ] + ) + ) + + section_iter = accum.flush() + + # -- iterator generates exactly one section -- + section = next(section_iter) + with pytest.raises(StopIteration): + next(section_iter) + # -- and it is a _TextSection containing all the elements -- + assert isinstance(section, _TextSection) + assert section._elements == [ + Title("Lorem Ipsum"), + Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Title("Mauris Nec"), + Text("Mauris nec urna non augue vulputate consequat eget et nisi."), + Title("Sed Orci"), + Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."), + ] + assert accum.text_length == 0 + assert accum.remaining_space == 150 + + def but_it_does_not_generate_a_TextSection_on_flush_when_empty(self): + accum = _TextSectionAccumulator(maxlen=150) + + sections = list(accum.flush()) + + assert sections == [] + assert accum.text_length == 0 + assert accum.remaining_space == 150 + + def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): + accum = _TextSectionAccumulator(maxlen=100) + accum.add_section(_TextSection([Text("abcde")])) + accum.add_section(_TextSection([Text("fghij")])) + + # -- .text_length includes a separator ("\n\n", len==2) between each text-segment, + # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10 + assert accum.text_length == 12 + # -- .remaining_space is reduced by the length (2) of the trailing separator which would + # -- go between the current text and that of the next section if one was added. + # -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88 + assert accum.remaining_space == 86 diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index ef78684aee..b70e5b3c4c 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -10,7 +10,7 @@ import inspect from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, cast -from typing_extensions import ParamSpec +from typing_extensions import ParamSpec, TypeAlias from unstructured.documents.elements import ( CompositeElement, @@ -20,6 +20,9 @@ Text, Title, ) +from unstructured.utils import lazyproperty + +_Section: TypeAlias = "_NonTextSection | _TableSection | _TextSection" # -- goes between text of each element when element-text is concatenated to form chunk -- TEXT_SEPARATOR = "\n\n" @@ -124,13 +127,16 @@ def chunk_by_title( chunked_elements: List[Element] = [] - sections = _split_elements_by_title_and_table( - elements, - multipage_sections=multipage_sections, - combine_text_under_n_chars=combine_text_under_n_chars, - new_after_n_chars=new_after_n_chars, - max_characters=max_characters, - ) + sections = _SectionCombiner( + _split_elements_by_title_and_table( + elements, + multipage_sections=multipage_sections, + new_after_n_chars=new_after_n_chars, + max_characters=max_characters, + ), + max_characters, + combine_text_under_n_chars, + ).iter_combined_sections() for section in sections: if isinstance(section, _NonTextSection): @@ -195,7 +201,6 @@ def chunk_by_title( def _split_elements_by_title_and_table( elements: List[Element], multipage_sections: bool, - combine_text_under_n_chars: int, new_after_n_chars: int, max_characters: int, ) -> Iterator[_TextSection | _TableSection | _NonTextSection]: @@ -233,21 +238,15 @@ def _split_elements_by_title_and_table( # -- start new section when necessary -- if ( - # TODO(scanny): this is where disassociated-titles are coming from (attempting to - # combine sections at the element level). This is fixed in the next PR. - ( - isinstance(element, Title) - and section_builder.text_length > combine_text_under_n_chars - ) + # -- Title, Table, and non-Text element (CheckBox) all start a new section -- + isinstance(element, (Title, Table)) + or not isinstance(element, Text) # -- adding this element would exceed hard-maxlen for section -- or section_builder.remaining_space < len(str(element)) # -- section already meets or exceeds soft-maxlen -- or section_builder.text_length >= new_after_n_chars # -- a semantic boundary is indicated by metadata change since prior element -- or metadata_differs - # -- table and non-text elements go in a section by themselves -- - or isinstance(element, Table) - or not isinstance(element, Text) ): # -- complete any work-in-progress section -- yield from section_builder.flush() @@ -341,6 +340,9 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: return decorator +# == Sections ==================================================================================== + + class _NonTextSection: """A section composed of a single `Element` that does not subclass `Text`. @@ -381,11 +383,28 @@ class _TextSection: def __init__(self, elements: Iterable[Element]) -> None: self._elements = list(elements) + def combine(self, other_section: _TextSection) -> _TextSection: + """Return new `_TextSection` that combines this and `other_section`.""" + return _TextSection(self._elements + other_section._elements) + @property def elements(self) -> List[Element]: """The elements of this text-section.""" return self._elements + @lazyproperty + def text_length(self) -> int: + """Length of concatenated text of this section, including separators.""" + return len(self._text) + + @lazyproperty + def _text(self) -> str: + """The concatenated text of all elements in this section. + + Each element-text is separated from the next by a blank line ("\n\n"). + """ + return TEXT_SEPARATOR.join(e.text for e in self._elements if isinstance(e, Text) and e.text) + class _TextSectionBuilder: """An element accumulator suitable for incrementally forming a section. @@ -462,3 +481,106 @@ def text_length(self) -> int: n = len(self._text_segments) separator_count = n - 1 if n else 0 return self._text_len + (separator_count * self._separator_len) + + +# == SectionCombiner ============================================================================= + + +class _SectionCombiner: + """Filters section stream to combine small sections where possible.""" + + def __init__( + self, + sections: Iterable[_Section], + maxlen: int, + combine_text_under_n_chars: int, + ): + self._sections = sections + self._maxlen = maxlen + self._combine_text_under_n_chars = combine_text_under_n_chars + + def iter_combined_sections(self) -> Iterator[_Section]: + """Generate section objects, combining TextSection objects when they will fit in window.""" + accum = _TextSectionAccumulator(self._maxlen) + + for section in self._sections: + # -- start new section under these conditions -- + if ( + # -- a table or checkbox section is never combined -- + isinstance(section, (_TableSection, _NonTextSection)) + # -- don't add another section once length has reached combination soft-max -- + or accum.text_length >= self._combine_text_under_n_chars + # -- combining would exceed hard-max -- + or accum.remaining_space < section.text_length + ): + yield from accum.flush() + + # -- a table or checkbox section is never combined so don't accumulate -- + if isinstance(section, (_TableSection, _NonTextSection)): + yield section + else: + accum.add_section(section) + + yield from accum.flush() + + +class _TextSectionAccumulator: + """Accumulates, measures, and combines section objects. + + Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding + whether to add another section. + + `.flush()` is used to combine the accumulated sections into a single `TextSection` object. This + method returns an interator that generates zero-or-one `TextSection` objects and is used like + so: + + yield from accum.flush() + + If no sections have been accumulated, no `TextSection` is generated. Flushing the builder clears + the sections it contains so it is ready to accept the next text-section. + """ + + def __init__(self, maxlen: int) -> None: + self._maxlen = maxlen + self._sections: List[_TextSection] = [] + + def add_section(self, section: _TextSection) -> None: + """Add a section to the accumulator for possible combination with next section.""" + self._sections.append(section) + + def flush(self) -> Iterator[_TextSection]: + """Generate all accumulated sections as a single combined section.""" + sections = self._sections + + # -- nothing to do if no sections have been accumulated -- + if not sections: + return + + # -- otherwise combine all accumulated section into one -- + section = sections[0] + for other_section in sections[1:]: + section = section.combine(other_section) + yield section + + # -- and reset the accumulator (to empty) -- + sections.clear() + + @property + def remaining_space(self) -> int: + """Maximum size of section that can be added without exceeding maxlen.""" + return ( + self._maxlen + if not self._sections + # -- an additional section will also incur an additional separator -- + else self._maxlen - self.text_length - len(TEXT_SEPARATOR) + ) + + @property + def text_length(self) -> int: + """Size of concatenated text in all sections in accumulator.""" + n = len(self._sections) + return ( + 0 + if n == 0 + else sum(s.text_length for s in self._sections) + len(TEXT_SEPARATOR) * (n - 1) + )