diff --git a/CHANGELOG.md b/CHANGELOG.md
index d603e41945..000b853330 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@
* **Fix wrong logger for paddle info** Replace the logger from unstructured-inference with the logger from unstructured for paddle_ocr.py module.
* **Fix ingest pipeline to be able to use chunking and embedding together** Problem: When ingest pipeline was using chunking and embedding together, embedding outputs were empty and the outputs of chunking couldn't be re-read into memory and be forwarded to embeddings. Fix: Added CompositeElement type to TYPE_TO_TEXT_ELEMENT_MAP to be able to process CompositeElements with unstructured.staging.base.isd_to_elements
* **Fix unnecessary mid-text chunk-splitting.** The "pre-chunker" did not consider separator blank-line ("\n\n") length when grouping elements for a single chunk. As a result, sections were frequently over-populated producing a over-sized chunk that required mid-text splitting.
+* **Fix frequent dissociation of title from chunk.** The sectioning algorithm included the title of the next section with the prior section whenever it would fit, frequently producing association of a section title with the prior section and dissociating it from its actual section. Fix this by performing combination of whole sections only.
## 0.10.27
diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
index 3853872388..db77d0703e 100644
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@@ -6,9 +6,11 @@
from unstructured.chunking.title import (
_NonTextSection,
+ _SectionCombiner,
_split_elements_by_title_and_table,
_TableSection,
_TextSection,
+ _TextSectionAccumulator,
_TextSectionBuilder,
chunk_by_title,
)
@@ -199,7 +201,6 @@ def test_split_elements_by_title_and_table():
sections = _split_elements_by_title_and_table(
elements,
multipage_sections=True,
- combine_text_under_n_chars=0,
new_after_n_chars=500,
max_characters=500,
)
@@ -734,7 +735,7 @@ def it_provides_access_to_its_elements(self):
class Describe_TextSectionBuilder:
- """Unit-test suite for `unstructured.chunking.title._TextSection objects."""
+ """Unit-test suite for `unstructured.chunking.title._TextSectionBuilder`."""
def it_is_empty_on_construction(self):
builder = _TextSectionBuilder(maxlen=50)
@@ -802,3 +803,347 @@ def it_considers_separator_length_when_computing_text_length_and_remaining_space
# -- between the current text and that of the next element if one was added.
# -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
assert builder.remaining_space == 36
+
+
+# == SectionCombiner =============================================================================
+
+
+class Describe_SectionCombiner:
+ """Unit-test suite for `unstructured.chunking.title._SectionCombiner`."""
+
+ def it_combines_sequential_small_text_sections(self):
+ sections = [
+ _TextSection(
+ [
+ Title("Lorem Ipsum"), # 11
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
+ ]
+ ),
+ _TextSection(
+ [
+ Title("Mauris Nec"), # 10
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
+ ]
+ ),
+ _TextSection(
+ [
+ Title("Sed Orci"), # 8
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
+ ]
+ ),
+ ]
+
+ section_iter = _SectionCombiner(
+ sections, maxlen=250, combine_text_under_n_chars=250
+ ).iter_combined_sections()
+
+ section = next(section_iter)
+ assert isinstance(section, _TextSection)
+ assert section._elements == [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
+ ]
+ with pytest.raises(StopIteration):
+ next(section_iter)
+
+ def but_it_does_not_combine_table_or_non_text_sections(self):
+ sections = [
+ _TextSection(
+ [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ ]
+ ),
+ _TableSection(Table("
")),
+ _TextSection(
+ [
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ]
+ ),
+ _NonTextSection(CheckBox()),
+ _TextSection(
+ [
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
+ ]
+ ),
+ ]
+
+ section_iter = _SectionCombiner(
+ sections, maxlen=250, combine_text_under_n_chars=250
+ ).iter_combined_sections()
+
+ section = next(section_iter)
+ assert isinstance(section, _TextSection)
+ assert section._elements == [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ ]
+ # --
+ section = next(section_iter)
+ assert isinstance(section, _TableSection)
+ assert section.table == Table("")
+ # --
+ section = next(section_iter)
+ assert isinstance(section, _TextSection)
+ assert section._elements == [
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ]
+ # --
+ section = next(section_iter)
+ assert isinstance(section, _NonTextSection)
+ assert section.element == CheckBox()
+ # --
+ section = next(section_iter)
+ assert isinstance(section, _TextSection)
+ assert section._elements == [
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
+ ]
+ # --
+ with pytest.raises(StopIteration):
+ next(section_iter)
+
+ def it_respects_the_specified_combination_threshold(self):
+ sections = [
+ _TextSection( # 68
+ [
+ Title("Lorem Ipsum"), # 11
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
+ ]
+ ),
+ _TextSection( # 71
+ [
+ Title("Mauris Nec"), # 10
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
+ ]
+ ),
+ # -- len == 139
+ _TextSection(
+ [
+ Title("Sed Orci"), # 8
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
+ ]
+ ),
+ ]
+
+ section_iter = _SectionCombiner(
+ sections, maxlen=250, combine_text_under_n_chars=80
+ ).iter_combined_sections()
+
+ section = next(section_iter)
+ assert isinstance(section, _TextSection)
+ assert section._elements == [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ]
+ # --
+ section = next(section_iter)
+ assert isinstance(section, _TextSection)
+ assert section._elements == [
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
+ ]
+ # --
+ with pytest.raises(StopIteration):
+ next(section_iter)
+
+ def it_respects_the_hard_maximum_window_length(self):
+ sections = [
+ _TextSection( # 68
+ [
+ Title("Lorem Ipsum"), # 11
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
+ ]
+ ),
+ _TextSection( # 71
+ [
+ Title("Mauris Nec"), # 10
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
+ ]
+ ),
+ # -- len == 139
+ _TextSection(
+ [
+ Title("Sed Orci"), # 8
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
+ ]
+ ),
+ # -- len == 214
+ ]
+
+ section_iter = _SectionCombiner(
+ sections, maxlen=200, combine_text_under_n_chars=200
+ ).iter_combined_sections()
+
+ section = next(section_iter)
+ assert isinstance(section, _TextSection)
+ assert section._elements == [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ]
+ # --
+ section = next(section_iter)
+ assert isinstance(section, _TextSection)
+ assert section._elements == [
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
+ ]
+ # --
+ with pytest.raises(StopIteration):
+ next(section_iter)
+
+ def it_accommodates_and_isolates_an_oversized_section(self):
+ """Such as occurs when a single element exceeds the window size."""
+
+ sections = [
+ _TextSection([Title("Lorem Ipsum")]),
+ _TextSection( # 179
+ [
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
+ " Mauris nec urna non augue vulputate consequat eget et nisi." # 60
+ " Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64
+ )
+ ]
+ ),
+ _TextSection([Title("Vulputate Consequat")]),
+ ]
+
+ section_iter = _SectionCombiner(
+ sections, maxlen=150, combine_text_under_n_chars=150
+ ).iter_combined_sections()
+
+ section = next(section_iter)
+ assert isinstance(section, _TextSection)
+ assert section._elements == [Title("Lorem Ipsum")]
+ # --
+ section = next(section_iter)
+ assert isinstance(section, _TextSection)
+ assert section._elements == [
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit."
+ " Mauris nec urna non augue vulputate consequat eget et nisi."
+ " Sed orci quam, eleifend sit amet vehicula, elementum ultricies."
+ )
+ ]
+ # --
+ section = next(section_iter)
+ assert isinstance(section, _TextSection)
+ assert section._elements == [Title("Vulputate Consequat")]
+ # --
+ with pytest.raises(StopIteration):
+ next(section_iter)
+
+
+class Describe_TextSectionAccumulator:
+ """Unit-test suite for `unstructured.chunking.title._TextSectionAccumulator`."""
+
+ def it_is_empty_on_construction(self):
+ accum = _TextSectionAccumulator(maxlen=100)
+
+ assert accum.text_length == 0
+ assert accum.remaining_space == 100
+
+ def it_accumulates_sections_added_to_it(self):
+ accum = _TextSectionAccumulator(maxlen=500)
+
+ accum.add_section(
+ _TextSection(
+ [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ ]
+ )
+ )
+ assert accum.text_length == 68
+ assert accum.remaining_space == 430
+
+ accum.add_section(
+ _TextSection(
+ [
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ]
+ )
+ )
+ assert accum.text_length == 141
+ assert accum.remaining_space == 357
+
+ def it_generates_a_TextSection_when_flushed_and_resets_itself_to_empty(self):
+ accum = _TextSectionAccumulator(maxlen=150)
+ accum.add_section(
+ _TextSection(
+ [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ ]
+ )
+ )
+ accum.add_section(
+ _TextSection(
+ [
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ]
+ )
+ )
+ accum.add_section(
+ _TextSection(
+ [
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
+ ]
+ )
+ )
+
+ section_iter = accum.flush()
+
+ # -- iterator generates exactly one section --
+ section = next(section_iter)
+ with pytest.raises(StopIteration):
+ next(section_iter)
+ # -- and it is a _TextSection containing all the elements --
+ assert isinstance(section, _TextSection)
+ assert section._elements == [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
+ ]
+ assert accum.text_length == 0
+ assert accum.remaining_space == 150
+
+ def but_it_does_not_generate_a_TextSection_on_flush_when_empty(self):
+ accum = _TextSectionAccumulator(maxlen=150)
+
+ sections = list(accum.flush())
+
+ assert sections == []
+ assert accum.text_length == 0
+ assert accum.remaining_space == 150
+
+ def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
+ accum = _TextSectionAccumulator(maxlen=100)
+ accum.add_section(_TextSection([Text("abcde")]))
+ accum.add_section(_TextSection([Text("fghij")]))
+
+ # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
+ # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
+ assert accum.text_length == 12
+ # -- .remaining_space is reduced by the length (2) of the trailing separator which would
+ # -- go between the current text and that of the next section if one was added.
+ # -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
+ assert accum.remaining_space == 86
diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py
index ef78684aee..b70e5b3c4c 100644
--- a/unstructured/chunking/title.py
+++ b/unstructured/chunking/title.py
@@ -10,7 +10,7 @@
import inspect
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, cast
-from typing_extensions import ParamSpec
+from typing_extensions import ParamSpec, TypeAlias
from unstructured.documents.elements import (
CompositeElement,
@@ -20,6 +20,9 @@
Text,
Title,
)
+from unstructured.utils import lazyproperty
+
+_Section: TypeAlias = "_NonTextSection | _TableSection | _TextSection"
# -- goes between text of each element when element-text is concatenated to form chunk --
TEXT_SEPARATOR = "\n\n"
@@ -124,13 +127,16 @@ def chunk_by_title(
chunked_elements: List[Element] = []
- sections = _split_elements_by_title_and_table(
- elements,
- multipage_sections=multipage_sections,
- combine_text_under_n_chars=combine_text_under_n_chars,
- new_after_n_chars=new_after_n_chars,
- max_characters=max_characters,
- )
+ sections = _SectionCombiner(
+ _split_elements_by_title_and_table(
+ elements,
+ multipage_sections=multipage_sections,
+ new_after_n_chars=new_after_n_chars,
+ max_characters=max_characters,
+ ),
+ max_characters,
+ combine_text_under_n_chars,
+ ).iter_combined_sections()
for section in sections:
if isinstance(section, _NonTextSection):
@@ -195,7 +201,6 @@ def chunk_by_title(
def _split_elements_by_title_and_table(
elements: List[Element],
multipage_sections: bool,
- combine_text_under_n_chars: int,
new_after_n_chars: int,
max_characters: int,
) -> Iterator[_TextSection | _TableSection | _NonTextSection]:
@@ -233,21 +238,15 @@ def _split_elements_by_title_and_table(
# -- start new section when necessary --
if (
- # TODO(scanny): this is where disassociated-titles are coming from (attempting to
- # combine sections at the element level). This is fixed in the next PR.
- (
- isinstance(element, Title)
- and section_builder.text_length > combine_text_under_n_chars
- )
+ # -- Title, Table, and non-Text element (CheckBox) all start a new section --
+ isinstance(element, (Title, Table))
+ or not isinstance(element, Text)
# -- adding this element would exceed hard-maxlen for section --
or section_builder.remaining_space < len(str(element))
# -- section already meets or exceeds soft-maxlen --
or section_builder.text_length >= new_after_n_chars
# -- a semantic boundary is indicated by metadata change since prior element --
or metadata_differs
- # -- table and non-text elements go in a section by themselves --
- or isinstance(element, Table)
- or not isinstance(element, Text)
):
# -- complete any work-in-progress section --
yield from section_builder.flush()
@@ -341,6 +340,9 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
return decorator
+# == Sections ====================================================================================
+
+
class _NonTextSection:
"""A section composed of a single `Element` that does not subclass `Text`.
@@ -381,11 +383,28 @@ class _TextSection:
def __init__(self, elements: Iterable[Element]) -> None:
self._elements = list(elements)
+ def combine(self, other_section: _TextSection) -> _TextSection:
+ """Return new `_TextSection` that combines this and `other_section`."""
+ return _TextSection(self._elements + other_section._elements)
+
@property
def elements(self) -> List[Element]:
"""The elements of this text-section."""
return self._elements
+ @lazyproperty
+ def text_length(self) -> int:
+ """Length of concatenated text of this section, including separators."""
+ return len(self._text)
+
+ @lazyproperty
+ def _text(self) -> str:
+ """The concatenated text of all elements in this section.
+
+ Each element-text is separated from the next by a blank line ("\n\n").
+ """
+ return TEXT_SEPARATOR.join(e.text for e in self._elements if isinstance(e, Text) and e.text)
+
class _TextSectionBuilder:
"""An element accumulator suitable for incrementally forming a section.
@@ -462,3 +481,106 @@ def text_length(self) -> int:
n = len(self._text_segments)
separator_count = n - 1 if n else 0
return self._text_len + (separator_count * self._separator_len)
+
+
+# == SectionCombiner =============================================================================
+
+
+class _SectionCombiner:
+ """Filters section stream to combine small sections where possible."""
+
+ def __init__(
+ self,
+ sections: Iterable[_Section],
+ maxlen: int,
+ combine_text_under_n_chars: int,
+ ):
+ self._sections = sections
+ self._maxlen = maxlen
+ self._combine_text_under_n_chars = combine_text_under_n_chars
+
+ def iter_combined_sections(self) -> Iterator[_Section]:
+ """Generate section objects, combining TextSection objects when they will fit in window."""
+ accum = _TextSectionAccumulator(self._maxlen)
+
+ for section in self._sections:
+ # -- start new section under these conditions --
+ if (
+ # -- a table or checkbox section is never combined --
+ isinstance(section, (_TableSection, _NonTextSection))
+ # -- don't add another section once length has reached combination soft-max --
+ or accum.text_length >= self._combine_text_under_n_chars
+ # -- combining would exceed hard-max --
+ or accum.remaining_space < section.text_length
+ ):
+ yield from accum.flush()
+
+ # -- a table or checkbox section is never combined so don't accumulate --
+ if isinstance(section, (_TableSection, _NonTextSection)):
+ yield section
+ else:
+ accum.add_section(section)
+
+ yield from accum.flush()
+
+
+class _TextSectionAccumulator:
+ """Accumulates, measures, and combines section objects.
+
+ Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
+ whether to add another section.
+
+ `.flush()` is used to combine the accumulated sections into a single `TextSection` object. This
+ method returns an interator that generates zero-or-one `TextSection` objects and is used like
+ so:
+
+ yield from accum.flush()
+
+ If no sections have been accumulated, no `TextSection` is generated. Flushing the builder clears
+ the sections it contains so it is ready to accept the next text-section.
+ """
+
+ def __init__(self, maxlen: int) -> None:
+ self._maxlen = maxlen
+ self._sections: List[_TextSection] = []
+
+ def add_section(self, section: _TextSection) -> None:
+ """Add a section to the accumulator for possible combination with next section."""
+ self._sections.append(section)
+
+ def flush(self) -> Iterator[_TextSection]:
+ """Generate all accumulated sections as a single combined section."""
+ sections = self._sections
+
+ # -- nothing to do if no sections have been accumulated --
+ if not sections:
+ return
+
+ # -- otherwise combine all accumulated section into one --
+ section = sections[0]
+ for other_section in sections[1:]:
+ section = section.combine(other_section)
+ yield section
+
+ # -- and reset the accumulator (to empty) --
+ sections.clear()
+
+ @property
+ def remaining_space(self) -> int:
+ """Maximum size of section that can be added without exceeding maxlen."""
+ return (
+ self._maxlen
+ if not self._sections
+ # -- an additional section will also incur an additional separator --
+ else self._maxlen - self.text_length - len(TEXT_SEPARATOR)
+ )
+
+ @property
+ def text_length(self) -> int:
+ """Size of concatenated text in all sections in accumulator."""
+ n = len(self._sections)
+ return (
+ 0
+ if n == 0
+ else sum(s.text_length for s in self._sections) + len(TEXT_SEPARATOR) * (n - 1)
+ )