diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1dc82c3e53..fd76ccbc63 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.11.5-dev1
+## 0.11.5-dev2
### Enhancements
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
index d2d4132eab..8988d6ded8 100644
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@@ -1,14 +1,35 @@
+# pyright: reportPrivateUsage=false
+
"""Unit-test suite for the `unstructured.chunking.base` module."""
from __future__ import annotations
+from typing import List
+
import pytest
-from unstructured.chunking.base import ChunkingOptions
+from unstructured.chunking.base import (
+ ChunkingOptions,
+ PreChunkBuilder,
+ PreChunkCombiner,
+ TablePreChunk,
+ TextPreChunk,
+ TextPreChunkAccumulator,
+)
+from unstructured.documents.elements import (
+ CompositeElement,
+ ElementMetadata,
+ PageBreak,
+ RegexMetadata,
+ Table,
+ TableChunk,
+ Text,
+ Title,
+)
class DescribeChunkingOptions:
- """Unit-test suite for `unstructured.chunking.model.ChunkingOptions objects."""
+ """Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects."""
@pytest.mark.parametrize("max_characters", [0, -1, -42])
def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int):
@@ -111,3 +132,847 @@ def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self):
def it_knows_the_text_separator_string(self):
assert ChunkingOptions.new().text_separator == "\n\n"
+
+
+# ================================================================================================
+# PRE-CHUNK SUBTYPES
+# ================================================================================================
+
+
+class DescribeTablePreChunk:
+ """Unit-test suite for `unstructured.chunking.base.TablePreChunk objects."""
+
+ def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
+ html_table = (
+ "
\n"
+ "\n"
+ "Header Col 1 | Header Col 2 |
\n"
+ "\n"
+ "\n"
+ "Lorem ipsum | adipiscing |
\n"
+ "\n"
+ "
"
+ )
+ text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
+ pre_chunk = TablePreChunk(
+ Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
+ opts=ChunkingOptions.new(max_characters=175),
+ )
+
+ chunk_iter = pre_chunk.iter_chunks()
+
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, Table)
+ assert chunk.text == "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
+ assert chunk.metadata.text_as_html == (
+ "\n"
+ "\n"
+ "Header Col 1 | Header Col 2 |
\n"
+ "\n"
+ "\n"
+ "Lorem ipsum | adipiscing |
\n"
+ "\n"
+ "
"
+ )
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
+ def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
+ # fixed-overhead = 8+8+9+8+9+8 = 50
+ # per-row overhead = 27
+ html_table = (
+ "\n" # 8
+ "\n" # 8
+ "Header Col 1 | Header Col 2 |
\n"
+ "\n" # 9
+ "\n" # 8
+ "Lorem ipsum | A Link example |
\n"
+ "Consectetur | adipiscing elit |
\n"
+ "Nunc aliquam | id enim nec molestie |
\n"
+ "Vivamus quis | nunc ipsum donec ac fermentum |
\n"
+ "\n" # 9
+ "
" # 8
+ )
+ text_table = (
+ "Header Col 1 Header Col 2\n"
+ "Lorem ipsum dolor sit amet\n"
+ "Consectetur adipiscing elit\n"
+ "Nunc aliquam id enim nec molestie\n"
+ "Vivamus quis nunc ipsum donec ac fermentum"
+ )
+ pre_chunk = TablePreChunk(
+ Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
+ opts=ChunkingOptions.new(max_characters=100),
+ )
+
+ chunk_iter = pre_chunk.iter_chunks()
+
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert chunk.text == (
+ "Header Col 1 Header Col 2\n"
+ "Lorem ipsum dolor sit amet\n"
+ "Consectetur adipiscing elit\n"
+ "Nunc aliqua"
+ )
+ assert chunk.metadata.text_as_html == (
+ "\n"
+ "\n"
+ "Header Col 1 | Header Col 2 |
\n"
+ "\n"
+ "\n"
+ "Lo"
+ )
+ # --
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert (
+ chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
+ )
+ assert chunk.metadata.text_as_html == (
+ "rem ipsum | A Link example |
\n"
+ "Consectetur | adipiscing elit | <"
+ )
+ # -- note that text runs out but HTML continues because it's significantly longer. So two
+ # -- of these chunks have HTML but no text.
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert chunk.text == ""
+ assert chunk.metadata.text_as_html == (
+ "/tr>\n"
+ "
Nunc aliquam | id enim nec molestie |
\n"
+ "Vivamus quis | "
+ )
+ # --
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert chunk.text == ""
+ assert chunk.metadata.text_as_html == (
+ "nunc ipsum donec ac fermentum |
\n\n
"
+ )
+ # --
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
+
+class DescribeTextPreChunk:
+ """Unit-test suite for `unstructured.chunking.base.TextPreChunk objects."""
+
+ def it_can_combine_itself_with_another_TextPreChunk_instance(self):
+ """.combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
+
+ Note that neither the original or other pre_chunk are mutated.
+ """
+ opts = ChunkingOptions.new()
+ pre_chunk = TextPreChunk(
+ [
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Text("In rhoncus ipsum sed lectus porta volutpat."),
+ ],
+ opts=opts,
+ )
+ other_pre_chunk = TextPreChunk(
+ [
+ Text("Donec semper facilisis metus finibus malesuada."),
+ Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
+ ],
+ opts=opts,
+ )
+
+ new_pre_chunk = pre_chunk.combine(other_pre_chunk)
+
+ assert new_pre_chunk == TextPreChunk(
+ [
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Text("In rhoncus ipsum sed lectus porta volutpat."),
+ Text("Donec semper facilisis metus finibus malesuada."),
+ Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
+ ],
+ opts=opts,
+ )
+ assert pre_chunk == TextPreChunk(
+ [
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Text("In rhoncus ipsum sed lectus porta volutpat."),
+ ],
+ opts=opts,
+ )
+ assert other_pre_chunk == TextPreChunk(
+ [
+ Text("Donec semper facilisis metus finibus malesuada."),
+ Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
+ ],
+ opts=opts,
+ )
+
+ def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
+ pre_chunk = TextPreChunk(
+ [
+ Title("Introduction"),
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+ "lectus porta volutpat.",
+ ),
+ ],
+ opts=ChunkingOptions.new(max_characters=200),
+ )
+
+ chunk_iter = pre_chunk.iter_chunks()
+
+ chunk = next(chunk_iter)
+ assert chunk == CompositeElement(
+ "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
+ " In rhoncus ipsum sedlectus porta volutpat.",
+ )
+ assert chunk.metadata is pre_chunk._consolidated_metadata
+
+ def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
+ # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
+ # -- The pre-chunker will isolate that element in a pre_chunk of its own.
+ pre_chunk = TextPreChunk(
+ [
+ Text(
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
+ " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
+ " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
+ " commodo consequat."
+ ),
+ ],
+ opts=ChunkingOptions.new(max_characters=200),
+ )
+
+ chunk_iter = pre_chunk.iter_chunks()
+
+ chunk = next(chunk_iter)
+ assert chunk == CompositeElement(
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
+ " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
+ " veniam, quis nostrud exercitation ullamco laboris nisi ut a"
+ )
+ assert chunk.metadata is pre_chunk._consolidated_metadata
+ # --
+ chunk = next(chunk_iter)
+ assert chunk == CompositeElement("liquip ex ea commodo consequat.")
+ assert chunk.metadata is pre_chunk._consolidated_metadata
+ # --
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
+ def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
+ """.text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
+ pre_chunk = TextPreChunk(
+ [PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new()
+ )
+ assert pre_chunk.text_length == 8
+
+ def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
+ pre_chunk = TextPreChunk(
+ [
+ Title(
+ "Lorem Ipsum",
+ metadata=ElementMetadata(
+ category_depth=0,
+ filename="foo.docx",
+ languages=["lat"],
+ parent_id="f87731e0",
+ ),
+ ),
+ Text(
+ "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
+ metadata=ElementMetadata(
+ category_depth=1,
+ filename="foo.docx",
+ image_path="sprite.png",
+ languages=["lat", "eng"],
+ ),
+ ),
+ ],
+ opts=ChunkingOptions.new(),
+ )
+
+ assert pre_chunk._all_metadata_values == {
+ # -- scalar values are accumulated in a list in element order --
+ "category_depth": [0, 1],
+ # -- all values are accumulated, not only unique ones --
+ "filename": ["foo.docx", "foo.docx"],
+ # -- list-type fields produce a list of lists --
+ "languages": [["lat"], ["lat", "eng"]],
+ # -- fields that only appear in some elements are captured --
+ "image_path": ["sprite.png"],
+ "parent_id": ["f87731e0"],
+ # -- A `None` value never appears, neither does a field-name with an empty list --
+ }
+
+ def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
+ metadata = ElementMetadata(
+ category_depth=0,
+ filename="foo.docx",
+ languages=["lat"],
+ parent_id="f87731e0",
+ )
+ metadata.coefficient = 0.62
+ metadata_2 = ElementMetadata(
+ category_depth=1,
+ filename="foo.docx",
+ image_path="sprite.png",
+ languages=["lat", "eng"],
+ )
+ metadata_2.quotient = 1.74
+
+ pre_chunk = TextPreChunk(
+ [
+ Title("Lorem Ipsum", metadata=metadata),
+ Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
+ ],
+ opts=ChunkingOptions.new(),
+ )
+
+ # -- ad-hoc fields "coefficient" and "quotient" do not appear --
+ assert pre_chunk._all_metadata_values == {
+ "category_depth": [0, 1],
+ "filename": ["foo.docx", "foo.docx"],
+ "image_path": ["sprite.png"],
+ "languages": [["lat"], ["lat", "eng"]],
+ "parent_id": ["f87731e0"],
+ }
+
+ def it_consolidates_regex_metadata_in_a_field_specific_way(self):
+ """regex_metadata of chunk is combined regex_metadatas of its elements.
+
+ Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
+ position in the chunk after element text has been concatenated.
+ """
+ pre_chunk = TextPreChunk(
+ [
+ Title(
+ "Lorem Ipsum",
+ metadata=ElementMetadata(
+ regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
+ ),
+ ),
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit.",
+ metadata=ElementMetadata(
+ regex_metadata={
+ "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
+ "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
+ },
+ ),
+ ),
+ Text(
+ "In rhoncus ipsum sed lectus porta volutpat.",
+ metadata=ElementMetadata(
+ regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
+ ),
+ ),
+ ],
+ opts=ChunkingOptions.new(),
+ )
+
+ regex_metadata = pre_chunk._consolidated_regex_meta
+
+ assert regex_metadata == {
+ "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
+ "ipsum": [
+ RegexMetadata(text="Ipsum", start=6, end=11),
+ RegexMetadata(text="ipsum", start=19, end=24),
+ RegexMetadata(text="ipsum", start=81, end=86),
+ ],
+ }
+
+ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
+ """._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
+
+ Only non-None fields should appear in the dict and each field value should be the
+ consolidation of the values across the pre_chunk elements.
+ """
+ pre_chunk = TextPreChunk(
+ [
+ PageBreak(""),
+ Title(
+ "Lorem Ipsum",
+ metadata=ElementMetadata(
+ filename="foo.docx",
+ # -- category_depth has DROP strategy so doesn't appear in result --
+ category_depth=0,
+ emphasized_text_contents=["Lorem", "Ipsum"],
+ emphasized_text_tags=["b", "i"],
+ languages=["lat"],
+ regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
+ ),
+ ),
+ Text(
+ "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
+ metadata=ElementMetadata(
+ # -- filename change doesn't happen IRL but demonstrates FIRST strategy --
+ filename="bar.docx",
+ # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
+ # -- appears twice in consolidated-meta (as it should) and length matches
+ # -- that of emphasized_text_tags both before and after consolidation.
+ emphasized_text_contents=["Lorem", "ipsum"],
+ emphasized_text_tags=["i", "b"],
+ # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
+ languages=["eng", "lat"],
+ # -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
+ regex_metadata={
+ "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
+ "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
+ },
+ ),
+ ),
+ ],
+ opts=ChunkingOptions.new(),
+ )
+
+ meta_kwargs = pre_chunk._meta_kwargs
+
+ assert meta_kwargs == {
+ "filename": "foo.docx",
+ "emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
+ "emphasized_text_tags": ["b", "i", "i", "b"],
+ "languages": ["lat", "eng"],
+ "regex_metadata": {
+ "ipsum": [
+ RegexMetadata(text="Ipsum", start=6, end=11),
+ RegexMetadata(text="ipsum", start=19, end=24),
+ ],
+ "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
+ },
+ }
+
+ @pytest.mark.parametrize(
+ ("elements", "expected_value"),
+ [
+ ([Text("foo"), Text("bar")], "foo\n\nbar"),
+ ([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"),
+ ([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"),
+ ([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
+ ],
+ )
+ def it_knows_the_concatenated_text_of_the_pre_chunk(
+ self, elements: List[Text], expected_value: str
+ ):
+ """._text is the "joined" text of the pre-chunk elements.
+
+ The text-segment contributed by each element is separated from the next by a blank line
+ ("\n\n"). An element that contributes no text does not give rise to a separator.
+ """
+ pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new())
+ assert pre_chunk._text == expected_value
+
+
+# ================================================================================================
+# PRE-CHUNKING ACCUMULATORS
+# ================================================================================================
+
+
+class DescribePreChunkBuilder:
+ """Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
+
+ def it_is_empty_on_construction(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
+
+ assert builder.text_length == 0
+ assert builder.remaining_space == 50
+
+ def it_accumulates_elements_added_to_it(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+
+ builder.add_element(Title("Introduction"))
+ assert builder.text_length == 12
+ assert builder.remaining_space == 136
+
+ builder.add_element(
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+ "lectus porta volutpat.",
+ ),
+ )
+ assert builder.text_length == 112
+ assert builder.remaining_space == 36
+
+ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+ builder.add_element(Title("Introduction"))
+ builder.add_element(
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+ "lectus porta volutpat.",
+ ),
+ )
+
+ pre_chunk = next(builder.flush())
+
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Title("Introduction"),
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+ "lectus porta volutpat.",
+ ),
+ ]
+ assert builder.text_length == 0
+ assert builder.remaining_space == 150
+
+ def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+
+ pre_chunks = list(builder.flush())
+
+ assert pre_chunks == []
+ assert builder.text_length == 0
+ assert builder.remaining_space == 150
+
+ def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
+ builder.add_element(Text("abcde"))
+ builder.add_element(Text("fghij"))
+
+ # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
+ # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
+ assert builder.text_length == 12
+ # -- .remaining_space is reduced by the length (2) of the trailing separator which would go
+ # -- between the current text and that of the next element if one was added.
+ # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
+ assert builder.remaining_space == 36
+
+
+class DescribePreChunkCombiner:
+ """Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""
+
+ def it_combines_sequential_small_text_pre_chunks(self):
+ opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
+ pre_chunks = [
+ TextPreChunk(
+ [
+ Title("Lorem Ipsum"), # 11
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
+ ],
+ opts=opts,
+ ),
+ TextPreChunk(
+ [
+ Title("Mauris Nec"), # 10
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
+ ],
+ opts=opts,
+ ),
+ TextPreChunk(
+ [
+ Title("Sed Orci"), # 8
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
+ ],
+ opts=opts,
+ ),
+ ]
+
+ pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
+
+ pre_chunk = next(pre_chunk_iter)
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
+ ]
+ with pytest.raises(StopIteration):
+ next(pre_chunk_iter)
+
+ def but_it_does_not_combine_table_pre_chunks(self):
+ opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
+ pre_chunks = [
+ TextPreChunk(
+ [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ ],
+ opts=opts,
+ ),
+ TablePreChunk(Table("Heading\nCell text"), opts=opts),
+ TextPreChunk(
+ [
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ],
+ opts=opts,
+ ),
+ ]
+
+ pre_chunk_iter = PreChunkCombiner(
+ pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
+ ).iter_combined_pre_chunks()
+
+ pre_chunk = next(pre_chunk_iter)
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ ]
+ # --
+ pre_chunk = next(pre_chunk_iter)
+ assert isinstance(pre_chunk, TablePreChunk)
+ assert pre_chunk._table == Table("Heading\nCell text")
+ # --
+ pre_chunk = next(pre_chunk_iter)
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ]
+ # --
+ with pytest.raises(StopIteration):
+ next(pre_chunk_iter)
+
+ def it_respects_the_specified_combination_threshold(self):
+ opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
+ pre_chunks = [
+ TextPreChunk( # 68
+ [
+ Title("Lorem Ipsum"), # 11
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
+ ],
+ opts=opts,
+ ),
+ TextPreChunk( # 71
+ [
+ Title("Mauris Nec"), # 10
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
+ ],
+ opts=opts,
+ ),
+ # -- len == 139
+ TextPreChunk(
+ [
+ Title("Sed Orci"), # 8
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
+ ],
+ opts=opts,
+ ),
+ ]
+
+ pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
+
+ pre_chunk = next(pre_chunk_iter)
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ]
+ # --
+ pre_chunk = next(pre_chunk_iter)
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
+ ]
+ # --
+ with pytest.raises(StopIteration):
+ next(pre_chunk_iter)
+
+ def it_respects_the_hard_maximum_window_length(self):
+ opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
+ pre_chunks = [
+ TextPreChunk( # 68
+ [
+ Title("Lorem Ipsum"), # 11
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
+ ],
+ opts=opts,
+ ),
+ TextPreChunk( # 71
+ [
+ Title("Mauris Nec"), # 10
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
+ ],
+ opts=opts,
+ ),
+ # -- len == 139
+ TextPreChunk(
+ [
+ Title("Sed Orci"), # 8
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
+ ],
+ opts=opts,
+ ),
+ # -- len == 214
+ ]
+
+ pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
+
+ pre_chunk = next(pre_chunk_iter)
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ]
+ # --
+ pre_chunk = next(pre_chunk_iter)
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
+ ]
+ # --
+ with pytest.raises(StopIteration):
+ next(pre_chunk_iter)
+
+ def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
+ """Such as occurs when a single element exceeds the window size."""
+ opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
+ pre_chunks = [
+ TextPreChunk([Title("Lorem Ipsum")], opts=opts),
+ TextPreChunk( # 179
+ [
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
+ " Mauris nec urna non augue vulputate consequat eget et nisi." # 60
+ " Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64
+ )
+ ],
+ opts=opts,
+ ),
+ TextPreChunk([Title("Vulputate Consequat")], opts=opts),
+ ]
+
+ pre_chunk_iter = PreChunkCombiner(
+ pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
+ ).iter_combined_pre_chunks()
+
+ pre_chunk = next(pre_chunk_iter)
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [Title("Lorem Ipsum")]
+ # --
+ pre_chunk = next(pre_chunk_iter)
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit."
+ " Mauris nec urna non augue vulputate consequat eget et nisi."
+ " Sed orci quam, eleifend sit amet vehicula, elementum ultricies."
+ )
+ ]
+ # --
+ pre_chunk = next(pre_chunk_iter)
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [Title("Vulputate Consequat")]
+ # --
+ with pytest.raises(StopIteration):
+ next(pre_chunk_iter)
+
+
+class DescribeTextPreChunkAccumulator:
+ """Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""
+
+ def it_is_empty_on_construction(self):
+ accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100))
+
+ assert accum.text_length == 0
+ assert accum.remaining_space == 100
+
+ def it_accumulates_pre_chunks_added_to_it(self):
+ opts = ChunkingOptions.new(max_characters=500)
+ accum = TextPreChunkAccumulator(opts=opts)
+
+ accum.add_pre_chunk(
+ TextPreChunk(
+ [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ ],
+ opts=opts,
+ )
+ )
+ assert accum.text_length == 68
+ assert accum.remaining_space == 430
+
+ accum.add_pre_chunk(
+ TextPreChunk(
+ [
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ],
+ opts=opts,
+ )
+ )
+ assert accum.text_length == 141
+ assert accum.remaining_space == 357
+
+ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
+ opts = ChunkingOptions.new(max_characters=150)
+ accum = TextPreChunkAccumulator(opts=opts)
+ accum.add_pre_chunk(
+ TextPreChunk(
+ [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ ],
+ opts=opts,
+ )
+ )
+ accum.add_pre_chunk(
+ TextPreChunk(
+ [
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ ],
+ opts=opts,
+ )
+ )
+ accum.add_pre_chunk(
+ TextPreChunk(
+ [
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
+ ],
+ opts=opts,
+ )
+ )
+
+ pre_chunk_iter = accum.flush()
+
+ # -- iterator generates exactly one pre_chunk --
+ pre_chunk = next(pre_chunk_iter)
+ with pytest.raises(StopIteration):
+ next(pre_chunk_iter)
+ # -- and it is a _TextPreChunk containing all the elements --
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Title("Lorem Ipsum"),
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Title("Mauris Nec"),
+ Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+ Title("Sed Orci"),
+ Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
+ ]
+ assert accum.text_length == 0
+ assert accum.remaining_space == 150
+
+ def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
+ accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
+
+ pre_chunks = list(accum.flush())
+
+ assert pre_chunks == []
+ assert accum.text_length == 0
+ assert accum.remaining_space == 150
+
+ def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
+ opts = ChunkingOptions.new(max_characters=100)
+ accum = TextPreChunkAccumulator(opts=opts)
+ accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts))
+ accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts))
+
+ # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
+ # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
+ assert accum.text_length == 12
+ # -- .remaining_space is reduced by the length (2) of the trailing separator which would
+ # -- go between the current text and that of the next pre-chunk if one was added.
+ # -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
+ assert accum.remaining_space == 86
diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
index df12220b6e..dfb196b069 100644
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@@ -4,16 +4,8 @@
import pytest
-from unstructured.chunking.base import ChunkingOptions
-from unstructured.chunking.title import (
- PreChunkCombiner,
- TablePreChunk,
- TextPreChunk,
- TextPreChunkAccumulator,
- TextPreChunkBuilder,
- _split_elements_by_title_and_table,
- chunk_by_title,
-)
+from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
+from unstructured.chunking.title import _split_elements_by_title_and_table, chunk_by_title
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
CheckBox,
@@ -22,10 +14,8 @@
Element,
ElementMetadata,
ListItem,
- PageBreak,
RegexMetadata,
Table,
- TableChunk,
Text,
Title,
)
@@ -552,843 +542,3 @@ def test_it_considers_separator_length_when_pre_chunking():
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
-
-
-# == PreChunks ===================================================================================
-
-
-class DescribeTablePreChunk:
- """Unit-test suite for `unstructured.chunking.title.TablePreChunk objects."""
-
- def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
- html_table = (
- "\n"
- "\n"
- "Header Col 1 | Header Col 2 |
\n"
- "\n"
- "\n"
- "Lorem ipsum | adipiscing |
\n"
- "\n"
- "
"
- )
- text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
- pre_chunk = TablePreChunk(
- Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
- opts=ChunkingOptions.new(max_characters=175),
- )
-
- chunk_iter = pre_chunk.iter_chunks()
-
- chunk = next(chunk_iter)
- assert isinstance(chunk, Table)
- assert chunk.text == "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
- assert chunk.metadata.text_as_html == (
- "\n"
- "\n"
- "Header Col 1 | Header Col 2 |
\n"
- "\n"
- "\n"
- "Lorem ipsum | adipiscing |
\n"
- "\n"
- "
"
- )
- with pytest.raises(StopIteration):
- next(chunk_iter)
-
- def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
- # fixed-overhead = 8+8+9+8+9+8 = 50
- # per-row overhead = 27
- html_table = (
- "\n" # 8
- "\n" # 8
- "Header Col 1 | Header Col 2 |
\n"
- "\n" # 9
- "\n" # 8
- "Lorem ipsum | A Link example |
\n"
- "Consectetur | adipiscing elit |
\n"
- "Nunc aliquam | id enim nec molestie |
\n"
- "Vivamus quis | nunc ipsum donec ac fermentum |
\n"
- "\n" # 9
- "
" # 8
- )
- text_table = (
- "Header Col 1 Header Col 2\n"
- "Lorem ipsum dolor sit amet\n"
- "Consectetur adipiscing elit\n"
- "Nunc aliquam id enim nec molestie\n"
- "Vivamus quis nunc ipsum donec ac fermentum"
- )
- pre_chunk = TablePreChunk(
- Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
- opts=ChunkingOptions.new(max_characters=100),
- )
-
- chunk_iter = pre_chunk.iter_chunks()
-
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
- assert chunk.text == (
- "Header Col 1 Header Col 2\n"
- "Lorem ipsum dolor sit amet\n"
- "Consectetur adipiscing elit\n"
- "Nunc aliqua"
- )
- assert chunk.metadata.text_as_html == (
- "\n"
- "\n"
- "Header Col 1 | Header Col 2 |
\n"
- "\n"
- "\n"
- "Lo"
- )
- # --
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
- assert (
- chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
- )
- assert chunk.metadata.text_as_html == (
- "rem ipsum | A Link example |
\n"
- "Consectetur | adipiscing elit | <"
- )
- # -- note that text runs out but HTML continues because it's significantly longer. So two
- # -- of these chunks have HTML but no text.
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
- assert chunk.text == ""
- assert chunk.metadata.text_as_html == (
- "/tr>\n"
- "
Nunc aliquam | id enim nec molestie |
\n"
- "Vivamus quis | "
- )
- # --
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
- assert chunk.text == ""
- assert chunk.metadata.text_as_html == (
- "nunc ipsum donec ac fermentum |
\n\n
"
- )
- # --
- with pytest.raises(StopIteration):
- next(chunk_iter)
-
-
-class DescribeTextPreChunk:
- """Unit-test suite for `unstructured.chunking.title.TextPreChunk objects."""
-
- def it_can_combine_itself_with_another_TextPreChunk_instance(self):
- """.combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
-
- Note that neither the original or other pre_chunk are mutated.
- """
- opts = ChunkingOptions.new()
- pre_chunk = TextPreChunk(
- [
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- Text("In rhoncus ipsum sed lectus porta volutpat."),
- ],
- opts=opts,
- )
- other_pre_chunk = TextPreChunk(
- [
- Text("Donec semper facilisis metus finibus malesuada."),
- Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
- ],
- opts=opts,
- )
-
- new_pre_chunk = pre_chunk.combine(other_pre_chunk)
-
- assert new_pre_chunk == TextPreChunk(
- [
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- Text("In rhoncus ipsum sed lectus porta volutpat."),
- Text("Donec semper facilisis metus finibus malesuada."),
- Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
- ],
- opts=opts,
- )
- assert pre_chunk == TextPreChunk(
- [
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- Text("In rhoncus ipsum sed lectus porta volutpat."),
- ],
- opts=opts,
- )
- assert other_pre_chunk == TextPreChunk(
- [
- Text("Donec semper facilisis metus finibus malesuada."),
- Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
- ],
- opts=opts,
- )
-
- def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
- pre_chunk = TextPreChunk(
- [
- Title("Introduction"),
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
- "lectus porta volutpat.",
- ),
- ],
- opts=ChunkingOptions.new(max_characters=200),
- )
-
- chunk_iter = pre_chunk.iter_chunks()
-
- chunk = next(chunk_iter)
- assert chunk == CompositeElement(
- "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
- " In rhoncus ipsum sedlectus porta volutpat.",
- )
- assert chunk.metadata is pre_chunk._consolidated_metadata
-
- def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
- # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
- # -- The pre-chunker will isolate that element in a pre_chunk of its own.
- pre_chunk = TextPreChunk(
- [
- Text(
- "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
- " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
- " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
- " commodo consequat."
- ),
- ],
- opts=ChunkingOptions.new(max_characters=200),
- )
-
- chunk_iter = pre_chunk.iter_chunks()
-
- chunk = next(chunk_iter)
- assert chunk == CompositeElement(
- "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
- " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
- " veniam, quis nostrud exercitation ullamco laboris nisi ut a"
- )
- assert chunk.metadata is pre_chunk._consolidated_metadata
- # --
- chunk = next(chunk_iter)
- assert chunk == CompositeElement("liquip ex ea commodo consequat.")
- assert chunk.metadata is pre_chunk._consolidated_metadata
- # --
- with pytest.raises(StopIteration):
- next(chunk_iter)
-
- def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
- """.text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
- pre_chunk = TextPreChunk(
- [PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new()
- )
- assert pre_chunk.text_length == 8
-
- def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
- pre_chunk = TextPreChunk(
- [
- Title(
- "Lorem Ipsum",
- metadata=ElementMetadata(
- category_depth=0,
- filename="foo.docx",
- languages=["lat"],
- parent_id="f87731e0",
- ),
- ),
- Text(
- "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
- metadata=ElementMetadata(
- category_depth=1,
- filename="foo.docx",
- image_path="sprite.png",
- languages=["lat", "eng"],
- ),
- ),
- ],
- opts=ChunkingOptions.new(),
- )
-
- assert pre_chunk._all_metadata_values == {
- # -- scalar values are accumulated in a list in element order --
- "category_depth": [0, 1],
- # -- all values are accumulated, not only unique ones --
- "filename": ["foo.docx", "foo.docx"],
- # -- list-type fields produce a list of lists --
- "languages": [["lat"], ["lat", "eng"]],
- # -- fields that only appear in some elements are captured --
- "image_path": ["sprite.png"],
- "parent_id": ["f87731e0"],
- # -- A `None` value never appears, neither does a field-name with an empty list --
- }
-
- def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
- metadata = ElementMetadata(
- category_depth=0,
- filename="foo.docx",
- languages=["lat"],
- parent_id="f87731e0",
- )
- metadata.coefficient = 0.62
- metadata_2 = ElementMetadata(
- category_depth=1,
- filename="foo.docx",
- image_path="sprite.png",
- languages=["lat", "eng"],
- )
- metadata_2.quotient = 1.74
-
- pre_chunk = TextPreChunk(
- [
- Title("Lorem Ipsum", metadata=metadata),
- Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
- ],
- opts=ChunkingOptions.new(),
- )
-
- # -- ad-hoc fields "coefficient" and "quotient" do not appear --
- assert pre_chunk._all_metadata_values == {
- "category_depth": [0, 1],
- "filename": ["foo.docx", "foo.docx"],
- "image_path": ["sprite.png"],
- "languages": [["lat"], ["lat", "eng"]],
- "parent_id": ["f87731e0"],
- }
-
- def it_consolidates_regex_metadata_in_a_field_specific_way(self):
- """regex_metadata of chunk is combined regex_metadatas of its elements.
-
- Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
- position in the chunk after element text has been concatenated.
- """
- pre_chunk = TextPreChunk(
- [
- Title(
- "Lorem Ipsum",
- metadata=ElementMetadata(
- regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
- ),
- ),
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit.",
- metadata=ElementMetadata(
- regex_metadata={
- "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
- "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
- },
- ),
- ),
- Text(
- "In rhoncus ipsum sed lectus porta volutpat.",
- metadata=ElementMetadata(
- regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
- ),
- ),
- ],
- opts=ChunkingOptions.new(),
- )
-
- regex_metadata = pre_chunk._consolidated_regex_meta
-
- assert regex_metadata == {
- "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
- "ipsum": [
- RegexMetadata(text="Ipsum", start=6, end=11),
- RegexMetadata(text="ipsum", start=19, end=24),
- RegexMetadata(text="ipsum", start=81, end=86),
- ],
- }
-
- def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
- """._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
-
- Only non-None fields should appear in the dict and each field value should be the
- consolidation of the values across the pre_chunk elements.
- """
- pre_chunk = TextPreChunk(
- [
- PageBreak(""),
- Title(
- "Lorem Ipsum",
- metadata=ElementMetadata(
- filename="foo.docx",
- # -- category_depth has DROP strategy so doesn't appear in result --
- category_depth=0,
- emphasized_text_contents=["Lorem", "Ipsum"],
- emphasized_text_tags=["b", "i"],
- languages=["lat"],
- regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
- ),
- ),
- Text(
- "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
- metadata=ElementMetadata(
- # -- filename change doesn't happen IRL but demonstrates FIRST strategy --
- filename="bar.docx",
- # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
- # -- appears twice in consolidated-meta (as it should) and length matches
- # -- that of emphasized_text_tags both before and after consolidation.
- emphasized_text_contents=["Lorem", "ipsum"],
- emphasized_text_tags=["i", "b"],
- # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
- languages=["eng", "lat"],
- # -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
- regex_metadata={
- "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
- "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
- },
- ),
- ),
- ],
- opts=ChunkingOptions.new(),
- )
-
- meta_kwargs = pre_chunk._meta_kwargs
-
- assert meta_kwargs == {
- "filename": "foo.docx",
- "emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
- "emphasized_text_tags": ["b", "i", "i", "b"],
- "languages": ["lat", "eng"],
- "regex_metadata": {
- "ipsum": [
- RegexMetadata(text="Ipsum", start=6, end=11),
- RegexMetadata(text="ipsum", start=19, end=24),
- ],
- "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
- },
- }
-
- @pytest.mark.parametrize(
- ("elements", "expected_value"),
- [
- ([Text("foo"), Text("bar")], "foo\n\nbar"),
- ([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"),
- ([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"),
- ([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
- ],
- )
- def it_knows_the_concatenated_text_of_the_pre_chunk(
- self, elements: List[Text], expected_value: str
- ):
- """._text is the "joined" text of the pre-chunk elements.
-
- The text-segment contributed by each element is separated from the next by a blank line
- ("\n\n"). An element that contributes no text does not give rise to a separator.
- """
- pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new())
- assert pre_chunk._text == expected_value
-
-
-class DescribeTextPreChunkBuilder:
- """Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`."""
-
- def it_is_empty_on_construction(self):
- builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
-
- assert builder.text_length == 0
- assert builder.remaining_space == 50
-
- def it_accumulates_elements_added_to_it(self):
- builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
-
- builder.add_element(Title("Introduction"))
- assert builder.text_length == 12
- assert builder.remaining_space == 136
-
- builder.add_element(
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
- "lectus porta volutpat.",
- ),
- )
- assert builder.text_length == 112
- assert builder.remaining_space == 36
-
- def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
- builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
- builder.add_element(Title("Introduction"))
- builder.add_element(
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
- "lectus porta volutpat.",
- ),
- )
-
- pre_chunk = next(builder.flush())
-
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Introduction"),
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
- "lectus porta volutpat.",
- ),
- ]
- assert builder.text_length == 0
- assert builder.remaining_space == 150
-
- def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
- builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
-
- pre_chunks = list(builder.flush())
-
- assert pre_chunks == []
- assert builder.text_length == 0
- assert builder.remaining_space == 150
-
- def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
- builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
- builder.add_element(Text("abcde"))
- builder.add_element(Text("fghij"))
-
- # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
- # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
- assert builder.text_length == 12
- # -- .remaining_space is reduced by the length (2) of the trailing separator which would go
- # -- between the current text and that of the next element if one was added.
- # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
- assert builder.remaining_space == 36
-
-
-# == PreChunkCombiner =============================================================================
-
-
-class DescribePreChunkCombiner:
- """Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`."""
-
- def it_combines_sequential_small_text_pre_chunks(self):
- opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
- pre_chunks = [
- TextPreChunk(
- [
- Title("Lorem Ipsum"), # 11
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
- ],
- opts=opts,
- ),
- TextPreChunk(
- [
- Title("Mauris Nec"), # 10
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
- ],
- opts=opts,
- ),
- TextPreChunk(
- [
- Title("Sed Orci"), # 8
- Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
- ],
- opts=opts,
- ),
- ]
-
- pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
-
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Lorem Ipsum"),
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- Title("Mauris Nec"),
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
- Title("Sed Orci"),
- Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
- ]
- with pytest.raises(StopIteration):
- next(pre_chunk_iter)
-
- def but_it_does_not_combine_table_pre_chunks(self):
- opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
- pre_chunks = [
- TextPreChunk(
- [
- Title("Lorem Ipsum"),
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- ],
- opts=opts,
- ),
- TablePreChunk(Table("Heading\nCell text"), opts=opts),
- TextPreChunk(
- [
- Title("Mauris Nec"),
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
- ],
- opts=opts,
- ),
- ]
-
- pre_chunk_iter = PreChunkCombiner(
- pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
- ).iter_combined_pre_chunks()
-
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Lorem Ipsum"),
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- ]
- # --
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TablePreChunk)
- assert pre_chunk._table == Table("Heading\nCell text")
- # --
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Mauris Nec"),
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
- ]
- # --
- with pytest.raises(StopIteration):
- next(pre_chunk_iter)
-
- def it_respects_the_specified_combination_threshold(self):
- opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
- pre_chunks = [
- TextPreChunk( # 68
- [
- Title("Lorem Ipsum"), # 11
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
- ],
- opts=opts,
- ),
- TextPreChunk( # 71
- [
- Title("Mauris Nec"), # 10
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
- ],
- opts=opts,
- ),
- # -- len == 139
- TextPreChunk(
- [
- Title("Sed Orci"), # 8
- Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
- ],
- opts=opts,
- ),
- ]
-
- pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
-
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Lorem Ipsum"),
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- Title("Mauris Nec"),
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
- ]
- # --
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Sed Orci"),
- Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
- ]
- # --
- with pytest.raises(StopIteration):
- next(pre_chunk_iter)
-
- def it_respects_the_hard_maximum_window_length(self):
- opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
- pre_chunks = [
- TextPreChunk( # 68
- [
- Title("Lorem Ipsum"), # 11
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
- ],
- opts=opts,
- ),
- TextPreChunk( # 71
- [
- Title("Mauris Nec"), # 10
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
- ],
- opts=opts,
- ),
- # -- len == 139
- TextPreChunk(
- [
- Title("Sed Orci"), # 8
- Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
- ],
- opts=opts,
- ),
- # -- len == 214
- ]
-
- pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
-
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Lorem Ipsum"),
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- Title("Mauris Nec"),
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
- ]
- # --
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Sed Orci"),
- Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
- ]
- # --
- with pytest.raises(StopIteration):
- next(pre_chunk_iter)
-
- def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
- """Such as occurs when a single element exceeds the window size."""
- opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
- pre_chunks = [
- TextPreChunk([Title("Lorem Ipsum")], opts=opts),
- TextPreChunk( # 179
- [
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
- " Mauris nec urna non augue vulputate consequat eget et nisi." # 60
- " Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64
- )
- ],
- opts=opts,
- ),
- TextPreChunk([Title("Vulputate Consequat")], opts=opts),
- ]
-
- pre_chunk_iter = PreChunkCombiner(
- pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
- ).iter_combined_pre_chunks()
-
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [Title("Lorem Ipsum")]
- # --
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit."
- " Mauris nec urna non augue vulputate consequat eget et nisi."
- " Sed orci quam, eleifend sit amet vehicula, elementum ultricies."
- )
- ]
- # --
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [Title("Vulputate Consequat")]
- # --
- with pytest.raises(StopIteration):
- next(pre_chunk_iter)
-
-
-class DescribeTextPreChunkAccumulator:
- """Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`."""
-
- def it_is_empty_on_construction(self):
- accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100))
-
- assert accum.text_length == 0
- assert accum.remaining_space == 100
-
- def it_accumulates_pre_chunks_added_to_it(self):
- opts = ChunkingOptions.new(max_characters=500)
- accum = TextPreChunkAccumulator(opts=opts)
-
- accum.add_pre_chunk(
- TextPreChunk(
- [
- Title("Lorem Ipsum"),
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- ],
- opts=opts,
- )
- )
- assert accum.text_length == 68
- assert accum.remaining_space == 430
-
- accum.add_pre_chunk(
- TextPreChunk(
- [
- Title("Mauris Nec"),
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
- ],
- opts=opts,
- )
- )
- assert accum.text_length == 141
- assert accum.remaining_space == 357
-
- def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
- opts = ChunkingOptions.new(max_characters=150)
- accum = TextPreChunkAccumulator(opts=opts)
- accum.add_pre_chunk(
- TextPreChunk(
- [
- Title("Lorem Ipsum"),
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- ],
- opts=opts,
- )
- )
- accum.add_pre_chunk(
- TextPreChunk(
- [
- Title("Mauris Nec"),
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
- ],
- opts=opts,
- )
- )
- accum.add_pre_chunk(
- TextPreChunk(
- [
- Title("Sed Orci"),
- Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
- ],
- opts=opts,
- )
- )
-
- pre_chunk_iter = accum.flush()
-
- # -- iterator generates exactly one pre_chunk --
- pre_chunk = next(pre_chunk_iter)
- with pytest.raises(StopIteration):
- next(pre_chunk_iter)
- # -- and it is a _TextPreChunk containing all the elements --
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Lorem Ipsum"),
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- Title("Mauris Nec"),
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
- Title("Sed Orci"),
- Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
- ]
- assert accum.text_length == 0
- assert accum.remaining_space == 150
-
- def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
- accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
-
- pre_chunks = list(accum.flush())
-
- assert pre_chunks == []
- assert accum.text_length == 0
- assert accum.remaining_space == 150
-
- def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
- opts = ChunkingOptions.new(max_characters=100)
- accum = TextPreChunkAccumulator(opts=opts)
- accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts))
- accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts))
-
- # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
- # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
- assert accum.text_length == 12
- # -- .remaining_space is reduced by the length (2) of the trailing separator which would
- # -- go between the current text and that of the next pre-chunk if one was added.
- # -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
- assert accum.remaining_space == 86
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 3ff8b04c20..b056c56d9a 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.5-dev1" # pragma: no cover
+__version__ = "0.11.5-dev2" # pragma: no cover
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
index a36f437d1b..6e64344a63 100644
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@@ -2,12 +2,25 @@
from __future__ import annotations
-from typing import Optional
+import collections
+import copy
+from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
-from typing_extensions import Self
+from typing_extensions import Self, TypeAlias
+from unstructured.documents.elements import (
+ CompositeElement,
+ ConsolidationStrategy,
+ Element,
+ ElementMetadata,
+ RegexMetadata,
+ Table,
+ TableChunk,
+)
from unstructured.utils import lazyproperty
+PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
+
class ChunkingOptions:
"""Specifies parameters of optional chunking behaviors."""
@@ -150,3 +163,404 @@ def _validate(self) -> None:
# loop (I think).
if self._overlap >= max_characters:
raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")
+
+
+# ================================================================================================
+# PRE-CHUNK SUB-TYPES
+# ================================================================================================
+
+
+class TablePreChunk:
+ """A pre-chunk composed of a single Table element."""
+
+ def __init__(self, table: Table, opts: ChunkingOptions) -> None:
+ self._table = table
+ self._opts = opts
+
+ def iter_chunks(self) -> Iterator[Table | TableChunk]:
+ """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
+ text = self._table.text
+ html = self._table.metadata.text_as_html or ""
+ maxlen = self._opts.hard_max
+
+ # -- only chunk a table when it's too big to swallow whole --
+ if len(text) <= maxlen and len(html) <= maxlen:
+ yield self._table
+ return
+
+ is_continuation = False
+
+ while text or html:
+ # -- split off the next maxchars into the next TableChunk --
+ text_chunk, text = text[:maxlen], text[maxlen:]
+ table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
+
+ # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
+ # -- HTML elements that *correspond* to the TextChunk.text fragment.
+ if html:
+ html_chunk, html = html[:maxlen], html[maxlen:]
+ table_chunk.metadata.text_as_html = html_chunk
+
+ # -- mark second and later chunks as a continuation --
+ if is_continuation:
+ table_chunk.metadata.is_continuation = True
+
+ yield table_chunk
+
+ is_continuation = True
+
+
+class TextPreChunk:
+ """A sequence of elements that belong to the same semantic unit within a document.
+
+ The name "section" derives from the idea of a document-section, a heading followed by the
+ paragraphs "under" that heading. That structure is not found in all documents and actual section
+ content can vary, but that's the concept.
+
+ This object is purposely immutable.
+ """
+
+ def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None:
+ self._elements = list(elements)
+ self._opts = opts
+
+ def __eq__(self, other: Any) -> bool:
+ if not isinstance(other, TextPreChunk):
+ return False
+ return self._elements == other._elements
+
+ def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
+ """Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
+ return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts)
+
+ def iter_chunks(self) -> Iterator[CompositeElement]:
+ """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
+ text = self._text
+ text_len = len(text)
+ maxlen = self._opts.hard_max
+ start = 0
+ remaining = text_len
+
+ while remaining > 0:
+ end = min(start + maxlen, text_len)
+ yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
+ start = end
+ remaining = text_len - end
+
+ @lazyproperty
+ def text_length(self) -> int:
+ """Length of concatenated text of this pre-chunk, including separators."""
+ # -- used by pre-chunk-combiner to identify combination candidates --
+ return len(self._text)
+
+ @lazyproperty
+ def _all_metadata_values(self) -> Dict[str, List[Any]]:
+ """Collection of all populated metadata values across elements.
+
+ The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
+ at least one of the elements in this pre-chunk. The value of that key is a list of all those
+ populated values, in element order, for example:
+
+ {
+ "filename": ["sample.docx", "sample.docx"],
+ "languages": [["lat"], ["lat", "eng"]]
+ ...
+ }
+
+ This preprocessing step provides the input for a specified consolidation strategy that will
+ resolve the list of values for each field to a single consolidated value.
+ """
+
+ def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]:
+ """(field_name, value) pair for each non-None field in single `ElementMetadata`."""
+ return (
+ (field_name, value)
+ for field_name, value in metadata.known_fields.items()
+ if value is not None
+ )
+
+ field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list)
+
+ # -- collect all non-None field values in a list for each field, in element-order --
+ for e in self._elements:
+ for field_name, value in iter_populated_fields(e.metadata):
+ field_values[field_name].append(value)
+
+ return dict(field_values)
+
+ @lazyproperty
+ def _consolidated_metadata(self) -> ElementMetadata:
+ """Metadata applicable to this pre-chunk as a single chunk.
+
+ Formed by applying consolidation rules to all metadata fields across the elements of this
+ pre-chunk.
+
+ For the sake of consistency, the same rules are applied (for example, for dropping values)
+ to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
+ "consolidated".
+ """
+ return ElementMetadata(**self._meta_kwargs)
+
+ @lazyproperty
+ def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]:
+ """Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
+
+ This consolidated value is suitable for use in the chunk metadata. `start` and `end`
+ offsets of each regex match are also adjusted for their new positions.
+ """
+ chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {}
+ separator_len = len(self._opts.text_separator)
+ running_text_len = 0
+ start_offset = 0
+
+ for element in self._elements:
+ text_len = len(element.text)
+ # -- skip empty elements like `PageBreak("")` --
+ if not text_len:
+ continue
+ # -- account for blank line between "squashed" elements, but not before first element --
+ running_text_len += separator_len if running_text_len else 0
+ start_offset = running_text_len
+ running_text_len += text_len
+
+ if not element.metadata.regex_metadata:
+ continue
+
+ # -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
+ element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
+ for regex_name, matches in element_regex_metadata.items():
+ for m in matches:
+ m["start"] += start_offset
+ m["end"] += start_offset
+ chunk_matches = chunk_regex_metadata.get(regex_name, [])
+ chunk_matches.extend(matches)
+ chunk_regex_metadata[regex_name] = chunk_matches
+
+ return chunk_regex_metadata
+
+ @lazyproperty
+ def _meta_kwargs(self) -> Dict[str, Any]:
+ """The consolidated metadata values as a dict suitable for constructing ElementMetadata.
+
+ This is where consolidation strategies are actually applied. The output is suitable for use
+ in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`.
+ """
+ CS = ConsolidationStrategy
+ field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
+
+ def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]:
+ """Generate (field-name, value) pairs for each field in consolidated metadata."""
+ for field_name, values in self._all_metadata_values.items():
+ strategy = field_consolidation_strategies.get(field_name)
+ if strategy is CS.FIRST:
+ yield field_name, values[0]
+ # -- concatenate lists from each element that had one, in order --
+ elif strategy is CS.LIST_CONCATENATE:
+ yield field_name, sum(values, cast(List[Any], []))
+ # -- union lists from each element, preserving order of appearance --
+ elif strategy is CS.LIST_UNIQUE:
+ # -- Python 3.7+ maintains dict insertion order --
+ ordered_unique_keys = {key: None for val_list in values for key in val_list}
+ yield field_name, list(ordered_unique_keys.keys())
+ elif strategy is CS.REGEX:
+ yield field_name, self._consolidated_regex_meta
+ elif strategy is CS.DROP:
+ continue
+ else:
+ # -- not likely to hit this since we have a test in `text_elements.py` that
+ # -- ensures every ElementMetadata fields has an assigned strategy.
+ raise NotImplementedError(
+ f"metadata field {repr(field_name)} has no defined consolidation strategy"
+ )
+
+ return dict(iter_kwarg_pairs())
+
+ @lazyproperty
+ def _text(self) -> str:
+ """The concatenated text of all elements in this pre-chunk.
+
+ Each element-text is separated from the next by a blank line ("\n\n").
+ """
+ text_separator = self._opts.text_separator
+ return text_separator.join(e.text for e in self._elements if e.text)
+
+
+# ================================================================================================
+# PRE-CHUNKING ACCUMULATORS
+# ------------------------------------------------------------------------------------------------
+# Accumulators encapsulate the work of grouping elements and later pre-chunks to form the larger
+# pre-chunk and combined-pre-chunk items central to unstructured chunking.
+# ================================================================================================
+
+
+class PreChunkBuilder:
+ """An element accumulator suitable for incrementally forming a pre-chunk.
+
+ Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
+ to determine whether it should add the next element in the element stream.
+
+ `.flush()` is used to build a PreChunk object from the accumulated elements. This method
+ returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
+ used like so:
+
+ yield from builder.flush()
+
+ If no elements have been accumulated, no `PreChunk` instance is generated. Flushing the builder
+ clears the elements it contains so it is ready to build the next pre-chunk.
+ """
+
+ def __init__(self, opts: ChunkingOptions) -> None:
+ self._opts = opts
+ self._separator_len = len(opts.text_separator)
+ self._elements: List[Element] = []
+
+ # -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
+ self._text_segments: List[str] = []
+ # -- combined length of text-segments, not including separators --
+ self._text_len: int = 0
+
+ def add_element(self, element: Element) -> None:
+ """Add `element` to this section."""
+ self._elements.append(element)
+ if element.text:
+ self._text_segments.append(element.text)
+ self._text_len += len(element.text)
+
+ def flush(self) -> Iterator[TextPreChunk]:
+ """Generate zero-or-one `PreChunk` object and clear the accumulator.
+
+ Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
+ boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
+ stream.
+ """
+ if not self._elements:
+ return
+ # -- clear builder before yield so we're not sensitive to the timing of how/when this
+ # -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
+ elements = self._elements[:]
+ self._elements.clear()
+ self._text_segments.clear()
+ self._text_len = 0
+ yield TextPreChunk(elements, self._opts)
+
+ @property
+ def remaining_space(self) -> int:
+ """Maximum text-length of an element that can be added without exceeding maxlen."""
+ # -- include length of trailing separator that will go before next element text --
+ separators_len = self._separator_len * len(self._text_segments)
+ return self._opts.hard_max - self._text_len - separators_len
+
+ @property
+ def text_length(self) -> int:
+ """Length of the text in this pre-chunk.
+
+ This value represents the chunk-size that would result if this pre-chunk was flushed in its
+ current state. In particular, it does not include the length of a trailing separator (since
+ that would only appear if an additional element was added).
+
+ Not suitable for judging remaining space, use `.remaining_space` for that value.
+ """
+ # -- number of text separators present in joined text of elements. This includes only
+ # -- separators *between* text segments, not one at the end. Note there are zero separators
+ # -- for both 0 and 1 text-segments.
+ n = len(self._text_segments)
+ separator_count = n - 1 if n else 0
+ return self._text_len + (separator_count * self._separator_len)
+
+
+class PreChunkCombiner:
+ """Filters pre-chunk stream to combine small pre-chunks where possible."""
+
+ def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
+ self._pre_chunks = pre_chunks
+ self._opts = opts
+
+ def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
+ """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
+ accum = TextPreChunkAccumulator(self._opts)
+ combine_text_under_n_chars = self._opts.combine_text_under_n_chars
+
+ for pre_chunk in self._pre_chunks:
+ # -- start new pre-chunk under these conditions --
+ if (
+ # -- a table pre-chunk is never combined --
+ isinstance(pre_chunk, TablePreChunk)
+ # -- don't add another pre-chunk once length has reached combination soft-max --
+ or accum.text_length >= combine_text_under_n_chars
+ # -- combining would exceed hard-max --
+ or accum.remaining_space < pre_chunk.text_length
+ ):
+ yield from accum.flush()
+
+ # -- a table pre-chunk is never combined so don't accumulate --
+ if isinstance(pre_chunk, TablePreChunk):
+ yield pre_chunk
+ else:
+ accum.add_pre_chunk(pre_chunk)
+
+ yield from accum.flush()
+
+
+class TextPreChunkAccumulator:
+ """Accumulates, measures, and combines pre-chunk objects.
+
+ Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
+ whether to add another pre-chunk.
+
+ `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
+ This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
+ like so:
+
+ yield from accum.flush()
+
+ If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
+ clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
+ """
+
+ def __init__(self, opts: ChunkingOptions) -> None:
+ self._opts = opts
+ self._pre_chunks: List[TextPreChunk] = []
+
+ def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
+ """Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
+ self._pre_chunks.append(pre_chunk)
+
+ def flush(self) -> Iterator[TextPreChunk]:
+ """Generate all accumulated pre-chunks as a single combined pre-chunk."""
+ pre_chunks = self._pre_chunks
+
+ # -- nothing to do if no pre-chunks have been accumulated --
+ if not pre_chunks:
+ return
+
+ # -- otherwise combine all accumulated pre-chunk into one --
+ pre_chunk = pre_chunks[0]
+ for other_pre_chunk in pre_chunks[1:]:
+ pre_chunk = pre_chunk.combine(other_pre_chunk)
+ yield pre_chunk
+
+ # -- and reset the accumulator (to empty) --
+ pre_chunks.clear()
+
+ @property
+ def remaining_space(self) -> int:
+ """Maximum size of pre-chunk that can be added without exceeding maxlen."""
+ maxlen = self._opts.hard_max
+ return (
+ maxlen
+ if not self._pre_chunks
+ # -- an additional pre-chunk will also incur an additional separator --
+ else maxlen - self.text_length - len(self._opts.text_separator)
+ )
+
+ @property
+ def text_length(self) -> int:
+ """Size of concatenated text in all pre-chunks in accumulator."""
+ n = len(self._pre_chunks)
+
+ if n == 0:
+ return 0
+
+ total_text_length = sum(s.text_length for s in self._pre_chunks)
+ total_separator_length = len(self._opts.text_separator) * (n - 1)
+ return total_text_length + total_separator_length
diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py
index 93301e963c..ebc1a92069 100644
--- a/unstructured/chunking/title.py
+++ b/unstructured/chunking/title.py
@@ -5,26 +5,20 @@
from __future__ import annotations
-import collections
-import copy
-from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
-
-from typing_extensions import TypeAlias
-
-from unstructured.chunking.base import ChunkingOptions
+from typing import Iterator, List, Optional
+
+from unstructured.chunking.base import (
+ ChunkingOptions,
+ PreChunk,
+ PreChunkBuilder,
+ PreChunkCombiner,
+ TablePreChunk,
+)
from unstructured.documents.elements import (
- CompositeElement,
- ConsolidationStrategy,
Element,
- ElementMetadata,
- RegexMetadata,
Table,
- TableChunk,
Title,
)
-from unstructured.utils import lazyproperty
-
-PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
def chunk_by_title(
@@ -78,7 +72,7 @@ def chunk_by_title(
def _split_elements_by_title_and_table(
elements: List[Element], opts: ChunkingOptions
-) -> Iterator[TextPreChunk | TablePreChunk]:
+) -> Iterator[PreChunk]:
"""Implements "pre-chunker" responsibilities.
A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
@@ -102,7 +96,7 @@ def _split_elements_by_title_and_table(
A Table or Checkbox element is placed into a pre-chunk by itself.
"""
- pre_chunk_builder = TextPreChunkBuilder(opts)
+ pre_chunk_builder = PreChunkBuilder(opts)
prior_element = None
@@ -156,396 +150,3 @@ def _metadata_differs(
if ignore_page_numbers:
return False
return metadata1.page_number != metadata2.page_number
-
-
-# == PreChunks ===================================================================================
-
-
-class TablePreChunk:
- """A pre-chunk composed of a single Table element."""
-
- def __init__(self, table: Table, opts: ChunkingOptions) -> None:
- self._table = table
- self._opts = opts
-
- def iter_chunks(self) -> Iterator[Table | TableChunk]:
- """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
- text = self._table.text
- html = self._table.metadata.text_as_html or ""
- maxlen = self._opts.hard_max
-
- # -- only chunk a table when it's too big to swallow whole --
- if len(text) <= maxlen and len(html) <= maxlen:
- yield self._table
- return
-
- is_continuation = False
-
- while text or html:
- # -- split off the next maxchars into the next TableChunk --
- text_chunk, text = text[:maxlen], text[maxlen:]
- table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
-
- # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
- # -- HTML elements that *correspond* to the TextChunk.text fragment.
- if html:
- html_chunk, html = html[:maxlen], html[maxlen:]
- table_chunk.metadata.text_as_html = html_chunk
-
- # -- mark second and later chunks as a continuation --
- if is_continuation:
- table_chunk.metadata.is_continuation = True
-
- yield table_chunk
-
- is_continuation = True
-
-
-class TextPreChunk:
- """A sequence of elements that belong to the same semantic unit within a document.
-
- The name "section" derives from the idea of a document-section, a heading followed by the
- paragraphs "under" that heading. That structure is not found in all documents and actual section
- content can vary, but that's the concept.
-
- This object is purposely immutable.
- """
-
- def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None:
- self._elements = list(elements)
- self._opts = opts
-
- def __eq__(self, other: Any) -> bool:
- if not isinstance(other, TextPreChunk):
- return False
- return self._elements == other._elements
-
- def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
- """Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
- return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts)
-
- def iter_chunks(self) -> Iterator[CompositeElement]:
- """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
- text = self._text
- text_len = len(text)
- maxlen = self._opts.hard_max
- start = 0
- remaining = text_len
-
- while remaining > 0:
- end = min(start + maxlen, text_len)
- yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
- start = end
- remaining = text_len - end
-
- @lazyproperty
- def text_length(self) -> int:
- """Length of concatenated text of this pre-chunk, including separators."""
- # -- used by pre-chunk-combiner to identify combination candidates --
- return len(self._text)
-
- @lazyproperty
- def _all_metadata_values(self) -> Dict[str, List[Any]]:
- """Collection of all populated metadata values across elements.
-
- The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
- at least one of the elements in this pre-chunk. The value of that key is a list of all those
- populated values, in element order, for example:
-
- {
- "filename": ["sample.docx", "sample.docx"],
- "languages": [["lat"], ["lat", "eng"]]
- ...
- }
-
- This preprocessing step provides the input for a specified consolidation strategy that will
- resolve the list of values for each field to a single consolidated value.
- """
-
- def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]:
- """(field_name, value) pair for each non-None field in single `ElementMetadata`."""
- return (
- (field_name, value)
- for field_name, value in metadata.known_fields.items()
- if value is not None
- )
-
- field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list)
-
- # -- collect all non-None field values in a list for each field, in element-order --
- for e in self._elements:
- for field_name, value in iter_populated_fields(e.metadata):
- field_values[field_name].append(value)
-
- return dict(field_values)
-
- @lazyproperty
- def _consolidated_metadata(self) -> ElementMetadata:
- """Metadata applicable to this pre-chunk as a single chunk.
-
- Formed by applying consolidation rules to all metadata fields across the elements of this
- pre-chunk.
-
- For the sake of consistency, the same rules are applied (for example, for dropping values)
- to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
- "consolidated".
- """
- return ElementMetadata(**self._meta_kwargs)
-
- @lazyproperty
- def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]:
- """Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
-
- This consolidated value is suitable for use in the chunk metadata. `start` and `end`
- offsets of each regex match are also adjusted for their new positions.
- """
- chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {}
- separator_len = len(self._opts.text_separator)
- running_text_len = 0
- start_offset = 0
-
- for element in self._elements:
- text_len = len(element.text)
- # -- skip empty elements like `PageBreak("")` --
- if not text_len:
- continue
- # -- account for blank line between "squashed" elements, but not before first element --
- running_text_len += separator_len if running_text_len else 0
- start_offset = running_text_len
- running_text_len += text_len
-
- if not element.metadata.regex_metadata:
- continue
-
- # -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
- element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
- for regex_name, matches in element_regex_metadata.items():
- for m in matches:
- m["start"] += start_offset
- m["end"] += start_offset
- chunk_matches = chunk_regex_metadata.get(regex_name, [])
- chunk_matches.extend(matches)
- chunk_regex_metadata[regex_name] = chunk_matches
-
- return chunk_regex_metadata
-
- @lazyproperty
- def _meta_kwargs(self) -> Dict[str, Any]:
- """The consolidated metadata values as a dict suitable for constructing ElementMetadata.
-
- This is where consolidation strategies are actually applied. The output is suitable for use
- in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`.
- """
- CS = ConsolidationStrategy
- field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
-
- def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]:
- """Generate (field-name, value) pairs for each field in consolidated metadata."""
- for field_name, values in self._all_metadata_values.items():
- strategy = field_consolidation_strategies.get(field_name)
- if strategy is CS.FIRST:
- yield field_name, values[0]
- # -- concatenate lists from each element that had one, in order --
- elif strategy is CS.LIST_CONCATENATE:
- yield field_name, sum(values, cast(List[Any], []))
- # -- union lists from each element, preserving order of appearance --
- elif strategy is CS.LIST_UNIQUE:
- # -- Python 3.7+ maintains dict insertion order --
- ordered_unique_keys = {key: None for val_list in values for key in val_list}
- yield field_name, list(ordered_unique_keys.keys())
- elif strategy is CS.REGEX:
- yield field_name, self._consolidated_regex_meta
- elif strategy is CS.DROP:
- continue
- else:
- # -- not likely to hit this since we have a test in `text_elements.py` that
- # -- ensures every ElementMetadata fields has an assigned strategy.
- raise NotImplementedError(
- f"metadata field {repr(field_name)} has no defined consolidation strategy"
- )
-
- return dict(iter_kwarg_pairs())
-
- @lazyproperty
- def _text(self) -> str:
- """The concatenated text of all elements in this pre-chunk.
-
- Each element-text is separated from the next by a blank line ("\n\n").
- """
- text_separator = self._opts.text_separator
- return text_separator.join(e.text for e in self._elements if e.text)
-
-
-class TextPreChunkBuilder:
- """An element accumulator suitable for incrementally forming a pre-chunk.
-
- Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
- to determine whether it should add the next element in the element stream.
-
- `.flush()` is used to build a `TextPreChunk` object from the accumulated elements. This method
- returns an interator that generates zero-or-one `TextPreChunk` object and is used like so:
-
- yield from builder.flush()
-
- If no elements have been accumulated, no `TextPreChunk` is generated. Flushing the builder
- clears the elements it contains so it is ready to build the next text-pre-chunk.
- """
-
- def __init__(self, opts: ChunkingOptions) -> None:
- self._opts = opts
- self._separator_len = len(opts.text_separator)
- self._elements: List[Element] = []
-
- # -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
- self._text_segments: List[str] = []
- # -- combined length of text-segments, not including separators --
- self._text_len: int = 0
-
- def add_element(self, element: Element) -> None:
- """Add `element` to this section."""
- self._elements.append(element)
- if element.text:
- self._text_segments.append(element.text)
- self._text_len += len(element.text)
-
- def flush(self) -> Iterator[TextPreChunk]:
- """Generate zero-or-one `PreChunk` object and clear the accumulator.
-
- Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
- boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
- stream.
- """
- if not self._elements:
- return
- # -- clear builder before yield so we're not sensitive to the timing of how/when this
- # -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
- elements = self._elements[:]
- self._elements.clear()
- self._text_segments.clear()
- self._text_len = 0
- yield TextPreChunk(elements, self._opts)
-
- @property
- def remaining_space(self) -> int:
- """Maximum text-length of an element that can be added without exceeding maxlen."""
- # -- include length of trailing separator that will go before next element text --
- separators_len = self._separator_len * len(self._text_segments)
- return self._opts.hard_max - self._text_len - separators_len
-
- @property
- def text_length(self) -> int:
- """Length of the text in this pre-chunk.
-
- This value represents the chunk-size that would result if this pre-chunk was flushed in its
- current state. In particular, it does not include the length of a trailing separator (since
- that would only appear if an additional element was added).
-
- Not suitable for judging remaining space, use `.remaining_space` for that value.
- """
- # -- number of text separators present in joined text of elements. This includes only
- # -- separators *between* text segments, not one at the end. Note there are zero separators
- # -- for both 0 and 1 text-segments.
- n = len(self._text_segments)
- separator_count = n - 1 if n else 0
- return self._text_len + (separator_count * self._separator_len)
-
-
-# == PreChunkCombiner ============================================================================
-
-
-class PreChunkCombiner:
- """Filters pre-chunk stream to combine small pre-chunks where possible."""
-
- def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
- self._pre_chunks = pre_chunks
- self._opts = opts
-
- def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
- """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
- accum = TextPreChunkAccumulator(self._opts)
- combine_text_under_n_chars = self._opts.combine_text_under_n_chars
-
- for pre_chunk in self._pre_chunks:
- # -- start new pre-chunk under these conditions --
- if (
- # -- a table pre-chunk is never combined --
- isinstance(pre_chunk, TablePreChunk)
- # -- don't add another pre-chunk once length has reached combination soft-max --
- or accum.text_length >= combine_text_under_n_chars
- # -- combining would exceed hard-max --
- or accum.remaining_space < pre_chunk.text_length
- ):
- yield from accum.flush()
-
- # -- a table pre-chunk is never combined so don't accumulate --
- if isinstance(pre_chunk, TablePreChunk):
- yield pre_chunk
- else:
- accum.add_pre_chunk(pre_chunk)
-
- yield from accum.flush()
-
-
-class TextPreChunkAccumulator:
- """Accumulates, measures, and combines pre-chunk objects.
-
- Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
- whether to add another pre-chunk.
-
- `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
- This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
- like so:
-
- yield from accum.flush()
-
- If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
- clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
- """
-
- def __init__(self, opts: ChunkingOptions) -> None:
- self._opts = opts
- self._pre_chunks: List[TextPreChunk] = []
-
- def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
- """Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
- self._pre_chunks.append(pre_chunk)
-
- def flush(self) -> Iterator[TextPreChunk]:
- """Generate all accumulated pre-chunks as a single combined pre-chunk."""
- pre_chunks = self._pre_chunks
-
- # -- nothing to do if no pre-chunks have been accumulated --
- if not pre_chunks:
- return
-
- # -- otherwise combine all accumulated pre-chunk into one --
- pre_chunk = pre_chunks[0]
- for other_pre_chunk in pre_chunks[1:]:
- pre_chunk = pre_chunk.combine(other_pre_chunk)
- yield pre_chunk
-
- # -- and reset the accumulator (to empty) --
- pre_chunks.clear()
-
- @property
- def remaining_space(self) -> int:
- """Maximum size of pre-chunk that can be added without exceeding maxlen."""
- maxlen = self._opts.hard_max
- return (
- maxlen
- if not self._pre_chunks
- # -- an additional pre-chunk will also incur an additional separator --
- else maxlen - self.text_length - len(self._opts.text_separator)
- )
-
- @property
- def text_length(self) -> int:
- """Size of concatenated text in all pre-chunks in accumulator."""
- n = len(self._pre_chunks)
-
- if n == 0:
- return 0
-
- total_text_length = sum(s.text_length for s in self._pre_chunks)
- total_separator_length = len(self._opts.text_separator) * (n - 1)
- return total_text_length + total_separator_length