Skip to content

Commit

Permalink
fix: sectioner does not consider separator length
Browse files Browse the repository at this point in the history
  • Loading branch information
scanny committed Oct 24, 2023
1 parent 63d6ce0 commit d8542d6
Show file tree
Hide file tree
Showing 2 changed files with 371 additions and 73 deletions.
189 changes: 167 additions & 22 deletions test_unstructured/chunking/test_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
import pytest

from unstructured.chunking.title import (
_NonTextSection,
_split_elements_by_title_and_table,
_TableSection,
_TextSection,
_TextSectionBuilder,
chunk_by_title,
)
from unstructured.documents.coordinates import CoordinateSystem
Expand All @@ -15,6 +19,7 @@
CoordinatesMetadata,
Element,
ElementMetadata,
ListItem,
RegexMetadata,
Table,
Text,
Expand Down Expand Up @@ -187,6 +192,7 @@ def test_split_elements_by_title_and_table():
Text("It is storming outside."),
CheckBox(),
]

sections = _split_elements_by_title_and_table(
elements,
multipage_sections=True,
Expand All @@ -195,29 +201,40 @@ def test_split_elements_by_title_and_table():
max_characters=500,
)

assert sections == [
[
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
],
[
Table("<table></table>"),
],
[
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
],
[
Title("A Bad Day"),
Text("Today is a bad day."),
Text("It is storming outside."),
],
[
CheckBox(),
],
section = next(sections)
assert isinstance(section, _TextSection)
assert section.elements == [
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
]
# --
section = next(sections)
assert isinstance(section, _TableSection)
assert section.table == Table("<table></table>")
# ==
section = next(sections)
assert isinstance(section, _TextSection)
assert section.elements == [
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
]
# --
section = next(sections)
assert isinstance(section, _TextSection)
assert section.elements == [
Title("A Bad Day"),
Text("Today is a bad day."),
Text("It is storming outside."),
]
# --
section = next(sections)
assert isinstance(section, _NonTextSection)
assert section.element == CheckBox()
# --
with pytest.raises(StopIteration):
next(sections)


def test_chunk_by_title():
Expand Down Expand Up @@ -654,3 +671,131 @@ def test_chunk_by_title_drops_extra_metadata():
)

assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))


def test_it_considers_separator_length_when_sectioning():
"""Sectioner includes length of separators when computing remaining space."""
elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars
ListItem("Preserve semantic boundaries"), # 28 chars
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
] # 114 chars total but 120 chars with separators

chunks = chunk_by_title(elements, max_characters=115)

assert chunks == [
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
),
CompositeElement("Minimize mid-text chunk-splitting"),
]


# == Sections ====================================================================================


class Describe_NonTextSection:
"""Unit-test suite for `unstructured.chunking.title._NonTextSection objects."""

def it_provides_access_to_its_element(self):
checkbox = CheckBox()
section = _NonTextSection(checkbox)
assert section.element is checkbox


class Describe_TableSection:
"""Unit-test suite for `unstructured.chunking.title._TableSection objects."""

def it_provides_access_to_its_table(self):
table = Table("<table></table>")
section = _TableSection(table)
assert section.table is table


class Describe_TextSection:
"""Unit-test suite for `unstructured.chunking.title._TextSection objects."""

def it_provides_access_to_its_elements(self):
elements: List[Element] = [
Title("Introduction"),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat.",
),
]
section = _TextSection(elements)
assert section.elements == elements


class Describe_TextSectionBuilder:
"""Unit-test suite for `unstructured.chunking.title._TextSection objects."""

def it_is_empty_on_construction(self):
builder = _TextSectionBuilder(maxlen=50)

assert builder.text_length == 0
assert builder.remaining_space == 50

def it_accumulates_elements_added_to_it(self):
builder = _TextSectionBuilder(maxlen=150)

builder.add_element(Title("Introduction"))
assert builder.text_length == 12
assert builder.remaining_space == 136

builder.add_element(
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat."
)
)
assert builder.text_length == 112
assert builder.remaining_space == 36

def it_generates_a_TextSection_when_flushed_and_resets_itself_to_empty(self):
builder = _TextSectionBuilder(maxlen=150)
builder.add_element(Title("Introduction"))
builder.add_element(
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat."
)
)

section = next(builder.flush())

assert isinstance(section, _TextSection)
assert section.elements == [
Title("Introduction"),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat.",
),
]
assert builder.text_length == 0
assert builder.remaining_space == 150

def but_it_does_not_generate_a_TextSection_on_flush_when_empty(self):
builder = _TextSectionBuilder(maxlen=150)

sections = list(builder.flush())

assert sections == []
assert builder.text_length == 0
assert builder.remaining_space == 150

def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
builder = _TextSectionBuilder(maxlen=50)
builder.add_element(Text("abcde"))
builder.add_element(Text("fghij"))

# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
assert builder.text_length == 12
# -- .remaining_space is reduced by the length (2) of the trailing separator which would go
# -- between the current text and that of the next element if one was added.
# -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
assert builder.remaining_space == 36
Loading

0 comments on commit d8542d6

Please sign in to comment.