Skip to content

Commit

Permalink
fix: chunk_by_title() interface is rude
Browse files Browse the repository at this point in the history
  • Loading branch information
scanny committed Oct 23, 2023
1 parent a2af72b commit 27035d8
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 52 deletions.
133 changes: 95 additions & 38 deletions test_unstructured/chunking/test_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,100 @@
)
from unstructured.partition.html import partition_html

# == chunk_by_title() validation behaviors =======================================================


@pytest.mark.parametrize("max_characters", [0, -1, -42])
def test_it_rejects_max_characters_not_greater_than_zero(max_characters: int):
elements: List[Element] = [Text("Lorem ipsum dolor.")]

with pytest.raises(
ValueError, match=f"'max_characters' argument must be > 0, got {max_characters}"
):
chunk_by_title(elements, max_characters=max_characters)


def test_it_does_not_complain_when_specifying_max_characters_by_itself():
"""Caller can specify `max_characters` arg without specifying any others.
In particular, When `combine_text_under_n_chars` is not specified it defaults to the value of
`max_characters`; it has no fixed default value that can be greater than `max_characters` and
trigger an exception.
"""
elements: List[Element] = [Text("Lorem ipsum dolor.")]

try:
chunk_by_title(elements, max_characters=50)
except Exception:
pytest.fail("did not accept `max_characters` as option by itself")


@pytest.mark.parametrize("n_chars", [-1, -42])
def test_it_rejects_combine_text_under_n_chars_for_n_less_than_zero(n_chars: int):
elements: List[Element] = [Text("Lorem ipsum dolor.")]

with pytest.raises(
ValueError, match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}"
):
chunk_by_title(elements, combine_text_under_n_chars=n_chars)


def test_it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining():
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
elements: List[Element] = [Text("Lorem ipsum dolor.")]

chunks = chunk_by_title(elements, max_characters=50, combine_text_under_n_chars=0)

assert chunks == [CompositeElement("Lorem ipsum dolor.")]


@pytest.mark.parametrize("n_chars", [-1, -42])
def test_it_rejects_new_after_n_chars_for_n_less_than_zero(n_chars: int):
elements: List[Element] = [Text("Lorem ipsum dolor.")]

with pytest.raises(
ValueError, match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}"
):
chunk_by_title(elements, new_after_n_chars=n_chars)


def test_it_does_not_complain_when_specifying_new_after_n_chars_by_itself():
"""Caller can specify `new_after_n_chars` arg without specifying any other options.
In particular, `combine_text_under_n_chars` value is adjusted down to the `new_after_n_chars`
value when the default for `combine_text_under_n_chars` exceeds the value of
`new_after_n_chars`.
"""
elements: List[Element] = [Text("Lorem ipsum dolor.")]

try:
chunk_by_title(elements, new_after_n_chars=50)
except Exception:
pytest.fail("did not accept `new_after_n_chars` as option by itself")


def test_it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk():
"""Specifying `new_after_n_chars=0` places each element into its own section.
This puts each element into its own chunk, although long chunks are still split.
"""
elements: List[Element] = [
Text("Lorem"),
Text("ipsum"),
Text("dolor"),
]

chunks = chunk_by_title(elements, max_characters=50, new_after_n_chars=0)

assert chunks == [
CompositeElement("Lorem"),
CompositeElement("ipsum"),
CompositeElement("dolor"),
]


# ================================================================================================


def test_it_splits_a_large_section_into_multiple_chunks():
elements: List[Element] = [
Expand All @@ -32,7 +126,7 @@ def test_it_splits_a_large_section_into_multiple_chunks():
),
]

chunks = chunk_by_title(elements, combine_text_under_n_chars=50, max_characters=50)
chunks = chunk_by_title(elements, max_characters=50)

assert chunks == [
CompositeElement("Introduction"),
Expand Down Expand Up @@ -395,43 +489,6 @@ def test_add_chunking_strategy_on_partition_html_respects_multipage():
)


@pytest.mark.parametrize(
("combine_text_under_n_chars", "new_after_n_chars", "max_characters"),
[
(-1, -1, -1), # invalid chunk size
(0, 0, 0), # invalid max_characters
(-5666, -6777, -8999), # invalid chunk size
(-5, 40, 50), # invalid chunk size
(50, 70, 20), # max_characters needs to be greater than new_after_n_chars
(70, 50, 50), # combine_text_under_n_chars needs to be les than new_after_n_chars
],
)
def test_add_chunking_strategy_raises_error_for_invalid_n_chars(
combine_text_under_n_chars: int,
new_after_n_chars: int,
max_characters: int,
):
elements: List[Element] = [
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text("It is storming outside."),
CheckBox(),
]
with pytest.raises(ValueError):
chunk_by_title(
elements,
combine_text_under_n_chars=combine_text_under_n_chars,
new_after_n_chars=new_after_n_chars,
max_characters=max_characters,
)


def test_chunk_by_title_drops_detection_class_prob():
elements: List[Element] = [
Title(
Expand Down
54 changes: 40 additions & 14 deletions unstructured/chunking/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def chunk_table_element(element: Table, max_characters: int = 500) -> List[Table
def chunk_by_title(
elements: List[Element],
multipage_sections: bool = True,
combine_text_under_n_chars: int = 500,
new_after_n_chars: int = 500,
combine_text_under_n_chars: int | None = None,
new_after_n_chars: int | None = None,
max_characters: int = 500,
) -> List[Element]:
"""Uses title elements to identify sections within the document for chunking.
Expand All @@ -73,27 +73,53 @@ def chunk_by_title(
multipage_sections
If True, sections can span multiple pages. Defaults to True.
combine_text_under_n_chars
Combines elements (for example a series of titles) until a section reaches
a length of n characters.
Combines elements (for example a series of titles) until a section reaches a length of
n characters. Defaults to `max_characters` which combines chunks whenever space allows.
Specifying 0 for this argument suppresses combining of small chunks. Note this value is
"capped" at the `new_after_n_chars` value since a value higher than that would not change
this parameter's effect.
new_after_n_chars
Cuts off new sections once they reach a length of n characters (soft max)
Cuts off new sections once they reach a length of n characters (soft max). Defaults to
`max_characters` when not specified, which effectively disables any soft window.
Specifying 0 for this argument causes each element to appear in a chunk by itself (although
an element with text longer than `max_characters` will be still be split into two or more
chunks).
max_characters
Chunks elements text and text_as_html (if present) into chunks of length
n characters (hard max)
"""

if not (
max_characters > 0
and combine_text_under_n_chars >= 0
and new_after_n_chars >= 0
and combine_text_under_n_chars <= new_after_n_chars
and combine_text_under_n_chars <= max_characters
):
# -- validation and arg pre-processing ---------------------------

# -- chunking window must have positive length --
if max_characters <= 0:
raise ValueError(f"'max_characters' argument must be > 0, got {max_characters}")

# -- `combine_text_under_n_chars` defaults to `max_characters` when not specified and is
# -- capped at max-chars
if combine_text_under_n_chars is None or combine_text_under_n_chars > max_characters:
combine_text_under_n_chars = max_characters

# -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
# -- but a negative value is not
if combine_text_under_n_chars < 0:
raise ValueError(
"Invalid values for combine_text_under_n_chars, "
"new_after_n_chars, and/or max_characters.",
f"'combine_text_under_n_chars' argument must be >= 0, got {combine_text_under_n_chars}"
)

# -- same with `new_after_n_chars` --
if new_after_n_chars is None or new_after_n_chars > max_characters:
new_after_n_chars = max_characters

if new_after_n_chars < 0:
raise ValueError(f"'new_after_n_chars' argument must be >= 0, got {new_after_n_chars}")

# -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
if combine_text_under_n_chars > new_after_n_chars:
combine_text_under_n_chars = new_after_n_chars

# ----------------------------------------------------------------

chunked_elements: List[Element] = []
sections = _split_elements_by_title_and_table(
elements,
Expand Down

0 comments on commit 27035d8

Please sign in to comment.