Skip to content

Commit

Permalink
fix: remove unused ElementMetadata.section
Browse files Browse the repository at this point in the history
The `.section` field in `ElementMetadata` is dead-code, being left from
a prior attempt a partitioning EPUB documents. It is no longer used.
Remove it and any code that uses it.
  • Loading branch information
scanny committed Apr 22, 2024
1 parent 305247b commit 0145311
Show file tree
Hide file tree
Showing 7 changed files with 3 additions and 159 deletions.
7 changes: 0 additions & 7 deletions docs/source/core/chunking.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,6 @@ following behaviors:
``Title`` element would fit in the prior chunk. This implements the first aspect of the "preserve
section boundaries" contract.

- **Detect metadata.section change.** An element with a new value in ``element.metadata.section`` is
considered to start a new section. When a change in this value is encountered a new chunk is
started. This implements the second aspect of preserving section boundaries. This metadata is not
present in all document formats so is not used alone. An element having ``None`` for this metadata
field is considered to be part of the prior section; a section break is only detected on an
explicit change in value.

- **Respect page boundaries.** Page boundaries can optionally also be respected using the
``multipage_sections`` argument. This defaults to ``True`` meaning that a page break does *not*
start a new chunk. Setting this to ``False`` will separate elements that occur on different pages
Expand Down
63 changes: 0 additions & 63 deletions test_unstructured/chunking/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
TextPreChunk,
TextPreChunkAccumulator,
_TextSplitter,
is_in_next_section,
is_on_next_page,
is_title,
)
Expand Down Expand Up @@ -1514,68 +1513,6 @@ def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
# ================================================================================================


class Describe_is_in_next_section:
"""Unit-test suite for `unstructured.chunking.base.is_in_next_section()` function.
`is_in_next_section()` is not itself a predicate, rather it returns a predicate on Element
(`Callable[[Element], bool]`) that can be called repeatedly to detect section changes in an
element stream.
"""

def it_is_false_for_the_first_element_when_it_has_a_non_None_section(self):
"""This is an explicit first-section; first-section does not represent a section break."""
pred = is_in_next_section()
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))

def and_it_is_false_for_the_first_element_when_it_has_a_None_section(self):
"""This is an anonymous first-section; still doesn't represent a section break."""
pred = is_in_next_section()
assert not pred(Text("abcd"))

def it_is_false_for_None_section_elements_that_follow_an_explicit_first_section(self):
"""A `None` section element is considered to continue the prior section."""
pred = is_in_next_section()
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
assert not pred(Text("efgh"))
assert not pred(Text("ijkl"))

def and_it_is_false_for_None_section_elements_that_follow_an_anonymous_first_section(self):
"""A `None` section element is considered to continue the prior section."""
pred = is_in_next_section()
assert not pred(Text("abcd"))
assert not pred(Text("efgh"))
assert not pred(Text("ijkl"))

def it_is_false_for_matching_section_elements_that_follow_an_explicit_first_section(self):
pred = is_in_next_section()
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
assert not pred(Text("efgh", metadata=ElementMetadata(section="Introduction")))
assert not pred(Text("ijkl", metadata=ElementMetadata(section="Introduction")))

def it_is_true_for_an_explicit_section_element_that_follows_an_anonymous_first_section(self):
pred = is_in_next_section()
assert not pred(Text("abcd"))
assert not pred(Text("efgh"))
assert pred(Text("ijkl", metadata=ElementMetadata(section="Introduction")))

def and_it_is_true_for_a_different_explicit_section_that_follows_an_explicit_section(self):
pred = is_in_next_section()
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
assert pred(Text("efgh", metadata=ElementMetadata(section="Summary")))

def it_is_true_whenever_the_section_explicitly_changes_except_at_the_start(self):
pred = is_in_next_section()
assert not pred(Text("abcd"))
assert pred(Text("efgh", metadata=ElementMetadata(section="Introduction")))
assert not pred(Text("ijkl"))
assert not pred(Text("mnop", metadata=ElementMetadata(section="Introduction")))
assert not pred(Text("qrst"))
assert pred(Text("uvwx", metadata=ElementMetadata(section="Summary")))
assert not pred(Text("yzab", metadata=ElementMetadata(section="Summary")))
assert not pred(Text("cdef"))
assert pred(Text("ghij", metadata=ElementMetadata(section="Appendix")))


class Describe_is_on_next_page:
"""Unit-test suite for `unstructured.chunking.base.is_on_next_page()` function.
Expand Down
37 changes: 0 additions & 37 deletions test_unstructured/chunking/test_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,43 +139,6 @@ def test_chunk_by_title():
)


def test_chunk_by_title_respects_section_change():
elements: list[Element] = [
Title("A Great Day", metadata=ElementMetadata(section="first")),
Text("Today is a great day.", metadata=ElementMetadata(section="second")),
Text("It is sunny outside.", metadata=ElementMetadata(section="second")),
Table("Heading\nCell text"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("It is storming outside."),
CheckBox(),
]

chunks = chunk_by_title(elements, combine_text_under_n_chars=0)

assert chunks == [
CompositeElement(
"A Great Day",
),
CompositeElement(
"Today is a great day.\n\nIt is sunny outside.",
),
Table("Heading\nCell text"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
]


def test_chunk_by_title_separates_by_page_number():
elements: list[Element] = [
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
Expand Down
2 changes: 0 additions & 2 deletions test_unstructured/partition/epub/test_epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ def test_partition_epub_from_filename_exclude_metadata():
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
assert elements[0].metadata.section is None


def test_partition_epub_from_file_exlcude_metadata():
Expand All @@ -87,7 +86,6 @@ def test_partition_epub_from_file_exlcude_metadata():
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
assert elements[0].metadata.section is None


def test_partition_epub_metadata_date(
Expand Down
45 changes: 0 additions & 45 deletions unstructured/chunking/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,51 +1022,6 @@ def will_fit(self, pre_chunk: TextPreChunk) -> bool:
# ================================================================================================


def is_in_next_section() -> BoundaryPredicate:
"""Not a predicate itself, calling this returns a predicate that triggers on each new section.
The lifetime of the returned callable cannot extend beyond a single element-stream because it
stores current state (current section) that is particular to that element stream.
A "section" of this type is particular to the EPUB format (so far) and not to be confused with
a "section" composed of a section-heading (`Title` element) followed by content elements.
The returned predicate tracks the current section, starting at `None`. Calling with an element
with a different value for `metadata.section` returns True, indicating the element starts a new
section boundary, and updates the enclosed section name ready for the next transition.
"""
current_section: Optional[str] = None
is_first: bool = True

def section_changed(element: Element) -> bool:
nonlocal current_section, is_first

section = element.metadata.section

# -- The first element never reports a section break, it starts the first section of the
# -- document. That section could be named (section is non-None) or anonymous (section is
# -- None). We don't really have to care.
if is_first:
current_section = section
is_first = False
return False

# -- An element with a `None` section is assumed to continue the current section. It never
# -- updates the current-section because once set, the current-section is "sticky" until
# -- replaced by another explicit section.
if section is None:
return False

# -- another element with the same section continues that section --
if section == current_section:
return False

current_section = section
return True

return section_changed


def is_on_next_page() -> BoundaryPredicate:
"""Not a predicate itself, calling this returns a predicate that triggers on each new page.
Expand Down
2 changes: 0 additions & 2 deletions unstructured/chunking/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
ChunkingOptions,
PreChunkCombiner,
PreChunker,
is_in_next_section,
is_on_next_page,
is_title,
)
Expand Down Expand Up @@ -121,7 +120,6 @@ def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:

def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
yield is_title
yield is_in_next_section()
if not self.multipage_sections:
yield is_on_next_page()

Expand Down
6 changes: 3 additions & 3 deletions unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,6 @@ class ElementMetadata:
parent_id: Optional[str]
# -- "fields" e.g. status, dept.no, etc. extracted from text via regex --
regex_metadata: Optional[dict[str, list[RegexMetadata]]]
# -- EPUB document section --
section: Optional[str]

# -- e-mail specific metadata fields --
sent_from: Optional[list[str]]
Expand Down Expand Up @@ -694,7 +692,9 @@ def __init__(
metadata: Optional[ElementMetadata] = None,
detection_origin: Optional[str] = None,
):
if element_id is not None and not isinstance(element_id, str):
if element_id is not None and not isinstance(
element_id, str
): # pyright: ignore[reportUnnecessaryIsInstance]
raise ValueError("element_id must be of type str or None.")

self._element_id = element_id
Expand Down

0 comments on commit 0145311

Please sign in to comment.