Skip to content

Commit

Permalink
rfctr: extract ._iter_maybe_paragraph_page_breaks()
Browse files Browse the repository at this point in the history
This approach needs some refinement, but this extraction localizes any
changes required to this new method.
  • Loading branch information
scanny committed Sep 19, 2023
1 parent d5c0861 commit 51690b4
Showing 1 changed file with 30 additions and 3 deletions.
33 changes: 30 additions & 3 deletions unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,13 @@ def _iter_document_elements(self) -> Iterator[Element]:
yield from self._iter_section_headers(section)

for block_item in _SectBlockItemIterator.iter_sect_block_items(section, self._document):
# -- a block-item can only be a Paragraph ... --
if isinstance(block_item, Paragraph):
yield from self._iter_paragraph_elements(block_item)
if self._element_contains_pagebreak(block_item._element):
yield from self._increment_page_number()
else: # -- it's a Table object --
# -- a paragraph can contain a page-break --
yield from self._iter_maybe_paragraph_page_breaks(block_item)
# -- ... or a Table --
else:
yield from self._iter_table_element(block_item)

yield from self._iter_section_footers(section)
Expand Down Expand Up @@ -346,6 +348,31 @@ def _iter_paragraph_elements(self, paragraph: Paragraph) -> Iterator[Element]:
# -- if all that fails we give it the default `Text` element-type --
yield Text(text, metadata=metadata)

def _iter_maybe_paragraph_page_breaks(self, paragraph: Paragraph) -> Iterator[PageBreak]:
"""Generate a `PageBreak` document element for each page-break in `paragraph`.
Checks for both "hard" page breaks (page breaks explicitly inserted by the user)
and "soft" page breaks, which are sometimes inserted by the MS Word renderer.
Note that soft page breaks aren't always present. Whether or not pages are
tracked may depend on your Word renderer.
"""

def has_page_break_implementation_we_have_so_far() -> bool:
"""Needs to become more sophisticated."""
page_break_indicators = [
["w:br", 'type="page"'], # "Hard" page break inserted by user
["lastRenderedPageBreak"], # "Soft" page break inserted by renderer
]
for indicators in page_break_indicators:
if all(indicator in paragraph._p.xml for indicator in indicators):
return True
return False

if not has_page_break_implementation_we_have_so_far():
return

yield from self._increment_page_number()

def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[Dict[str, str]]:
"""Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`."""
for run in paragraph.runs:
Expand Down

0 comments on commit 51690b4

Please sign in to comment.