From 51690b451f3b2099780015fa2ea84568b959a43a Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 12 Sep 2023 20:39:09 -0700 Subject: [PATCH] rfctr: extract ._iter_maybe_paragraph_page_breaks() This approach needs some refinement, but this extraction localizes any changes required to this new method. --- unstructured/partition/docx.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 5a739434dc..e3118c3c72 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -249,11 +249,13 @@ def _iter_document_elements(self) -> Iterator[Element]: yield from self._iter_section_headers(section) for block_item in _SectBlockItemIterator.iter_sect_block_items(section, self._document): + # -- a block-item can only be a Paragraph ... -- if isinstance(block_item, Paragraph): yield from self._iter_paragraph_elements(block_item) - if self._element_contains_pagebreak(block_item._element): - yield from self._increment_page_number() - else: # -- it's a Table object -- + # -- a paragraph can contain a page-break -- + yield from self._iter_maybe_paragraph_page_breaks(block_item) + # -- ... or a Table -- + else: yield from self._iter_table_element(block_item) yield from self._iter_section_footers(section) @@ -346,6 +348,31 @@ def _iter_paragraph_elements(self, paragraph: Paragraph) -> Iterator[Element]: # -- if all that fails we give it the default `Text` element-type -- yield Text(text, metadata=metadata) + def _iter_maybe_paragraph_page_breaks(self, paragraph: Paragraph) -> Iterator[PageBreak]: + """Generate a `PageBreak` document element for each page-break in `paragraph`. + + Checks for both "hard" page breaks (page breaks explicitly inserted by the user) + and "soft" page breaks, which are sometimes inserted by the MS Word renderer. + Note that soft page breaks aren't always present. Whether or not pages are + tracked may depend on your Word renderer. + """ + + def has_page_break_implementation_we_have_so_far() -> bool: + """Needs to become more sophisticated.""" + page_break_indicators = [ + ["w:br", 'type="page"'], # "Hard" page break inserted by user + ["lastRenderedPageBreak"], # "Soft" page break inserted by renderer + ] + for indicators in page_break_indicators: + if all(indicator in paragraph._p.xml for indicator in indicators): + return True + return False + + if not has_page_break_implementation_we_have_so_far(): + return + + yield from self._increment_page_number() + def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[Dict[str, str]]: """Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`.""" for run in paragraph.runs: