support office files

dataelement · Sep 17, 2023 · e680580 · e680580
2 parents c0118ee + 32a8b5e
commit e680580
Show file tree

Hide file tree

Showing 15 changed files with 322 additions and 33 deletions.
diff --git a/docker/prepare.sh b/docker/prepare.sh
@@ -31,7 +31,7 @@ function install_deps() {
 }
 
 function twine_upload() {
-  twine upload dist/* -u __token__ -p ${PYPI_PASSWORD} --repository pypi
+  twine upload dist/* -u __token__ -p ${PYPI_PASSWORD} --repository pypi --skip-existing
 }
 
 

diff --git a/examples/docs/maoxuan_sample.docx b/examples/docs/maoxuan_sample.docx
diff --git a/examples/docs/test.xlsx b/examples/docs/test.xlsx
diff --git a/src/bisheng_unstructured/documents/html_utils.py b/src/bisheng_unstructured/documents/html_utils.py
@@ -41,7 +41,8 @@ def visualize_html(elements, output_file):
             text = el.metadata.text_as_html
             text = text.replace("\n", " ")
         else:
-            text = f"<p {styles[idx % 2]}>{el.text}</p>"
+            text = el.text.replace("\n", "<br>")
+            text = f"<p {styles[idx % 2]}>{text}</p>"
             idx += 1
 
         if text:
@@ -57,19 +58,24 @@ def save_to_txt(elements, output_file):
     text_elem_sep = "\n"
     content_page = []
     is_first_elem = True
+    last_label = ""
     for el in elements:
         label, text = el.category, el.text
         if is_first_elem:
             f_text = text + "\n" if label == "Title" else text
             content_page.append(f_text)
             is_first_elem = False
         else:
-            if label == "Title":
+            if last_label == "Title" and label == "Title":
+                content_page.append("\n" + text + "\n")
+            elif label == "Title":
                 content_page.append("\n\n" + text + "\n")
             elif label == "Table":
                 content_page.append("\n\n" + text + "\n")
             else:
                 content_page.append(text_elem_sep + text)
 
+        last_label = label
+
     with open(output_file, "w") as fout:
         fout.write("".join(content_page))
diff --git a/src/bisheng_unstructured/nlp/patterns.py b/src/bisheng_unstructured/nlp/patterns.py
@@ -143,3 +143,6 @@
 # For zh variables
 ZH_PUNC_NOT_IN_TITLE_PATTERN = r"[，。、；！|,;!]"
 ZH_PUNC_NOT_IN_TITLE_RE = re.compile(ZH_PUNC_NOT_IN_TITLE_PATTERN)
+
+ZH_PUNC_NOT_IN_PPTX_TITLE_PATTERN = r"[。；！|;!]"
+ZH_PUNC_NOT_IN_PPTX_TITLE_RE = re.compile(ZH_PUNC_NOT_IN_PPTX_TITLE_PATTERN)
diff --git a/src/bisheng_unstructured/partition/docx.py b/src/bisheng_unstructured/partition/docx.py
@@ -8,6 +8,7 @@
 from docx.table import Table as DocxTable
 from docx.text.paragraph import Paragraph
 from docx.text.run import Run
+from lxml import etree
 
 from bisheng_unstructured.cleaners.core import clean_bullets
 from bisheng_unstructured.documents.elements import (
@@ -25,6 +26,7 @@
     Title,
     process_metadata,
 )
+from bisheng_unstructured.documents.markdown import transform_html_table_to_md
 from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from bisheng_unstructured.partition.common import (
     convert_ms_office_table_to_text,
@@ -140,6 +142,7 @@ def partition_docx(
 
     # Verify that only one of the arguments was provided
     exactly_one(filename=filename, file=file)
+    language = kwargs.get("language", "zh")
 
     last_modification_date = None
     if filename is not None:
@@ -168,14 +171,16 @@ def partition_docx(
     section = 0
     is_list = False
     for element_item in document.element.body:
+        # print("---element_item---", element_item, element_item.tag, element_item.xml)
         if element_item.tag.endswith("tbl"):
             table = document.tables[table_index]
             emphasized_texts = _get_emphasized_texts_from_table(table)
             emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
                 emphasized_texts,
             )
             html_table = convert_ms_office_table_to_text(table, as_html=True)
-            text_table = convert_ms_office_table_to_text(table, as_html=False)
+            # text_table = convert_ms_office_table_to_text(table, as_html=False)
+            text_table = transform_html_table_to_md(html_table)["text"]
             element = Table(text_table)
             if element is not None:
                 element.metadata = ElementMetadata(
@@ -196,7 +201,7 @@ def partition_docx(
             emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
                 emphasized_texts,
             )
-            para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list)
+            para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list, language)
             if para_element is not None:
                 para_element.metadata = ElementMetadata(
                     filename=metadata_filename,
@@ -207,6 +212,14 @@ def partition_docx(
                 )
                 elements.append(para_element)
             is_list = False
+            # print(
+            #     "---p---",
+            #     emphasized_texts,
+            #     emphasized_text_contents,
+            #     emphasized_text_tags,
+            #     para_element,
+            #     paragraph.style,
+            # )
         elif element_item.tag.endswith("sectPr"):
             if len(headers_and_footers) > section:
                 footers = headers_and_footers[section][1]
@@ -226,26 +239,30 @@ def partition_docx(
 
 
 def _paragraph_to_element(
-    paragraph: docx.text.paragraph.Paragraph,
-    is_list=False,
+    paragraph: docx.text.paragraph.Paragraph, is_list=False, language="eng"
 ) -> Optional[Text]:
     """Converts a docx Paragraph object into the appropriate unstructured document element.
     If the paragraph style is "Normal" or unknown, we try to predict the element type from the
     raw text."""
     text = paragraph.text
+    # normailize the text
+    text = text.strip("\n")
     style_name = paragraph.style and paragraph.style.name  # .style can be None
 
     if len(text.strip()) == 0:
         return None
 
+    if "Heading" in paragraph.style.name:
+        return Title(text)
+
     element_class = STYLE_TO_ELEMENT_MAPPING.get(style_name)
 
     # NOTE(robinson) - The "Normal" style name will return None since it's in the mapping.
     # Unknown style names will also return None
     if is_list:
-        return _text_to_element(text, is_list)
+        return _text_to_element(text, is_list, language=language)
     elif element_class is None:
-        return _text_to_element(text)
+        return _text_to_element(text, language=language)
     else:
         return element_class(text)
 
@@ -266,7 +283,7 @@ def _element_contains_pagebreak(element) -> bool:
     return False
 
 
-def _text_to_element(text: str, is_list=False) -> Optional[Text]:
+def _text_to_element(text: str, is_list=False, language="eng") -> Optional[Text]:
     """Converts raw text into an unstructured Text element."""
     if is_bulleted_text(text) or is_list:
         clean_text = clean_bullets(text).strip()
@@ -280,8 +297,8 @@ def _text_to_element(text: str, is_list=False) -> Optional[Text]:
         return None
     elif is_possible_narrative_text(text):
         return NarrativeText(text)
-    elif is_possible_title(text):
-        return Title(text)
+    # elif is_possible_title(text, title_max_word_length=20, language=language):
+    #     return Title(text)
     else:
         return Text(text)
 

diff --git a/src/bisheng_unstructured/partition/pptx.py b/src/bisheng_unstructured/partition/pptx.py
@@ -1,3 +1,5 @@
+import json
+import re
 from tempfile import SpooledTemporaryFile
 from typing import IO, BinaryIO, List, Optional, Union, cast
 
@@ -15,6 +17,7 @@
     Title,
     process_metadata,
 )
+from bisheng_unstructured.documents.markdown import transform_html_table_to_md
 from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from bisheng_unstructured.partition.common import (
     convert_ms_office_table_to_text,
@@ -31,6 +34,10 @@
 
 OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
 
+RE_MULTLINES = re.compile(pattern=r"\n+", flags=re.DOTALL)
+RE_SPACES = re.compile(pattern=r"[ \t\r\f\v]+", flags=re.DOTALL)
+RE_NORMAL_SPACES = re.compile(pattern=r"\s+", flags=re.DOTALL)
+
 
 @process_metadata()
 @add_metadata_with_filetype(FileType.PPTX)
@@ -85,6 +92,10 @@ def partition_pptx(
     elements: List[Element] = []
     metadata = ElementMetadata(filename=metadata_filename or filename)
     num_slides = len(presentation.slides)
+    slide_height = presentation.slide_height
+    slide_width = presentation.slide_width
+    page_bbox = [slide_width, slide_height]
+    sel_i = 30
     for i, slide in enumerate(presentation.slides):
         metadata = ElementMetadata.from_dict(metadata.to_dict())
         metadata.last_modified = metadata_last_modified or last_modification_date
@@ -97,11 +108,16 @@ def partition_pptx(
                 if notes_text.strip() != "":
                     elements.append(NarrativeText(text=notes_text, metadata=metadata))
 
+        shape_infos = []
+        shape_index = -1
         for shape in _order_shapes(slide.shapes):
+            shape_index += 1
             if shape.has_table:
                 table: pptx.table.Table = shape.table
                 html_table = convert_ms_office_table_to_text(table, as_html=True)
-                text_table = convert_ms_office_table_to_text(table, as_html=False)
+                # text_table = convert_ms_office_table_to_text(table, as_html=False)
+                text_table = transform_html_table_to_md(html_table)["text"]
+                # print('---table---', html_table, text_table)
                 if (text_table := text_table.strip()) != "":
                     metadata = ElementMetadata(
                         filename=metadata_filename or filename,
@@ -113,24 +129,46 @@ def partition_pptx(
                 continue
             if not shape.has_text_frame:
                 continue
-            # NOTE(robinson) - avoid processing shapes that are not on the actual slide
-            # NOTE - skip check if no top or left position (shape displayed top left)
-            if (shape.top and shape.left) and (shape.top < 0 or shape.left < 0):
+
+            bbox = [shape.left, shape.top, shape.width, shape.height]
+            shape_info = []
+            shape_infos.append({"runs": shape_info, "bbox": bbox})
+            metadata = {"bbox": bbox, "page_bbox": page_bbox}
+            metadata = ElementMetadata(
+                page_number=i, text_as_html=json.dumps(metadata), page_name="paragraph"
+            )
+
+            TITLE_AREA_THRESHOLD = 0.2
+            ratio = abs(bbox[3] - bbox[1]) * 1.0 / page_bbox[1]
+            # print('bbox', bbox, page_bbox, ratio)
+
+            is_title = False
+            text = None
+            if shape_index == 0 and ratio <= TITLE_AREA_THRESHOLD:
+                text = re.sub(RE_NORMAL_SPACES, " ", shape.text_frame.text)
+                is_title = is_possible_title(
+                    text, language="zh", title_max_word_length=30, is_pptx=True
+                )
+
+            if not is_title:
+                text = shape.text_frame.text.replace("\x0b", "\n")
+                text = re.sub(RE_MULTLINES, "\n", text).strip()
+                text = re.sub(RE_SPACES, " ", text)
+
+            if text == "":
                 continue
-            for paragraph in shape.text_frame.paragraphs:
-                text = paragraph.text
-                if text.strip() == "":
-                    continue
-                if _is_bulleted_paragraph(paragraph):
-                    elements.append(ListItem(text=text, metadata=metadata))
-                elif is_email_address(text):
-                    elements.append(EmailAddress(text=text))
-                elif is_possible_narrative_text(text):
-                    elements.append(NarrativeText(text=text, metadata=metadata))
-                elif is_possible_title(text):
-                    elements.append(Title(text=text, metadata=metadata))
-                else:
-                    elements.append(Text(text=text, metadata=metadata))
+
+            # for paragraph in shape.text_frame.paragraphs:
+            #     print('is_bulleted', _is_bulleted_paragraph(paragraph))
+
+            if is_email_address(text):
+                elements.append(EmailAddress(text=text))
+            elif is_possible_narrative_text(text):
+                elements.append(NarrativeText(text=text, metadata=metadata))
+            elif is_title:
+                elements.append(Title(text=text, metadata=metadata))
+            else:
+                elements.append(Text(text=text, metadata=metadata))
 
         if include_page_breaks and i < num_slides - 1:
             elements.append(PageBreak(text=""))

diff --git a/src/bisheng_unstructured/partition/text_type.py b/src/bisheng_unstructured/partition/text_type.py
@@ -18,6 +18,7 @@
     UNICODE_BULLETS_RE,
     US_CITY_STATE_ZIP_RE,
     US_PHONE_NUMBERS_RE,
+    ZH_PUNC_NOT_IN_PPTX_TITLE_RE,
     ZH_PUNC_NOT_IN_TITLE_RE,
 )
 from bisheng_unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
@@ -99,6 +100,7 @@ def is_possible_title(
     non_alpha_threshold: float = 0.5,
     language: str = "en",
     language_checks: bool = False,
+    is_pptx: bool = False,
 ) -> bool:
     """Checks to see if the text passes all of the checks for a valid title.
 
@@ -127,7 +129,8 @@ def is_possible_title(
         return False
 
     if language == "zh":
-        if ZH_PUNC_NOT_IN_TITLE_RE.search(text) is not None:
+        PUNK_RE = ZH_PUNC_NOT_IN_PPTX_TITLE_RE if is_pptx else ZH_PUNC_NOT_IN_TITLE_RE
+        if PUNK_RE.search(text) is not None:
             return False
 
         title_max_word_length = int(

diff --git a/src/bisheng_unstructured/partition/xlsx.py b/src/bisheng_unstructured/partition/xlsx.py
@@ -10,6 +10,7 @@
     Table,
     process_metadata,
 )
+from bisheng_unstructured.documents.markdown import transform_html_table_to_md
 from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from bisheng_unstructured.partition.common import (
     exactly_one,
@@ -63,7 +64,9 @@ def partition_xlsx(
     for sheet_name, table in sheets.items():
         page_number += 1
         html_text = table.to_html(index=False, header=include_header, na_rep="")
-        text = soupparser_fromstring(html_text).text_content()
+        text = transform_html_table_to_md(html_text)["text"]
+
+        # text = soupparser_fromstring(html_text).text_content()
 
         if include_metadata:
             metadata = ElementMetadata(

diff --git a/tests/test_doc.py b/tests/test_doc.py
@@ -0,0 +1,53 @@
+from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html
+from bisheng_unstructured.partition.docx import partition_docx
+
+
+def test_docx():
+    filename = "./examples/docs/maoxuan_sample.docx"
+    elements = partition_docx(filename=filename)
+
+    output_file = "./data/maoxuan_sample_docx.html"
+    output_file2 = "./data/maoxuan_sample_docx.txt"
+    visualize_html(elements, output_file)
+    save_to_txt(elements, output_file2)
+
+
+def test_docx2():
+    filename = "./examples/docs/handbook-1p.docx"
+    elements = partition_docx(filename=filename)
+
+    output_file = "./data/handbook-1p.html"
+    visualize_html(elements, output_file)
+
+
+def test_docx3():
+    import docx
+
+    filename = "./examples/docs/handbook-1p.docx"
+    output = "./examples/docs/handbook-1p.pdf"
+
+    # Open the .docs file
+    doc = docx.Document(filename)
+    # Save the file as pdf
+    doc.save(output)
+
+
+def test4():
+    inp = "./examples/docs/handbook-1p.docx"
+    outp = "./examples/docs/handbook-1p.pdf"
+
+    import pypandoc
+
+    pypandoc.convert_file(inp, "pdf", outputfile=outp)
+
+
+def test5():
+    inp = "./examples/docs/maoxuan_sample.docx"
+    outp = "./data/maoxuan_sample.pdf"
+
+
+test_docx()
+# test_docx2()
+# test_docx3()
+# test4()
+# test5()
-Original file line number
+Diff line change
@@ Expand Up / @@ -31,7 +31,7 @@ function install_deps() { @@
     }
     function twine_upload() {
-      twine upload dist/* -u __token__ -p ${PYPI_PASSWORD} --repository pypi
+      twine upload dist/* -u __token__ -p ${PYPI_PASSWORD} --repository pypi --skip-existing
     }
@@ Expand Down @@