Skip to content

Commit

Permalink
support office files
Browse files Browse the repository at this point in the history
support office files
  • Loading branch information
hrfng authored Sep 17, 2023
2 parents c0118ee + 32a8b5e commit e680580
Show file tree
Hide file tree
Showing 15 changed files with 322 additions and 33 deletions.
2 changes: 1 addition & 1 deletion docker/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ function install_deps() {
}

function twine_upload() {
twine upload dist/* -u __token__ -p ${PYPI_PASSWORD} --repository pypi
twine upload dist/* -u __token__ -p ${PYPI_PASSWORD} --repository pypi --skip-existing
}


Expand Down
Binary file modified examples/docs/maoxuan_sample.docx
Binary file not shown.
Binary file added examples/docs/test.xlsx
Binary file not shown.
10 changes: 8 additions & 2 deletions src/bisheng_unstructured/documents/html_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def visualize_html(elements, output_file):
text = el.metadata.text_as_html
text = text.replace("\n", " ")
else:
text = f"<p {styles[idx % 2]}>{el.text}</p>"
text = el.text.replace("\n", "<br>")
text = f"<p {styles[idx % 2]}>{text}</p>"
idx += 1

if text:
Expand All @@ -57,19 +58,24 @@ def save_to_txt(elements, output_file):
text_elem_sep = "\n"
content_page = []
is_first_elem = True
last_label = ""
for el in elements:
label, text = el.category, el.text
if is_first_elem:
f_text = text + "\n" if label == "Title" else text
content_page.append(f_text)
is_first_elem = False
else:
if label == "Title":
if last_label == "Title" and label == "Title":
content_page.append("\n" + text + "\n")
elif label == "Title":
content_page.append("\n\n" + text + "\n")
elif label == "Table":
content_page.append("\n\n" + text + "\n")
else:
content_page.append(text_elem_sep + text)

last_label = label

with open(output_file, "w") as fout:
fout.write("".join(content_page))
3 changes: 3 additions & 0 deletions src/bisheng_unstructured/nlp/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,6 @@
# For zh variables
ZH_PUNC_NOT_IN_TITLE_PATTERN = r"[,。、;!|,;!]"
ZH_PUNC_NOT_IN_TITLE_RE = re.compile(ZH_PUNC_NOT_IN_TITLE_PATTERN)

ZH_PUNC_NOT_IN_PPTX_TITLE_PATTERN = r"[。;!|;!]"
ZH_PUNC_NOT_IN_PPTX_TITLE_RE = re.compile(ZH_PUNC_NOT_IN_PPTX_TITLE_PATTERN)
35 changes: 26 additions & 9 deletions src/bisheng_unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from docx.table import Table as DocxTable
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from lxml import etree

from bisheng_unstructured.cleaners.core import clean_bullets
from bisheng_unstructured.documents.elements import (
Expand All @@ -25,6 +26,7 @@
Title,
process_metadata,
)
from bisheng_unstructured.documents.markdown import transform_html_table_to_md
from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from bisheng_unstructured.partition.common import (
convert_ms_office_table_to_text,
Expand Down Expand Up @@ -140,6 +142,7 @@ def partition_docx(

# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file)
language = kwargs.get("language", "zh")

last_modification_date = None
if filename is not None:
Expand Down Expand Up @@ -168,14 +171,16 @@ def partition_docx(
section = 0
is_list = False
for element_item in document.element.body:
# print("---element_item---", element_item, element_item.tag, element_item.xml)
if element_item.tag.endswith("tbl"):
table = document.tables[table_index]
emphasized_texts = _get_emphasized_texts_from_table(table)
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
emphasized_texts,
)
html_table = convert_ms_office_table_to_text(table, as_html=True)
text_table = convert_ms_office_table_to_text(table, as_html=False)
# text_table = convert_ms_office_table_to_text(table, as_html=False)
text_table = transform_html_table_to_md(html_table)["text"]
element = Table(text_table)
if element is not None:
element.metadata = ElementMetadata(
Expand All @@ -196,7 +201,7 @@ def partition_docx(
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
emphasized_texts,
)
para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list)
para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list, language)
if para_element is not None:
para_element.metadata = ElementMetadata(
filename=metadata_filename,
Expand All @@ -207,6 +212,14 @@ def partition_docx(
)
elements.append(para_element)
is_list = False
# print(
# "---p---",
# emphasized_texts,
# emphasized_text_contents,
# emphasized_text_tags,
# para_element,
# paragraph.style,
# )
elif element_item.tag.endswith("sectPr"):
if len(headers_and_footers) > section:
footers = headers_and_footers[section][1]
Expand All @@ -226,26 +239,30 @@ def partition_docx(


def _paragraph_to_element(
paragraph: docx.text.paragraph.Paragraph,
is_list=False,
paragraph: docx.text.paragraph.Paragraph, is_list=False, language="eng"
) -> Optional[Text]:
"""Converts a docx Paragraph object into the appropriate unstructured document element.
If the paragraph style is "Normal" or unknown, we try to predict the element type from the
raw text."""
text = paragraph.text
# normailize the text
text = text.strip("\n")
style_name = paragraph.style and paragraph.style.name # .style can be None

if len(text.strip()) == 0:
return None

if "Heading" in paragraph.style.name:
return Title(text)

element_class = STYLE_TO_ELEMENT_MAPPING.get(style_name)

# NOTE(robinson) - The "Normal" style name will return None since it's in the mapping.
# Unknown style names will also return None
if is_list:
return _text_to_element(text, is_list)
return _text_to_element(text, is_list, language=language)
elif element_class is None:
return _text_to_element(text)
return _text_to_element(text, language=language)
else:
return element_class(text)

Expand All @@ -266,7 +283,7 @@ def _element_contains_pagebreak(element) -> bool:
return False


def _text_to_element(text: str, is_list=False) -> Optional[Text]:
def _text_to_element(text: str, is_list=False, language="eng") -> Optional[Text]:
"""Converts raw text into an unstructured Text element."""
if is_bulleted_text(text) or is_list:
clean_text = clean_bullets(text).strip()
Expand All @@ -280,8 +297,8 @@ def _text_to_element(text: str, is_list=False) -> Optional[Text]:
return None
elif is_possible_narrative_text(text):
return NarrativeText(text)
elif is_possible_title(text):
return Title(text)
# elif is_possible_title(text, title_max_word_length=20, language=language):
# return Title(text)
else:
return Text(text)

Expand Down
74 changes: 56 additions & 18 deletions src/bisheng_unstructured/partition/pptx.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json
import re
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast

Expand All @@ -15,6 +17,7 @@
Title,
process_metadata,
)
from bisheng_unstructured.documents.markdown import transform_html_table_to_md
from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from bisheng_unstructured.partition.common import (
convert_ms_office_table_to_text,
Expand All @@ -31,6 +34,10 @@

OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"

RE_MULTLINES = re.compile(pattern=r"\n+", flags=re.DOTALL)
RE_SPACES = re.compile(pattern=r"[ \t\r\f\v]+", flags=re.DOTALL)
RE_NORMAL_SPACES = re.compile(pattern=r"\s+", flags=re.DOTALL)


@process_metadata()
@add_metadata_with_filetype(FileType.PPTX)
Expand Down Expand Up @@ -85,6 +92,10 @@ def partition_pptx(
elements: List[Element] = []
metadata = ElementMetadata(filename=metadata_filename or filename)
num_slides = len(presentation.slides)
slide_height = presentation.slide_height
slide_width = presentation.slide_width
page_bbox = [slide_width, slide_height]
sel_i = 30
for i, slide in enumerate(presentation.slides):
metadata = ElementMetadata.from_dict(metadata.to_dict())
metadata.last_modified = metadata_last_modified or last_modification_date
Expand All @@ -97,11 +108,16 @@ def partition_pptx(
if notes_text.strip() != "":
elements.append(NarrativeText(text=notes_text, metadata=metadata))

shape_infos = []
shape_index = -1
for shape in _order_shapes(slide.shapes):
shape_index += 1
if shape.has_table:
table: pptx.table.Table = shape.table
html_table = convert_ms_office_table_to_text(table, as_html=True)
text_table = convert_ms_office_table_to_text(table, as_html=False)
# text_table = convert_ms_office_table_to_text(table, as_html=False)
text_table = transform_html_table_to_md(html_table)["text"]
# print('---table---', html_table, text_table)
if (text_table := text_table.strip()) != "":
metadata = ElementMetadata(
filename=metadata_filename or filename,
Expand All @@ -113,24 +129,46 @@ def partition_pptx(
continue
if not shape.has_text_frame:
continue
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
# NOTE - skip check if no top or left position (shape displayed top left)
if (shape.top and shape.left) and (shape.top < 0 or shape.left < 0):

bbox = [shape.left, shape.top, shape.width, shape.height]
shape_info = []
shape_infos.append({"runs": shape_info, "bbox": bbox})
metadata = {"bbox": bbox, "page_bbox": page_bbox}
metadata = ElementMetadata(
page_number=i, text_as_html=json.dumps(metadata), page_name="paragraph"
)

TITLE_AREA_THRESHOLD = 0.2
ratio = abs(bbox[3] - bbox[1]) * 1.0 / page_bbox[1]
# print('bbox', bbox, page_bbox, ratio)

is_title = False
text = None
if shape_index == 0 and ratio <= TITLE_AREA_THRESHOLD:
text = re.sub(RE_NORMAL_SPACES, " ", shape.text_frame.text)
is_title = is_possible_title(
text, language="zh", title_max_word_length=30, is_pptx=True
)

if not is_title:
text = shape.text_frame.text.replace("\x0b", "\n")
text = re.sub(RE_MULTLINES, "\n", text).strip()
text = re.sub(RE_SPACES, " ", text)

if text == "":
continue
for paragraph in shape.text_frame.paragraphs:
text = paragraph.text
if text.strip() == "":
continue
if _is_bulleted_paragraph(paragraph):
elements.append(ListItem(text=text, metadata=metadata))
elif is_email_address(text):
elements.append(EmailAddress(text=text))
elif is_possible_narrative_text(text):
elements.append(NarrativeText(text=text, metadata=metadata))
elif is_possible_title(text):
elements.append(Title(text=text, metadata=metadata))
else:
elements.append(Text(text=text, metadata=metadata))

# for paragraph in shape.text_frame.paragraphs:
# print('is_bulleted', _is_bulleted_paragraph(paragraph))

if is_email_address(text):
elements.append(EmailAddress(text=text))
elif is_possible_narrative_text(text):
elements.append(NarrativeText(text=text, metadata=metadata))
elif is_title:
elements.append(Title(text=text, metadata=metadata))
else:
elements.append(Text(text=text, metadata=metadata))

if include_page_breaks and i < num_slides - 1:
elements.append(PageBreak(text=""))
Expand Down
5 changes: 4 additions & 1 deletion src/bisheng_unstructured/partition/text_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
UNICODE_BULLETS_RE,
US_CITY_STATE_ZIP_RE,
US_PHONE_NUMBERS_RE,
ZH_PUNC_NOT_IN_PPTX_TITLE_RE,
ZH_PUNC_NOT_IN_TITLE_RE,
)
from bisheng_unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
Expand Down Expand Up @@ -99,6 +100,7 @@ def is_possible_title(
non_alpha_threshold: float = 0.5,
language: str = "en",
language_checks: bool = False,
is_pptx: bool = False,
) -> bool:
"""Checks to see if the text passes all of the checks for a valid title.
Expand Down Expand Up @@ -127,7 +129,8 @@ def is_possible_title(
return False

if language == "zh":
if ZH_PUNC_NOT_IN_TITLE_RE.search(text) is not None:
PUNK_RE = ZH_PUNC_NOT_IN_PPTX_TITLE_RE if is_pptx else ZH_PUNC_NOT_IN_TITLE_RE
if PUNK_RE.search(text) is not None:
return False

title_max_word_length = int(
Expand Down
5 changes: 4 additions & 1 deletion src/bisheng_unstructured/partition/xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
Table,
process_metadata,
)
from bisheng_unstructured.documents.markdown import transform_html_table_to_md
from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from bisheng_unstructured.partition.common import (
exactly_one,
Expand Down Expand Up @@ -63,7 +64,9 @@ def partition_xlsx(
for sheet_name, table in sheets.items():
page_number += 1
html_text = table.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
text = transform_html_table_to_md(html_text)["text"]

# text = soupparser_fromstring(html_text).text_content()

if include_metadata:
metadata = ElementMetadata(
Expand Down
53 changes: 53 additions & 0 deletions tests/test_doc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html
from bisheng_unstructured.partition.docx import partition_docx


def test_docx():
filename = "./examples/docs/maoxuan_sample.docx"
elements = partition_docx(filename=filename)

output_file = "./data/maoxuan_sample_docx.html"
output_file2 = "./data/maoxuan_sample_docx.txt"
visualize_html(elements, output_file)
save_to_txt(elements, output_file2)


def test_docx2():
filename = "./examples/docs/handbook-1p.docx"
elements = partition_docx(filename=filename)

output_file = "./data/handbook-1p.html"
visualize_html(elements, output_file)


def test_docx3():
import docx

filename = "./examples/docs/handbook-1p.docx"
output = "./examples/docs/handbook-1p.pdf"

# Open the .docs file
doc = docx.Document(filename)
# Save the file as pdf
doc.save(output)


def test4():
inp = "./examples/docs/handbook-1p.docx"
outp = "./examples/docs/handbook-1p.pdf"

import pypandoc

pypandoc.convert_file(inp, "pdf", outputfile=outp)


def test5():
inp = "./examples/docs/maoxuan_sample.docx"
outp = "./data/maoxuan_sample.pdf"


test_docx()
# test_docx2()
# test_docx3()
# test4()
# test5()
Loading

0 comments on commit e680580

Please sign in to comment.