Skip to content

Commit

Permalink
support execl, powerpoint to pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
hrfng committed Nov 8, 2023
1 parent 49a3e3e commit 7633494
Show file tree
Hide file tree
Showing 13 changed files with 333 additions and 15 deletions.
11 changes: 10 additions & 1 deletion docker/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,15 @@ function temp_build_image_v002() {
}


function upload_image() {
IMAGE_DIR="/home/public/bisheng-images"
IMAGE_FILE="${IMAGE_DIR}/dataelement-bisheng-unstructured-v0.0.2.tar.gz"
docker save dataelement/bisheng-unstructured:0.0.2 | gzip > $IMAGE_FILE
upload-data $IMAGE_FILE
}


# create_dev_image
# temp_build_image
temp_build_image_v002
# temp_build_image_v002
upload_image
10 changes: 7 additions & 3 deletions src/bisheng_unstructured/api/any2pdf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import base64

from bisheng_unstructured.documents.pdf_parser.pdf_creator import PdfCreator
from bisheng_unstructured.topdf import DocxToPDF
from bisheng_unstructured.topdf import DocxToPDFV1, ExcelToPDF, PptxToPDF


class Any2PdfCreator(object):
Expand All @@ -12,8 +12,12 @@ def __init__(self, kwargs):
"jpg": PdfCreator,
"tiff": PdfCreator,
"bmp": PdfCreator,
"doc": DocxToPDF,
"docx": DocxToPDF,
"doc": DocxToPDFV1,
"docx": DocxToPDFV1,
"ppt": PptxToPDF,
"pptx": PptxToPDF,
"xlsx": ExcelToPDF,
"xls": ExcelToPDF,
}
self.model_params = kwargs

Expand Down
6 changes: 4 additions & 2 deletions src/bisheng_unstructured/topdf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from .docx2pdf import DocxToPDF
from .docx2pdf import DocxToPDF, DocxToPDFV1
from .excel2pdf import ExcelToPDF
from .pptx2pdf import PptxToPDF

__all__ = ["DocxToPDF"]
__all__ = ["DocxToPDF", "DocxToPDFV1", "PptxToPDF", "ExcelToPDF"]
41 changes: 40 additions & 1 deletion src/bisheng_unstructured/topdf/docx2pdf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import shutil

from bisheng_unstructured.partition.common import convert_office_doc

Expand Down Expand Up @@ -32,7 +33,10 @@ def render(self, input_file, output_file=None, to_bytes=False):
output_file = os.path.join(temp_dir, output_filename)

if type_ext == "doc":
convert_office_doc(input_file, temp_dir)
convert_office_doc(
input_file, temp_dir, target_format="docx", target_filter="MS Word 2007 XML"
)

input_file = os.path.join(temp_dir, filename.rsplit(".", 1)[0] + ".docx")

cmd = self.cmd_template.format(input_file, output_file)
Expand All @@ -45,3 +49,38 @@ def render(self, input_file, output_file=None, to_bytes=False):

if to_bytes:
return open(output_file, "rb").read()


class DocxToPDFV1(object):
def __init__(self, kwargs={}):
cmd_template = """
soffice --headless --convert-to pdf --outdir {1} {0}
"""

def _norm_cmd(cmd):
return " ".join([p.strip() for p in cmd.strip().split()])

self.cmd_template = _norm_cmd(cmd_template)

def render(self, input_file, output_file=None, to_bytes=False):
type_ext = input_file.rsplit(".", 1)[-1]
filename = os.path.basename(input_file)
temp_dir = os.path.dirname(input_file)
output_filename = filename.rsplit(".", 1)[0] + ".pdf"
temp_output_file = os.path.join(temp_dir, output_filename)

assert type_ext in ["docx", "doc"]

cmd = self.cmd_template.format(input_file, temp_dir)
try:
exit_code = os.system(cmd)
if exit_code != 0:
raise Exception("error in transforming doc to pdf")
except Exception as e:
raise e

if output_file is not None:
shutil.move(temp_output_file, output_file)

if to_bytes:
return open(temp_output_file, "rb").read()
65 changes: 65 additions & 0 deletions src/bisheng_unstructured/topdf/excel2pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
import shutil
import tempfile

import openpyxl


class ExcelToPDF(object):
def __init__(self, kwargs={}):
cmd_template = """
soffice --convert-to
"pdf:calc_pdf_Export:{\"SinglePageSheets\":{\"type\":\"boolean\",\"value\":\"true\"}}"
--outdir
"""
cmd_template2 = """
soffice --headless --convert-to xlsx --outdir
"""

def _norm_cmd(cmd):
return " ".join([p.strip() for p in cmd.strip().split()])

self.cmd_template = _norm_cmd(cmd_template)
self.cmd_template2 = _norm_cmd(cmd_template2)

@staticmethod
def run(cmd):
try:
exit_code = os.system(cmd)
if exit_code != 0:
raise Exception("error in transforming xlsx to pdf")
except Exception as e:
raise e

def render(self, input_file, output_file=None, to_bytes=False):
type_ext = input_file.rsplit(".", 1)[-1]
filename = os.path.basename(input_file)
output_filename = filename.rsplit(".", 1)[0] + ".pdf"

assert type_ext in ["xlsx", "xls"]

with tempfile.TemporaryDirectory() as temp_dir:
if type_ext == "xls":
cmd = self.cmd_template2 + " {1} {0}".format(input_file, temp_dir)
ExcelToPDF.run(cmd)
filename = filename.rsplit(".", 1)[0] + ".xlsx"
input_file = os.path.join(temp_dir, filename)

print("input_file", input_file)
temp_output_file = os.path.join(temp_dir, output_filename)
wb = openpyxl.load_workbook(input_file)
for ws in wb:
ws.print_options.gridLines = True
ws.print_options.gridLinesSet = True

input_file = os.path.join(temp_dir, filename)
wb.save(input_file)

cmd = self.cmd_template + " {1} {0}".format(input_file, temp_dir)
ExcelToPDF.run(cmd)

if output_file is not None:
shutil.move(temp_output_file, output_file)

if to_bytes:
return open(temp_output_file, "rb").read()
37 changes: 37 additions & 0 deletions src/bisheng_unstructured/topdf/pptx2pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os
import shutil


class PptxToPDF(object):
def __init__(self, kwargs={}):
cmd_template = """
soffice --headless --convert-to pdf --outdir {1} {0}
"""

def _norm_cmd(cmd):
return " ".join([p.strip() for p in cmd.strip().split()])

self.cmd_template = _norm_cmd(cmd_template)

def render(self, input_file, output_file=None, to_bytes=False):
type_ext = input_file.rsplit(".", 1)[-1]
filename = os.path.basename(input_file)
temp_dir = os.path.dirname(input_file)
output_filename = filename.rsplit(".", 1)[0] + ".pdf"
temp_output_file = os.path.join(temp_dir, output_filename)

assert type_ext in ["pptx", "ppt"]

cmd = self.cmd_template.format(input_file, temp_dir)
try:
exit_code = os.system(cmd)
if exit_code != 0:
raise Exception("error in transforming pptx to pdf")
except Exception as e:
raise e

if output_file is not None:
shutil.move(temp_output_file, output_file)

if to_bytes:
return open(temp_output_file, "rb").read()
53 changes: 53 additions & 0 deletions src/bisheng_unstructured/topdf/text2pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os


class Text2PDF(object):
def __init__(self, kwargs={}):
cmd_template = """
pandoc -o {1} --pdf-engine=xelatex {0}
-V mainfont="Alibaba PuHuiTi"
-V sansfont="Alibaba PuHuiTi"
-V monofont="Cascadia Mono"
-V CJKmainfont="Alibaba PuHuiTi"
-V CJKsansfont="Alibaba PuHuiTi"
-V CJKmonofont="Adobe Heiti Std"
"""

self.cmd_template2 = "pandoc -o {1} {0}"

def _norm_cmd(cmd):
return " ".join([p.strip() for p in cmd.strip().split()])

self.cmd_template = _norm_cmd(cmd_template)
self.cmd_template2 = "pandoc -o {1} {0}"

@staticmethod
def run(cmd):
try:
exit_code = os.system(cmd)
if exit_code != 0:
raise Exception("error in transforming xlsx to pdf")
except Exception as e:
raise e

def render(self, input_file, output_file=None, to_bytes=False):
type_ext = input_file.rsplit(".", 1)[-1]
filename = os.path.basename(input_file)
temp_dir = os.path.dirname(input_file)

assert type_ext in ["txt", "md", "html"]

if type_ext == "txt":
cmd = self.cmd_template2.format(input_file, "./data/xxxx.md")
Text2PDF.run(cmd)
input_file = "./data/xxxx.md"

if output_file is None:
output_filename = filename.rsplit(".", 1)[0] + ".pdf"
output_file = os.path.join(temp_dir, output_filename)

cmd = self.cmd_template.format(input_file, output_file)
Text2PDF.run(cmd)

if to_bytes:
return open(output_file, "rb").read()
8 changes: 4 additions & 4 deletions tests/regression/test_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ function test_cases() {


function test_container() {
# image="dataelement/bisheng-unstructured:0.0.1"
# temp_ctn="bisheng_uns_v001_test"
image="dataelement/bisheng-unstructured:0.0.1"
temp_ctn="bisheng_uns_v001_test"

image="dataelement/bisheng-unstructured:0.0.2"
temp_ctn="bisheng_uns_v002_test"

pushd $(cd $(dirname $0); pwd)
docker run -p 10005:10001 -itd --workdir /opt/bisheng-unstructured --name ${temp_ctn} $image bash bin/entrypoint.sh
# docker run -p 10005:10001 -itd --workdir /opt/bisheng-unstructured --name ${temp_ctn} $image bash bin/entrypoint.sh
UNS_EP="127.0.0.1:10005"

sleep 5
Expand All @@ -26,7 +26,7 @@ function test_container() {
curl -X GET http://${UNS_EP}/v1/config
UNS_EP=${UNS_EP} python3 test_etl4llm.py

# docker stop ${temp_ctn} && docker rm ${temp_ctn}
docker stop ${temp_ctn} && docker rm ${temp_ctn}
}


Expand Down
16 changes: 14 additions & 2 deletions tests/regression/test_etl4llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def test_part():
parameters={"start": 0, "n": 5},
)
resp = requests.post(url, json=inp).json()
print(resp)
assert resp["status_code"] == 200, resp
# print(resp)


def test_any2pdf():
Expand All @@ -33,10 +34,21 @@ def test_any2pdf():
)
resp = requests.post(url, json=inp).json()

assert resp["status_code"] == 200, resp

filename = "../../examples/docs/maoxuan_sample.doc"
b64_data = base64.b64encode(open(filename, "rb").read()).decode()
inp = dict(
filename=os.path.basename(filename),
b64_data=[b64_data],
mode="topdf",
)
resp = requests.post(url, json=inp).json()

assert resp["status_code"] == 200, resp
# with open('test.pdf', 'wb') as fout:
# fout.write(base64.b64decode(resp['b64_pdf']))


test_part()
test_any2pdf()
# test_part()
36 changes: 34 additions & 2 deletions tests/test_docx2pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from bisheng_unstructured.partition.common import convert_office_doc
from bisheng_unstructured.topdf.docx2pdf import DocxToPDF
from bisheng_unstructured.topdf.docx2pdf import DocxToPDF, DocxToPDFV1


def test1():
Expand All @@ -22,6 +22,38 @@ def test3():
engine.render(input_file, output_file)


test1()
def test4():
input_file = "./examples/docs/maoxuan_sample.doc"
output_file = "./data/maoxuan_sample-v1.pdf"
engine = DocxToPDFV1()
engine.render(input_file, output_file)


def test5():
input_file = "./examples/docs/maoxuan_sample.docx"
output_file = "./data/maoxuan_sample-v2.pdf"
engine = DocxToPDFV1()
engine.render(input_file, output_file)


def test6():
input_file = "./examples/docs/UI自动化测试说明文档V1.0.docx"
output_file = "./data/UI自动化测试说明文档V1.0.pdf"
engine = DocxToPDFV1()
engine.render(input_file, output_file)


def test7():
# not supported
input_file = "./examples/docs/UI自动化测试说明文档V1.0.docx"
output_file = "./data/UI自动化测试说明文档V1.0-v0.pdf"
engine = DocxToPDF()
engine.render(input_file, output_file)


# test1()
# test2()
# test3()
# test4()
# test6()
# test7()
19 changes: 19 additions & 0 deletions tests/test_excel2pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from bisheng_unstructured.topdf.excel2pdf import ExcelToPDF


def test1():
input_file = "./examples/docs/tests-example.xlsx"
output_file = "./data/tests-example-v1.0.pdf"
engine = ExcelToPDF()
engine.render(input_file, output_file)


def test2():
input_file = "./examples/docs/tests-example.xls"
output_file = "./data/tests-example-v1.1.pdf"
engine = ExcelToPDF()
engine.render(input_file, output_file)


test1()
test2()
Loading

0 comments on commit 7633494

Please sign in to comment.