Skip to content

Commit

Permalink
support word to pdf
Browse files Browse the repository at this point in the history
support word to pdf
  • Loading branch information
hrfng authored Nov 8, 2023
2 parents 4b91848 + ef1256b commit 49a3e3e
Show file tree
Hide file tree
Showing 10 changed files with 198 additions and 10 deletions.
5 changes: 5 additions & 0 deletions docker/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,9 @@ unset https_proxy
unset HTTP_PROXY
unset HTTPS_PROXY

export PATH=/opt/texlive/2023/bin/x86_64-linux:$PATH
export MANPATH=/opt/texlive/2023/texmf-dist/doc/man:$MANPATH
export INFOPATH=/opt/texlive/2023/texmf-dist/doc/info:$INFOPATH
export PATH=/opt/pandoc/pandoc-3.1.9/bin:$PATH

uvicorn --host 0.0.0.0 --port 10001 bisheng_unstructured.api.main:app
20 changes: 19 additions & 1 deletion docker/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,22 @@ function temp_build_image() {
}


temp_build_image
function create_dev_image() {
image="dataelement/bisheng-unstructured:0.0.1"
docker run -itd --name bisheng-uns-v002-dev -p 50001:10001 \
-v /home/hanfeng:/home/hanfeng -v /home/public:/home/public \
$image bash
}


function temp_build_image_v002() {
image="dataelement/bisheng-unstructured:0.0.2"
docker rmi $image
docker commit -a "[email protected]" -m "commit bisheng-unstructured image" bisheng-uns-v002-dev $image
#docker push $image
}


# create_dev_image
# temp_build_image
temp_build_image_v002
63 changes: 63 additions & 0 deletions scripts/install_latex.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash


function install_texlive() {
# ISO: https://mirrors.tuna.tsinghua.edu.cn/CTAN/systems/texlive/Images/
# ISO_FILE="/home/hanfeng/tars/texlive.iso"
# mount -o loop $ISO_FILE /mnt
MOUNT_PATH="/home/hanfeng/tars/texlive_iso_mirror"
pushd ${MOUNT_PATH}
./install-tl -no-gui -texdir /opt/texlive/2023
popd
unmount /mnt
}


function install_deps() {
apt-get install libfontconfig1 fontconfig libreoffice
}


function update_path() {
echo "export PATH=/opt/texlive/2023/bin/x86_64-linux:\$PATH" >> /root/.bashrc
echo "export MANPATH=/opt/texlive/2023/texmf-dist/doc/man:\$MANPATH" >> /root/.bashrc
echo "export INFOPATH=/opt/texlive/2023/texmf-dist/doc/info:\$INFOPATH" >> /root/.bashrc
}


function update_fonts() {
# add tex pkg
# mktexlsr

EXTR_FONT_DIR="/home/hanfeng/tars/texlive_fonts"
cp -fr ${EXTR_FONT_DIR} /usr/share/fonts/
# mkfontscale
# mkfontdir
fc-cache -fsv
# fc-cache -fv
fc-list :lang=zh-cn
}


function install_pandoc() {
# PANDOC_TAR_FILE="/home/hanfeng/tars/pandoc-3.1.9-linux-amd64.tar.gz"
# tar zxf ${PANDOC_TAR_FILE} -C /opt/pandoc

# pandoc template
# commit f7d8b629330074a4400d1f2795b101d14491c968
# (HEAD -> master, tag: 3.1.9, origin/master, origin/HEAD)

echo "export PATH=/opt/pandoc/pandoc-3.1.9/bin:\$PATH" >> /root/.bashrc
}


function clean() {
echo "clean"
apt-get clean && rm -rf /var/lib/apt/lists/* && rm -rf /root/.cache/pip
}


# update_fonts
# install_texlive
# install_pandoc
clean
3 changes: 3 additions & 0 deletions src/bisheng_unstructured/api/any2pdf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import base64

from bisheng_unstructured.documents.pdf_parser.pdf_creator import PdfCreator
from bisheng_unstructured.topdf import DocxToPDF


class Any2PdfCreator(object):
Expand All @@ -11,6 +12,8 @@ def __init__(self, kwargs):
"jpg": PdfCreator,
"tiff": PdfCreator,
"bmp": PdfCreator,
"doc": DocxToPDF,
"docx": DocxToPDF,
}
self.model_params = kwargs

Expand Down
6 changes: 3 additions & 3 deletions src/bisheng_unstructured/api/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,14 @@ def to_pdf(self, inp: UnstructuredInput) -> UnstructuredOutput:
return UnstructuredOutput(status_code=400, status_message=str(e))

def predict(self, inp: UnstructuredInput) -> UnstructuredOutput:
if inp.mode == "topdf":
return self.to_pdf(inp)

if inp.file_type not in PARTITION_MAP:
raise Exception(f"file type[{inp.file_type}] not supported")
filename = inp.file_path
file_type = inp.file_type

if inp.mode == "topdf":
return self.to_pdf(inp)

# part_params = inp.parameters
part_inp = {"filename": filename, **inp.parameters}
part_func = PARTITION_MAP.get(file_type)
Expand Down
3 changes: 3 additions & 0 deletions src/bisheng_unstructured/topdf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .docx2pdf import DocxToPDF

__all__ = ["DocxToPDF"]
47 changes: 47 additions & 0 deletions src/bisheng_unstructured/topdf/docx2pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os

from bisheng_unstructured.partition.common import convert_office_doc


class DocxToPDF(object):
def __init__(self, kwargs={}):
cmd_template = """
pandoc -o {1} --pdf-engine=xelatex {0}
-V mainfont="Alibaba PuHuiTi"
-V sansfont="Alibaba PuHuiTi"
-V monofont="Cascadia Mono"
-V CJKmainfont="Alibaba PuHuiTi"
-V CJKsansfont="Alibaba PuHuiTi"
-V CJKmonofont="Cascadia Mono"
"""

def _norm_cmd(cmd):
return " ".join([p.strip() for p in cmd.strip().split()])

self.cmd_template = _norm_cmd(cmd_template)

def render(self, input_file, output_file=None, to_bytes=False):
type_ext = input_file.rsplit(".", 1)[-1]
filename = os.path.basename(input_file)
temp_dir = os.path.dirname(input_file)

assert type_ext in ["doc", "docx"]

if output_file is None:
output_filename = filename.rsplit(".", 1)[0] + ".pdf"
output_file = os.path.join(temp_dir, output_filename)

if type_ext == "doc":
convert_office_doc(input_file, temp_dir)
input_file = os.path.join(temp_dir, filename.rsplit(".", 1)[0] + ".docx")

cmd = self.cmd_template.format(input_file, output_file)
try:
exit_code = os.system(cmd)
if exit_code != 0:
raise Exception("error in transforming doc to pdf")
except Exception as e:
raise e

if to_bytes:
return open(output_file, "rb").read()
13 changes: 8 additions & 5 deletions tests/regression/test_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@ function test_cases() {


function test_container() {
image="dataelement/bisheng-unstructured:0.0.1"
temp_ctn="bisheng_uns_v001_test"
# image="dataelement/bisheng-unstructured:0.0.1"
# temp_ctn="bisheng_uns_v001_test"

image="dataelement/bisheng-unstructured:0.0.2"
temp_ctn="bisheng_uns_v002_test"

pushd $(cd $(dirname $0); pwd)
docker run -p 10002:10001 -itd --workdir /opt/bisheng-unstructured --name ${temp_ctn} $image bash bin/entrypoint.sh
UNS_EP="127.0.0.1:10002"
docker run -p 10005:10001 -itd --workdir /opt/bisheng-unstructured --name ${temp_ctn} $image bash bin/entrypoint.sh
UNS_EP="127.0.0.1:10005"

sleep 5
# test_cases $UNS_EP
Expand All @@ -23,7 +26,7 @@ function test_container() {
curl -X GET http://${UNS_EP}/v1/config
UNS_EP=${UNS_EP} python3 test_etl4llm.py

docker stop ${temp_ctn} && docker rm ${temp_ctn}
# docker stop ${temp_ctn} && docker rm ${temp_ctn}
}


Expand Down
21 changes: 20 additions & 1 deletion tests/regression/test_etl4llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,23 @@ def test_part():
print(resp)


test_part()
def test_any2pdf():
uns_ep = os.environ.get("UNS_EP", "127.0.0.1:10001")

url = f"http://{uns_ep}/v1/etl4llm/predict"
filename = "../../examples/docs/maoxuan_sample.docx"
b64_data = base64.b64encode(open(filename, "rb").read()).decode()
inp = dict(
filename=os.path.basename(filename),
b64_data=[b64_data],
mode="topdf",
)
resp = requests.post(url, json=inp).json()

assert resp["status_code"] == 200, resp
# with open('test.pdf', 'wb') as fout:
# fout.write(base64.b64decode(resp['b64_pdf']))


test_any2pdf()
# test_part()
27 changes: 27 additions & 0 deletions tests/test_docx2pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from bisheng_unstructured.partition.common import convert_office_doc
from bisheng_unstructured.topdf.docx2pdf import DocxToPDF


def test1():
input_file = "./examples/docs/maoxuan_sample.docx"
output_file = "./data/maoxuan_sample.pdf"
engine = DocxToPDF()
engine.render(input_file, output_file)


def test2():
input_file = "./examples/docs/maoxuan_sample.doc"
output_file = "./data/maoxuan_sample.docx"
convert_office_doc(input_file, "./data/")


def test3():
input_file = "./examples/docs/maoxuan_sample.doc"
output_file = "./data/maoxuan_sample-v1.pdf"
engine = DocxToPDF()
engine.render(input_file, output_file)


test1()
# test2()
# test3()

0 comments on commit 49a3e3e

Please sign in to comment.