-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
support word to pdf
- Loading branch information
Showing
10 changed files
with
198 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,4 +9,22 @@ function temp_build_image() { | |
} | ||
|
||
|
||
temp_build_image | ||
function create_dev_image() { | ||
image="dataelement/bisheng-unstructured:0.0.1" | ||
docker run -itd --name bisheng-uns-v002-dev -p 50001:10001 \ | ||
-v /home/hanfeng:/home/hanfeng -v /home/public:/home/public \ | ||
$image bash | ||
} | ||
|
||
|
||
function temp_build_image_v002() { | ||
image="dataelement/bisheng-unstructured:0.0.2" | ||
docker rmi $image | ||
docker commit -a "[email protected]" -m "commit bisheng-unstructured image" bisheng-uns-v002-dev $image | ||
#docker push $image | ||
} | ||
|
||
|
||
# create_dev_image | ||
# temp_build_image | ||
temp_build_image_v002 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/bin/bash | ||
|
||
|
||
function install_texlive() { | ||
# ISO: https://mirrors.tuna.tsinghua.edu.cn/CTAN/systems/texlive/Images/ | ||
# ISO_FILE="/home/hanfeng/tars/texlive.iso" | ||
# mount -o loop $ISO_FILE /mnt | ||
MOUNT_PATH="/home/hanfeng/tars/texlive_iso_mirror" | ||
pushd ${MOUNT_PATH} | ||
./install-tl -no-gui -texdir /opt/texlive/2023 | ||
popd | ||
unmount /mnt | ||
} | ||
|
||
|
||
function install_deps() { | ||
apt-get install libfontconfig1 fontconfig libreoffice | ||
} | ||
|
||
|
||
function update_path() { | ||
echo "export PATH=/opt/texlive/2023/bin/x86_64-linux:\$PATH" >> /root/.bashrc | ||
echo "export MANPATH=/opt/texlive/2023/texmf-dist/doc/man:\$MANPATH" >> /root/.bashrc | ||
echo "export INFOPATH=/opt/texlive/2023/texmf-dist/doc/info:\$INFOPATH" >> /root/.bashrc | ||
} | ||
|
||
|
||
function update_fonts() { | ||
# add tex pkg | ||
# mktexlsr | ||
|
||
EXTR_FONT_DIR="/home/hanfeng/tars/texlive_fonts" | ||
cp -fr ${EXTR_FONT_DIR} /usr/share/fonts/ | ||
# mkfontscale | ||
# mkfontdir | ||
fc-cache -fsv | ||
# fc-cache -fv | ||
fc-list :lang=zh-cn | ||
} | ||
|
||
|
||
function install_pandoc() { | ||
# PANDOC_TAR_FILE="/home/hanfeng/tars/pandoc-3.1.9-linux-amd64.tar.gz" | ||
# tar zxf ${PANDOC_TAR_FILE} -C /opt/pandoc | ||
|
||
# pandoc template | ||
# commit f7d8b629330074a4400d1f2795b101d14491c968 | ||
# (HEAD -> master, tag: 3.1.9, origin/master, origin/HEAD) | ||
|
||
echo "export PATH=/opt/pandoc/pandoc-3.1.9/bin:\$PATH" >> /root/.bashrc | ||
} | ||
|
||
|
||
function clean() { | ||
echo "clean" | ||
apt-get clean && rm -rf /var/lib/apt/lists/* && rm -rf /root/.cache/pip | ||
} | ||
|
||
|
||
# update_fonts | ||
# install_texlive | ||
# install_pandoc | ||
clean |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .docx2pdf import DocxToPDF | ||
|
||
__all__ = ["DocxToPDF"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import os | ||
|
||
from bisheng_unstructured.partition.common import convert_office_doc | ||
|
||
|
||
class DocxToPDF(object): | ||
def __init__(self, kwargs={}): | ||
cmd_template = """ | ||
pandoc -o {1} --pdf-engine=xelatex {0} | ||
-V mainfont="Alibaba PuHuiTi" | ||
-V sansfont="Alibaba PuHuiTi" | ||
-V monofont="Cascadia Mono" | ||
-V CJKmainfont="Alibaba PuHuiTi" | ||
-V CJKsansfont="Alibaba PuHuiTi" | ||
-V CJKmonofont="Cascadia Mono" | ||
""" | ||
|
||
def _norm_cmd(cmd): | ||
return " ".join([p.strip() for p in cmd.strip().split()]) | ||
|
||
self.cmd_template = _norm_cmd(cmd_template) | ||
|
||
def render(self, input_file, output_file=None, to_bytes=False): | ||
type_ext = input_file.rsplit(".", 1)[-1] | ||
filename = os.path.basename(input_file) | ||
temp_dir = os.path.dirname(input_file) | ||
|
||
assert type_ext in ["doc", "docx"] | ||
|
||
if output_file is None: | ||
output_filename = filename.rsplit(".", 1)[0] + ".pdf" | ||
output_file = os.path.join(temp_dir, output_filename) | ||
|
||
if type_ext == "doc": | ||
convert_office_doc(input_file, temp_dir) | ||
input_file = os.path.join(temp_dir, filename.rsplit(".", 1)[0] + ".docx") | ||
|
||
cmd = self.cmd_template.format(input_file, output_file) | ||
try: | ||
exit_code = os.system(cmd) | ||
if exit_code != 0: | ||
raise Exception("error in transforming doc to pdf") | ||
except Exception as e: | ||
raise e | ||
|
||
if to_bytes: | ||
return open(output_file, "rb").read() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from bisheng_unstructured.partition.common import convert_office_doc | ||
from bisheng_unstructured.topdf.docx2pdf import DocxToPDF | ||
|
||
|
||
def test1(): | ||
input_file = "./examples/docs/maoxuan_sample.docx" | ||
output_file = "./data/maoxuan_sample.pdf" | ||
engine = DocxToPDF() | ||
engine.render(input_file, output_file) | ||
|
||
|
||
def test2(): | ||
input_file = "./examples/docs/maoxuan_sample.doc" | ||
output_file = "./data/maoxuan_sample.docx" | ||
convert_office_doc(input_file, "./data/") | ||
|
||
|
||
def test3(): | ||
input_file = "./examples/docs/maoxuan_sample.doc" | ||
output_file = "./data/maoxuan_sample-v1.pdf" | ||
engine = DocxToPDF() | ||
engine.render(input_file, output_file) | ||
|
||
|
||
test1() | ||
# test2() | ||
# test3() |