Skip to content

Commit

Permalink
support pdf and image document
Browse files Browse the repository at this point in the history
  • Loading branch information
hrfng committed Sep 5, 2023
1 parent 98e87b3 commit 7bd0c45
Show file tree
Hide file tree
Showing 25 changed files with 2,468 additions and 73 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/
36 changes: 36 additions & 0 deletions docker/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

function start_docker() {
docker run --gpus=all --shm-size 2g --net=host -itd --name bisheng_unstr_dev1 \
-v /home/hanfeng:/home/hanfeng -v /home/public:/home/public ubuntu:20.04 bash
}

function prepare_env() {
# Install Basic Dependences
export DEBIAN_FRONTEND=noninteractive
apt update && apt install -y nasm zlib1g-dev libssl-dev libre2-dev libb64-dev locales libsm6 libxext6 libxrender-dev libgl1 python3-dev python3-pip git

# Configure language
locale-gen en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LANG=en_US.UTF-8
export LANGUAGE=en_US.UTF-8

# Configure timezone
export TZ=Asia/Shanghai
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
}

function install_deps() {
# apt install -y python3-dev python3-pip git
# pip3 install git+https://github.com/pypdfium2-team/ctypesgen@pypdfium2
pip3 install -r requirements.txt -i https://mirrors.tencent.com/pypi/simple
python3 -c "import nltk; nltk.download('punkt')" && \
python3 -c "import nltk; nltk.download('averaged_perceptron_tagger')"

}

prepare_env
# install_deps


17 changes: 17 additions & 0 deletions docker/runtime.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM ubuntu:20.04
MAINTAINER "dataelem inc."

# 安装系统库依赖
ENV DEBIAN_FRONTEND=noninteractive
RUN apt update && apt install -y nasm zlib1g-dev libssl-dev libre2-dev libb64-dev locales libsm6 libxext6 libxrender-dev libgl1 python3-dev python3-pip git

# Configure language
RUN locale-gen en_US.UTF-8
ENV LC_ALL=en_US.UTF-8 \
LANG=en_US.UTF-8 \
LANGUAGE=en_US.UTF-8

# Configure timezone
ENV TZ=Asia/Shanghai
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

Binary file added examples/docs/maoxuan_intro_with_table.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/docs/maoxuan_v1.pdf
Binary file not shown.
68 changes: 67 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,67 @@
emoji
# base
chardet==5.1.0
filetype==1.2.0
python-magic==0.4.27
nltk==3.8.1
tabulate==0.9.0
requests==2.31.0
urllib3==1.26.16
beautifulsoup4==4.12.2
emoji==2.8.0

# doc and docx
lxml==4.9.3
python-docx==0.8.11

# csv, tsv
numpy==1.24.4
pandas==2.0.3
python-dateutil==2.8.2
pytz==2023.3
six==1.16.0
tzdata==2023.3

# epub
ebooklib==0.18

# markdown
importlib-metadata==6.8.0
markdown==3.4.4
zipp==3.16.2

# msg
msg-parser==1.2.0
olefile==0.46

# odt, pandoc, rtf, rst, org
pypandoc==1.11

# pdf
pdf2image==1.16.3
pdfminer-six==20221105
pdfplumber==0.10.2
wheel==0.41.0
pypdfium2==4.18.0
PyMuPDF==1.23.2
opencv-python==4.8.0.76
certifi==2023.7.22
cffi==1.15.1
charset-normalizer==3.2.0
contourpy==1.1.0
cryptography==41.0.3
cycler==0.11.0
fonttools==4.42.1
idna==3.4
scipy==1.10.1
shapely==2.0.1
pydantic==1.10.12

# pptx
pillow==10.0.0
python-pptx==0.6.21
xlsxwriter==3.1.2

# xlsx
et-xmlfile==1.1.0
openpyxl==3.1.2
xlrd==2.0.1
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020 pybackend libs Authors. All Rights Reserved.
# Copyright (c) 2020 Dataelem Inc. Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
78 changes: 78 additions & 0 deletions src/unstructured/documents/html_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from unstructured.documents.markdown import transform_html_table_to_md


def visualize_html(elements, output_file):
html_prefix = """
<html>
<head>
<style>
table {
font-family: arial, sans-serif;
border-collapse: collapse;
width: 100%;
}
td, th {
border: 1px solid #dddddd;
text-align: left;
padding: 8px;
}
tr:nth-child(even) {
background-color: #dddddd;
}
</style>
</head>
<body>
"""

html_suffix = "</body></html>"

styles = [
'style="background-color: #EBEBEB;"',
'style="background-color: #ABBAEA;"'
]
idx = 0

table_style = 'style="border:1px solid black;"'

texts = []
for el in elements:
if el.category == 'Title':
text = f'<h1>{el.text}</h1>'
elif el.category == 'Table':
text = el.metadata.text_as_html
text = text.replace('\n', ' ')
else:
text = f'<p {styles[idx % 2]}>{el.text}</p>'
idx += 1

if text:
texts.append(text)

body_content = '\n'.join(texts)
html_str = html_prefix + body_content + html_suffix
with open(output_file, 'w') as fout:
fout.write(html_str)


def save_to_txt(elements, output_file):
text_elem_sep = '\n'
content_page = []
is_first_elem = True
for el in elements:
label, text = el.category, el.text
if is_first_elem:
f_text = text + '\n' if label == 'Title' else text
content_page.append(f_text)
is_first_elem = False
else:
if label == 'Title':
content_page.append('\n\n' + text + '\n')
elif label == 'Table':
content_page.append('\n\n' + text + '\n')
else:
content_page.append(text_elem_sep + text)

with open(output_file, 'w') as fout:
fout.write(''.join(content_page))
160 changes: 160 additions & 0 deletions src/unstructured/documents/markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
from lxml.html.clean import Cleaner
import lxml
from lxml import etree
from lxml.builder import E

import re

RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)


def norm_text(e):
return re.sub(RE_MULTISPACE_INCLUDING_NEWLINES, ' ', str(e) or "").strip()


def markdown_table(rows):
def _format_row(r):
content = ' | '.join(r)
content = '| ' + content + ' |'
return content

def _format_header(n):
r = ['---'] * n
content = ' | '.join(r)
content = '| ' + content + ' |'
return content

if not rows: return ''
r0 = rows[0]
max_cols = max(map(len, rows))
first_cols = len(r0)
head_cols_threhold = 3
if max_cols - first_cols <= head_cols_threhold:
first_cols = max_cols

content = [_format_row(r0)]
content.append(_format_header(first_cols))
for r in rows[1:]:
content.append(_format_row(r))

return '\n'.join(content)


def transform_html_table_to_md(html_table_str, field_sep = ' '):
table_node = lxml.html.fromstring(html_table_str)
rows = []
for thead_node in table_node.xpath('.//thead'):
row = []
texts = tuple(thead_node.xpath('.//th//text()'))
texts = list(map(norm_text, texts))
row = texts

if row: rows.append(row)

for tr in table_node.xpath('.//tr'):
row = []
for e in tr.getchildren():
texts = tuple(e.xpath('.//text()'))
texts = map(norm_text, texts)
texts = [t for t in texts if t]
field_text = field_sep.join(texts)
row.append(field_text)

if row: rows.append(row)

table_html = etree.tostring(table_node)

cleaner = Cleaner(
remove_unknown_tags=False,
allow_tags=[
"table", "thead", "tbody", "td", "tr", 'th',
],
style=True,
page_structure=False)
clean_table_html = cleaner.clean_html(table_html).decode()
text = markdown_table(rows)

return dict(text=text, html=clean_table_html)


def merge_md_tables(tables, has_header=False) -> str:
if not tables: return ''
content = tables[0]
for t in tables[1:]:
rows = t.split('\n')
rows = rows[2:] if has_header else [rows[0]] + rows[2:]
content += '\n' + '\n'.join(rows)

return content


def merge_html_tables(tables, has_header=False) -> str:
if not tables: return ''

# print('---table0/1---', has_header)
# print(tables[0])
# print(tables[1])

contents = ['<table>']
table_node = lxml.html.fromstring(tables[0])

for thead_node in table_node.xpath('.//thead'):
contents.append(etree.tostring(thead_node))

for tr in table_node.xpath('./tbody//tr'):
contents.append(etree.tostring(tr))

for t in tables[1:]:
table_node = lxml.html.fromstring(t)
if has_header:
for tr in table_node.xpath('./tbody//tr'):
contents.append(etree.tostring(tr))
else:
tds = []
trs = []
for thead_node in table_node.xpath('.//thead'):
row = []
texts = tuple(thead_node.xpath('.//th//text()'))
for text in texts:
tds.append("<td>{}</td>".format(text))

for tr in thead_node.xpath('.//tr'):
trs.append(etree.tostring(tr))

if tds:
tr = "<tr>{}</tr>".format(''.join(tds))
contents.append(tr)

if trs:
tr = b'\n'.join(trs)
contents.append(tr)

for tr in table_node.xpath('./tbody//tr'):
contents.append(etree.tostring(tr))

contents.append('</table>')

tables = []
for e in contents:
tables.append(e.decode().strip() if isinstance(e, bytes) else e)
return '\n'.join(tables)


def transform_list_to_table(cols):
contents = ['<table><thead>']
for col in cols:
contents.append("<th>{}</th>".format(col))

contents.append('</thead></table>')
return '\n'.join(contents)


def clean_html_table(table_html):
cleaner = Cleaner(
remove_unknown_tags=False,
allow_tags=[
"table", "td", "tr", 'th',
],
style=True,
page_structure=False)
return cleaner.clean_html(table_html)
Empty file.
Loading

0 comments on commit 7bd0c45

Please sign in to comment.