-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
25 changed files
with
2,468 additions
and
73 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
data/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#!/bin/bash | ||
|
||
function start_docker() { | ||
docker run --gpus=all --shm-size 2g --net=host -itd --name bisheng_unstr_dev1 \ | ||
-v /home/hanfeng:/home/hanfeng -v /home/public:/home/public ubuntu:20.04 bash | ||
} | ||
|
||
function prepare_env() { | ||
# Install Basic Dependences | ||
export DEBIAN_FRONTEND=noninteractive | ||
apt update && apt install -y nasm zlib1g-dev libssl-dev libre2-dev libb64-dev locales libsm6 libxext6 libxrender-dev libgl1 python3-dev python3-pip git | ||
|
||
# Configure language | ||
locale-gen en_US.UTF-8 | ||
export LC_ALL=en_US.UTF-8 | ||
export LANG=en_US.UTF-8 | ||
export LANGUAGE=en_US.UTF-8 | ||
|
||
# Configure timezone | ||
export TZ=Asia/Shanghai | ||
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone | ||
} | ||
|
||
function install_deps() { | ||
# apt install -y python3-dev python3-pip git | ||
# pip3 install git+https://github.com/pypdfium2-team/ctypesgen@pypdfium2 | ||
pip3 install -r requirements.txt -i https://mirrors.tencent.com/pypi/simple | ||
python3 -c "import nltk; nltk.download('punkt')" && \ | ||
python3 -c "import nltk; nltk.download('averaged_perceptron_tagger')" | ||
|
||
} | ||
|
||
prepare_env | ||
# install_deps | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
FROM ubuntu:20.04 | ||
MAINTAINER "dataelem inc." | ||
|
||
# 安装系统库依赖 | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
RUN apt update && apt install -y nasm zlib1g-dev libssl-dev libre2-dev libb64-dev locales libsm6 libxext6 libxrender-dev libgl1 python3-dev python3-pip git | ||
|
||
# Configure language | ||
RUN locale-gen en_US.UTF-8 | ||
ENV LC_ALL=en_US.UTF-8 \ | ||
LANG=en_US.UTF-8 \ | ||
LANGUAGE=en_US.UTF-8 | ||
|
||
# Configure timezone | ||
ENV TZ=Asia/Shanghai | ||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone | ||
|
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,67 @@ | ||
emoji | ||
# base | ||
chardet==5.1.0 | ||
filetype==1.2.0 | ||
python-magic==0.4.27 | ||
nltk==3.8.1 | ||
tabulate==0.9.0 | ||
requests==2.31.0 | ||
urllib3==1.26.16 | ||
beautifulsoup4==4.12.2 | ||
emoji==2.8.0 | ||
|
||
# doc and docx | ||
lxml==4.9.3 | ||
python-docx==0.8.11 | ||
|
||
# csv, tsv | ||
numpy==1.24.4 | ||
pandas==2.0.3 | ||
python-dateutil==2.8.2 | ||
pytz==2023.3 | ||
six==1.16.0 | ||
tzdata==2023.3 | ||
|
||
# epub | ||
ebooklib==0.18 | ||
|
||
# markdown | ||
importlib-metadata==6.8.0 | ||
markdown==3.4.4 | ||
zipp==3.16.2 | ||
|
||
# msg | ||
msg-parser==1.2.0 | ||
olefile==0.46 | ||
|
||
# odt, pandoc, rtf, rst, org | ||
pypandoc==1.11 | ||
|
||
pdf2image==1.16.3 | ||
pdfminer-six==20221105 | ||
pdfplumber==0.10.2 | ||
wheel==0.41.0 | ||
pypdfium2==4.18.0 | ||
PyMuPDF==1.23.2 | ||
opencv-python==4.8.0.76 | ||
certifi==2023.7.22 | ||
cffi==1.15.1 | ||
charset-normalizer==3.2.0 | ||
contourpy==1.1.0 | ||
cryptography==41.0.3 | ||
cycler==0.11.0 | ||
fonttools==4.42.1 | ||
idna==3.4 | ||
scipy==1.10.1 | ||
shapely==2.0.1 | ||
pydantic==1.10.12 | ||
|
||
# pptx | ||
pillow==10.0.0 | ||
python-pptx==0.6.21 | ||
xlsxwriter==3.1.2 | ||
|
||
# xlsx | ||
et-xmlfile==1.1.0 | ||
openpyxl==3.1.2 | ||
xlrd==2.0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
from unstructured.documents.markdown import transform_html_table_to_md | ||
|
||
|
||
def visualize_html(elements, output_file): | ||
html_prefix = """ | ||
<html> | ||
<head> | ||
<style> | ||
table { | ||
font-family: arial, sans-serif; | ||
border-collapse: collapse; | ||
width: 100%; | ||
} | ||
td, th { | ||
border: 1px solid #dddddd; | ||
text-align: left; | ||
padding: 8px; | ||
} | ||
tr:nth-child(even) { | ||
background-color: #dddddd; | ||
} | ||
</style> | ||
</head> | ||
<body> | ||
""" | ||
|
||
html_suffix = "</body></html>" | ||
|
||
styles = [ | ||
'style="background-color: #EBEBEB;"', | ||
'style="background-color: #ABBAEA;"' | ||
] | ||
idx = 0 | ||
|
||
table_style = 'style="border:1px solid black;"' | ||
|
||
texts = [] | ||
for el in elements: | ||
if el.category == 'Title': | ||
text = f'<h1>{el.text}</h1>' | ||
elif el.category == 'Table': | ||
text = el.metadata.text_as_html | ||
text = text.replace('\n', ' ') | ||
else: | ||
text = f'<p {styles[idx % 2]}>{el.text}</p>' | ||
idx += 1 | ||
|
||
if text: | ||
texts.append(text) | ||
|
||
body_content = '\n'.join(texts) | ||
html_str = html_prefix + body_content + html_suffix | ||
with open(output_file, 'w') as fout: | ||
fout.write(html_str) | ||
|
||
|
||
def save_to_txt(elements, output_file): | ||
text_elem_sep = '\n' | ||
content_page = [] | ||
is_first_elem = True | ||
for el in elements: | ||
label, text = el.category, el.text | ||
if is_first_elem: | ||
f_text = text + '\n' if label == 'Title' else text | ||
content_page.append(f_text) | ||
is_first_elem = False | ||
else: | ||
if label == 'Title': | ||
content_page.append('\n\n' + text + '\n') | ||
elif label == 'Table': | ||
content_page.append('\n\n' + text + '\n') | ||
else: | ||
content_page.append(text_elem_sep + text) | ||
|
||
with open(output_file, 'w') as fout: | ||
fout.write(''.join(content_page)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
from lxml.html.clean import Cleaner | ||
import lxml | ||
from lxml import etree | ||
from lxml.builder import E | ||
|
||
import re | ||
|
||
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL) | ||
|
||
|
||
def norm_text(e): | ||
return re.sub(RE_MULTISPACE_INCLUDING_NEWLINES, ' ', str(e) or "").strip() | ||
|
||
|
||
def markdown_table(rows): | ||
def _format_row(r): | ||
content = ' | '.join(r) | ||
content = '| ' + content + ' |' | ||
return content | ||
|
||
def _format_header(n): | ||
r = ['---'] * n | ||
content = ' | '.join(r) | ||
content = '| ' + content + ' |' | ||
return content | ||
|
||
if not rows: return '' | ||
r0 = rows[0] | ||
max_cols = max(map(len, rows)) | ||
first_cols = len(r0) | ||
head_cols_threhold = 3 | ||
if max_cols - first_cols <= head_cols_threhold: | ||
first_cols = max_cols | ||
|
||
content = [_format_row(r0)] | ||
content.append(_format_header(first_cols)) | ||
for r in rows[1:]: | ||
content.append(_format_row(r)) | ||
|
||
return '\n'.join(content) | ||
|
||
|
||
def transform_html_table_to_md(html_table_str, field_sep = ' '): | ||
table_node = lxml.html.fromstring(html_table_str) | ||
rows = [] | ||
for thead_node in table_node.xpath('.//thead'): | ||
row = [] | ||
texts = tuple(thead_node.xpath('.//th//text()')) | ||
texts = list(map(norm_text, texts)) | ||
row = texts | ||
|
||
if row: rows.append(row) | ||
|
||
for tr in table_node.xpath('.//tr'): | ||
row = [] | ||
for e in tr.getchildren(): | ||
texts = tuple(e.xpath('.//text()')) | ||
texts = map(norm_text, texts) | ||
texts = [t for t in texts if t] | ||
field_text = field_sep.join(texts) | ||
row.append(field_text) | ||
|
||
if row: rows.append(row) | ||
|
||
table_html = etree.tostring(table_node) | ||
|
||
cleaner = Cleaner( | ||
remove_unknown_tags=False, | ||
allow_tags=[ | ||
"table", "thead", "tbody", "td", "tr", 'th', | ||
], | ||
style=True, | ||
page_structure=False) | ||
clean_table_html = cleaner.clean_html(table_html).decode() | ||
text = markdown_table(rows) | ||
|
||
return dict(text=text, html=clean_table_html) | ||
|
||
|
||
def merge_md_tables(tables, has_header=False) -> str: | ||
if not tables: return '' | ||
content = tables[0] | ||
for t in tables[1:]: | ||
rows = t.split('\n') | ||
rows = rows[2:] if has_header else [rows[0]] + rows[2:] | ||
content += '\n' + '\n'.join(rows) | ||
|
||
return content | ||
|
||
|
||
def merge_html_tables(tables, has_header=False) -> str: | ||
if not tables: return '' | ||
|
||
# print('---table0/1---', has_header) | ||
# print(tables[0]) | ||
# print(tables[1]) | ||
|
||
contents = ['<table>'] | ||
table_node = lxml.html.fromstring(tables[0]) | ||
|
||
for thead_node in table_node.xpath('.//thead'): | ||
contents.append(etree.tostring(thead_node)) | ||
|
||
for tr in table_node.xpath('./tbody//tr'): | ||
contents.append(etree.tostring(tr)) | ||
|
||
for t in tables[1:]: | ||
table_node = lxml.html.fromstring(t) | ||
if has_header: | ||
for tr in table_node.xpath('./tbody//tr'): | ||
contents.append(etree.tostring(tr)) | ||
else: | ||
tds = [] | ||
trs = [] | ||
for thead_node in table_node.xpath('.//thead'): | ||
row = [] | ||
texts = tuple(thead_node.xpath('.//th//text()')) | ||
for text in texts: | ||
tds.append("<td>{}</td>".format(text)) | ||
|
||
for tr in thead_node.xpath('.//tr'): | ||
trs.append(etree.tostring(tr)) | ||
|
||
if tds: | ||
tr = "<tr>{}</tr>".format(''.join(tds)) | ||
contents.append(tr) | ||
|
||
if trs: | ||
tr = b'\n'.join(trs) | ||
contents.append(tr) | ||
|
||
for tr in table_node.xpath('./tbody//tr'): | ||
contents.append(etree.tostring(tr)) | ||
|
||
contents.append('</table>') | ||
|
||
tables = [] | ||
for e in contents: | ||
tables.append(e.decode().strip() if isinstance(e, bytes) else e) | ||
return '\n'.join(tables) | ||
|
||
|
||
def transform_list_to_table(cols): | ||
contents = ['<table><thead>'] | ||
for col in cols: | ||
contents.append("<th>{}</th>".format(col)) | ||
|
||
contents.append('</thead></table>') | ||
return '\n'.join(contents) | ||
|
||
|
||
def clean_html_table(table_html): | ||
cleaner = Cleaner( | ||
remove_unknown_tags=False, | ||
allow_tags=[ | ||
"table", "td", "tr", 'th', | ||
], | ||
style=True, | ||
page_structure=False) | ||
return cleaner.clean_html(table_html) |
Empty file.
Oops, something went wrong.