support pdf and image document

dataelement · Sep 5, 2023 · 7bd0c45 · 7bd0c45
1 parent 98e87b3
commit 7bd0c45
Show file tree

Hide file tree

Showing 25 changed files with 2,468 additions and 73 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+data/
diff --git a/docker/prepare.sh b/docker/prepare.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+function start_docker() {
+  docker run --gpus=all --shm-size 2g --net=host -itd --name bisheng_unstr_dev1 \
+   -v /home/hanfeng:/home/hanfeng -v /home/public:/home/public ubuntu:20.04 bash
+}
+
+function prepare_env() {
+  # Install Basic Dependences
+  export DEBIAN_FRONTEND=noninteractive
+  apt update && apt install -y nasm zlib1g-dev libssl-dev libre2-dev libb64-dev locales libsm6 libxext6 libxrender-dev libgl1 python3-dev python3-pip git
+
+  # Configure language
+  locale-gen en_US.UTF-8
+  export LC_ALL=en_US.UTF-8
+  export LANG=en_US.UTF-8
+  export LANGUAGE=en_US.UTF-8
+
+  # Configure timezone
+  export TZ=Asia/Shanghai
+  ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+}
+
+function install_deps() {
+  # apt install -y python3-dev python3-pip git
+  # pip3 install git+https://github.com/pypdfium2-team/ctypesgen@pypdfium2
+  pip3 install -r requirements.txt -i https://mirrors.tencent.com/pypi/simple
+  python3 -c "import nltk; nltk.download('punkt')" && \
+    python3 -c "import nltk; nltk.download('averaged_perceptron_tagger')"
+
+}
+
+prepare_env
+# install_deps
+
+
diff --git a/docker/runtime.Dockerfile b/docker/runtime.Dockerfile
@@ -0,0 +1,17 @@
+FROM ubuntu:20.04
+MAINTAINER "dataelem inc."
+
+# 安装系统库依赖
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt install -y nasm zlib1g-dev libssl-dev libre2-dev libb64-dev locales libsm6 libxext6 libxrender-dev libgl1 python3-dev python3-pip git
+
+# Configure language
+RUN locale-gen en_US.UTF-8
+ENV LC_ALL=en_US.UTF-8 \
+    LANG=en_US.UTF-8 \
+    LANGUAGE=en_US.UTF-8
+
+# Configure timezone
+ENV TZ=Asia/Shanghai
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+
diff --git a/examples/docs/maoxuan_intro_with_table.jpg b/examples/docs/maoxuan_intro_with_table.jpg
diff --git a/examples/docs/maoxuan_v1.pdf b/examples/docs/maoxuan_v1.pdf
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,67 @@
-emoji
+# base
+chardet==5.1.0
+filetype==1.2.0
+python-magic==0.4.27
+nltk==3.8.1
+tabulate==0.9.0
+requests==2.31.0
+urllib3==1.26.16
+beautifulsoup4==4.12.2
+emoji==2.8.0
+
+# doc and docx
+lxml==4.9.3
+python-docx==0.8.11
+
+# csv, tsv
+numpy==1.24.4
+pandas==2.0.3
+python-dateutil==2.8.2
+pytz==2023.3
+six==1.16.0
+tzdata==2023.3
+
+# epub
+ebooklib==0.18
+
+# markdown
+importlib-metadata==6.8.0
+markdown==3.4.4
+zipp==3.16.2
+
+# msg
+msg-parser==1.2.0
+olefile==0.46
+
+# odt, pandoc, rtf, rst, org
+pypandoc==1.11
+
+# pdf
+pdf2image==1.16.3
+pdfminer-six==20221105
+pdfplumber==0.10.2
+wheel==0.41.0
+pypdfium2==4.18.0
+PyMuPDF==1.23.2
+opencv-python==4.8.0.76
+certifi==2023.7.22
+cffi==1.15.1
+charset-normalizer==3.2.0
+contourpy==1.1.0
+cryptography==41.0.3
+cycler==0.11.0
+fonttools==4.42.1
+idna==3.4
+scipy==1.10.1
+shapely==2.0.1
+pydantic==1.10.12
+
+# pptx
+pillow==10.0.0
+python-pptx==0.6.21
+xlsxwriter==3.1.2
+
+# xlsx
+et-xmlfile==1.1.0
+openpyxl==3.1.2
+xlrd==2.0.1
diff --git a/setup.py b/setup.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 pybackend libs Authors. All Rights Reserved.
+# Copyright (c) 2020 Dataelem Inc. Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/src/unstructured/documents/html_utils.py b/src/unstructured/documents/html_utils.py
@@ -0,0 +1,78 @@
+from unstructured.documents.markdown import transform_html_table_to_md
+
+
+def visualize_html(elements, output_file):
+    html_prefix = """
+    <html>
+    <head>
+    <style>
+        table {
+          font-family: arial, sans-serif;
+          border-collapse: collapse;
+          width: 100%;
+        }
+
+        td, th {
+          border: 1px solid #dddddd;
+          text-align: left;
+          padding: 8px;
+        }
+
+        tr:nth-child(even) {
+          background-color: #dddddd;
+        }
+    </style>
+    </head>
+    <body>
+    """
+
+    html_suffix = "</body></html>"
+
+    styles = [
+      'style="background-color: #EBEBEB;"',
+      'style="background-color: #ABBAEA;"'
+    ]
+    idx = 0
+
+    table_style = 'style="border:1px solid black;"'
+
+    texts = []
+    for el in elements:
+        if el.category == 'Title':
+            text = f'<h1>{el.text}</h1>'
+        elif el.category == 'Table':
+            text = el.metadata.text_as_html
+            text = text.replace('\n', ' ')
+        else:
+            text = f'<p {styles[idx % 2]}>{el.text}</p>'
+            idx += 1
+
+        if text:
+            texts.append(text)
+
+    body_content = '\n'.join(texts)
+    html_str = html_prefix + body_content + html_suffix
+    with open(output_file, 'w') as fout:
+        fout.write(html_str)            
+
+
+def save_to_txt(elements, output_file):
+    text_elem_sep = '\n'
+    content_page = []
+    is_first_elem = True
+    for el in elements:
+        label, text = el.category, el.text
+        if is_first_elem:
+            f_text = text + '\n' if label == 'Title' else text
+            content_page.append(f_text)
+            is_first_elem = False
+        else:
+            if label == 'Title':
+                content_page.append('\n\n' + text + '\n')
+            elif label == 'Table':
+                content_page.append('\n\n' + text + '\n')
+            else:
+                content_page.append(text_elem_sep + text)
+
+    with open(output_file, 'w') as fout:
+        fout.write(''.join(content_page))
diff --git a/src/unstructured/documents/markdown.py b/src/unstructured/documents/markdown.py
@@ -0,0 +1,160 @@
+from lxml.html.clean import Cleaner
+import lxml
+from lxml import etree
+from lxml.builder import E
+
+import re
+
+RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
+
+
+def norm_text(e):
+    return re.sub(RE_MULTISPACE_INCLUDING_NEWLINES, ' ', str(e) or "").strip()
+
+
+def markdown_table(rows):
+    def _format_row(r):
+        content = ' | '.join(r)
+        content = '| ' + content + ' |'
+        return content
+
+    def _format_header(n):
+        r = ['---'] * n
+        content = ' | '.join(r)
+        content = '| ' + content + ' |'
+        return content
+
+    if not rows: return ''
+    r0 = rows[0]
+    max_cols = max(map(len, rows))
+    first_cols = len(r0)
+    head_cols_threhold = 3
+    if max_cols - first_cols <= head_cols_threhold:
+        first_cols = max_cols
+
+    content = [_format_row(r0)]
+    content.append(_format_header(first_cols))
+    for r in rows[1:]:
+        content.append(_format_row(r))
+
+    return '\n'.join(content)
+
+
+def transform_html_table_to_md(html_table_str, field_sep = ' '):
+    table_node = lxml.html.fromstring(html_table_str)
+    rows = []
+    for thead_node in table_node.xpath('.//thead'):
+        row = []
+        texts = tuple(thead_node.xpath('.//th//text()'))
+        texts = list(map(norm_text, texts))
+        row = texts
+
+        if row: rows.append(row)
+
+    for tr in table_node.xpath('.//tr'):
+        row = []
+        for e in tr.getchildren():
+            texts = tuple(e.xpath('.//text()'))
+            texts = map(norm_text, texts)
+            texts = [t for t in texts if t]
+            field_text = field_sep.join(texts)
+            row.append(field_text)
+
+        if row: rows.append(row)
+
+    table_html = etree.tostring(table_node)
+
+    cleaner = Cleaner(
+        remove_unknown_tags=False,
+        allow_tags=[
+            "table", "thead", "tbody", "td", "tr", 'th',
+        ],
+        style=True,
+        page_structure=False)
+    clean_table_html = cleaner.clean_html(table_html).decode()
+    text = markdown_table(rows)
+
+    return dict(text=text, html=clean_table_html)
+
+
+def merge_md_tables(tables, has_header=False) -> str:
+    if not tables: return ''
+    content = tables[0]
+    for t in tables[1:]:
+        rows = t.split('\n')
+        rows = rows[2:] if has_header else [rows[0]] + rows[2:]
+        content += '\n' + '\n'.join(rows)
+
+    return content
+
+
+def merge_html_tables(tables, has_header=False) -> str:
+    if not tables: return ''
+
+    # print('---table0/1---', has_header)
+    # print(tables[0])
+    # print(tables[1])
+
+    contents = ['<table>']
+    table_node = lxml.html.fromstring(tables[0])
+
+    for thead_node in table_node.xpath('.//thead'):
+        contents.append(etree.tostring(thead_node))
+
+    for tr in table_node.xpath('./tbody//tr'):
+        contents.append(etree.tostring(tr))
+
+    for t in tables[1:]:
+        table_node = lxml.html.fromstring(t)
+        if has_header:
+            for tr in table_node.xpath('./tbody//tr'):
+                contents.append(etree.tostring(tr))
+        else:
+            tds = []
+            trs = []
+            for thead_node in table_node.xpath('.//thead'):
+                row = []
+                texts = tuple(thead_node.xpath('.//th//text()'))
+                for text in texts:
+                    tds.append("<td>{}</td>".format(text))
+
+                for tr in thead_node.xpath('.//tr'):
+                    trs.append(etree.tostring(tr))
+
+            if tds:
+                tr = "<tr>{}</tr>".format(''.join(tds))
+                contents.append(tr)
+
+            if trs:
+                tr = b'\n'.join(trs)
+                contents.append(tr)
+
+            for tr in table_node.xpath('./tbody//tr'):
+                contents.append(etree.tostring(tr))
+
+    contents.append('</table>')
+
+    tables = []
+    for e in contents:
+        tables.append(e.decode().strip() if isinstance(e, bytes) else e)
+    return '\n'.join(tables)
+
+
+def transform_list_to_table(cols):
+    contents = ['<table><thead>']
+    for col in cols:
+        contents.append("<th>{}</th>".format(col))
+
+    contents.append('</thead></table>')
+    return '\n'.join(contents)
+
+
+def clean_html_table(table_html):
+    cleaner = Cleaner(
+        remove_unknown_tags=False,
+        allow_tags=[
+            "table", "td", "tr", 'th',
+        ],
+        style=True,
+        page_structure=False)
+    return cleaner.clean_html(table_html)
diff --git a/src/unstructured/documents/pdf_parser/__init__.py b/src/unstructured/documents/pdf_parser/__init__.py