reconstruct the code

dataelement · Sep 5, 2023 · 79a8fce · 79a8fce
1 parent 7bd0c45
commit 79a8fce
Show file tree

Hide file tree

Showing 230 changed files with 402 additions and 10,753 deletions.
diff --git a/docker/prepare.sh b/docker/prepare.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 function start_docker() {
-  docker run --gpus=all --shm-size 2g --net=host -itd --name bisheng_unstr_dev1 \
+  docker run --net=host -itd --name bisheng_unstr_dev1 \
    -v /home/hanfeng:/home/hanfeng -v /home/public:/home/public ubuntu:20.04 bash
 }
 

diff --git a/src/unstructured/__init__.py → src/bisheng_unstructured/__init__.py b/src/unstructured/__init__.py → src/bisheng_unstructured/__init__.py
diff --git a/src/unstructured/__version__.py → src/bisheng_unstructured/__version__.py b/src/unstructured/__version__.py → src/bisheng_unstructured/__version__.py
diff --git a/src/unstructured/cleaners/__init__.py → ...bisheng_unstructured/cleaners/__init__.py b/src/unstructured/cleaners/__init__.py → ...bisheng_unstructured/cleaners/__init__.py
diff --git a/src/unstructured/cleaners/core.py → src/bisheng_unstructured/cleaners/core.py b/src/unstructured/cleaners/core.py → src/bisheng_unstructured/cleaners/core.py
@@ -3,10 +3,10 @@
 import sys
 import unicodedata
 
-from unstructured.file_utils.encoding import (
+from bisheng_unstructured.file_utils.encoding import (
     format_encoding_str,
 )
-from unstructured.nlp.patterns import (
+from bisheng_unstructured.nlp.patterns import (
     DOUBLE_PARAGRAPH_PATTERN_RE,
     E_BULLET_PATTERN,
     LINE_BREAK_RE,

diff --git a/src/unstructured/cleaners/extract.py → src/bisheng_unstructured/cleaners/extract.py b/src/unstructured/cleaners/extract.py → src/bisheng_unstructured/cleaners/extract.py
@@ -2,7 +2,7 @@
 import re
 from typing import List, Optional
 
-from unstructured.nlp.patterns import (
+from bisheng_unstructured.nlp.patterns import (
     EMAIL_ADDRESS_PATTERN,
     EMAIL_DATETIMETZ_PATTERN,
     IP_ADDRESS_NAME_PATTERN,

diff --git a/src/unstructured/cleaners/translate.py → ...isheng_unstructured/cleaners/translate.py b/src/unstructured/cleaners/translate.py → ...isheng_unstructured/cleaners/translate.py
@@ -4,8 +4,8 @@
 import langdetect
 from transformers import MarianMTModel, MarianTokenizer
 
-from unstructured.nlp.tokenize import sent_tokenize
-from unstructured.staging.huggingface import chunk_by_attention_window
+from bisheng_unstructured.nlp.tokenize import sent_tokenize
+from bisheng_unstructured.staging.huggingface import chunk_by_attention_window
 
 
 def _get_opus_mt_model_name(source_lang: str, target_lang: str):

diff --git a/src/unstructured/documents/__init__.py → ...isheng_unstructured/documents/__init__.py b/src/unstructured/documents/__init__.py → ...isheng_unstructured/documents/__init__.py
diff --git a/src/unstructured/documents/base.py → src/bisheng_unstructured/documents/base.py b/src/unstructured/documents/base.py → src/bisheng_unstructured/documents/base.py
@@ -3,7 +3,7 @@
 from abc import ABC
 from typing import List, Optional
 
-from unstructured.documents.elements import Element, NarrativeText
+from bisheng_unstructured.documents.elements import Element, NarrativeText
 
 
 class Document(ABC):

diff --git a/src/unstructured/documents/coordinates.py → ...eng_unstructured/documents/coordinates.py b/src/unstructured/documents/coordinates.py → ...eng_unstructured/documents/coordinates.py
diff --git a/src/unstructured/documents/elements.py → ...isheng_unstructured/documents/elements.py b/src/unstructured/documents/elements.py → ...isheng_unstructured/documents/elements.py
@@ -13,7 +13,7 @@
 from functools import wraps
 from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union, cast
 
-from unstructured.documents.coordinates import (
+from bisheng_unstructured.documents.coordinates import (
     TYPE_TO_COORDINATE_SYSTEM_MAP,
     CoordinateSystem,
     RelativeCoordinateSystem,

diff --git a/src/unstructured/documents/email_elements.py → ..._unstructured/documents/email_elements.py b/src/unstructured/documents/email_elements.py → ..._unstructured/documents/email_elements.py
@@ -4,7 +4,7 @@
 from datetime import datetime
 from typing import Callable, List, Union
 
-from unstructured.documents.elements import UUID, Element, NoID, Text
+from bisheng_unstructured.documents.elements import UUID, Element, NoID, Text
 
 
 class NoDatestamp(ABC):

diff --git a/src/unstructured/documents/html.py → src/bisheng_unstructured/documents/html.py b/src/unstructured/documents/html.py → src/bisheng_unstructured/documents/html.py
@@ -12,9 +12,11 @@
 
 
 
-from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
-from unstructured.documents.base import Page
-from unstructured.documents.elements import (
+from bisheng_unstructured.cleaners.core import (
+    clean_bullets, replace_unicode_quotes
+)
+from bisheng_unstructured.documents.base import Page
+from bisheng_unstructured.documents.elements import (
     Address,
     Element,
     EmailAddress,
@@ -26,9 +28,9 @@
     Table,
     ElementMetadata
 )
-from unstructured.documents.xml import VALID_PARSERS, XMLDocument
-from unstructured.logger import logger
-from unstructured.partition.text_type import (
+from bisheng_unstructured.documents.xml import VALID_PARSERS, XMLDocument
+from bisheng_unstructured.logger import logger
+from bisheng_unstructured.partition.text_type import (
     is_bulleted_text,
     is_email_address,
     is_possible_narrative_text,

diff --git a/src/unstructured/documents/html_utils.py → ...heng_unstructured/documents/html_utils.py b/src/unstructured/documents/html_utils.py → ...heng_unstructured/documents/html_utils.py
@@ -1,4 +1,4 @@
-from unstructured.documents.markdown import transform_html_table_to_md
+from bisheng_unstructured.documents.markdown import transform_html_table_to_md
 
 
 def visualize_html(elements, output_file):

diff --git a/src/bisheng_unstructured/documents/layout.py b/src/bisheng_unstructured/documents/layout.py
@@ -0,0 +1,145 @@
+from __future__ import annotations
+
+import os
+import tempfile
+from pathlib import PurePath
+from typing import BinaryIO, Collection, List, Optional, Tuple, Union, cast
+
+import numpy as np
+from PIL import Image
+
+
+class DocumentLayout:
+    """Class for handling documents that are saved as .pdf files. For .pdf files, a
+    document image analysis (DIA) model detects the layout of the page prior to extracting
+    element."""
+
+    def __init__(self, pages=None):
+        self._pages = pages
+
+    def __str__(self) -> str:
+        return "\n\n".join([str(page) for page in self.pages])
+
+    @property
+    def pages(self) -> List[PageLayout]:
+        """Gets all elements from pages in sequential order."""
+        return self._pages
+
+    @classmethod
+    def from_pages(cls, pages: List[PageLayout]) -> DocumentLayout:
+        """Generates a new instance of the class from a list of `PageLayouts`s"""
+        doc_layout = cls()
+        doc_layout._pages = pages
+        return doc_layout
+
+    @classmethod
+    def from_file(
+        cls,
+        filename: str,
+        detection_model: Optional[Any] = None,
+        element_extraction_model: Optional[Any] = None,
+        fixed_layouts: Optional[List[Optional[List[Any]]]] = None,
+        ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
+        extract_tables: bool = False,
+        pdf_image_dpi: int = 200,
+    ) -> DocumentLayout:
+        """Creates a DocumentLayout from a pdf file."""
+        logger.info(f"Reading PDF for file: {filename} ...")
+        pages: List[PageLayout] = []
+        return cls.from_pages(pages)
+
+    @classmethod
+    def from_image_file(
+        cls,
+        filename: str,
+        detection_model: Optional[Any] = None,
+        element_extraction_model: Optional[Any] = None,
+        ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
+        fixed_layout: Optional[List[Any]] = None,
+        extract_tables: bool = False,
+    ) -> DocumentLayout:
+        """Creates a DocumentLayout from an image file."""
+        logger.info(f"Reading image file: {filename} ...")
+        return cls.from_pages([])
+
+
+class PageLayout:
+    """Class for an individual PDF page."""
+
+    def __init__(
+        self,
+        number: int,
+        image: Image.Image,
+        layout: Optional[List[Any]],
+        image_metadata: Optional[dict] = None,
+        image_path: Optional[Union[str, PurePath]] = None,
+        detection_model: Optional[Any] = None,
+        element_extraction_model: Optional[Any] = None,
+        ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
+        extract_tables: bool = False,
+    ):
+        self.elements: Collection[Any] = []
+
+
+    def __str__(self) -> str:
+        return "\n\n".join([str(element) for element in self.elements])
+
+    def get_elements_using_image_extraction(
+        self,
+        inplace=True,
+    ) -> Optional[List[Any]]:
+        """Uses end-to-end text element extraction model to extract the elements on the page."""
+        return []
+
+    def get_elements_with_detection_model(self, inplace=True) -> Optional[List[Any]]:
+        """Uses specified model to detect the elements on the page."""
+        elements = []
+        if inplace:
+            self.elements = elements
+            return None
+        return elements
+
+    def get_elements_from_layout(self, layout: List[Any]) -> List[Any]:
+        """Uses the given Layout to separate the page text into elements, either extracting the
+        text from the discovered layout blocks or from the image using OCR."""
+        return []
+
+    def _get_image_array(self) -> Union[np.ndarray, None]:
+        """Converts the raw image into a numpy array."""
+        if self.image_array is None:
+            if self.image:
+                self.image_array = np.array(self.image)
+            else:
+                image = Image.open(self.image_path)
+                self.image_array = np.array(image)
+        return self.image_array
+
+    @classmethod
+    def from_image(
+        cls,
+        image: Image.Image,
+        image_path: Optional[Union[str, PurePath]],
+        number: int = 1,
+        detection_model: Optional[Any] = None,
+        element_extraction_model: Optional[Any] = None,
+        layout: Optional[List[Any]] = None,
+        ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
+        extract_tables: bool = False,
+        fixed_layout: Optional[List[Any]] = None,
+    ):
+        """Creates a PageLayout from an already-loaded PIL Image."""
+        page = cls(
+            number=number,
+            image=image,    
+            layout=layout,
+            detection_model=detection_model,
+            element_extraction_model=element_extraction_model,
+            ocr_strategy=ocr_strategy,
+            ocr_languages=ocr_languages,
+            extract_tables=extract_tables,
+        )
+        return page
diff --git a/src/unstructured/documents/markdown.py → ...isheng_unstructured/documents/markdown.py b/src/unstructured/documents/markdown.py → ...isheng_unstructured/documents/markdown.py
diff --git a/...ructured/documents/pdf_parser/__init__.py → ...ructured/documents/pdf_parser/__init__.py b/...ructured/documents/pdf_parser/__init__.py → ...ructured/documents/pdf_parser/__init__.py
diff --git a/...unstructured/documents/pdf_parser/blob.py → ...unstructured/documents/pdf_parser/blob.py b/...unstructured/documents/pdf_parser/blob.py → ...unstructured/documents/pdf_parser/blob.py
diff --git a/...nstructured/documents/pdf_parser/image.py → ...nstructured/documents/pdf_parser/image.py b/...nstructured/documents/pdf_parser/image.py → ...nstructured/documents/pdf_parser/image.py
@@ -1,9 +1,9 @@
 from typing import Any, Iterator, List, Mapping, Optional, Union
 import base64
 
-from unstructured.documents.base import Page
+from bisheng_unstructured.documents.base import Page
 
-from unstructured.models import (
+from bisheng_unstructured.models import (
     LayoutAgent, TableAgent, OCRAgent, TableDetAgent)
 
 from .blob import Blob

diff --git a/src/unstructured/documents/pdf_parser/pdf.py → ..._unstructured/documents/pdf_parser/pdf.py b/src/unstructured/documents/pdf_parser/pdf.py → ..._unstructured/documents/pdf_parser/pdf.py
@@ -22,19 +22,19 @@
 import pypdfium2
 import fitz
 
-from unstructured.models import (
+from bisheng_unstructured.models import (
     LayoutAgent, TableAgent, OCRAgent, TableDetAgent)
 
-from unstructured.documents.base import Document, Page
-from unstructured.documents.markdown import (
+from bisheng_unstructured.documents.base import Document, Page
+from bisheng_unstructured.documents.markdown import (
     transform_html_table_to_md,
     merge_md_tables,
     merge_html_tables,
     transform_list_to_table,
     clean_html_table
 )
 
-from unstructured.documents.elements import (
+from bisheng_unstructured.documents.elements import (
     ListItem,
     NarrativeText,
     Text,

diff --git a/...ructured/documents/pdf_parser/test_pdf.py → ...ructured/documents/pdf_parser/test_pdf.py b/...ructured/documents/pdf_parser/test_pdf.py → ...ructured/documents/pdf_parser/test_pdf.py
@@ -8,8 +8,8 @@
 import pypdfium2
 import fitz
 
-from unstructured.models import LayoutAgent, TableAgent, OCRAgent
-from unstructured.documents.pdf_parser.blob import Blob
+from bisheng_unstructured.models import LayoutAgent, TableAgent, OCRAgent
+from bisheng_unstructured.documents.pdf_parser.blob import Blob
 
 
 def draw_polygon(image, bbox, text=None, color=(255, 0, 0), thickness=1):

diff --git a/src/unstructured/documents/xml.py → src/bisheng_unstructured/documents/xml.py b/src/unstructured/documents/xml.py → src/bisheng_unstructured/documents/xml.py
@@ -2,10 +2,10 @@
 
 from lxml import etree
 
-from unstructured.documents.base import Document, Page
-from unstructured.file_utils.encoding import read_txt_file
-from unstructured.logger import logger
-from unstructured.partition.text import (
+from bisheng_unstructured.documents.base import Document, Page
+from bisheng_unstructured.file_utils.encoding import read_txt_file
+from bisheng_unstructured.logger import logger
+from bisheng_unstructured.partition.text import (
     element_from_text,
     partition_text,
 )

diff --git a/src/unstructured/file_utils/__init__.py → ...sheng_unstructured/file_utils/__init__.py b/src/unstructured/file_utils/__init__.py → ...sheng_unstructured/file_utils/__init__.py
diff --git a/src/unstructured/file_utils/encoding.py → ...sheng_unstructured/file_utils/encoding.py b/src/unstructured/file_utils/encoding.py → ...sheng_unstructured/file_utils/encoding.py
@@ -2,7 +2,7 @@
 
 import chardet
 
-from unstructured.partition.common import convert_to_bytes
+from bisheng_unstructured.partition.common import convert_to_bytes
 
 ENCODE_REC_THRESHOLD = 0.8
 

diff --git a/src/unstructured/file_utils/exploration.py → ...ng_unstructured/file_utils/exploration.py b/src/unstructured/file_utils/exploration.py → ...ng_unstructured/file_utils/exploration.py
@@ -5,7 +5,7 @@
 
 import pandas as pd
 
-from unstructured.file_utils.filetype import detect_filetype
+from bisheng_unstructured.file_utils.filetype import detect_filetype
 
 
 def get_directory_file_info(directory: str) -> pd.DataFrame:

diff --git a/...nstructured/file_utils/file_conversion.py → ...nstructured/file_utils/file_conversion.py b/...nstructured/file_utils/file_conversion.py → ...nstructured/file_utils/file_conversion.py
@@ -1,8 +1,8 @@
 import tempfile
 from typing import IO, Optional
 
-from unstructured.partition.common import exactly_one
-from unstructured.utils import dependency_exists, requires_dependencies
+from bisheng_unstructured.partition.common import exactly_one
+from bisheng_unstructured.utils import dependency_exists, requires_dependencies
 
 if dependency_exists("pypandoc"):
     import pypandoc

diff --git a/src/unstructured/file_utils/filetype.py → ...sheng_unstructured/file_utils/filetype.py b/src/unstructured/file_utils/filetype.py → ...sheng_unstructured/file_utils/filetype.py
@@ -7,21 +7,20 @@
 import zipfile
 from enum import Enum
 from functools import wraps
-from typing import IO, TYPE_CHECKING, Callable, List, Optional
+from typing import IO, Callable, List, Optional
 
-from unstructured.documents.coordinates import PixelSpace
-from unstructured.documents.elements import Element, PageBreak
-from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
-from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
-from unstructured.partition.common import (
+from bisheng_unstructured.documents.coordinates import PixelSpace
+from bisheng_unstructured.documents.elements import Element, PageBreak
+from bisheng_unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
+from bisheng_unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
+from bisheng_unstructured.partition.common import (
     _add_element_metadata,
     _remove_element_metadata,
     exactly_one,
     normalize_layout_element,
 )
 
-if TYPE_CHECKING:
-    from unstructured_inference.inference.layout import DocumentLayout, PageLayout
+from bisheng_unstructured.documents.layout import DocumentLayout, PageLayout
 
 try:
     import magic
@@ -30,8 +29,8 @@
 except ImportError:  # pragma: nocover
     LIBMAGIC_AVAILABLE = False  # pragma: nocover
 
-from unstructured.logger import logger
-from unstructured.nlp.patterns import EMAIL_HEAD_RE
+from bisheng_unstructured.logger import logger
+from bisheng_unstructured.nlp.patterns import EMAIL_HEAD_RE
 
 TXT_MIME_TYPES = [
     "text/plain",

diff --git a/...nstructured/file_utils/google_filetype.py → ...nstructured/file_utils/google_filetype.py b/...nstructured/file_utils/google_filetype.py → ...nstructured/file_utils/google_filetype.py
diff --git a/src/unstructured/file_utils/metadata.py → ...sheng_unstructured/file_utils/metadata.py b/src/unstructured/file_utils/metadata.py → ...sheng_unstructured/file_utils/metadata.py
diff --git a/src/unstructured/logger.py → src/bisheng_unstructured/logger.py b/src/unstructured/logger.py → src/bisheng_unstructured/logger.py
diff --git a/src/unstructured/models/__init__.py → src/bisheng_unstructured/models/__init__.py b/src/unstructured/models/__init__.py → src/bisheng_unstructured/models/__init__.py
diff --git a/src/unstructured/models/layout_agent.py → ...sheng_unstructured/models/layout_agent.py b/src/unstructured/models/layout_agent.py → ...sheng_unstructured/models/layout_agent.py
diff --git a/src/unstructured/models/ocr_agent.py → src/bisheng_unstructured/models/ocr_agent.py b/src/unstructured/models/ocr_agent.py → src/bisheng_unstructured/models/ocr_agent.py
diff --git a/src/unstructured/models/table_agent.py → ...isheng_unstructured/models/table_agent.py b/src/unstructured/models/table_agent.py → ...isheng_unstructured/models/table_agent.py
diff --git a/src/unstructured/ingest/__init__.py → src/bisheng_unstructured/nlp/__init__.py b/src/unstructured/ingest/__init__.py → src/bisheng_unstructured/nlp/__init__.py
diff --git a/src/unstructured/nlp/english-words.txt → ...isheng_unstructured/nlp/english-words.txt b/src/unstructured/nlp/english-words.txt → ...isheng_unstructured/nlp/english-words.txt
diff --git a/src/unstructured/nlp/english_words.py → ...bisheng_unstructured/nlp/english_words.py b/src/unstructured/nlp/english_words.py → ...bisheng_unstructured/nlp/english_words.py
diff --git a/src/bisheng_unstructured/nlp/partition.py b/src/bisheng_unstructured/nlp/partition.py
@@ -0,0 +1,7 @@
+# flake8: noqa
+from bisheng_unstructured.partition.pdf import partition_pdf
+from bisheng_unstructured.partition.text_type import (
+    is_bulleted_text,
+    is_possible_narrative_text,
+    is_possible_title,
+)
diff --git a/src/unstructured/nlp/patterns.py → src/bisheng_unstructured/nlp/patterns.py b/src/unstructured/nlp/patterns.py → src/bisheng_unstructured/nlp/patterns.py
diff --git a/src/unstructured/nlp/tokenize.py → src/bisheng_unstructured/nlp/tokenize.py b/src/unstructured/nlp/tokenize.py → src/bisheng_unstructured/nlp/tokenize.py
diff --git a/src/unstructured/ingest/cli/__init__.py → ...isheng_unstructured/partition/__init__.py b/src/unstructured/ingest/cli/__init__.py → ...isheng_unstructured/partition/__init__.py