Skip to content

Commit

Permalink
reconstruct the code
Browse files Browse the repository at this point in the history
  • Loading branch information
hrfng committed Sep 5, 2023
1 parent 7bd0c45 commit 79a8fce
Show file tree
Hide file tree
Showing 230 changed files with 402 additions and 10,753 deletions.
2 changes: 1 addition & 1 deletion docker/prepare.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

function start_docker() {
docker run --gpus=all --shm-size 2g --net=host -itd --name bisheng_unstr_dev1 \
docker run --net=host -itd --name bisheng_unstr_dev1 \
-v /home/hanfeng:/home/hanfeng -v /home/public:/home/public ubuntu:20.04 bash
}

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import sys
import unicodedata

from unstructured.file_utils.encoding import (
from bisheng_unstructured.file_utils.encoding import (
format_encoding_str,
)
from unstructured.nlp.patterns import (
from bisheng_unstructured.nlp.patterns import (
DOUBLE_PARAGRAPH_PATTERN_RE,
E_BULLET_PATTERN,
LINE_BREAK_RE,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
from typing import List, Optional

from unstructured.nlp.patterns import (
from bisheng_unstructured.nlp.patterns import (
EMAIL_ADDRESS_PATTERN,
EMAIL_DATETIMETZ_PATTERN,
IP_ADDRESS_NAME_PATTERN,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import langdetect
from transformers import MarianMTModel, MarianTokenizer

from unstructured.nlp.tokenize import sent_tokenize
from unstructured.staging.huggingface import chunk_by_attention_window
from bisheng_unstructured.nlp.tokenize import sent_tokenize
from bisheng_unstructured.staging.huggingface import chunk_by_attention_window


def _get_opus_mt_model_name(source_lang: str, target_lang: str):
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from abc import ABC
from typing import List, Optional

from unstructured.documents.elements import Element, NarrativeText
from bisheng_unstructured.documents.elements import Element, NarrativeText


class Document(ABC):
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from functools import wraps
from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union, cast

from unstructured.documents.coordinates import (
from bisheng_unstructured.documents.coordinates import (
TYPE_TO_COORDINATE_SYSTEM_MAP,
CoordinateSystem,
RelativeCoordinateSystem,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from datetime import datetime
from typing import Callable, List, Union

from unstructured.documents.elements import UUID, Element, NoID, Text
from bisheng_unstructured.documents.elements import UUID, Element, NoID, Text


class NoDatestamp(ABC):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@



from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
from unstructured.documents.base import Page
from unstructured.documents.elements import (
from bisheng_unstructured.cleaners.core import (
clean_bullets, replace_unicode_quotes
)
from bisheng_unstructured.documents.base import Page
from bisheng_unstructured.documents.elements import (
Address,
Element,
EmailAddress,
Expand All @@ -26,9 +28,9 @@
Table,
ElementMetadata
)
from unstructured.documents.xml import VALID_PARSERS, XMLDocument
from unstructured.logger import logger
from unstructured.partition.text_type import (
from bisheng_unstructured.documents.xml import VALID_PARSERS, XMLDocument
from bisheng_unstructured.logger import logger
from bisheng_unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
is_possible_narrative_text,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from unstructured.documents.markdown import transform_html_table_to_md
from bisheng_unstructured.documents.markdown import transform_html_table_to_md


def visualize_html(elements, output_file):
Expand Down
145 changes: 145 additions & 0 deletions src/bisheng_unstructured/documents/layout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from __future__ import annotations

import os
import tempfile
from pathlib import PurePath
from typing import BinaryIO, Collection, List, Optional, Tuple, Union, cast

import numpy as np
from PIL import Image


class DocumentLayout:
"""Class for handling documents that are saved as .pdf files. For .pdf files, a
document image analysis (DIA) model detects the layout of the page prior to extracting
element."""

def __init__(self, pages=None):
self._pages = pages

def __str__(self) -> str:
return "\n\n".join([str(page) for page in self.pages])

@property
def pages(self) -> List[PageLayout]:
"""Gets all elements from pages in sequential order."""
return self._pages

@classmethod
def from_pages(cls, pages: List[PageLayout]) -> DocumentLayout:
"""Generates a new instance of the class from a list of `PageLayouts`s"""
doc_layout = cls()
doc_layout._pages = pages
return doc_layout

@classmethod
def from_file(
cls,
filename: str,
detection_model: Optional[Any] = None,
element_extraction_model: Optional[Any] = None,
fixed_layouts: Optional[List[Optional[List[Any]]]] = None,
ocr_strategy: str = "auto",
ocr_languages: str = "eng",
extract_tables: bool = False,
pdf_image_dpi: int = 200,
) -> DocumentLayout:
"""Creates a DocumentLayout from a pdf file."""
logger.info(f"Reading PDF for file: {filename} ...")
pages: List[PageLayout] = []
return cls.from_pages(pages)

@classmethod
def from_image_file(
cls,
filename: str,
detection_model: Optional[Any] = None,
element_extraction_model: Optional[Any] = None,
ocr_strategy: str = "auto",
ocr_languages: str = "eng",
fixed_layout: Optional[List[Any]] = None,
extract_tables: bool = False,
) -> DocumentLayout:
"""Creates a DocumentLayout from an image file."""
logger.info(f"Reading image file: {filename} ...")
return cls.from_pages([])


class PageLayout:
"""Class for an individual PDF page."""

def __init__(
self,
number: int,
image: Image.Image,
layout: Optional[List[Any]],
image_metadata: Optional[dict] = None,
image_path: Optional[Union[str, PurePath]] = None,
detection_model: Optional[Any] = None,
element_extraction_model: Optional[Any] = None,
ocr_strategy: str = "auto",
ocr_languages: str = "eng",
extract_tables: bool = False,
):
self.elements: Collection[Any] = []


def __str__(self) -> str:
return "\n\n".join([str(element) for element in self.elements])

def get_elements_using_image_extraction(
self,
inplace=True,
) -> Optional[List[Any]]:
"""Uses end-to-end text element extraction model to extract the elements on the page."""
return []

def get_elements_with_detection_model(self, inplace=True) -> Optional[List[Any]]:
"""Uses specified model to detect the elements on the page."""
elements = []
if inplace:
self.elements = elements
return None
return elements

def get_elements_from_layout(self, layout: List[Any]) -> List[Any]:
"""Uses the given Layout to separate the page text into elements, either extracting the
text from the discovered layout blocks or from the image using OCR."""
return []

def _get_image_array(self) -> Union[np.ndarray, None]:
"""Converts the raw image into a numpy array."""
if self.image_array is None:
if self.image:
self.image_array = np.array(self.image)
else:
image = Image.open(self.image_path)
self.image_array = np.array(image)
return self.image_array

@classmethod
def from_image(
cls,
image: Image.Image,
image_path: Optional[Union[str, PurePath]],
number: int = 1,
detection_model: Optional[Any] = None,
element_extraction_model: Optional[Any] = None,
layout: Optional[List[Any]] = None,
ocr_strategy: str = "auto",
ocr_languages: str = "eng",
extract_tables: bool = False,
fixed_layout: Optional[List[Any]] = None,
):
"""Creates a PageLayout from an already-loaded PIL Image."""
page = cls(
number=number,
image=image,
layout=layout,
detection_model=detection_model,
element_extraction_model=element_extraction_model,
ocr_strategy=ocr_strategy,
ocr_languages=ocr_languages,
extract_tables=extract_tables,
)
return page
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import Any, Iterator, List, Mapping, Optional, Union
import base64

from unstructured.documents.base import Page
from bisheng_unstructured.documents.base import Page

from unstructured.models import (
from bisheng_unstructured.models import (
LayoutAgent, TableAgent, OCRAgent, TableDetAgent)

from .blob import Blob
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@
import pypdfium2
import fitz

from unstructured.models import (
from bisheng_unstructured.models import (
LayoutAgent, TableAgent, OCRAgent, TableDetAgent)

from unstructured.documents.base import Document, Page
from unstructured.documents.markdown import (
from bisheng_unstructured.documents.base import Document, Page
from bisheng_unstructured.documents.markdown import (
transform_html_table_to_md,
merge_md_tables,
merge_html_tables,
transform_list_to_table,
clean_html_table
)

from unstructured.documents.elements import (
from bisheng_unstructured.documents.elements import (
ListItem,
NarrativeText,
Text,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
import pypdfium2
import fitz

from unstructured.models import LayoutAgent, TableAgent, OCRAgent
from unstructured.documents.pdf_parser.blob import Blob
from bisheng_unstructured.models import LayoutAgent, TableAgent, OCRAgent
from bisheng_unstructured.documents.pdf_parser.blob import Blob


def draw_polygon(image, bbox, text=None, color=(255, 0, 0), thickness=1):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

from lxml import etree

from unstructured.documents.base import Document, Page
from unstructured.file_utils.encoding import read_txt_file
from unstructured.logger import logger
from unstructured.partition.text import (
from bisheng_unstructured.documents.base import Document, Page
from bisheng_unstructured.file_utils.encoding import read_txt_file
from bisheng_unstructured.logger import logger
from bisheng_unstructured.partition.text import (
element_from_text,
partition_text,
)
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import chardet

from unstructured.partition.common import convert_to_bytes
from bisheng_unstructured.partition.common import convert_to_bytes

ENCODE_REC_THRESHOLD = 0.8

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pandas as pd

from unstructured.file_utils.filetype import detect_filetype
from bisheng_unstructured.file_utils.filetype import detect_filetype


def get_directory_file_info(directory: str) -> pd.DataFrame:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import tempfile
from typing import IO, Optional

from unstructured.partition.common import exactly_one
from unstructured.utils import dependency_exists, requires_dependencies
from bisheng_unstructured.partition.common import exactly_one
from bisheng_unstructured.utils import dependency_exists, requires_dependencies

if dependency_exists("pypandoc"):
import pypandoc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,20 @@
import zipfile
from enum import Enum
from functools import wraps
from typing import IO, TYPE_CHECKING, Callable, List, Optional
from typing import IO, Callable, List, Optional

from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import Element, PageBreak
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
from unstructured.partition.common import (
from bisheng_unstructured.documents.coordinates import PixelSpace
from bisheng_unstructured.documents.elements import Element, PageBreak
from bisheng_unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
from bisheng_unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
from bisheng_unstructured.partition.common import (
_add_element_metadata,
_remove_element_metadata,
exactly_one,
normalize_layout_element,
)

if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from bisheng_unstructured.documents.layout import DocumentLayout, PageLayout

try:
import magic
Expand All @@ -30,8 +29,8 @@
except ImportError: # pragma: nocover
LIBMAGIC_AVAILABLE = False # pragma: nocover

from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_HEAD_RE
from bisheng_unstructured.logger import logger
from bisheng_unstructured.nlp.patterns import EMAIL_HEAD_RE

TXT_MIME_TYPES = [
"text/plain",
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
7 changes: 7 additions & 0 deletions src/bisheng_unstructured/nlp/partition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# flake8: noqa
from bisheng_unstructured.partition.pdf import partition_pdf
from bisheng_unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
is_possible_title,
)
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 79a8fce

Please sign in to comment.