Skip to content

Commit

Permalink
update idp compatible pdf and image parser
Browse files Browse the repository at this point in the history
  • Loading branch information
hrfng committed Mar 28, 2024
1 parent 966427f commit 9d33ce5
Show file tree
Hide file tree
Showing 6 changed files with 499 additions and 8 deletions.
4 changes: 2 additions & 2 deletions docker/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ function upload_image() {


function run_dev_docker() {
image="dataelement/bisheng-unstructured:0.0.2"
cnt_name="bisheng_uns_v002_rd_dev"
image="dataelement/bisheng-unstructured:0.0.3.4"
cnt_name="bisheng_uns_dev"
MOUNT="-v $HOME:$HOME -v /public:/public"
pushd $(cd $(dirname $0); pwd)
docker run -p 22001:10001 -itd --name ${cnt_name} $MOUNT $image bash
Expand Down
Empty file.
57 changes: 57 additions & 0 deletions src/bisheng_unstructured/documents/pdf_parser/idp/image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import base64
from typing import Any, Iterator, List, Mapping, Optional, Union

from bisheng_unstructured.documents.base import Page
from bisheng_unstructured.models.idp.dummy_ocr_agent import OCRAgent

from ..blob import Blob
from ..idp.pdf import PDFDocument

# from bisheng_unstructured.common import Timer


class ImageDocument(PDFDocument):
def __init__(
self,
file: str,
model_params: dict,
with_columns: bool = False,
text_elem_sep: str = "\n",
enhance_table: bool = True,
keep_text_in_image: bool = True,
lang: str = "zh",
verbose: bool = False,
n_parallel: int = 10,
**kwargs
) -> None:
self.ocr_agent = OCRAgent(**model_params)
self.with_columns = with_columns
self.verbose = verbose
self.text_elem_sep = text_elem_sep
self.file = file
self.enhance_table = enhance_table
self.lang = lang

self.is_scan = True
self.support_rotate = False
self.is_join_table = False
self.keep_text_in_image = keep_text_in_image
self.n_parallel = n_parallel

super(PDFDocument, self).__init__()

def load(self) -> List[Page]:
"""Load given path as pages."""
blob = Blob.from_path(self.file)
groups = []
b64_data = base64.b64encode(blob.as_bytes()).decode()
payload = {"b64_image": b64_data}

page_inds = []
blocks = self.ocr_agent.predict(payload)
if blocks:
groups.append(blocks)
page_inds.append(1)

pages = self._save_to_pages(groups, page_inds, self.lang)
return pages
Loading

0 comments on commit 9d33ce5

Please sign in to comment.