Skip to content

Commit

Permalink
Merge pull request #58 from dataelement/feat/v0.0.3.11
Browse files Browse the repository at this point in the history
Feat/v0.0.3.11
  • Loading branch information
zgqgit authored Oct 12, 2024
2 parents 9c43a91 + f931be3 commit 5757892
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 44 deletions.
47 changes: 23 additions & 24 deletions .drone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@ steps: # 定义流水线执行步骤,这些步骤将顺序执行
https_proxy:
from_secret: PROXY
commands:
- git config --global core.compression 0
- git clone https://github.com/dataelement/bisheng-unstructured.git .
- git checkout $DRONE_COMMIT
- git config --global core.compression 0
- git clone https://github.com/dataelement/bisheng-unstructured.git .
- git checkout $DRONE_COMMIT

- name: build_docker_release
pull: if-not-exists
image: plugins/docker
image: docker:24.0.6
privileged: true
volumes: # 将容器内目录挂载到宿主机,仓库需要开启Trusted设置
- name: apt-cache
Expand All @@ -33,19 +33,18 @@ steps: # 定义流水线执行步骤,这些步骤将顺序执行
from_secret: PROXY
https_proxy:
from_secret: PROXY
no_proxy: 192.168.106.8,archive.ubuntu.com
settings:
registry: http://192.168.106.8:6082
insecure: true
purge: true
repo: 192.168.106.8:6082/dataelement/bisheng-unstructured
tags: [ release ]
context: ./
dockerfile: ./docker/Dockerfile
username:
no_proxy: 192.168.106.8
version: release
docker_repo: 192.168.106.8:6082/dataelement/bisheng-unstructured
docker_registry: http://192.168.106.8:6082
docker_user:
from_secret: NEXUS_USER
password:
docker_password:
from_secret: NEXUS_PASSWORD
commands:
- docker login -u $docker_user -p $docker_password $docker_registry
- docker build -t $docker_repo:$version -f ./docker/Dockerfile .
- docker push $docker_repo:$version
when:
status:
- success
Expand Down Expand Up @@ -96,12 +95,12 @@ steps: # 定义流水线执行步骤,这些步骤将顺序执行
- refs/tags/v*

volumes:
- name: bisheng-cache
host:
path: /opt/drone/data/bisheng/
- name: apt-cache
host:
path: /opt/drone/data/bisheng/apt/
- name: socket
host:
path: /var/run/docker.sock
- name: bisheng-cache
host:
path: /opt/drone/data/bisheng/
- name: apt-cache
host:
path: /opt/drone/data/bisheng/apt/
- name: socket
host:
path: /var/run/docker.sock
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ pdf_model_params:
table_model_ep: "http://192.168.106.12:9001/v2.1/models/elem_table_detect_v1/infer"
ocr_model_ep: "http://192.168.106.12:9001/v2.1/models/elem_ocr_collection_v3/infer"


is_all_ocr: false
# ocr识别需要的配置项
ocr_conf:
params:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ pdfplumber==0.10.2
wheel==0.41.0
pypdfium2==4.23.1
pypdf==4.3.0
PyMuPDF==1.23.2
PyMuPDF==1.23.8
opencv-python==4.8.0.76
certifi==2023.7.22
cffi==1.15.1
Expand Down
2 changes: 1 addition & 1 deletion src/bisheng_unstructured/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pdf_model_params:
ocr_model_ep: "http://192.168.106.12:9001/v2.1/models/elem_ocr_collection_v3/infer"

# 是否全部走ocr识别, false的话则由代码逻辑判断是否需要走ocr识别
is_all_ocr: true
is_all_ocr: false
# ocr识别需要的配置项
ocr_conf:
params:
Expand Down
28 changes: 16 additions & 12 deletions src/bisheng_unstructured/documents/pdf_parser/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,19 @@


class ImageDocument(PDFDocument):

def __init__(self,
file: str,
model_params: dict,
with_columns: bool = False,
text_elem_sep: str = "\n",
enhance_table: bool = True,
keep_text_in_image: bool = True,
lang: str = "zh",
verbose: bool = False,
n_parallel: int = 10,
**kwargs) -> None:
def __init__(
self,
file: str,
model_params: dict,
with_columns: bool = False,
text_elem_sep: str = "\n",
enhance_table: bool = True,
keep_text_in_image: bool = True,
lang: str = "zh",
verbose: bool = False,
n_parallel: int = 10,
**kwargs
) -> None:
super(ImageDocument, self).__init__(file=file, model_params=model_params)
rt_type = kwargs.get("rt_type", "sdk")
if rt_type in {"ocr_sdk", "idp", "sdk"}:
Expand Down Expand Up @@ -74,6 +75,9 @@ def load(self) -> List[Page]:
# timer.toc()

if blocks:
for tmp_block in blocks:
tmp_block.pages = [1 for _ in tmp_block.rs]
tmp_block.bbox_text = None
if self.with_columns:
sub_groups = self._divide_blocks_into_groups(blocks)
groups.extend(sub_groups)
Expand Down
52 changes: 47 additions & 5 deletions src/bisheng_unstructured/documents/pdf_parser/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,9 +1072,30 @@ def _get_elem(blocks, is_first=True):
has_header = np.all([e0 == e1 for e0, e1 in zip(row0, row1)])
new_text = merge_md_tables([b0.block_text, b1.block_text], has_header)
new_html_text = merge_html_tables([b0.html_text, b1.html_text], has_header)
new_pages = b0.pages.copy()
new_pages.extend(b1.pages)
joined_lines = np.hstack([b0.ts, b1.ts])
joined_bboxes = np.vstack([b0.rs, b1.rs])
new_block = b1
new_bbox = []
new_bbox_text = []
if b0.bbox_text is not None:
new_bbox_text.extend(b0.bbox_text)
else:
new_bbox_text.append(b0.block_text)
if b1.bbox_text is not None:
new_bbox_text.extend(b1.bbox_text)
else:
new_bbox_text.append(b1.block_text)
new_bbox.extend(b0.bbox)
new_bbox.extend(b1.bbox)
new_block.bbox = new_bbox
new_block.pages = new_pages
new_block.ts = joined_lines
new_block.rs = joined_bboxes
new_block.block_text = new_text
new_block.html_text = new_html_text
new_block.bbox_text = new_bbox_text
groups[i][0] = new_block
groups[i - 1].pop(-1)

Expand Down Expand Up @@ -1103,12 +1124,32 @@ def _save_to_pages(self, groups, page_inds, lang):
# html = b[-1]
html = b.html_text
clean_html = clean_html_table(html)
extra_data.update({"types": ["table"]})
prev_ind = 0
s = prev_ind
e = prev_ind + len(text) - 1
indexes = [[s, e]]
extra_data.update({"indexes": indexes, "pages": [idx]})
table_bbox = [
[bbox[i], bbox[i + 1], bbox[i + 2], bbox[i + 3]]
for i in range(0, len(bbox) - 3, 4)
]
indexes = [[0, len(b.block_text) - 1]]
metadata_types = ["table"]
if len(table_bbox) > 1:
indexes = []
metadata_types = []
for bbox_index, tmp_text in enumerate(b.bbox_text):
metadata_types.append("table")
if bbox_index == 0:
continue
next_index = b.block_text.find(tmp_text.split("\n")[0])
indexes.append([prev_ind, next_index - 1])
prev_ind = next_index
indexes.append([prev_ind, len(b.block_text) - 1])
extra_data.update(
{
"indexes": indexes,
"bboxes": table_bbox,
"pages": list(set(b.pages)),
"types": metadata_types,
}
)
metadata = ElementMetadata(text_as_html=clean_html, extra_data=extra_data)
element = Table(text=text, metadata=metadata)
else:
Expand Down Expand Up @@ -1257,6 +1298,7 @@ def _task(textpage_info, bytes_img, img, is_scan, lang, rot_matirx, page_index:
blocks = one[1]
for tmp_block in blocks:
tmp_block.pages = [idx + 1 for _ in tmp_block.rs]
tmp_block.bbox_text = None
if self.with_columns:
sub_groups = self._divide_blocks_into_groups(blocks)
groups.extend(sub_groups)
Expand Down
1 change: 1 addition & 0 deletions src/bisheng_unstructured/models/idp/dummy_ocr_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class BlockInfo:
layout_type: int = None # 3: title 4: pragraph, 5: table
html_text: str = None
pages: List[int] = None # record every rs item belong to which page
bbox_text: List[str] = None # record every bbox relate to which text


def find_xy(box):
Expand Down

0 comments on commit 5757892

Please sign in to comment.