Skip to content

Commit

Permalink
support rowcol cell table route
Browse files Browse the repository at this point in the history
  • Loading branch information
hrfng committed Sep 12, 2024
1 parent b4858b4 commit a938ae7
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 13 deletions.
35 changes: 27 additions & 8 deletions src/bisheng_unstructured/documents/pdf_parser/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,11 +507,18 @@ def _enhance_table_layout(self, b64_image, layout_blocks):
TABLE_ID = 5
inp = {"b64_image": b64_image}
result = self.table_det_agent.predict(inp)
# 1: cell 2: rowcol
DEFAULT_TABLE_CATE = 2
table_layout_cats = []
table_layout = []
for bb in result["bboxes"]:
for i, bb in enumerate(result["bboxes"]):
coords = ((bb[0], bb[1]), (bb[2], bb[3]), (bb[4], bb[5]), (bb[6], bb[7]))
poly = Polygon(coords)
table_layout.append((poly, TABLE_ID))
if "labels" in result:
table_layout_cats.append(result["labels"][i])
else:
table_layout_cats.append(DEFAULT_TABLE_CATE)

general_table_layout = []
result_layout = []
Expand All @@ -535,16 +542,21 @@ def _enhance_table_layout(self, b64_image, layout_blocks):
mask[i] = 1
break

semantic_table_cate = [
None,
] * len(result_layout) + table_layout_cats
for e in table_layout:
result_layout.append(e)

for i, e in enumerate(general_table_layout):
if mask[i] == 0:
result_layout.append(e)
semantic_table_cate.append(DEFAULT_TABLE_CATE)

semantic_polys = [e[0] for e in result_layout]
semantic_labels = [e[1] for e in result_layout]

return semantic_polys, semantic_labels
return semantic_polys, semantic_labels, semantic_table_cate

def _enhance_texts_info_with_formula(self, b64_image, img, textpage_info):
if self.support_formula:
Expand Down Expand Up @@ -656,14 +668,21 @@ def _allocate_semantic(
layout_info = layout
# print('layout_info', layout_info)
if self.enhance_table:
semantic_polys, semantic_labels = self._enhance_table_layout(b64_image, layout)
semantic_polys, semantic_labels, semantic_table_cate = self._enhance_table_layout(
b64_image, layout
)
else:
for info in layout_info["result"]:
bbs = info["bbox"]
coords = ((bbs[0], bbs[1]), (bbs[2], bbs[3]), (bbs[4], bbs[5]), (bbs[6], bbs[7]))
semantic_polys.append(Polygon(coords))
semantic_labels.append(info["category_id"])

semantic_table_cate = [
2,
] * len(semantic_labels)

print("---semantic_table_cate", semantic_table_cate)
timer.toc()

# phrase 1. merge continuous text block by the containing matrix
Expand Down Expand Up @@ -836,7 +855,8 @@ def _allocate_semantic(
bboxes.append(b_)

table_bbox = semantic_bboxes[ind[0]]
table_infos.append((j, texts, bboxes, table_bbox))
table_cate = semantic_table_cate[ind[0]]
table_infos.append((j, texts, bboxes, table_bbox, table_cate))

new_blocks[j].layout_type = sem_label

Expand All @@ -847,16 +867,17 @@ def _allocate_semantic(
# Parse the table layout
table_layout = []
for table_info in table_infos:
block_ind, texts, bboxes, table_bbox = table_info
block_ind, texts, bboxes, table_bbox, table_cate = table_info
if not texts:
continue
ocr_result = {"texts": texts, "bboxes": rect2polygon(bboxes)}

scene = "cell" if table_cate == 1 else "rowcol"
inp = {
"b64_image": b64_image,
"ocr_result": json.dumps(ocr_result),
"table_bboxes": [table_bbox],
"scene": "cell",
"scene": scene,
}
table_result = self.table_agent.predict(inp)
# print('---table--', ocr_result, table_bbox, table_result)
Expand Down Expand Up @@ -1150,7 +1171,6 @@ def _task(textpage_info, bytes_img, img, is_scan, lang, rot_matirx, page_index:
max_page = fitz_doc.page_count - start
n = self.n if self.n else max_page
n = min(n, max_page)

sample_n = min(5, fitz_doc.page_count)
type_texts = [page.get_text() for page in fitz_doc.pages(0, sample_n)]
type_texts = "".join(type_texts)
Expand Down Expand Up @@ -1211,7 +1231,6 @@ def _task(textpage_info, bytes_img, img, is_scan, lang, rot_matirx, page_index:
textpage_info = self._extract_lines_v2(textpage)
else:
textpage_info = (None, None)

# blocks = _task(textpage_info, bytes_img, img, is_scan, lang, rot_matrix)
futures.append(
executor.submit(
Expand Down
18 changes: 13 additions & 5 deletions tests/test_pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,28 +236,36 @@ def test_regress():


def test_pdf_doc10():
url = TEST_RT_URL
# url = TEST_RT_URL
url = "http://192.168.106.20:10502/v2/models/"
layout_ep = url + "elem_layout_v1/infer"
cell_model_ep = url + "elem_table_cell_detect_v1/infer"
rowcol_model_ep = url + "elem_table_rowcol_detect_v1/infer"
table_model_ep = url + "elem_table_detect_v1/infer"
# table_model_ep = url + "elem_table_detect_v1/infer"
table_model_ep = url + "elem_table_multiclass_v1/infer"

model_params = {
"layout_ep": layout_ep,
"cell_model_ep": cell_model_ep,
"rowcol_model_ep": rowcol_model_ep,
"table_model_ep": table_model_ep,
"ocr_model_ep": f"{TEST_RT_URL}elem_ocr_collection_v3/infer",
"ocr_model_ep": "http://192.168.106.125:10502/v2/idp/idp_app/infer",
}
print("model_params", model_params)

filename = "examples/docs/南陵电子2022.pdf"
pdf_doc = PDFDocument(
file=filename, model_params=model_params, start=0, verbose=True, n_parallel=10
file=filename,
model_params=model_params,
start=0,
verbose=True,
n_parallel=10,
n=50,
mode="server",
)
pages = pdf_doc.pages
elements = pdf_doc.elements
visualize_html(elements, "data/南陵电子2022-2.html")
# visualize_html(elements, "data/南陵电子2022-2.html")
save_to_txt(elements, "data/南陵电子2022-2.txt")


Expand Down

0 comments on commit a938ae7

Please sign in to comment.