Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add unstructured_infernece dependency decorator, remove more_itertool…
Browse files Browse the repository at this point in the history
…s dependency
plutasnyy committed Apr 18, 2024
1 parent c73714b commit 8b71376
Showing 4 changed files with 33 additions and 25 deletions.
18 changes: 14 additions & 4 deletions test_unstructured/metrics/test_table_formats.py
Original file line number Diff line number Diff line change
@@ -10,7 +10,17 @@ def test_simple_table_cell_parsing_from_table_transformer_when_expected_input():
assert expected_cell == transformed_cell


def test_simple_table_cell_parsing_from_table_transformer_when_missing_input():
table_transformer_cell = {"row_nums": [], "column_nums": [], "cell text": "text"}
with pytest.raises(ValueError):
SimpleTableCell.from_table_transformer_cell(table_transformer_cell)
def test_simple_table_cell_parsing_from_table_transformer_when_missing_row_nums():
cell = {"row_nums": [], "column_nums": [1], "cell text": "text"}
with pytest.raises(ValueError) as exception_info:
SimpleTableCell.from_table_transformer_cell(cell)
assert str(exception_info.value) == f'Cell {str(cell)} has missing values under "row_nums" key'


def test_simple_table_cell_parsing_from_table_transformer_when_missing_column_nums():
cell = {"row_nums": [1], "column_nums": [], "cell text": "text"}
with pytest.raises(ValueError) as exception_info:
SimpleTableCell.from_table_transformer_cell(cell)
assert (
str(exception_info.value) == f'Cell {str(cell)} has missing values under "column_nums" key'
)
22 changes: 13 additions & 9 deletions unstructured/metrics/table/table_formats.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from dataclasses import dataclass

from more_itertools import first


@dataclass
class SimpleTableCell:
@@ -26,13 +24,19 @@ def from_table_transformer_cell(cls, tatr_table_cell: dict[str, list[int] | str]
"cell text": "Text inside cell"
}
"""
rows_sorted = sorted(tatr_table_cell["row_nums"])
columns_sorted = sorted(tatr_table_cell["column_nums"])

x = first(columns_sorted)
y = first(rows_sorted)
row_nums = tatr_table_cell.get("row_nums", [])
column_nums = tatr_table_cell.get("column_nums", [])

if not row_nums:
raise ValueError(f'Cell {tatr_table_cell} has missing values under "row_nums" key')
if not column_nums:
raise ValueError(f'Cell {tatr_table_cell} has missing values under "column_nums" key')

x = sorted(column_nums)[0]
y = sorted(row_nums)[0]

width = len(columns_sorted)
height = len(rows_sorted)
width = len(column_nums)
height = len(row_nums)

return cls(x=x, y=y, w=width, h=height, content=tatr_table_cell["cell text"])
return cls(x=x, y=y, w=width, h=height, content=tatr_table_cell.get("cell text", ""))
10 changes: 2 additions & 8 deletions unstructured/partition/common.py
Original file line number Diff line number Diff line change
@@ -578,14 +578,8 @@ def document_to_element_list(
else:
if last_modification_date:
element.metadata.last_modified = last_modification_date
element.metadata.text_as_html = (
layout_element.text_as_html if hasattr(layout_element, "text_as_html") else None
)
element.metadata.table_as_cells = (
layout_element.table_as_cells
if hasattr(layout_element, "table_as_cells")
else None
)
element.metadata.text_as_html = getattr(layout_element, "text_as_html", None)
element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None)
try:
if (
isinstance(element, Title) and element.metadata.category_depth is None
8 changes: 4 additions & 4 deletions unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
@@ -8,7 +8,6 @@
# unstructured.documents.elements.Image
from PIL import Image as PILImage
from PIL import ImageSequence
from unstructured_inference.models.tables import cells_to_html

from unstructured.documents.elements import ElementType
from unstructured.logger import logger
@@ -254,6 +253,7 @@ def supplement_page_layout_with_ocr(
return page_layout


@requires_dependencies("unstructured_inference")
def supplement_element_with_table_extraction(
elements: List["LayoutElement"],
image: PILImage,
@@ -267,6 +267,7 @@ def supplement_element_with_table_extraction(
the table's text content is rendered into a html string and "table_as_cells"
with the raw table cells output from table agent
"""
from unstructured_inference.models.tables import cells_to_html

table_elements = [el for el in elements if el.type == ElementType.TABLE]
for element in table_elements:
@@ -292,12 +293,11 @@ def supplement_element_with_table_extraction(
)
text_as_html = cells_to_html(tatr_cells)
simple_table_cells = [
SimpleTableCell.from_table_transformer_cell(cell) for cell in tatr_cells
SimpleTableCell.from_table_transformer_cell(cell).to_dict() for cell in tatr_cells
]
serializable_simple_cells = [cell.to_dict() for cell in simple_table_cells]

element.text_as_html = text_as_html
element.table_as_cells = serializable_simple_cells
element.table_as_cells = simple_table_cells

return elements

0 comments on commit 8b71376

Please sign in to comment.