Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: form parsing placeholders #3034

Merged
merged 14 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.

### Features
* **Add form extraction basics (document elements and placeholder code in partition)
MillCheck marked this conversation as resolved.
Show resolved Hide resolved

### Fixes

Expand Down
149 changes: 149 additions & 0 deletions example-docs/test_evaluate_files/unstructured_output/form.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
[
{
"type": "FormKeysValues",
"element_id": "MOCK_FORM_ID",
"text": "",
"metadata": {
"coordinates": {
"points": [
[
35.15625,
95.556640625
],
[
710.357666015625,
95.556640625
],
[
710.357666015625,
887.890625
],
[
35.15625,
887.890625
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1,
"key_value_pairs": [
{
"key": {
"text": "MOCK KEY",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_KEY_ID_1",
"text": "MOCK KEY",
"metadata": {
"coordinates": {
"points": [
[
503.271484375,
96.3897705078125
],
[
503.271484375,
107.5164794921875
],
[
606.103515625,
107.5164794921875
],
[
606.103515625,
96.3897705078125
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"value": {
"text": "MOCK VALUE",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_VALUE_ID",
"text": "MOCK VALUE",
"metadata": {
"coordinates": {
"points": [
[
557.568359375,
124.8626708984375
],
[
557.568359375,
136.6607666015625
],
[
595.556640625,
136.6607666015625
],
[
595.556640625,
124.8626708984375
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"confidence": 0.0
},
{
"key": {
"text": "MOCK KEY 2",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_KEY_ID_2",
"text": "MOCK KEY 2",
"metadata": {
"coordinates": {
"points": [
[
428.52783203125,
124.0478515625
],
[
428.52783203125,
136.6943359375
],
[
473.81591796875,
136.6943359375
],
[
473.81591796875,
124.0478515625
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"value": null,
"confidence": 0.0
}
],
"file_directory": "dataset/testing_data/images",
"filename": "MOCK.png"
}
}
]
14 changes: 13 additions & 1 deletion test_unstructured/documents/test_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
from __future__ import annotations

import copy
import io
import json
import pathlib
from functools import partial

import pytest

from test_unstructured.unit_utils import assign_hash_ids
from test_unstructured.unit_utils import assign_hash_ids, example_doc_path
from unstructured.cleaners.core import clean_bullets, clean_prefix
from unstructured.documents.coordinates import (
CoordinateSystem,
Expand All @@ -31,6 +32,7 @@
Title,
assign_and_map_hash_ids,
)
from unstructured.partition.json import partition_json


@pytest.mark.parametrize("element", [Element(), Text(text=""), CheckBox()])
Expand Down Expand Up @@ -744,3 +746,13 @@ def test_id_to_hash_calculates(text, sequence_number, filename, page_number, exp
)
assert element.id_to_hash(sequence_number) == expected_hash, "Returned ID does not match"
assert element.id == expected_hash, "ID should be set"


def test_formskeysvalues_reads_saves():
filename = example_doc_path("test_evaluate_files/unstructured_output/form.json")
as_read = partition_json(filename=filename)
tmp_file = io.StringIO()
json.dump([element.to_dict() for element in as_read], tmp_file)
tmp_file.seek(0)
as_read_2 = partition_json(file=tmp_file)
assert as_read == as_read_2
64 changes: 64 additions & 0 deletions unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,18 @@ class Link(TypedDict):
start_index: int


class FormKeyOrValue(TypedDict):
text: str
MthwRobinson marked this conversation as resolved.
Show resolved Hide resolved
layout_element_id: Optional[str]
custom_element: Optional[Text]


class FormKeyValuePair(TypedDict):
key: FormKeyOrValue
value: Optional[FormKeyOrValue]
confidence: float


class ElementMetadata:
"""Fully-dynamic replacement for dataclass-based ElementMetadata."""

Expand Down Expand Up @@ -176,6 +188,7 @@ class ElementMetadata:
header_footer_type: Optional[str]
# -- used in chunks only, when chunk must be split mid-text to fit window --
is_continuation: Optional[bool]
key_value_pairs: Optional[list[FormKeyValuePair]]
languages: Optional[list[str]]
last_modified: Optional[str]
link_texts: Optional[list[str]]
Expand Down Expand Up @@ -327,6 +340,8 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata:
self.data_source = DataSourceMetadata.from_dict(field_value)
elif field_name == "orig_elements":
self.orig_elements = elements_from_base64_gzipped_json(field_value)
elif field_name == "key_value_pairs":
self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value)
else:
setattr(self, field_name, field_value)

Expand Down Expand Up @@ -392,6 +407,8 @@ def to_dict(self) -> dict[str, Any]:
meta_dict["data_source"] = self.data_source.to_dict()
if self.orig_elements is not None:
meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements)
if self.key_value_pairs is not None:
meta_dict["key_value_pairs"] = _kvform_pairs_to_dict(self.key_value_pairs)

return meta_dict

Expand Down Expand Up @@ -494,6 +511,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
"text_as_html": cls.FIRST, # -- only occurs in Table --
"table_as_cells": cls.FIRST, # -- only occurs in Table --
"url": cls.FIRST,
"key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues --
}


Expand Down Expand Up @@ -660,6 +678,7 @@ class ElementType:
PAGE_FOOTER = "Page-footer"
PAGE_NUMBER = "PageNumber"
CODE_SNIPPET = "CodeSnippet"
FORM_KEYS_VALUES = "FormKeysValues"

@classmethod
def to_dict(cls):
Expand Down Expand Up @@ -992,6 +1011,12 @@ class PageNumber(Text):
category = "PageNumber"


class FormKeysValues(Text):
"""An element for capturing Key-Value dicts (forms)."""

category = "FormKeysValues"


TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
ElementType.TITLE: Title,
ElementType.SECTION_HEADER: Title,
Expand Down Expand Up @@ -1029,4 +1054,43 @@ class PageNumber(Text):
ElementType.PAGE_BREAK: PageBreak,
ElementType.CODE_SNIPPET: CodeSnippet,
ElementType.PAGE_NUMBER: PageNumber,
ElementType.FORM_KEYS_VALUES: FormKeysValues,
}


def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]:
"""
The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from dicts into Elements explicitly,
e.g. when partition_json is used.
"""
from unstructured.staging.base import elements_from_dicts

# safe to overwrite - deepcopy already happened
for kv_pair in kv_pairs:
if kv_pair["key"]["custom_element"] is not None:
(kv_pair["key"]["custom_element"],) = elements_from_dicts(
[kv_pair["key"]["custom_element"]]
)
if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
(kv_pair["value"]["custom_element"],) = elements_from_dicts(
[kv_pair["value"]["custom_element"]]
)
return kv_pairs


def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]:
"""
The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from Elements to dicts recursively,
e.g. when FormKeysValues.to_dict() is used.

"""
kv_pairs: list[dict] = copy.deepcopy(kv_pairs)
for kv_pair in kv_pairs:
if kv_pair["key"]["custom_element"] is not None:
kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()
if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict()

return kv_pairs
13 changes: 10 additions & 3 deletions unstructured/partition/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata
from unstructured.partition.common import exactly_one
from unstructured.partition.lang import (
check_language_args,
)
from unstructured.partition.lang import check_language_args
from unstructured.partition.pdf import partition_pdf_or_image
from unstructured.partition.utils.constants import PartitionStrategy

Expand All @@ -33,6 +31,8 @@ def partition_image(
extract_image_block_to_payload: bool = False,
date_from_file_object: bool = False,
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
**kwargs: Any,
) -> list[Element]:
"""Parses an image into a list of interpreted elements.
Expand Down Expand Up @@ -90,6 +90,11 @@ def partition_image(
date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
extract_forms
Whether the form extraction logic should be run
(results in adding FormKeysValues elements to output).
form_extraction_skip_tables
Whether the form extraction logic should ignore regions designated as Tables.
"""
exactly_one(filename=filename, file=file)

Expand All @@ -111,5 +116,7 @@ def partition_image(
extract_image_block_to_payload=extract_image_block_to_payload,
date_from_file_object=date_from_file_object,
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
**kwargs,
)
Loading
Loading