Skip to content

Commit

Permalink
add sample code about extracting table cell values
Browse files Browse the repository at this point in the history
  • Loading branch information
TakaValley committed Mar 8, 2024
1 parent 56c346a commit a4e99fa
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 76 deletions.
57 changes: 33 additions & 24 deletions sdk/documentintelligence/azure-ai-documentintelligence/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,7 @@ For best results, you should only analyze documents of the same document type th
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from helper import utils

endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]
Expand Down Expand Up @@ -435,31 +436,39 @@ if result.documents:
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
)

# iterate over tables, lines, and selection marks on each page
for page in result.pages:
print(f"\nLines found on page {page.page_number}")
if page.lines:
for line in page.lines:
print(f"...Line '{line.content}'")
if page.words:
for word in page.words:
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
if page.selection_marks:
print(f"\nSelection marks found on page {page.page_number}")
for selection_mark in page.selection_marks:
print(
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
)
# Extract table cell values
SYMBOL_OF_TABLE_TYPE = "array"
KEY_OF_VALUE_OBJECT = "valueObject"
KEY_OF_CELL_CONTENT = "content"

for doc in result.documents:
for field_name, field_value in doc.fields.items():
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
if (
field_name == "MaintenanceLog"
and field_value.type == SYMBOL_OF_TABLE_TYPE
and field_value.value_array
):
col_names = []
sample_obj = field_value.value_array[0]
if KEY_OF_VALUE_OBJECT in sample_obj:
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
print("----Extracting Table Cell Values----")
table_rows = []
for obj in field_value.value_array:
if KEY_OF_VALUE_OBJECT in obj:
value_obj = obj[KEY_OF_VALUE_OBJECT]
extract_value_by_col_name = lambda key: (
value_obj[key].get(KEY_OF_CELL_CONTENT)
if key in value_obj
and KEY_OF_CELL_CONTENT in value_obj[key]
else "None"
)
row_data = list(map(extract_value_by_col_name, col_names))
table_rows.append(row_data)
utils.print_table(col_names, table_rows)

if result.tables:
for i, table in enumerate(result.tables):
print(f"\nTable {i + 1} can be found on page:")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"...{region.page_number}")
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
print("-----------------------------------")
print("------------------------------------")
```

<!-- END SNIPPET -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,20 @@

import os
import asyncio

import sys

async def analyze_custom_documents(custom_model_id):
path_of_parents = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
path_to_sample_documents = os.path.abspath(
os.path.join(os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg")
os.path.join(path_of_parents, "./sample_forms/forms/Form_1.jpg")
)
sys.path.append(path_of_parents)

# [START analyze_custom_documents]
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from helper import utils

endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]
Expand All @@ -65,31 +69,39 @@ async def analyze_custom_documents(custom_model_id):
print(
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
)

# iterate over tables, lines, and selection marks on each page
for page in result.pages:
print(f"\nLines found on page {page.page_number}")
if page.lines:
for line in page.lines:
print(f"...Line '{line.content}'")
if page.words:
for word in page.words:
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
if page.selection_marks:
print(f"\nSelection marks found on page {page.page_number}")
for selection_mark in page.selection_marks:
print(
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
)

if result.tables:
for i, table in enumerate(result.tables):
print(f"\nTable {i + 1} can be found on page:")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"...{region.page_number}")
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")

# Extract table cell values
SYMBOL_OF_TABLE_TYPE = "array"
KEY_OF_VALUE_OBJECT = "valueObject"
KEY_OF_CELL_CONTENT = "content"

for doc in result.documents:
if not doc.fields is None:
for field_name, field_value in doc.fields.items():
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
if (
field_name == "MaintenanceLog"
and field_value.type == SYMBOL_OF_TABLE_TYPE
and field_value.value_array
):
col_names = []
sample_obj = field_value.value_array[0]
if KEY_OF_VALUE_OBJECT in sample_obj:
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
print("----Extracting Table Cell Values----")
table_rows = []
for obj in field_value.value_array:
if KEY_OF_VALUE_OBJECT in obj:
value_obj = obj[KEY_OF_VALUE_OBJECT]
extract_value_by_col_name = lambda key: (
value_obj[key].get(KEY_OF_CELL_CONTENT)
if key in value_obj
and KEY_OF_CELL_CONTENT in value_obj[key]
else "None"
)
row_data = list(map(extract_value_by_col_name, col_names))
table_rows.append(row_data)
utils.print_table(col_names, table_rows)
print("-----------------------------------")
# [END analyze_custom_documents]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ async def analyze_read():
for paragraph in result.paragraphs:
print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
print(f"...with content: '{paragraph.content}'")

result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
print("-----Print sorted paragraphs-----")
for idx, paragraph in enumerate(result.paragraphs):
print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")

print("----------------------------------------")

Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# coding: utf-8

# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------

"""
FILE: utils.py
DESCRIPTION:
These util functions provide an intuitionistic way to organize data. Make sample code more concise.
"""

def print_table(header_names, table_data):
"""Print a two-dimensional array like a table.
Based on provided column header names and two two-dimensional array data, print the strings like table.
Args:
header_names: An array of string, it's the column header names. e.g. ["name", "gender", "age"]
table_data: A two-dimensional array, they're the table data. e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]]
Return: None
It's will print the string like table in output window. e.g.
Name Gender Age
Mike M 25
John M 19
Lily F 23
"""
max_len_list = []
for i in range(len(header_names)):
col_values = list(map(lambda row: len(str(row[i])), table_data))
col_values.append(len(str(header_names[i])))
max_len_list.append(max(col_values))

row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list))

print(row_format_str.format(*header_names))
for row in table_data:
print(row_format_str.format(*row))
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def analyze_custom_documents(custom_model_id):
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from helper import utils

endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]
Expand Down Expand Up @@ -65,31 +66,40 @@ def analyze_custom_documents(custom_model_id):
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
)

# iterate over tables, lines, and selection marks on each page
for page in result.pages:
print(f"\nLines found on page {page.page_number}")
if page.lines:
for line in page.lines:
print(f"...Line '{line.content}'")
if page.words:
for word in page.words:
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
if page.selection_marks:
print(f"\nSelection marks found on page {page.page_number}")
for selection_mark in page.selection_marks:
print(
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
)

if result.tables:
for i, table in enumerate(result.tables):
print(f"\nTable {i + 1} can be found on page:")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"...{region.page_number}")
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
print("-----------------------------------")
# Extract table cell values
SYMBOL_OF_TABLE_TYPE = "array"
KEY_OF_VALUE_OBJECT = "valueObject"
KEY_OF_CELL_CONTENT = "content"

for doc in result.documents:
if not doc.fields is None:
for field_name, field_value in doc.fields.items():
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
if (
field_name == "MaintenanceLog"
and field_value.type == SYMBOL_OF_TABLE_TYPE
and field_value.value_array
):
col_names = []
sample_obj = field_value.value_array[0]
if KEY_OF_VALUE_OBJECT in sample_obj:
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
print("----Extracting Table Cell Values----")
table_rows = []
for obj in field_value.value_array:
if KEY_OF_VALUE_OBJECT in obj:
value_obj = obj[KEY_OF_VALUE_OBJECT]
extract_value_by_col_name = lambda key: (
value_obj[key].get(KEY_OF_CELL_CONTENT)
if key in value_obj
and KEY_OF_CELL_CONTENT in value_obj[key]
else "None"
)
row_data = list(map(extract_value_by_col_name, col_names))
table_rows.append(row_data)
utils.print_table(col_names, table_rows)

print("------------------------------------")
# [END analyze_custom_documents]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ def analyze_read():
for paragraph in result.paragraphs:
print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
print(f"...with content: '{paragraph.content}'")

result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
print("-----Print sorted paragraphs-----")
for idx, paragraph in enumerate(result.paragraphs):
print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")

print("----------------------------------------")

Expand Down

0 comments on commit a4e99fa

Please sign in to comment.