Skip to content

Commit

Permalink
add sample code about extracting table cell values
Browse files Browse the repository at this point in the history
  • Loading branch information
TakaValley committed Mar 21, 2024
1 parent 56c346a commit d3e3248
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 75 deletions.
60 changes: 35 additions & 25 deletions sdk/documentintelligence/azure-ai-documentintelligence/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,7 @@ For best results, you should only analyze documents of the same document type th
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from helper import utils

endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]
Expand Down Expand Up @@ -435,31 +436,40 @@ if result.documents:
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
)

# iterate over tables, lines, and selection marks on each page
for page in result.pages:
print(f"\nLines found on page {page.page_number}")
if page.lines:
for line in page.lines:
print(f"...Line '{line.content}'")
if page.words:
for word in page.words:
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
if page.selection_marks:
print(f"\nSelection marks found on page {page.page_number}")
for selection_mark in page.selection_marks:
print(
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
)

if result.tables:
for i, table in enumerate(result.tables):
print(f"\nTable {i + 1} can be found on page:")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"...{region.page_number}")
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
print("-----------------------------------")
# Extract table cell values
SYMBOL_OF_TABLE_TYPE = "array"
KEY_OF_VALUE_OBJECT = "valueObject"
KEY_OF_CELL_CONTENT = "content"

for doc in result.documents:
if not doc.fields is None:
for field_name, field_value in doc.fields.items():
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
if (
field_name == "MaintenanceLog"
and field_value.type == SYMBOL_OF_TABLE_TYPE
and field_value.value_array
):
col_names = []
sample_obj = field_value.value_array[0]
if KEY_OF_VALUE_OBJECT in sample_obj:
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
print("----Extracting Table Cell Values----")
table_rows = []
for obj in field_value.value_array:
if KEY_OF_VALUE_OBJECT in obj:
value_obj = obj[KEY_OF_VALUE_OBJECT]
extract_value_by_col_name = lambda key: (
value_obj[key].get(KEY_OF_CELL_CONTENT)
if key in value_obj
and KEY_OF_CELL_CONTENT in value_obj[key]
else "None"
)
row_data = list(map(extract_value_by_col_name, col_names))
table_rows.append(row_data)
utils.print_table(col_names, table_rows)

print("------------------------------------")
```

<!-- END SNIPPET -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,33 @@
import os
import asyncio

def print_table(header_names, table_data):
"""Print a two-dimensional array like a table.
Based on provided column header names and two two-dimensional array data, print the strings like table.
Args:
header_names: An array of string, it's the column header names. e.g. ["name", "gender", "age"]
table_data: A two-dimensional array, they're the table data. e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]]
Return: None
It's will print the string like table in output window. e.g.
Name Gender Age
Mike M 25
John M 19
Lily F 23
"""
max_len_list = []
for i in range(len(header_names)):
col_values = list(map(lambda row: len(str(row[i])), table_data))
col_values.append(len(str(header_names[i])))
max_len_list.append(max(col_values))

row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list))

print(row_format_str.format(*header_names))
for row in table_data:
print(row_format_str.format(*row))


async def analyze_custom_documents(custom_model_id):
path_to_sample_documents = os.path.abspath(
Expand Down Expand Up @@ -65,31 +92,39 @@ async def analyze_custom_documents(custom_model_id):
print(
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
)

# iterate over tables, lines, and selection marks on each page
for page in result.pages:
print(f"\nLines found on page {page.page_number}")
if page.lines:
for line in page.lines:
print(f"...Line '{line.content}'")
if page.words:
for word in page.words:
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
if page.selection_marks:
print(f"\nSelection marks found on page {page.page_number}")
for selection_mark in page.selection_marks:
print(
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
)

if result.tables:
for i, table in enumerate(result.tables):
print(f"\nTable {i + 1} can be found on page:")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"...{region.page_number}")
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")

# Extract table cell values
SYMBOL_OF_TABLE_TYPE = "array"
KEY_OF_VALUE_OBJECT = "valueObject"
KEY_OF_CELL_CONTENT = "content"

for doc in result.documents:
if not doc.fields is None:
for field_name, field_value in doc.fields.items():
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
if (
field_name == "MaintenanceLog"
and field_value.type == SYMBOL_OF_TABLE_TYPE
and field_value.value_array
):
col_names = []
sample_obj = field_value.value_array[0]
if KEY_OF_VALUE_OBJECT in sample_obj:
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
print("----Extracting Table Cell Values----")
table_rows = []
for obj in field_value.value_array:
if KEY_OF_VALUE_OBJECT in obj:
value_obj = obj[KEY_OF_VALUE_OBJECT]
extract_value_by_col_name = lambda key: (
value_obj[key].get(KEY_OF_CELL_CONTENT)
if key in value_obj
and KEY_OF_CELL_CONTENT in value_obj[key]
else "None"
)
row_data = list(map(extract_value_by_col_name, col_names))
table_rows.append(row_data)
print_table(col_names, table_rows)
print("-----------------------------------")
# [END analyze_custom_documents]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ async def analyze_read():
for paragraph in result.paragraphs:
print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
print(f"...with content: '{paragraph.content}'")

result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
print("-----Print sorted paragraphs-----")
for idx, paragraph in enumerate(result.paragraphs):
print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")

print("----------------------------------------")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,33 @@

import os

def print_table(header_names, table_data):
"""Print a two-dimensional array like a table.
Based on provided column header names and two two-dimensional array data, print the strings like table.
Args:
header_names: An array of string, it's the column header names. e.g. ["name", "gender", "age"]
table_data: A two-dimensional array, they're the table data. e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]]
Return: None
It's will print the string like table in output window. e.g.
Name Gender Age
Mike M 25
John M 19
Lily F 23
"""
max_len_list = []
for i in range(len(header_names)):
col_values = list(map(lambda row: len(str(row[i])), table_data))
col_values.append(len(str(header_names[i])))
max_len_list.append(max(col_values))

row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list))

print(row_format_str.format(*header_names))
for row in table_data:
print(row_format_str.format(*row))


def analyze_custom_documents(custom_model_id):
path_to_sample_documents = os.path.abspath(
Expand Down Expand Up @@ -65,31 +92,40 @@ def analyze_custom_documents(custom_model_id):
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
)

# iterate over tables, lines, and selection marks on each page
for page in result.pages:
print(f"\nLines found on page {page.page_number}")
if page.lines:
for line in page.lines:
print(f"...Line '{line.content}'")
if page.words:
for word in page.words:
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
if page.selection_marks:
print(f"\nSelection marks found on page {page.page_number}")
for selection_mark in page.selection_marks:
print(
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
)

if result.tables:
for i, table in enumerate(result.tables):
print(f"\nTable {i + 1} can be found on page:")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"...{region.page_number}")
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
print("-----------------------------------")
# Extract table cell values
SYMBOL_OF_TABLE_TYPE = "array"
KEY_OF_VALUE_OBJECT = "valueObject"
KEY_OF_CELL_CONTENT = "content"

for doc in result.documents:
if not doc.fields is None:
for field_name, field_value in doc.fields.items():
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
if (
field_name == "MaintenanceLog"
and field_value.type == SYMBOL_OF_TABLE_TYPE
and field_value.value_array
):
col_names = []
sample_obj = field_value.value_array[0]
if KEY_OF_VALUE_OBJECT in sample_obj:
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
print("----Extracting Table Cell Values----")
table_rows = []
for obj in field_value.value_array:
if KEY_OF_VALUE_OBJECT in obj:
value_obj = obj[KEY_OF_VALUE_OBJECT]
extract_value_by_col_name = lambda key: (
value_obj[key].get(KEY_OF_CELL_CONTENT)
if key in value_obj
and KEY_OF_CELL_CONTENT in value_obj[key]
else "None"
)
row_data = list(map(extract_value_by_col_name, col_names))
table_rows.append(row_data)
print_table(col_names, table_rows)

print("------------------------------------")
# [END analyze_custom_documents]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ def analyze_read():
for paragraph in result.paragraphs:
print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
print(f"...with content: '{paragraph.content}'")

result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
print("-----Print sorted paragraphs-----")
for idx, paragraph in enumerate(result.paragraphs):
print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")

print("----------------------------------------")

Expand Down

0 comments on commit d3e3248

Please sign in to comment.