Skip to content

Commit

Permalink
add sample code about extracting table cell values
Browse files Browse the repository at this point in the history
  • Loading branch information
TakaValley committed Mar 21, 2024
1 parent 56c346a commit 391fbdd
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 75 deletions.
59 changes: 34 additions & 25 deletions sdk/documentintelligence/azure-ai-documentintelligence/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -435,31 +435,40 @@ if result.documents:
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
)

# iterate over tables, lines, and selection marks on each page
for page in result.pages:
print(f"\nLines found on page {page.page_number}")
if page.lines:
for line in page.lines:
print(f"...Line '{line.content}'")
if page.words:
for word in page.words:
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
if page.selection_marks:
print(f"\nSelection marks found on page {page.page_number}")
for selection_mark in page.selection_marks:
print(
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
)

if result.tables:
for i, table in enumerate(result.tables):
print(f"\nTable {i + 1} can be found on page:")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"...{region.page_number}")
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
print("-----------------------------------")
# Extract table cell values
SYMBOL_OF_TABLE_TYPE = "array"
KEY_OF_VALUE_OBJECT = "valueObject"
KEY_OF_CELL_CONTENT = "content"

for doc in result.documents:
if not doc.fields is None:
for field_name, field_value in doc.fields.items():
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
if (
field_name == "MaintenanceLog"
and field_value.type == SYMBOL_OF_TABLE_TYPE
and field_value.value_array
):
col_names = []
sample_obj = field_value.value_array[0]
if KEY_OF_VALUE_OBJECT in sample_obj:
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
print("----Extracting Table Cell Values----")
table_rows = []
for obj in field_value.value_array:
if KEY_OF_VALUE_OBJECT in obj:
value_obj = obj[KEY_OF_VALUE_OBJECT]
extract_value_by_col_name = lambda key: (
value_obj[key].get(KEY_OF_CELL_CONTENT)
if key in value_obj
and KEY_OF_CELL_CONTENT in value_obj[key]
else "None"
)
row_data = list(map(extract_value_by_col_name, col_names))
table_rows.append(row_data)
print_table(col_names, table_rows)

print("------------------------------------")
```

<!-- END SNIPPET -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,33 @@
import os
import asyncio

def print_table(header_names, table_data):
"""Print a two-dimensional array like a table.
Based on provided column header names and two two-dimensional array data, print the strings like table.
Args:
header_names: An array of string, it's the column header names. e.g. ["name", "gender", "age"]
table_data: A two-dimensional array, they're the table data. e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]]
Return: None
It's will print the string like table in output window. e.g.
Name Gender Age
Mike M 25
John M 19
Lily F 23
"""
max_len_list = []
for i in range(len(header_names)):
col_values = list(map(lambda row: len(str(row[i])), table_data))
col_values.append(len(str(header_names[i])))
max_len_list.append(max(col_values))

row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list))

print(row_format_str.format(*header_names))
for row in table_data:
print(row_format_str.format(*row))


async def analyze_custom_documents(custom_model_id):
path_to_sample_documents = os.path.abspath(
Expand Down Expand Up @@ -65,31 +92,39 @@ async def analyze_custom_documents(custom_model_id):
print(
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
)

# iterate over tables, lines, and selection marks on each page
for page in result.pages:
print(f"\nLines found on page {page.page_number}")
if page.lines:
for line in page.lines:
print(f"...Line '{line.content}'")
if page.words:
for word in page.words:
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
if page.selection_marks:
print(f"\nSelection marks found on page {page.page_number}")
for selection_mark in page.selection_marks:
print(
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
)

if result.tables:
for i, table in enumerate(result.tables):
print(f"\nTable {i + 1} can be found on page:")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"...{region.page_number}")
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")

# Extract table cell values
SYMBOL_OF_TABLE_TYPE = "array"
KEY_OF_VALUE_OBJECT = "valueObject"
KEY_OF_CELL_CONTENT = "content"

for doc in result.documents:
if not doc.fields is None:
for field_name, field_value in doc.fields.items():
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
if (
field_name == "MaintenanceLog"
and field_value.type == SYMBOL_OF_TABLE_TYPE
and field_value.value_array
):
col_names = []
sample_obj = field_value.value_array[0]
if KEY_OF_VALUE_OBJECT in sample_obj:
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
print("----Extracting Table Cell Values----")
table_rows = []
for obj in field_value.value_array:
if KEY_OF_VALUE_OBJECT in obj:
value_obj = obj[KEY_OF_VALUE_OBJECT]
extract_value_by_col_name = lambda key: (
value_obj[key].get(KEY_OF_CELL_CONTENT)
if key in value_obj
and KEY_OF_CELL_CONTENT in value_obj[key]
else "None"
)
row_data = list(map(extract_value_by_col_name, col_names))
table_rows.append(row_data)
print_table(col_names, table_rows)
print("-----------------------------------")
# [END analyze_custom_documents]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ async def analyze_read():
for paragraph in result.paragraphs:
print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
print(f"...with content: '{paragraph.content}'")

result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
print("-----Print sorted paragraphs-----")
for idx, paragraph in enumerate(result.paragraphs):
print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")

print("----------------------------------------")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,33 @@

import os

def print_table(header_names, table_data):
"""Print a two-dimensional array like a table.
Based on provided column header names and two two-dimensional array data, print the strings like table.
Args:
header_names: An array of string, it's the column header names. e.g. ["name", "gender", "age"]
table_data: A two-dimensional array, they're the table data. e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]]
Return: None
It's will print the string like table in output window. e.g.
Name Gender Age
Mike M 25
John M 19
Lily F 23
"""
max_len_list = []
for i in range(len(header_names)):
col_values = list(map(lambda row: len(str(row[i])), table_data))
col_values.append(len(str(header_names[i])))
max_len_list.append(max(col_values))

row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list))

print(row_format_str.format(*header_names))
for row in table_data:
print(row_format_str.format(*row))


def analyze_custom_documents(custom_model_id):
path_to_sample_documents = os.path.abspath(
Expand Down Expand Up @@ -65,31 +92,40 @@ def analyze_custom_documents(custom_model_id):
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
)

# iterate over tables, lines, and selection marks on each page
for page in result.pages:
print(f"\nLines found on page {page.page_number}")
if page.lines:
for line in page.lines:
print(f"...Line '{line.content}'")
if page.words:
for word in page.words:
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
if page.selection_marks:
print(f"\nSelection marks found on page {page.page_number}")
for selection_mark in page.selection_marks:
print(
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
)

if result.tables:
for i, table in enumerate(result.tables):
print(f"\nTable {i + 1} can be found on page:")
if table.bounding_regions:
for region in table.bounding_regions:
print(f"...{region.page_number}")
for cell in table.cells:
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
print("-----------------------------------")
# Extract table cell values
SYMBOL_OF_TABLE_TYPE = "array"
KEY_OF_VALUE_OBJECT = "valueObject"
KEY_OF_CELL_CONTENT = "content"

for doc in result.documents:
if not doc.fields is None:
for field_name, field_value in doc.fields.items():
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
if (
field_name == "MaintenanceLog"
and field_value.type == SYMBOL_OF_TABLE_TYPE
and field_value.value_array
):
col_names = []
sample_obj = field_value.value_array[0]
if KEY_OF_VALUE_OBJECT in sample_obj:
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
print("----Extracting Table Cell Values----")
table_rows = []
for obj in field_value.value_array:
if KEY_OF_VALUE_OBJECT in obj:
value_obj = obj[KEY_OF_VALUE_OBJECT]
extract_value_by_col_name = lambda key: (
value_obj[key].get(KEY_OF_CELL_CONTENT)
if key in value_obj
and KEY_OF_CELL_CONTENT in value_obj[key]
else "None"
)
row_data = list(map(extract_value_by_col_name, col_names))
table_rows.append(row_data)
print_table(col_names, table_rows)

print("------------------------------------")
# [END analyze_custom_documents]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ def analyze_read():
for paragraph in result.paragraphs:
print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
print(f"...with content: '{paragraph.content}'")

result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
print("-----Print sorted paragraphs-----")
for idx, paragraph in enumerate(result.paragraphs):
print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")

print("----------------------------------------")

Expand Down

0 comments on commit 391fbdd

Please sign in to comment.