From a4e99faee91c66decee1e2c75627b385795ad724 Mon Sep 17 00:00:00 2001 From: TakaValley Date: Wed, 28 Feb 2024 18:11:15 +0800 Subject: [PATCH] add sample code about extracting table cell values --- .../azure-ai-documentintelligence/README.md | 57 +++++++++------- .../sample_analyze_custom_documents_async.py | 66 +++++++++++-------- .../samples/aio/sample_analyze_read_async.py | 5 ++ .../samples/helper/__init__.py | 0 .../samples/helper/utils.py | 41 ++++++++++++ .../sample_analyze_custom_documents.py | 60 ++++++++++------- .../samples/sample_analyze_read.py | 5 ++ 7 files changed, 158 insertions(+), 76 deletions(-) create mode 100644 sdk/documentintelligence/azure-ai-documentintelligence/samples/helper/__init__.py create mode 100644 sdk/documentintelligence/azure-ai-documentintelligence/samples/helper/utils.py diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/README.md b/sdk/documentintelligence/azure-ai-documentintelligence/README.md index 15c5a1516669b..81f3974cc0b0b 100644 --- a/sdk/documentintelligence/azure-ai-documentintelligence/README.md +++ b/sdk/documentintelligence/azure-ai-documentintelligence/README.md @@ -408,6 +408,7 @@ For best results, you should only analyze documents of the same document type th from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.ai.documentintelligence.models import AnalyzeResult +from helper import utils endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"] key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"] @@ -435,31 +436,39 @@ if result.documents: f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}" ) -# iterate over tables, lines, and selection marks on each page -for page in result.pages: - print(f"\nLines found on page {page.page_number}") - if page.lines: - for line in page.lines: - print(f"...Line '{line.content}'") - if page.words: - for word in page.words: - print(f"...Word '{word.content}' has a confidence of {word.confidence}") - if page.selection_marks: - print(f"\nSelection marks found on page {page.page_number}") - for selection_mark in page.selection_marks: - print( - f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}" - ) + # Extract table cell values + SYMBOL_OF_TABLE_TYPE = "array" + KEY_OF_VALUE_OBJECT = "valueObject" + KEY_OF_CELL_CONTENT = "content" + + for doc in result.documents: + for field_name, field_value in doc.fields.items(): + # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field. + if ( + field_name == "MaintenanceLog" + and field_value.type == SYMBOL_OF_TABLE_TYPE + and field_value.value_array + ): + col_names = [] + sample_obj = field_value.value_array[0] + if KEY_OF_VALUE_OBJECT in sample_obj: + col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys()) + print("----Extracting Table Cell Values----") + table_rows = [] + for obj in field_value.value_array: + if KEY_OF_VALUE_OBJECT in obj: + value_obj = obj[KEY_OF_VALUE_OBJECT] + extract_value_by_col_name = lambda key: ( + value_obj[key].get(KEY_OF_CELL_CONTENT) + if key in value_obj + and KEY_OF_CELL_CONTENT in value_obj[key] + else "None" + ) + row_data = list(map(extract_value_by_col_name, col_names)) + table_rows.append(row_data) + utils.print_table(col_names, table_rows) -if result.tables: - for i, table in enumerate(result.tables): - print(f"\nTable {i + 1} can be found on page:") - if table.bounding_regions: - for region in table.bounding_regions: - print(f"...{region.page_number}") - for cell in table.cells: - print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'") -print("-----------------------------------") +print("------------------------------------") ``` diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py index 7ae8d9f69950e..327004db2ecf8 100644 --- a/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py +++ b/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py @@ -29,16 +29,20 @@ import os import asyncio - +import sys async def analyze_custom_documents(custom_model_id): + path_of_parents = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") path_to_sample_documents = os.path.abspath( - os.path.join(os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg") + os.path.join(path_of_parents, "./sample_forms/forms/Form_1.jpg") ) + sys.path.append(path_of_parents) + # [START analyze_custom_documents] from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence.aio import DocumentIntelligenceClient from azure.ai.documentintelligence.models import AnalyzeResult + from helper import utils endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"] key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"] @@ -65,31 +69,39 @@ async def analyze_custom_documents(custom_model_id): print( f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}" ) - - # iterate over tables, lines, and selection marks on each page - for page in result.pages: - print(f"\nLines found on page {page.page_number}") - if page.lines: - for line in page.lines: - print(f"...Line '{line.content}'") - if page.words: - for word in page.words: - print(f"...Word '{word.content}' has a confidence of {word.confidence}") - if page.selection_marks: - print(f"\nSelection marks found on page {page.page_number}") - for selection_mark in page.selection_marks: - print( - f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}" - ) - - if result.tables: - for i, table in enumerate(result.tables): - print(f"\nTable {i + 1} can be found on page:") - if table.bounding_regions: - for region in table.bounding_regions: - print(f"...{region.page_number}") - for cell in table.cells: - print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'") + + # Extract table cell values + SYMBOL_OF_TABLE_TYPE = "array" + KEY_OF_VALUE_OBJECT = "valueObject" + KEY_OF_CELL_CONTENT = "content" + + for doc in result.documents: + if not doc.fields is None: + for field_name, field_value in doc.fields.items(): + # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field. + if ( + field_name == "MaintenanceLog" + and field_value.type == SYMBOL_OF_TABLE_TYPE + and field_value.value_array + ): + col_names = [] + sample_obj = field_value.value_array[0] + if KEY_OF_VALUE_OBJECT in sample_obj: + col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys()) + print("----Extracting Table Cell Values----") + table_rows = [] + for obj in field_value.value_array: + if KEY_OF_VALUE_OBJECT in obj: + value_obj = obj[KEY_OF_VALUE_OBJECT] + extract_value_by_col_name = lambda key: ( + value_obj[key].get(KEY_OF_CELL_CONTENT) + if key in value_obj + and KEY_OF_CELL_CONTENT in value_obj[key] + else "None" + ) + row_data = list(map(extract_value_by_col_name, col_names)) + table_rows.append(row_data) + utils.print_table(col_names, table_rows) print("-----------------------------------") # [END analyze_custom_documents] diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py index ad57230383bfc..c2af2af54e113 100644 --- a/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py +++ b/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py @@ -109,6 +109,11 @@ async def analyze_read(): for paragraph in result.paragraphs: print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region") print(f"...with content: '{paragraph.content}'") + + result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset)) + print("-----Print sorted paragraphs-----") + for idx, paragraph in enumerate(result.paragraphs): + print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}") print("----------------------------------------") diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/helper/__init__.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/helper/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/helper/utils.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/helper/utils.py new file mode 100644 index 0000000000000..f863ef8d5adaa --- /dev/null +++ b/sdk/documentintelligence/azure-ai-documentintelligence/samples/helper/utils.py @@ -0,0 +1,41 @@ +# coding: utf-8 + +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- + +""" +FILE: utils.py + +DESCRIPTION: + These util functions provide an intuitionistic way to organize data. Make sample code more concise. +""" + +def print_table(header_names, table_data): + """Print a two-dimensional array like a table. + + Based on provided column header names and two two-dimensional array data, print the strings like table. + + Args: + header_names: An array of string, it's the column header names. e.g. ["name", "gender", "age"] + table_data: A two-dimensional array, they're the table data. e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]] + Return: None + It's will print the string like table in output window. e.g. + Name Gender Age + Mike M 25 + John M 19 + Lily F 23 + """ + max_len_list = [] + for i in range(len(header_names)): + col_values = list(map(lambda row: len(str(row[i])), table_data)) + col_values.append(len(str(header_names[i]))) + max_len_list.append(max(col_values)) + + row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list)) + + print(row_format_str.format(*header_names)) + for row in table_data: + print(row_format_str.format(*row)) diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py index 0874a663b0978..0982849e0cd09 100644 --- a/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py +++ b/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py @@ -38,6 +38,7 @@ def analyze_custom_documents(custom_model_id): from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.ai.documentintelligence.models import AnalyzeResult + from helper import utils endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"] key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"] @@ -65,31 +66,40 @@ def analyze_custom_documents(custom_model_id): f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}" ) - # iterate over tables, lines, and selection marks on each page - for page in result.pages: - print(f"\nLines found on page {page.page_number}") - if page.lines: - for line in page.lines: - print(f"...Line '{line.content}'") - if page.words: - for word in page.words: - print(f"...Word '{word.content}' has a confidence of {word.confidence}") - if page.selection_marks: - print(f"\nSelection marks found on page {page.page_number}") - for selection_mark in page.selection_marks: - print( - f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}" - ) - - if result.tables: - for i, table in enumerate(result.tables): - print(f"\nTable {i + 1} can be found on page:") - if table.bounding_regions: - for region in table.bounding_regions: - print(f"...{region.page_number}") - for cell in table.cells: - print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'") - print("-----------------------------------") + # Extract table cell values + SYMBOL_OF_TABLE_TYPE = "array" + KEY_OF_VALUE_OBJECT = "valueObject" + KEY_OF_CELL_CONTENT = "content" + + for doc in result.documents: + if not doc.fields is None: + for field_name, field_value in doc.fields.items(): + # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field. + if ( + field_name == "MaintenanceLog" + and field_value.type == SYMBOL_OF_TABLE_TYPE + and field_value.value_array + ): + col_names = [] + sample_obj = field_value.value_array[0] + if KEY_OF_VALUE_OBJECT in sample_obj: + col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys()) + print("----Extracting Table Cell Values----") + table_rows = [] + for obj in field_value.value_array: + if KEY_OF_VALUE_OBJECT in obj: + value_obj = obj[KEY_OF_VALUE_OBJECT] + extract_value_by_col_name = lambda key: ( + value_obj[key].get(KEY_OF_CELL_CONTENT) + if key in value_obj + and KEY_OF_CELL_CONTENT in value_obj[key] + else "None" + ) + row_data = list(map(extract_value_by_col_name, col_names)) + table_rows.append(row_data) + utils.print_table(col_names, table_rows) + + print("------------------------------------") # [END analyze_custom_documents] diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py index 2092217223110..8a71549348501 100644 --- a/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py +++ b/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py @@ -106,6 +106,11 @@ def analyze_read(): for paragraph in result.paragraphs: print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region") print(f"...with content: '{paragraph.content}'") + + result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset)) + print("-----Print sorted paragraphs-----") + for idx, paragraph in enumerate(result.paragraphs): + print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}") print("----------------------------------------")