From d3e3248f97a1c78f49cc14763b57d99729a89439 Mon Sep 17 00:00:00 2001 From: TakaValley Date: Wed, 28 Feb 2024 18:11:15 +0800 Subject: [PATCH] add sample code about extracting table cell values --- .../azure-ai-documentintelligence/README.md | 60 +++++++------ .../sample_analyze_custom_documents_async.py | 85 ++++++++++++------ .../samples/aio/sample_analyze_read_async.py | 5 ++ .../sample_analyze_custom_documents.py | 86 +++++++++++++------ .../samples/sample_analyze_read.py | 5 ++ 5 files changed, 166 insertions(+), 75 deletions(-) diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/README.md b/sdk/documentintelligence/azure-ai-documentintelligence/README.md index 15c5a1516669b..5614b9419b3de 100644 --- a/sdk/documentintelligence/azure-ai-documentintelligence/README.md +++ b/sdk/documentintelligence/azure-ai-documentintelligence/README.md @@ -408,6 +408,7 @@ For best results, you should only analyze documents of the same document type th from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.ai.documentintelligence.models import AnalyzeResult +from helper import utils endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"] key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"] @@ -435,31 +436,40 @@ if result.documents: f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}" ) -# iterate over tables, lines, and selection marks on each page -for page in result.pages: - print(f"\nLines found on page {page.page_number}") - if page.lines: - for line in page.lines: - print(f"...Line '{line.content}'") - if page.words: - for word in page.words: - print(f"...Word '{word.content}' has a confidence of {word.confidence}") - if page.selection_marks: - print(f"\nSelection marks found on page {page.page_number}") - for selection_mark in page.selection_marks: - print( - f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}" - ) - -if result.tables: - for i, table in enumerate(result.tables): - print(f"\nTable {i + 1} can be found on page:") - if table.bounding_regions: - for region in table.bounding_regions: - print(f"...{region.page_number}") - for cell in table.cells: - print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'") -print("-----------------------------------") + # Extract table cell values + SYMBOL_OF_TABLE_TYPE = "array" + KEY_OF_VALUE_OBJECT = "valueObject" + KEY_OF_CELL_CONTENT = "content" + + for doc in result.documents: + if not doc.fields is None: + for field_name, field_value in doc.fields.items(): + # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field. + if ( + field_name == "MaintenanceLog" + and field_value.type == SYMBOL_OF_TABLE_TYPE + and field_value.value_array + ): + col_names = [] + sample_obj = field_value.value_array[0] + if KEY_OF_VALUE_OBJECT in sample_obj: + col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys()) + print("----Extracting Table Cell Values----") + table_rows = [] + for obj in field_value.value_array: + if KEY_OF_VALUE_OBJECT in obj: + value_obj = obj[KEY_OF_VALUE_OBJECT] + extract_value_by_col_name = lambda key: ( + value_obj[key].get(KEY_OF_CELL_CONTENT) + if key in value_obj + and KEY_OF_CELL_CONTENT in value_obj[key] + else "None" + ) + row_data = list(map(extract_value_by_col_name, col_names)) + table_rows.append(row_data) + utils.print_table(col_names, table_rows) + +print("------------------------------------") ``` diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py index 7ae8d9f69950e..c394ee1dfe45a 100644 --- a/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py +++ b/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py @@ -30,6 +30,33 @@ import os import asyncio +def print_table(header_names, table_data): + """Print a two-dimensional array like a table. + + Based on provided column header names and two two-dimensional array data, print the strings like table. + + Args: + header_names: An array of string, it's the column header names. e.g. ["name", "gender", "age"] + table_data: A two-dimensional array, they're the table data. e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]] + Return: None + It's will print the string like table in output window. e.g. + Name Gender Age + Mike M 25 + John M 19 + Lily F 23 + """ + max_len_list = [] + for i in range(len(header_names)): + col_values = list(map(lambda row: len(str(row[i])), table_data)) + col_values.append(len(str(header_names[i]))) + max_len_list.append(max(col_values)) + + row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list)) + + print(row_format_str.format(*header_names)) + for row in table_data: + print(row_format_str.format(*row)) + async def analyze_custom_documents(custom_model_id): path_to_sample_documents = os.path.abspath( @@ -65,31 +92,39 @@ async def analyze_custom_documents(custom_model_id): print( f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}" ) - - # iterate over tables, lines, and selection marks on each page - for page in result.pages: - print(f"\nLines found on page {page.page_number}") - if page.lines: - for line in page.lines: - print(f"...Line '{line.content}'") - if page.words: - for word in page.words: - print(f"...Word '{word.content}' has a confidence of {word.confidence}") - if page.selection_marks: - print(f"\nSelection marks found on page {page.page_number}") - for selection_mark in page.selection_marks: - print( - f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}" - ) - - if result.tables: - for i, table in enumerate(result.tables): - print(f"\nTable {i + 1} can be found on page:") - if table.bounding_regions: - for region in table.bounding_regions: - print(f"...{region.page_number}") - for cell in table.cells: - print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'") + + # Extract table cell values + SYMBOL_OF_TABLE_TYPE = "array" + KEY_OF_VALUE_OBJECT = "valueObject" + KEY_OF_CELL_CONTENT = "content" + + for doc in result.documents: + if not doc.fields is None: + for field_name, field_value in doc.fields.items(): + # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field. + if ( + field_name == "MaintenanceLog" + and field_value.type == SYMBOL_OF_TABLE_TYPE + and field_value.value_array + ): + col_names = [] + sample_obj = field_value.value_array[0] + if KEY_OF_VALUE_OBJECT in sample_obj: + col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys()) + print("----Extracting Table Cell Values----") + table_rows = [] + for obj in field_value.value_array: + if KEY_OF_VALUE_OBJECT in obj: + value_obj = obj[KEY_OF_VALUE_OBJECT] + extract_value_by_col_name = lambda key: ( + value_obj[key].get(KEY_OF_CELL_CONTENT) + if key in value_obj + and KEY_OF_CELL_CONTENT in value_obj[key] + else "None" + ) + row_data = list(map(extract_value_by_col_name, col_names)) + table_rows.append(row_data) + print_table(col_names, table_rows) print("-----------------------------------") # [END analyze_custom_documents] diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py index ad57230383bfc..c2af2af54e113 100644 --- a/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py +++ b/sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py @@ -109,6 +109,11 @@ async def analyze_read(): for paragraph in result.paragraphs: print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region") print(f"...with content: '{paragraph.content}'") + + result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset)) + print("-----Print sorted paragraphs-----") + for idx, paragraph in enumerate(result.paragraphs): + print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}") print("----------------------------------------") diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py index 0874a663b0978..19c21c42400ff 100644 --- a/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py +++ b/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py @@ -29,6 +29,33 @@ import os +def print_table(header_names, table_data): + """Print a two-dimensional array like a table. + + Based on provided column header names and two two-dimensional array data, print the strings like table. + + Args: + header_names: An array of string, it's the column header names. e.g. ["name", "gender", "age"] + table_data: A two-dimensional array, they're the table data. e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]] + Return: None + It's will print the string like table in output window. e.g. + Name Gender Age + Mike M 25 + John M 19 + Lily F 23 + """ + max_len_list = [] + for i in range(len(header_names)): + col_values = list(map(lambda row: len(str(row[i])), table_data)) + col_values.append(len(str(header_names[i]))) + max_len_list.append(max(col_values)) + + row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list)) + + print(row_format_str.format(*header_names)) + for row in table_data: + print(row_format_str.format(*row)) + def analyze_custom_documents(custom_model_id): path_to_sample_documents = os.path.abspath( @@ -65,31 +92,40 @@ def analyze_custom_documents(custom_model_id): f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}" ) - # iterate over tables, lines, and selection marks on each page - for page in result.pages: - print(f"\nLines found on page {page.page_number}") - if page.lines: - for line in page.lines: - print(f"...Line '{line.content}'") - if page.words: - for word in page.words: - print(f"...Word '{word.content}' has a confidence of {word.confidence}") - if page.selection_marks: - print(f"\nSelection marks found on page {page.page_number}") - for selection_mark in page.selection_marks: - print( - f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}" - ) - - if result.tables: - for i, table in enumerate(result.tables): - print(f"\nTable {i + 1} can be found on page:") - if table.bounding_regions: - for region in table.bounding_regions: - print(f"...{region.page_number}") - for cell in table.cells: - print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'") - print("-----------------------------------") + # Extract table cell values + SYMBOL_OF_TABLE_TYPE = "array" + KEY_OF_VALUE_OBJECT = "valueObject" + KEY_OF_CELL_CONTENT = "content" + + for doc in result.documents: + if not doc.fields is None: + for field_name, field_value in doc.fields.items(): + # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field. + if ( + field_name == "MaintenanceLog" + and field_value.type == SYMBOL_OF_TABLE_TYPE + and field_value.value_array + ): + col_names = [] + sample_obj = field_value.value_array[0] + if KEY_OF_VALUE_OBJECT in sample_obj: + col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys()) + print("----Extracting Table Cell Values----") + table_rows = [] + for obj in field_value.value_array: + if KEY_OF_VALUE_OBJECT in obj: + value_obj = obj[KEY_OF_VALUE_OBJECT] + extract_value_by_col_name = lambda key: ( + value_obj[key].get(KEY_OF_CELL_CONTENT) + if key in value_obj + and KEY_OF_CELL_CONTENT in value_obj[key] + else "None" + ) + row_data = list(map(extract_value_by_col_name, col_names)) + table_rows.append(row_data) + print_table(col_names, table_rows) + + print("------------------------------------") # [END analyze_custom_documents] diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py index 2092217223110..8a71549348501 100644 --- a/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py +++ b/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py @@ -106,6 +106,11 @@ def analyze_read(): for paragraph in result.paragraphs: print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region") print(f"...with content: '{paragraph.content}'") + + result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset)) + print("-----Print sorted paragraphs-----") + for idx, paragraph in enumerate(result.paragraphs): + print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}") print("----------------------------------------")