add sample code about extracting table cell values

Azure · Mar 8, 2024 · a4e99fa · a4e99fa
1 parent 56c346a
commit a4e99fa
Show file tree

Hide file tree

Showing 7 changed files with 158 additions and 76 deletions.
diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/README.md b/sdk/documentintelligence/azure-ai-documentintelligence/README.md
@@ -408,6 +408,7 @@ For best results, you should only analyze documents of the same document type th
 from azure.core.credentials import AzureKeyCredential
 from azure.ai.documentintelligence import DocumentIntelligenceClient
 from azure.ai.documentintelligence.models import AnalyzeResult
+from helper import utils
 
 endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
 key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]
@@ -435,31 +436,39 @@ if result.documents:
                     f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
                 )
 
-# iterate over tables, lines, and selection marks on each page
-for page in result.pages:
-    print(f"\nLines found on page {page.page_number}")
-    if page.lines:
-        for line in page.lines:
-            print(f"...Line '{line.content}'")
-    if page.words:
-        for word in page.words:
-            print(f"...Word '{word.content}' has a confidence of {word.confidence}")
-    if page.selection_marks:
-        print(f"\nSelection marks found on page {page.page_number}")
-        for selection_mark in page.selection_marks:
-            print(
-                f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
-            )
+    # Extract table cell values
+    SYMBOL_OF_TABLE_TYPE = "array"
+    KEY_OF_VALUE_OBJECT = "valueObject"
+    KEY_OF_CELL_CONTENT = "content"
+
+    for doc in result.documents:
+        for field_name, field_value in doc.fields.items():
+            # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
+            if (
+                field_name == "MaintenanceLog"
+                and field_value.type == SYMBOL_OF_TABLE_TYPE
+                and field_value.value_array
+            ):
+                col_names = []
+                sample_obj = field_value.value_array[0]
+                if KEY_OF_VALUE_OBJECT in sample_obj:
+                    col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
+                print("----Extracting Table Cell Values----")
+                table_rows = []
+                for obj in field_value.value_array:
+                    if KEY_OF_VALUE_OBJECT in obj:
+                        value_obj = obj[KEY_OF_VALUE_OBJECT]
+                        extract_value_by_col_name = lambda key: (
+                            value_obj[key].get(KEY_OF_CELL_CONTENT)
+                            if key in value_obj
+                            and KEY_OF_CELL_CONTENT in value_obj[key]
+                            else "None"
+                        )
+                        row_data = list(map(extract_value_by_col_name, col_names))
+                        table_rows.append(row_data)
+                utils.print_table(col_names, table_rows)
 
-if result.tables:
-    for i, table in enumerate(result.tables):
-        print(f"\nTable {i + 1} can be found on page:")
-        if table.bounding_regions:
-            for region in table.bounding_regions:
-                print(f"...{region.page_number}")
-        for cell in table.cells:
-            print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
-print("-----------------------------------")
+print("------------------------------------")
 ```
 
 <!-- END SNIPPET -->

diff --git a/...igence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py b/...igence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py
@@ -29,16 +29,20 @@
 
 import os
 import asyncio
-
+import sys
 
 async def analyze_custom_documents(custom_model_id):
+    path_of_parents = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
     path_to_sample_documents = os.path.abspath(
-        os.path.join(os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg")
+        os.path.join(path_of_parents, "./sample_forms/forms/Form_1.jpg")
     )
+    sys.path.append(path_of_parents)
+
     # [START analyze_custom_documents]
     from azure.core.credentials import AzureKeyCredential
     from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
     from azure.ai.documentintelligence.models import AnalyzeResult
+    from helper import utils
 
     endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
     key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]
@@ -65,31 +69,39 @@ async def analyze_custom_documents(custom_model_id):
                     print(
                         f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
                     )
-
-    # iterate over tables, lines, and selection marks on each page
-    for page in result.pages:
-        print(f"\nLines found on page {page.page_number}")
-        if page.lines:
-            for line in page.lines:
-                print(f"...Line '{line.content}'")
-        if page.words:
-            for word in page.words:
-                print(f"...Word '{word.content}' has a confidence of {word.confidence}")
-        if page.selection_marks:
-            print(f"\nSelection marks found on page {page.page_number}")
-            for selection_mark in page.selection_marks:
-                print(
-                    f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
-                )
-
-    if result.tables:
-        for i, table in enumerate(result.tables):
-            print(f"\nTable {i + 1} can be found on page:")
-            if table.bounding_regions:
-                for region in table.bounding_regions:
-                    print(f"...{region.page_number}")
-            for cell in table.cells:
-                print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
+
+        # Extract table cell values
+        SYMBOL_OF_TABLE_TYPE = "array"
+        KEY_OF_VALUE_OBJECT = "valueObject"
+        KEY_OF_CELL_CONTENT = "content"
+
+        for doc in result.documents:
+            if not doc.fields is None:
+                for field_name, field_value in doc.fields.items():
+                    # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
+                    if (
+                        field_name == "MaintenanceLog"
+                        and field_value.type == SYMBOL_OF_TABLE_TYPE
+                        and field_value.value_array
+                    ):
+                        col_names = []
+                        sample_obj = field_value.value_array[0]
+                        if KEY_OF_VALUE_OBJECT in sample_obj:
+                            col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
+                        print("----Extracting Table Cell Values----")
+                        table_rows = []
+                        for obj in field_value.value_array:
+                            if KEY_OF_VALUE_OBJECT in obj:
+                                value_obj = obj[KEY_OF_VALUE_OBJECT]
+                                extract_value_by_col_name = lambda key: (
+                                    value_obj[key].get(KEY_OF_CELL_CONTENT)
+                                    if key in value_obj
+                                    and KEY_OF_CELL_CONTENT in value_obj[key]
+                                    else "None"
+                                )
+                                row_data = list(map(extract_value_by_col_name, col_names))
+                                table_rows.append(row_data)
+                        utils.print_table(col_names, table_rows)
     print("-----------------------------------")
     # [END analyze_custom_documents]
 

diff --git a/...cumentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py b/...cumentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py
@@ -109,6 +109,11 @@ async def analyze_read():
         for paragraph in result.paragraphs:
             print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
             print(f"...with content: '{paragraph.content}'")
+
+        result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
+        print("-----Print sorted paragraphs-----")
+        for idx, paragraph in enumerate(result.paragraphs):
+            print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")
 
     print("----------------------------------------")
 

diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/helper/__init__.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/helper/__init__.py
diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/helper/utils.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/helper/utils.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+"""
+FILE: utils.py
+
+DESCRIPTION:
+    These util functions provide an intuitionistic way to organize data. Make sample code more concise.
+"""
+
+def print_table(header_names, table_data):
+    """Print a two-dimensional array like a table.
+
+    Based on provided column header names and two two-dimensional array data, print the strings like table.
+
+    Args:
+        header_names: An array of string, it's the column header names.  e.g. ["name", "gender", "age"]
+        table_data: A two-dimensional array, they're the table data.  e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]]
+    Return: None
+        It's will print the string like table in output window. e.g.
+         Name    Gender    Age
+         Mike    M         25
+         John    M         19     
+         Lily    F         23
+    """
+    max_len_list = []
+    for i in range(len(header_names)):
+        col_values = list(map(lambda row: len(str(row[i])), table_data))
+        col_values.append(len(str(header_names[i])))
+        max_len_list.append(max(col_values))
+
+    row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list))
+
+    print(row_format_str.format(*header_names))
+    for row in table_data:
+        print(row_format_str.format(*row))
diff --git a/...mentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py b/...mentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py
@@ -38,6 +38,7 @@ def analyze_custom_documents(custom_model_id):
     from azure.core.credentials import AzureKeyCredential
     from azure.ai.documentintelligence import DocumentIntelligenceClient
     from azure.ai.documentintelligence.models import AnalyzeResult
+    from helper import utils
 
     endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
     key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]
@@ -65,31 +66,40 @@ def analyze_custom_documents(custom_model_id):
                         f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
                     )
 
-    # iterate over tables, lines, and selection marks on each page
-    for page in result.pages:
-        print(f"\nLines found on page {page.page_number}")
-        if page.lines:
-            for line in page.lines:
-                print(f"...Line '{line.content}'")
-        if page.words:
-            for word in page.words:
-                print(f"...Word '{word.content}' has a confidence of {word.confidence}")
-        if page.selection_marks:
-            print(f"\nSelection marks found on page {page.page_number}")
-            for selection_mark in page.selection_marks:
-                print(
-                    f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
-                )
-
-    if result.tables:
-        for i, table in enumerate(result.tables):
-            print(f"\nTable {i + 1} can be found on page:")
-            if table.bounding_regions:
-                for region in table.bounding_regions:
-                    print(f"...{region.page_number}")
-            for cell in table.cells:
-                print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
-    print("-----------------------------------")
+        # Extract table cell values
+        SYMBOL_OF_TABLE_TYPE = "array"
+        KEY_OF_VALUE_OBJECT = "valueObject"
+        KEY_OF_CELL_CONTENT = "content"
+
+        for doc in result.documents:
+            if not doc.fields is None:
+                for field_name, field_value in doc.fields.items():
+                    # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
+                    if (
+                        field_name == "MaintenanceLog"
+                        and field_value.type == SYMBOL_OF_TABLE_TYPE
+                        and field_value.value_array
+                    ):
+                        col_names = []
+                        sample_obj = field_value.value_array[0]
+                        if KEY_OF_VALUE_OBJECT in sample_obj:
+                            col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
+                        print("----Extracting Table Cell Values----")
+                        table_rows = []
+                        for obj in field_value.value_array:
+                            if KEY_OF_VALUE_OBJECT in obj:
+                                value_obj = obj[KEY_OF_VALUE_OBJECT]
+                                extract_value_by_col_name = lambda key: (
+                                    value_obj[key].get(KEY_OF_CELL_CONTENT)
+                                    if key in value_obj
+                                    and KEY_OF_CELL_CONTENT in value_obj[key]
+                                    else "None"
+                                )
+                                row_data = list(map(extract_value_by_col_name, col_names))
+                                table_rows.append(row_data)
+                        utils.print_table(col_names, table_rows)
+
+    print("------------------------------------")
     # [END analyze_custom_documents]
 
 

diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py
@@ -106,6 +106,11 @@ def analyze_read():
         for paragraph in result.paragraphs:
             print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
             print(f"...with content: '{paragraph.content}'")
+
+        result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
+        print("-----Print sorted paragraphs-----")
+        for idx, paragraph in enumerate(result.paragraphs):
+            print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")
 
     print("----------------------------------------")