add sample code about extracting table cell values

Azure · Mar 21, 2024 · d3e3248 · d3e3248
1 parent 56c346a
commit d3e3248
Show file tree

Hide file tree

Showing 5 changed files with 166 additions and 75 deletions.
diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/README.md b/sdk/documentintelligence/azure-ai-documentintelligence/README.md
@@ -408,6 +408,7 @@ For best results, you should only analyze documents of the same document type th
 from azure.core.credentials import AzureKeyCredential
 from azure.ai.documentintelligence import DocumentIntelligenceClient
 from azure.ai.documentintelligence.models import AnalyzeResult
+from helper import utils
 
 endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
 key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]
@@ -435,31 +436,40 @@ if result.documents:
                     f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
                 )
 
-# iterate over tables, lines, and selection marks on each page
-for page in result.pages:
-    print(f"\nLines found on page {page.page_number}")
-    if page.lines:
-        for line in page.lines:
-            print(f"...Line '{line.content}'")
-    if page.words:
-        for word in page.words:
-            print(f"...Word '{word.content}' has a confidence of {word.confidence}")
-    if page.selection_marks:
-        print(f"\nSelection marks found on page {page.page_number}")
-        for selection_mark in page.selection_marks:
-            print(
-                f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
-            )
-
-if result.tables:
-    for i, table in enumerate(result.tables):
-        print(f"\nTable {i + 1} can be found on page:")
-        if table.bounding_regions:
-            for region in table.bounding_regions:
-                print(f"...{region.page_number}")
-        for cell in table.cells:
-            print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
-print("-----------------------------------")
+    # Extract table cell values
+    SYMBOL_OF_TABLE_TYPE = "array"
+    KEY_OF_VALUE_OBJECT = "valueObject"
+    KEY_OF_CELL_CONTENT = "content"
+
+    for doc in result.documents:
+        if not doc.fields is None:
+            for field_name, field_value in doc.fields.items():
+                # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
+                if (
+                    field_name == "MaintenanceLog"
+                    and field_value.type == SYMBOL_OF_TABLE_TYPE
+                    and field_value.value_array
+                ):
+                    col_names = []
+                    sample_obj = field_value.value_array[0]
+                    if KEY_OF_VALUE_OBJECT in sample_obj:
+                        col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
+                    print("----Extracting Table Cell Values----")
+                    table_rows = []
+                    for obj in field_value.value_array:
+                        if KEY_OF_VALUE_OBJECT in obj:
+                            value_obj = obj[KEY_OF_VALUE_OBJECT]
+                            extract_value_by_col_name = lambda key: (
+                                value_obj[key].get(KEY_OF_CELL_CONTENT)
+                                if key in value_obj
+                                and KEY_OF_CELL_CONTENT in value_obj[key]
+                                else "None"
+                            )
+                            row_data = list(map(extract_value_by_col_name, col_names))
+                            table_rows.append(row_data)
+                    utils.print_table(col_names, table_rows)
+
+print("------------------------------------")
 ```
 
 <!-- END SNIPPET -->

diff --git a/...igence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py b/...igence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py
@@ -30,6 +30,33 @@
 import os
 import asyncio
 
+def print_table(header_names, table_data):
+    """Print a two-dimensional array like a table.
+
+    Based on provided column header names and two two-dimensional array data, print the strings like table.
+
+    Args:
+        header_names: An array of string, it's the column header names.  e.g. ["name", "gender", "age"]
+        table_data: A two-dimensional array, they're the table data.  e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]]
+    Return: None
+        It's will print the string like table in output window. e.g.
+         Name    Gender    Age
+         Mike    M         25
+         John    M         19     
+         Lily    F         23
+    """
+    max_len_list = []
+    for i in range(len(header_names)):
+        col_values = list(map(lambda row: len(str(row[i])), table_data))
+        col_values.append(len(str(header_names[i])))
+        max_len_list.append(max(col_values))
+
+    row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list))
+
+    print(row_format_str.format(*header_names))
+    for row in table_data:
+        print(row_format_str.format(*row))
+
 
 async def analyze_custom_documents(custom_model_id):
     path_to_sample_documents = os.path.abspath(
@@ -65,31 +92,39 @@ async def analyze_custom_documents(custom_model_id):
                     print(
                         f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
                     )
-
-    # iterate over tables, lines, and selection marks on each page
-    for page in result.pages:
-        print(f"\nLines found on page {page.page_number}")
-        if page.lines:
-            for line in page.lines:
-                print(f"...Line '{line.content}'")
-        if page.words:
-            for word in page.words:
-                print(f"...Word '{word.content}' has a confidence of {word.confidence}")
-        if page.selection_marks:
-            print(f"\nSelection marks found on page {page.page_number}")
-            for selection_mark in page.selection_marks:
-                print(
-                    f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
-                )
-
-    if result.tables:
-        for i, table in enumerate(result.tables):
-            print(f"\nTable {i + 1} can be found on page:")
-            if table.bounding_regions:
-                for region in table.bounding_regions:
-                    print(f"...{region.page_number}")
-            for cell in table.cells:
-                print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
+
+        # Extract table cell values
+        SYMBOL_OF_TABLE_TYPE = "array"
+        KEY_OF_VALUE_OBJECT = "valueObject"
+        KEY_OF_CELL_CONTENT = "content"
+
+        for doc in result.documents:
+            if not doc.fields is None:
+                for field_name, field_value in doc.fields.items():
+                    # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
+                    if (
+                        field_name == "MaintenanceLog"
+                        and field_value.type == SYMBOL_OF_TABLE_TYPE
+                        and field_value.value_array
+                    ):
+                        col_names = []
+                        sample_obj = field_value.value_array[0]
+                        if KEY_OF_VALUE_OBJECT in sample_obj:
+                            col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
+                        print("----Extracting Table Cell Values----")
+                        table_rows = []
+                        for obj in field_value.value_array:
+                            if KEY_OF_VALUE_OBJECT in obj:
+                                value_obj = obj[KEY_OF_VALUE_OBJECT]
+                                extract_value_by_col_name = lambda key: (
+                                    value_obj[key].get(KEY_OF_CELL_CONTENT)
+                                    if key in value_obj
+                                    and KEY_OF_CELL_CONTENT in value_obj[key]
+                                    else "None"
+                                )
+                                row_data = list(map(extract_value_by_col_name, col_names))
+                                table_rows.append(row_data)
+                        print_table(col_names, table_rows)
     print("-----------------------------------")
     # [END analyze_custom_documents]
 

diff --git a/...cumentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py b/...cumentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py
@@ -109,6 +109,11 @@ async def analyze_read():
         for paragraph in result.paragraphs:
             print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
             print(f"...with content: '{paragraph.content}'")
+
+        result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
+        print("-----Print sorted paragraphs-----")
+        for idx, paragraph in enumerate(result.paragraphs):
+            print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")
 
     print("----------------------------------------")
 

diff --git a/...mentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py b/...mentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py
@@ -29,6 +29,33 @@
 
 import os
 
+def print_table(header_names, table_data):
+    """Print a two-dimensional array like a table.
+
+    Based on provided column header names and two two-dimensional array data, print the strings like table.
+
+    Args:
+        header_names: An array of string, it's the column header names.  e.g. ["name", "gender", "age"]
+        table_data: A two-dimensional array, they're the table data.  e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]]
+    Return: None
+        It's will print the string like table in output window. e.g.
+         Name    Gender    Age
+         Mike    M         25
+         John    M         19     
+         Lily    F         23
+    """
+    max_len_list = []
+    for i in range(len(header_names)):
+        col_values = list(map(lambda row: len(str(row[i])), table_data))
+        col_values.append(len(str(header_names[i])))
+        max_len_list.append(max(col_values))
+
+    row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list))
+
+    print(row_format_str.format(*header_names))
+    for row in table_data:
+        print(row_format_str.format(*row))
+
 
 def analyze_custom_documents(custom_model_id):
     path_to_sample_documents = os.path.abspath(
@@ -65,31 +92,40 @@ def analyze_custom_documents(custom_model_id):
                         f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
                     )
 
-    # iterate over tables, lines, and selection marks on each page
-    for page in result.pages:
-        print(f"\nLines found on page {page.page_number}")
-        if page.lines:
-            for line in page.lines:
-                print(f"...Line '{line.content}'")
-        if page.words:
-            for word in page.words:
-                print(f"...Word '{word.content}' has a confidence of {word.confidence}")
-        if page.selection_marks:
-            print(f"\nSelection marks found on page {page.page_number}")
-            for selection_mark in page.selection_marks:
-                print(
-                    f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
-                )
-
-    if result.tables:
-        for i, table in enumerate(result.tables):
-            print(f"\nTable {i + 1} can be found on page:")
-            if table.bounding_regions:
-                for region in table.bounding_regions:
-                    print(f"...{region.page_number}")
-            for cell in table.cells:
-                print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
-    print("-----------------------------------")
+        # Extract table cell values
+        SYMBOL_OF_TABLE_TYPE = "array"
+        KEY_OF_VALUE_OBJECT = "valueObject"
+        KEY_OF_CELL_CONTENT = "content"
+
+        for doc in result.documents:
+            if not doc.fields is None:
+                for field_name, field_value in doc.fields.items():
+                    # "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
+                    if (
+                        field_name == "MaintenanceLog"
+                        and field_value.type == SYMBOL_OF_TABLE_TYPE
+                        and field_value.value_array
+                    ):
+                        col_names = []
+                        sample_obj = field_value.value_array[0]
+                        if KEY_OF_VALUE_OBJECT in sample_obj:
+                            col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
+                        print("----Extracting Table Cell Values----")
+                        table_rows = []
+                        for obj in field_value.value_array:
+                            if KEY_OF_VALUE_OBJECT in obj:
+                                value_obj = obj[KEY_OF_VALUE_OBJECT]
+                                extract_value_by_col_name = lambda key: (
+                                    value_obj[key].get(KEY_OF_CELL_CONTENT)
+                                    if key in value_obj
+                                    and KEY_OF_CELL_CONTENT in value_obj[key]
+                                    else "None"
+                                )
+                                row_data = list(map(extract_value_by_col_name, col_names))
+                                table_rows.append(row_data)
+                        print_table(col_names, table_rows)
+
+    print("------------------------------------")
     # [END analyze_custom_documents]
 
 

diff --git a/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py b/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py
@@ -106,6 +106,11 @@ def analyze_read():
         for paragraph in result.paragraphs:
             print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
             print(f"...with content: '{paragraph.content}'")
+
+        result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
+        print("-----Print sorted paragraphs-----")
+        for idx, paragraph in enumerate(result.paragraphs):
+            print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")
 
     print("----------------------------------------")