Merge branch 'main' into jj/remove-unstrc-client-constraint

Unstructured-IO · Jul 2, 2024 · 565fb4e · 565fb4e
2 parents 4dc1f85 + c28deff
commit 565fb4e
Show file tree

Hide file tree

Showing 43 changed files with 1,305 additions and 687 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,15 +1,15 @@
-## 0.14.10-dev1
+## 0.14.10-dev5
 
 ### Enhancements
 * **Update unstructured-client dependency** Change unstructured-client dependency pin back to
   greater than min version and updated tests that were failing given the update.
 
 * **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well.
 
-
 ### Features
 
 ### Fixes
+- Fix counting false negatives and false positives in table structure evaluation
 
 * **Fix Slack CI test** Change channel that Slack test is pointing to because previous test bot expired
 

diff --git a/test_unstructured/metrics/test_table_alignment.py b/test_unstructured/metrics/test_table_alignment.py
@@ -0,0 +1,14 @@
+from unstructured.metrics.table.table_alignment import TableAlignment
+
+
+def test_get_element_level_alignment_when_no_match():
+    example_table = [{"row_index": 0, "col_index": 0, "content": "a"}]
+    metrics = TableAlignment.get_element_level_alignment(
+        predicted_table_data=[example_table],
+        ground_truth_table_data=[example_table],
+        matched_indices=[-1],
+    )
+    assert metrics["col_index_acc"] == 0
+    assert metrics["row_index_acc"] == 0
+    assert metrics["row_content_acc"] == 0
+    assert metrics["col_content_acc"] == 0
diff --git a/test_unstructured/metrics/test_table_structure.py b/test_unstructured/metrics/test_table_structure.py
@@ -1,5 +1,9 @@
+from unittest import mock
+
+import numpy as np
 import pytest
 
+from unstructured.metrics.table.table_alignment import TableAlignment
 from unstructured.metrics.table.table_eval import TableEvalProcessor
 from unstructured.metrics.table_structure import (
     eval_table_transformer_for_file,
@@ -542,3 +546,154 @@ def test_table_eval_processor_merged_cells():
     assert result.element_col_level_index_acc == 1.0
     assert result.element_row_level_content_acc == 1.0
     assert result.element_col_level_content_acc == 1.0
+
+
+def test_table_eval_processor_when_no_match_with_pred():
+    prediction = [
+        {
+            "type": "Table",
+            "metadata": {"text_as_html": """<table><tr><td>Some cell</td></tr></table>"""},
+        }
+    ]
+
+    ground_truth = [
+        {
+            "type": "Table",
+            "text": [
+                {
+                    "id": "ee862c7a-d27e-4484-92de-4faa42a63f3b",
+                    "x": 0,
+                    "y": 0,
+                    "w": 1,
+                    "h": 1,
+                    "content": "11",
+                },
+                {
+                    "id": "6237ac7b-bfc8-40d2-92f2-d138277205e2",
+                    "x": 0,
+                    "y": 1,
+                    "w": 1,
+                    "h": 1,
+                    "content": "21",
+                },
+                {
+                    "id": "9d0933a9-5984-4cad-80d9-6752bf9bc4df",
+                    "x": 1,
+                    "y": 0,
+                    "w": 1,
+                    "h": 1,
+                    "content": "12",
+                },
+                {
+                    "id": "1152d043-5ead-4ab8-8b88-888d48831ac2",
+                    "x": 1,
+                    "y": 1,
+                    "w": 1,
+                    "h": 1,
+                    "content": "22",
+                },
+            ],
+        }
+    ]
+
+    with mock.patch.object(TableAlignment, "get_table_level_alignment") as align_fn:
+        align_fn.return_value = [-1]
+        te_processor = TableEvalProcessor(prediction, ground_truth)
+        result = te_processor.process_file()
+
+    assert result.total_tables == 1
+    assert result.table_level_acc == 0
+    assert result.element_row_level_index_acc == 0
+    assert result.element_col_level_index_acc == 0
+    assert result.element_row_level_content_acc == 0
+    assert result.element_col_level_content_acc == 0
+
+
+def test_table_eval_processor_when_no_tables():
+    prediction = [{}]
+
+    ground_truth = [{}]
+
+    te_processor = TableEvalProcessor(prediction, ground_truth)
+    result = te_processor.process_file()
+    assert result.total_tables == 0
+    assert result.table_level_acc == 1
+    assert np.isnan(result.element_row_level_index_acc)
+    assert np.isnan(result.element_col_level_index_acc)
+    assert np.isnan(result.element_row_level_content_acc)
+    assert np.isnan(result.element_col_level_content_acc)
+
+
+def test_table_eval_processor_when_only_gt():
+    prediction = []
+
+    ground_truth = [
+        {
+            "type": "Table",
+            "text": [
+                {
+                    "id": "ee862c7a-d27e-4484-92de-4faa42a63f3b",
+                    "x": 0,
+                    "y": 0,
+                    "w": 1,
+                    "h": 1,
+                    "content": "11",
+                },
+                {
+                    "id": "6237ac7b-bfc8-40d2-92f2-d138277205e2",
+                    "x": 0,
+                    "y": 1,
+                    "w": 1,
+                    "h": 1,
+                    "content": "21",
+                },
+                {
+                    "id": "9d0933a9-5984-4cad-80d9-6752bf9bc4df",
+                    "x": 1,
+                    "y": 0,
+                    "w": 1,
+                    "h": 1,
+                    "content": "12",
+                },
+                {
+                    "id": "1152d043-5ead-4ab8-8b88-888d48831ac2",
+                    "x": 1,
+                    "y": 1,
+                    "w": 1,
+                    "h": 1,
+                    "content": "22",
+                },
+            ],
+        }
+    ]
+
+    te_processor = TableEvalProcessor(prediction, ground_truth)
+    result = te_processor.process_file()
+
+    assert result.total_tables == 1
+    assert result.table_level_acc == 0
+    assert result.element_row_level_index_acc == 0
+    assert result.element_col_level_index_acc == 0
+    assert result.element_row_level_content_acc == 0
+    assert result.element_col_level_content_acc == 0
+
+
+def test_table_eval_processor_when_only_pred():
+    prediction = [
+        {
+            "type": "Table",
+            "metadata": {"text_as_html": """<table><tr><td>Some cell</td></tr></table>"""},
+        }
+    ]
+
+    ground_truth = [{}]
+
+    te_processor = TableEvalProcessor(prediction, ground_truth)
+    result = te_processor.process_file()
+
+    assert result.total_tables == 0
+    assert result.table_level_acc == 0
+    assert result.element_row_level_index_acc == 0
+    assert result.element_col_level_index_acc == 0
+    assert result.element_row_level_content_acc == 0
+    assert result.element_col_level_content_acc == 0
diff --git a/test_unstructured_ingest/dest/pinecone.sh b/test_unstructured_ingest/dest/pinecone.sh
@@ -20,6 +20,7 @@ RANDOM_SUFFIX=$((RANDOM % 100000 + 1))
 
 # Set the variables with default values if they're not set in the environment
 PINECONE_INDEX=${PINECONE_INDEX:-"ingest-test-$RANDOM_SUFFIX"}
+PINECONE_HOST_POSTFIX=${PINECONE_HOST_POSTFIX:-"4627-b74a"}
 PINECONE_ENVIRONMENT=${PINECONE_ENVIRONMENT:-"us-east1-gcp"}
 PINECONE_PROJECT_ID=${PINECONE_PROJECT_ID:-"art8iaj"}
 
@@ -96,7 +97,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
   --input-path example-docs/book-war-and-peace-1225p.txt \
   --work-dir "$WORK_DIR" \
   --chunking-strategy by_title \
-  --chunk-combine-text-under-n-chars 200 --chunk-new-after-n-chars 2500 --chunk-max-characters 38000 --chunk-multipage-sections \
+  --chunk-combine-text-under-n-chars 150 --chunk-new-after-n-chars 1500 --chunk-max-characters 2500 --chunk-multipage-sections \
   --embedding-provider "langchain-huggingface" \
   pinecone \
   --api-key "$PINECONE_API_KEY" \
@@ -116,7 +117,7 @@ while [ "$num_of_vectors_remote" -eq 0 ] && [ "$attempt" -lt 4 ]; do
 
   num_of_vectors_remote=$(curl --request POST \
     -s \
-    --url "https://$PINECONE_INDEX-$PINECONE_PROJECT_ID.svc.$PINECONE_ENVIRONMENT.pinecone.io/describe_index_stats" \
+    --url "https://$PINECONE_INDEX-$PINECONE_PROJECT_ID.svc.aped-$PINECONE_HOST_POSTFIX.pinecone.io/describe_index_stats" \
     --header "accept: application/json" \
     --header "content-type: application/json" \
     --header "Api-Key: $PINECONE_API_KEY" | jq -r '.totalVectorCount')
@@ -125,7 +126,7 @@ while [ "$num_of_vectors_remote" -eq 0 ] && [ "$attempt" -lt 4 ]; do
   attempt=$((attempt + 1))
 done
 
-EXPECTED=1404
+EXPECTED=1825
 
 if [ "$num_of_vectors_remote" -ne $EXPECTED ]; then
   echo "Number of vectors in Pinecone are $num_of_vectors_remote when the expected number is $EXPECTED. Test failed."