Skip to content

Commit

Permalink
Merge branch 'main' into jj/remove-unstrc-client-constraint
Browse files Browse the repository at this point in the history
  • Loading branch information
Coniferish authored Jul 2, 2024
2 parents 4dc1f85 + c28deff commit 565fb4e
Show file tree
Hide file tree
Showing 43 changed files with 1,305 additions and 687 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
## 0.14.10-dev1
## 0.14.10-dev5

### Enhancements
* **Update unstructured-client dependency** Change unstructured-client dependency pin back to
greater than min version and updated tests that were failing given the update.

* **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well.


### Features

### Fixes
- Fix counting false negatives and false positives in table structure evaluation

* **Fix Slack CI test** Change channel that Slack test is pointing to because previous test bot expired

Expand Down
14 changes: 14 additions & 0 deletions test_unstructured/metrics/test_table_alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from unstructured.metrics.table.table_alignment import TableAlignment


def test_get_element_level_alignment_when_no_match():
example_table = [{"row_index": 0, "col_index": 0, "content": "a"}]
metrics = TableAlignment.get_element_level_alignment(
predicted_table_data=[example_table],
ground_truth_table_data=[example_table],
matched_indices=[-1],
)
assert metrics["col_index_acc"] == 0
assert metrics["row_index_acc"] == 0
assert metrics["row_content_acc"] == 0
assert metrics["col_content_acc"] == 0
155 changes: 155 additions & 0 deletions test_unstructured/metrics/test_table_structure.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from unittest import mock

import numpy as np
import pytest

from unstructured.metrics.table.table_alignment import TableAlignment
from unstructured.metrics.table.table_eval import TableEvalProcessor
from unstructured.metrics.table_structure import (
eval_table_transformer_for_file,
Expand Down Expand Up @@ -542,3 +546,154 @@ def test_table_eval_processor_merged_cells():
assert result.element_col_level_index_acc == 1.0
assert result.element_row_level_content_acc == 1.0
assert result.element_col_level_content_acc == 1.0


def test_table_eval_processor_when_no_match_with_pred():
prediction = [
{
"type": "Table",
"metadata": {"text_as_html": """<table><tr><td>Some cell</td></tr></table>"""},
}
]

ground_truth = [
{
"type": "Table",
"text": [
{
"id": "ee862c7a-d27e-4484-92de-4faa42a63f3b",
"x": 0,
"y": 0,
"w": 1,
"h": 1,
"content": "11",
},
{
"id": "6237ac7b-bfc8-40d2-92f2-d138277205e2",
"x": 0,
"y": 1,
"w": 1,
"h": 1,
"content": "21",
},
{
"id": "9d0933a9-5984-4cad-80d9-6752bf9bc4df",
"x": 1,
"y": 0,
"w": 1,
"h": 1,
"content": "12",
},
{
"id": "1152d043-5ead-4ab8-8b88-888d48831ac2",
"x": 1,
"y": 1,
"w": 1,
"h": 1,
"content": "22",
},
],
}
]

with mock.patch.object(TableAlignment, "get_table_level_alignment") as align_fn:
align_fn.return_value = [-1]
te_processor = TableEvalProcessor(prediction, ground_truth)
result = te_processor.process_file()

assert result.total_tables == 1
assert result.table_level_acc == 0
assert result.element_row_level_index_acc == 0
assert result.element_col_level_index_acc == 0
assert result.element_row_level_content_acc == 0
assert result.element_col_level_content_acc == 0


def test_table_eval_processor_when_no_tables():
prediction = [{}]

ground_truth = [{}]

te_processor = TableEvalProcessor(prediction, ground_truth)
result = te_processor.process_file()
assert result.total_tables == 0
assert result.table_level_acc == 1
assert np.isnan(result.element_row_level_index_acc)
assert np.isnan(result.element_col_level_index_acc)
assert np.isnan(result.element_row_level_content_acc)
assert np.isnan(result.element_col_level_content_acc)


def test_table_eval_processor_when_only_gt():
prediction = []

ground_truth = [
{
"type": "Table",
"text": [
{
"id": "ee862c7a-d27e-4484-92de-4faa42a63f3b",
"x": 0,
"y": 0,
"w": 1,
"h": 1,
"content": "11",
},
{
"id": "6237ac7b-bfc8-40d2-92f2-d138277205e2",
"x": 0,
"y": 1,
"w": 1,
"h": 1,
"content": "21",
},
{
"id": "9d0933a9-5984-4cad-80d9-6752bf9bc4df",
"x": 1,
"y": 0,
"w": 1,
"h": 1,
"content": "12",
},
{
"id": "1152d043-5ead-4ab8-8b88-888d48831ac2",
"x": 1,
"y": 1,
"w": 1,
"h": 1,
"content": "22",
},
],
}
]

te_processor = TableEvalProcessor(prediction, ground_truth)
result = te_processor.process_file()

assert result.total_tables == 1
assert result.table_level_acc == 0
assert result.element_row_level_index_acc == 0
assert result.element_col_level_index_acc == 0
assert result.element_row_level_content_acc == 0
assert result.element_col_level_content_acc == 0


def test_table_eval_processor_when_only_pred():
prediction = [
{
"type": "Table",
"metadata": {"text_as_html": """<table><tr><td>Some cell</td></tr></table>"""},
}
]

ground_truth = [{}]

te_processor = TableEvalProcessor(prediction, ground_truth)
result = te_processor.process_file()

assert result.total_tables == 0
assert result.table_level_acc == 0
assert result.element_row_level_index_acc == 0
assert result.element_col_level_index_acc == 0
assert result.element_row_level_content_acc == 0
assert result.element_col_level_content_acc == 0
7 changes: 4 additions & 3 deletions test_unstructured_ingest/dest/pinecone.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ RANDOM_SUFFIX=$((RANDOM % 100000 + 1))

# Set the variables with default values if they're not set in the environment
PINECONE_INDEX=${PINECONE_INDEX:-"ingest-test-$RANDOM_SUFFIX"}
PINECONE_HOST_POSTFIX=${PINECONE_HOST_POSTFIX:-"4627-b74a"}
PINECONE_ENVIRONMENT=${PINECONE_ENVIRONMENT:-"us-east1-gcp"}
PINECONE_PROJECT_ID=${PINECONE_PROJECT_ID:-"art8iaj"}

Expand Down Expand Up @@ -96,7 +97,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--input-path example-docs/book-war-and-peace-1225p.txt \
--work-dir "$WORK_DIR" \
--chunking-strategy by_title \
--chunk-combine-text-under-n-chars 200 --chunk-new-after-n-chars 2500 --chunk-max-characters 38000 --chunk-multipage-sections \
--chunk-combine-text-under-n-chars 150 --chunk-new-after-n-chars 1500 --chunk-max-characters 2500 --chunk-multipage-sections \
--embedding-provider "langchain-huggingface" \
pinecone \
--api-key "$PINECONE_API_KEY" \
Expand All @@ -116,7 +117,7 @@ while [ "$num_of_vectors_remote" -eq 0 ] && [ "$attempt" -lt 4 ]; do

num_of_vectors_remote=$(curl --request POST \
-s \
--url "https://$PINECONE_INDEX-$PINECONE_PROJECT_ID.svc.$PINECONE_ENVIRONMENT.pinecone.io/describe_index_stats" \
--url "https://$PINECONE_INDEX-$PINECONE_PROJECT_ID.svc.aped-$PINECONE_HOST_POSTFIX.pinecone.io/describe_index_stats" \
--header "accept: application/json" \
--header "content-type: application/json" \
--header "Api-Key: $PINECONE_API_KEY" | jq -r '.totalVectorCount')
Expand All @@ -125,7 +126,7 @@ while [ "$num_of_vectors_remote" -eq 0 ] && [ "$attempt" -lt 4 ]; do
attempt=$((attempt + 1))
done

EXPECTED=1404
EXPECTED=1825

if [ "$num_of_vectors_remote" -ne $EXPECTED ]; then
echo "Number of vectors in Pinecone are $num_of_vectors_remote when the expected number is $EXPECTED. Test failed."
Expand Down
Loading

0 comments on commit 565fb4e

Please sign in to comment.