Merge branch 'main' into jj/2011-missing-languages-metadata

Unstructured-IO · Nov 8, 2023 · 8ee0346 · 8ee0346
2 parents 6516806 + 67fa7ad
commit 8ee0346
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 26 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.10.30-dev2
+## 0.10.30-dev3
 
 ### Enhancements
 
@@ -12,6 +12,7 @@
 ### Fixes
 
 * **Include `languages` in metadata when partitioning strategy='hi_res' or 'fast'** User defined `languages` was previously used for text detection, but not included in the resulting element metadata for some strategies. `languages` will now be included in the metadata regardless of partition strategy for pdfs and images.
+* **Fix ingest partition parameters not being passed to the api.** When using the --partition-by-api flag via unstructured-ingest, none of the partition arguments are forwarded, meaning that these options are disregarded. With this change, we now pass through all of the relevant partition arguments to the api. This allows a user to specify all of the same partition arguments they would locally and have them respected when specifying --partition-by-api.
 
 ## 0.10.29
 

diff --git a/test_unstructured_ingest/src/against-api.sh b/test_unstructured_ingest/src/against-api.sh
@@ -24,19 +24,27 @@ function cleanup() {
 }
 trap cleanup EXIT
 
+TEST_FILE_NAME=layout-parser-paper-with-table.pdf
+
+# including pdf-infer-table-structure to validate partition arguments are passed to the api
 RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
 PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
     local \
     --api-key "$UNS_API_KEY" \
     --metadata-exclude coordinates,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
     --partition-by-api \
     --strategy hi_res \
+    --pdf-infer-table-structure \
     --reprocess \
     --output-dir "$OUTPUT_DIR" \
     --verbose \
     --num-processes "$max_processes" \
-    --file-glob "*1p.txt" \
-    --input-path example-docs \
+    --input-path "example-docs/$TEST_FILE_NAME" \
     --work-dir "$WORK_DIR"
 
-"$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME
+RESULT_FILE_PATH="$OUTPUT_DIR/example-docs/$TEST_FILE_NAME.json"
+# validate that there is at least one table with text_as_html in the results
+if [ "$(jq 'any(.[]; .metadata.text_as_html != null)' "$RESULT_FILE_PATH")" = "false" ]; then
+  echo "No table with text_as_html found in $RESULT_FILE_PATH but at least one was expected."
+  exit 1
+fi
diff --git a/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh b/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh
@@ -26,7 +26,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
     --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
     --output-dir "$OUTPUT_DIR" \
     --skip-infer-table-types "xls,xlsx" \
-    --pdf-infer-table-structure true \
+    --pdf-infer-table-structure \
     --strategy hi_res \
     --verbose \
     --reprocess \

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.30-dev2"  # pragma: no cover
+__version__ = "0.10.30-dev3"  # pragma: no cover
diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py
@@ -262,9 +262,9 @@ def get_cli_options() -> t.List[click.Option]:
         options = [
             click.Option(
                 ["--pdf-infer-table-structure"],
+                is_flag=True,
                 default=False,
-                help="If set to True, partition will include the table's text "
-                "content in the response.",
+                help="Partition will include the table's text_as_html " "in the response metadata.",
             ),
             click.Option(
                 ["--strategy"],
@@ -345,6 +345,11 @@ def get_cli_options() -> t.List[click.Option]:
                 default=None,
                 help="API Key for partition endpoint.",
             ),
+            click.Option(
+                ["--hi-res-model-name"],
+                default=None,
+                help="Model name for hi-res strategy.",
+            ),
         ]
         return options
 

diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py
@@ -2,7 +2,6 @@
 through Unstructured."""
 
 import functools
-import json
 import os
 import re
 import typing as t
@@ -11,7 +10,6 @@
 from datetime import datetime
 from pathlib import Path
 
-import requests
 from dataclasses_json import DataClassJsonMixin
 from dataclasses_json.core import Json, _asdict, _decode_dataclass
 
@@ -21,8 +19,9 @@
 from unstructured.embed.interfaces import BaseEmbeddingEncoder, Element
 from unstructured.ingest.error import PartitionError, SourceConnectionError
 from unstructured.ingest.logger import logger
+from unstructured.partition.api import partition_via_api
 from unstructured.partition.auto import partition
-from unstructured.staging.base import convert_to_dict, elements_from_json, flatten_dict
+from unstructured.staging.base import convert_to_dict, flatten_dict
 
 A = t.TypeVar("A", bound="DataClassJsonMixin")
 
@@ -88,6 +87,7 @@ class PartitionConfig(BaseConfig):
     partition_endpoint: t.Optional[str] = "https://api.unstructured.io/general/v0/general"
     partition_by_api: bool = False
     api_key: t.Optional[str] = None
+    hi_res_model_name: t.Optional[str] = None
 
 
 @dataclass
@@ -453,21 +453,17 @@ def partition_file(
 
             logger.debug(f"Using remote partition ({endpoint})")
 
-            with open(self.filename, "rb") as f:
-                headers_dict = {}
-                if partition_config.api_key:
-                    headers_dict["UNSTRUCTURED-API-KEY"] = partition_config.api_key
-                response = requests.post(
-                    f"{endpoint}",
-                    files={"files": (str(self.filename), f)},
-                    headers=headers_dict,
-                    # TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
-                    # pass the stringified json here
-                )
-
-            if response.status_code != 200:
-                raise RuntimeError(f"Caught {response.status_code} from API: {response.text}")
-            elements = elements_from_json(text=json.dumps(response.json()))
+            passthrough_partition_kwargs = {
+                k: str(v) for k, v in partition_kwargs.items() if v is not None
+            }
+            elements = partition_via_api(
+                filename=str(self.filename),
+                api_key=partition_config.api_key,
+                api_url=endpoint,
+                **passthrough_partition_kwargs,
+            )
+            # TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
+            # pass the stringified json here
         return elements
 
     def process_file(

diff --git a/unstructured/ingest/pipeline/partition.py b/unstructured/ingest/pipeline/partition.py
@@ -37,6 +37,7 @@ def run(self, ingest_doc_dict) -> Optional[str]:
                 "encoding": self.partition_config.encoding,
                 "pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
                 "languages": self.partition_config.ocr_languages,
+                "hi_res_model_name": self.partition_config.hi_res_model_name,
             }
             if self.partition_config.skip_infer_table_types:
                 partition_kwargs[

diff --git a/unstructured/metrics/doctype_aggregation.py b/unstructured/metrics/doctype_aggregation.py
@@ -0,0 +1,81 @@
+import os
+from typing import List, Optional, Tuple
+
+import click
+import pandas as pd
+
+from unstructured.metrics.evaluate import measure_text_edit_distance
+
+
+@click.group()
+def main():
+    pass
+
+
+def aggregate_cct_data_by_doctype(results_dir: str):
+    # load tsv into dataframe
+    df = pd.read_csv(os.path.join(results_dir, "all-docs-cct.tsv"), sep="\t", header=0)
+
+    # group by doctype and calculate stats
+    agg_df = df.groupby("doctype").agg(
+        {"cct-accuracy": ["mean", "std", "count"], "cct-%missing": ["mean", "std", "count"]}
+    )
+
+    # write results to same export results folder
+    agg_df.to_csv(os.path.join(results_dir, "all-doctypes-agg-cct.tsv"))
+
+
+@main.command()
+@click.option("--output_dir", type=str, help="Directory to structured output.")
+@click.option("--source_dir", type=str, help="Directory to source.")
+@click.option(
+    "--output_list",
+    type=str,
+    multiple=True,
+    help="Optional: list of selected structured output file names under the \
+        directory to be evaluate. If none, all files under directory will be use.",
+)
+@click.option(
+    "--source_list",
+    type=str,
+    multiple=True,
+    help="Optional: list of selected source file names under the directory \
+        to be evaluate. If none, all files under directory will be use.",
+)
+@click.option(
+    "--export_dir",
+    type=str,
+    default="metrics_results",
+    help="Directory to save the output evaluation metrics to. Default to \
+        [your_working_dir]/metrics_results/",
+)
+@click.option(
+    "--weights",
+    type=(int, int, int),
+    default=(2, 1, 1),
+    show_default=True,
+    help="A tuple of weights to the Levenshtein distance calculation. \
+        See text_extraction.py/calculate_edit_distance for more details.",
+)
+def measure_holistic_eval_cct(
+    output_dir: str,
+    source_dir: str,
+    output_list: Optional[List[str]],
+    source_list: Optional[List[str]],
+    export_dir: str,
+    weights: Tuple[int, int, int],
+) -> None:
+    export_dir = "result_doctype_aggregate"
+    measure_text_edit_distance(
+        output_dir=output_dir,
+        source_dir=source_dir,
+        output_list=output_list,
+        source_list=source_list,
+        export_dir=export_dir,
+        weights=weights,
+    )
+    aggregate_cct_data_by_doctype(export_dir)
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.10.30-dev2" # pragma: no cover
		__version__ = "0.10.30-dev3" # pragma: no cover