Skip to content

Commit

Permalink
Merge branch 'main' into jj/2011-missing-languages-metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
Coniferish authored Nov 8, 2023
2 parents 6516806 + 67fa7ad commit 8ee0346
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 26 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.10.30-dev2
## 0.10.30-dev3

### Enhancements

Expand All @@ -12,6 +12,7 @@
### Fixes

* **Include `languages` in metadata when partitioning strategy='hi_res' or 'fast'** User defined `languages` was previously used for text detection, but not included in the resulting element metadata for some strategies. `languages` will now be included in the metadata regardless of partition strategy for pdfs and images.
* **Fix ingest partition parameters not being passed to the api.** When using the --partition-by-api flag via unstructured-ingest, none of the partition arguments are forwarded, meaning that these options are disregarded. With this change, we now pass through all of the relevant partition arguments to the api. This allows a user to specify all of the same partition arguments they would locally and have them respected when specifying --partition-by-api.

## 0.10.29

Expand Down
14 changes: 11 additions & 3 deletions test_unstructured_ingest/src/against-api.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,27 @@ function cleanup() {
}
trap cleanup EXIT

TEST_FILE_NAME=layout-parser-paper-with-table.pdf

# including pdf-infer-table-structure to validate partition arguments are passed to the api
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_API_KEY" \
--metadata-exclude coordinates,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--partition-by-api \
--strategy hi_res \
--pdf-infer-table-structure \
--reprocess \
--output-dir "$OUTPUT_DIR" \
--verbose \
--num-processes "$max_processes" \
--file-glob "*1p.txt" \
--input-path example-docs \
--input-path "example-docs/$TEST_FILE_NAME" \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME
RESULT_FILE_PATH="$OUTPUT_DIR/example-docs/$TEST_FILE_NAME.json"
# validate that there is at least one table with text_as_html in the results
if [ "$(jq 'any(.[]; .metadata.text_as_html != null)' "$RESULT_FILE_PATH")" = "false" ]; then
echo "No table with text_as_html found in $RESULT_FILE_PATH but at least one was expected."
exit 1
fi
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--skip-infer-table-types "xls,xlsx" \
--pdf-infer-table-structure true \
--pdf-infer-table-structure \
--strategy hi_res \
--verbose \
--reprocess \
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.30-dev2" # pragma: no cover
__version__ = "0.10.30-dev3" # pragma: no cover
9 changes: 7 additions & 2 deletions unstructured/ingest/cli/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,9 @@ def get_cli_options() -> t.List[click.Option]:
options = [
click.Option(
["--pdf-infer-table-structure"],
is_flag=True,
default=False,
help="If set to True, partition will include the table's text "
"content in the response.",
help="Partition will include the table's text_as_html " "in the response metadata.",
),
click.Option(
["--strategy"],
Expand Down Expand Up @@ -345,6 +345,11 @@ def get_cli_options() -> t.List[click.Option]:
default=None,
help="API Key for partition endpoint.",
),
click.Option(
["--hi-res-model-name"],
default=None,
help="Model name for hi-res strategy.",
),
]
return options

Expand Down
32 changes: 14 additions & 18 deletions unstructured/ingest/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
through Unstructured."""

import functools
import json
import os
import re
import typing as t
Expand All @@ -11,7 +10,6 @@
from datetime import datetime
from pathlib import Path

import requests
from dataclasses_json import DataClassJsonMixin
from dataclasses_json.core import Json, _asdict, _decode_dataclass

Expand All @@ -21,8 +19,9 @@
from unstructured.embed.interfaces import BaseEmbeddingEncoder, Element
from unstructured.ingest.error import PartitionError, SourceConnectionError
from unstructured.ingest.logger import logger
from unstructured.partition.api import partition_via_api
from unstructured.partition.auto import partition
from unstructured.staging.base import convert_to_dict, elements_from_json, flatten_dict
from unstructured.staging.base import convert_to_dict, flatten_dict

A = t.TypeVar("A", bound="DataClassJsonMixin")

Expand Down Expand Up @@ -88,6 +87,7 @@ class PartitionConfig(BaseConfig):
partition_endpoint: t.Optional[str] = "https://api.unstructured.io/general/v0/general"
partition_by_api: bool = False
api_key: t.Optional[str] = None
hi_res_model_name: t.Optional[str] = None


@dataclass
Expand Down Expand Up @@ -453,21 +453,17 @@ def partition_file(

logger.debug(f"Using remote partition ({endpoint})")

with open(self.filename, "rb") as f:
headers_dict = {}
if partition_config.api_key:
headers_dict["UNSTRUCTURED-API-KEY"] = partition_config.api_key
response = requests.post(
f"{endpoint}",
files={"files": (str(self.filename), f)},
headers=headers_dict,
# TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
# pass the stringified json here
)

if response.status_code != 200:
raise RuntimeError(f"Caught {response.status_code} from API: {response.text}")
elements = elements_from_json(text=json.dumps(response.json()))
passthrough_partition_kwargs = {
k: str(v) for k, v in partition_kwargs.items() if v is not None
}
elements = partition_via_api(
filename=str(self.filename),
api_key=partition_config.api_key,
api_url=endpoint,
**passthrough_partition_kwargs,
)
# TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
# pass the stringified json here
return elements

def process_file(
Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/pipeline/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def run(self, ingest_doc_dict) -> Optional[str]:
"encoding": self.partition_config.encoding,
"pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
"languages": self.partition_config.ocr_languages,
"hi_res_model_name": self.partition_config.hi_res_model_name,
}
if self.partition_config.skip_infer_table_types:
partition_kwargs[
Expand Down
81 changes: 81 additions & 0 deletions unstructured/metrics/doctype_aggregation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import os
from typing import List, Optional, Tuple

import click
import pandas as pd

from unstructured.metrics.evaluate import measure_text_edit_distance


@click.group()
def main():
pass


def aggregate_cct_data_by_doctype(results_dir: str):
# load tsv into dataframe
df = pd.read_csv(os.path.join(results_dir, "all-docs-cct.tsv"), sep="\t", header=0)

# group by doctype and calculate stats
agg_df = df.groupby("doctype").agg(
{"cct-accuracy": ["mean", "std", "count"], "cct-%missing": ["mean", "std", "count"]}
)

# write results to same export results folder
agg_df.to_csv(os.path.join(results_dir, "all-doctypes-agg-cct.tsv"))


@main.command()
@click.option("--output_dir", type=str, help="Directory to structured output.")
@click.option("--source_dir", type=str, help="Directory to source.")
@click.option(
"--output_list",
type=str,
multiple=True,
help="Optional: list of selected structured output file names under the \
directory to be evaluate. If none, all files under directory will be use.",
)
@click.option(
"--source_list",
type=str,
multiple=True,
help="Optional: list of selected source file names under the directory \
to be evaluate. If none, all files under directory will be use.",
)
@click.option(
"--export_dir",
type=str,
default="metrics_results",
help="Directory to save the output evaluation metrics to. Default to \
[your_working_dir]/metrics_results/",
)
@click.option(
"--weights",
type=(int, int, int),
default=(2, 1, 1),
show_default=True,
help="A tuple of weights to the Levenshtein distance calculation. \
See text_extraction.py/calculate_edit_distance for more details.",
)
def measure_holistic_eval_cct(
output_dir: str,
source_dir: str,
output_list: Optional[List[str]],
source_list: Optional[List[str]],
export_dir: str,
weights: Tuple[int, int, int],
) -> None:
export_dir = "result_doctype_aggregate"
measure_text_edit_distance(
output_dir=output_dir,
source_dir=source_dir,
output_list=output_list,
source_list=source_list,
export_dir=export_dir,
weights=weights,
)
aggregate_cct_data_by_doctype(export_dir)


if __name__ == "__main__":
main()

0 comments on commit 8ee0346

Please sign in to comment.