Skip to content

Commit

Permalink
add element type frequency eval function. add copy script to all conn…
Browse files Browse the repository at this point in the history
…ectors.
  • Loading branch information
Klaijan committed Oct 24, 2023
1 parent 6707cab commit 6d96bb6
Show file tree
Hide file tree
Showing 34 changed files with 154 additions and 6 deletions.
8 changes: 8 additions & 0 deletions test_unstructured_ingest/evaluation-metrics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ trap cleanup EXIT

EXPORT_DIR="$SCRIPT_DIR"/metrics
PYTHONPATH=. ./unstructured/ingest/evaluate.py \
measure-text-edit-distance \
--output_dir "$OUTPUT_DIR" \
--source_dir "$CCT_DIR" \
--export_dir "$EXPORT_DIR"

ELEMENT_TYPE_DIR="$SCRIPT_DIR"/expected-structured-output
PYTHONPATH=. ./unstructured/ingest/evaluate.py \
measure-element-type-accuracy \
--output_dir "$OUTPUT_DIR" \
--source_dir "$ELEMENT_TYPE_DIR" \
--export_dir "$EXPORT_DIR"
Empty file.
4 changes: 2 additions & 2 deletions test_unstructured_ingest/metrics/aggregate-scores-cct.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
strategy average sample_sd population_sd count
cct-accuracy 0.774 0.124 0.087 2
cct-%missing 0.065 0.035 0.025 2
cct-accuracy 0.83 0.204 0.144 2
cct-%missing 0.035 0.007 0.005 2
2 changes: 1 addition & 1 deletion test_unstructured_ingest/metrics/all-docs-cct.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
filename connector cct-accuracy cct-%missing
handbook-1p.docx box 0.974 0.03
example-10k.html local 0.686 0.04
science-exploration-1p.pptx box 0.861 0.09
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-against-api.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-airtable-diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--verbose

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-azure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-biomed-api.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-biomed-path.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-confluence-diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-delta-table.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-discord.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-elasticsearch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-embed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,5 @@ set +e
# once we have an alternative encoder that is deterministic, we test the diff here
# until then just validating the file was created
"$SCRIPT_DIR"/check-num-files-output.sh 1 "$OUTPUT_FOLDER_NAME"

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-gcs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \


"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-github.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
$ACCESS_TOKEN_FLAGS

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-gitlab.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-google-drive.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,5 @@ PYTHONPATH=. unstructured/ingest/main.py \


"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-jira.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \


"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-local-single-file.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-notion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \


"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-onedrive.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-outlook.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \


"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \


"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-s3-compression.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--uncompress

"$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-s3-minio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,5 @@ AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key PYTHONPATH=. ./u


"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-s3.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \


"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-salesforce.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-sharepoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-slack.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-wikipedia.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
88 changes: 85 additions & 3 deletions unstructured/ingest/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import click

from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text
from unstructured.metrics.element_type import calculate_element_type_percent_match, get_element_type_frequency
from unstructured.staging.base import elements_from_json, elements_to_text

logger = logging.getLogger("unstructured.ingest")
Expand All @@ -23,8 +24,12 @@

logger.setLevel(logging.DEBUG)

@click.group()
def main():
pass

@click.command()

@main.command()
@click.option("--output_dir", type=click.STRING, help="Directory to a structured output.")
@click.option(
"--output_list",
Expand Down Expand Up @@ -56,7 +61,7 @@
help="A tuple of weights to the Levenshtein distance calculation. \
See text_extraction.py/calculate_edit_distance for more details.",
)
def measure_edit_distance(
def measure_text_edit_distance(
output_dir: str,
output_list: Optional[List[str]],
source_dir: str,
Expand Down Expand Up @@ -123,6 +128,73 @@ def measure_edit_distance(
_display(agg_rows, headers)


@main.command()
@click.option("--output_dir", type=click.STRING, help="Directory to a structured output.")
@click.option(
"--output_list",
type=click.STRING,
multiple=True,
help="Optional: list of selected structured output file names under the \
directory to be evaluate. If none, all files under directory will be use.",
)
@click.option("--source_dir", type=click.STRING, help="Directory to a structured source.")
@click.option(
"--source_list",
type=click.STRING,
multiple=True,
help="Optional: list of selected structured source file names under the directory \
to be evaluate. If none, all files under directory will be use.",
)
@click.option(
"--export_dir",
type=click.STRING,
default="metrics",
help="Directory to save the output evaluation metrics to. Default to \
[your_working_dir]/metrics/",
)
def measure_element_type_accuracy(
output_dir: str,
output_list: Optional[List[str]],
source_dir: str,
source_list: Optional[List[str]],
export_dir: str
):
if not output_list:
output_list = _listdir_recursive(output_dir)
if not source_list:
source_list = _listdir_recursive(source_dir)

rows = []
accuracy_scores: List[float] = []

for doc in output_list: # type: ignore
fn = (doc.split("/")[-1]).split(".json")[0]
connector = doc.split("/")[0]
if doc in source_list: # type: ignore
output = get_element_type_frequency(_read_json(os.path.join(output_dir, doc)))
source = get_element_type_frequency(_read_json(os.path.join(source_dir, doc)))
accuracy = round(calculate_element_type_percent_match(output, source), 3)
rows.append([fn, connector, accuracy])
accuracy_scores.append(accuracy)

headers = ["filename", "connector", "element-type-accuracy"]
_write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers)

headers = ["strategy", "average", "sample_sd", "population_sd", "count"]
agg_rows = []
agg_rows.append(
[
"element-type-accuracy",
_mean(accuracy_scores),
_stdev(accuracy_scores),
_pstdev(accuracy_scores),
len(accuracy_scores),
],
)
_write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_rows, headers)
_display(agg_rows, headers)


def _listdir_recursive(dir: str):
listdir = []
for dirpath, _, filenames in os.walk(dir):
Expand All @@ -137,6 +209,8 @@ def _listdir_recursive(dir: str):


def _write_to_file(dir: str, filename: str, rows: List[Any], headers: List[Any]):
click.echo(dir)
click.echo(filename)
if dir and not os.path.exists(dir):
os.makedirs(dir)
with open(os.path.join(os.path.join(dir, filename)), "w", newline="") as tsv:
Expand Down Expand Up @@ -164,6 +238,8 @@ def _display(rows, headers):


def _mean(scores: List[float], rounding: Optional[int] = 3):
if len(scores) < 1:
return None
if not rounding:
return statistics.mean(scores)
return round(statistics.mean(scores), rounding)
Expand All @@ -185,5 +261,11 @@ def _pstdev(scores: List[float], rounding: Optional[int] = 3):
return round(statistics.pstdev(scores), rounding)


def _read_json(path):
with open(path) as f:
jsontext = f.read()
return jsontext


if __name__ == "__main__":
measure_edit_distance()
main()

0 comments on commit 6d96bb6

Please sign in to comment.