diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index 9e43105fd0..f51141353b 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -432,97 +432,6 @@ def _generate_dataframes(self, rows): return df, agg_df -def measure_text_extraction_accuracy( - output_dir: str, - source_dir: str, - output_list: Optional[List[str]] = None, - source_list: Optional[List[str]] = None, - export_dir: str = "metrics", - group_by: Optional[str] = None, - weights: Tuple[int, int, int] = (1, 1, 1), - visualize: bool = False, - output_type: str = "json", -) -> None: - """ - Loops through the list of structured output from all of `output_dir` or selected files from - `output_list`, and compare with gold-standard of the same file name under `source_dir` or - selected files from `source_list`. - - Calculates text accuracy and percent missing. After looped through the whole list, write to tsv. - Also calculates the aggregated accuracy and percent missing. - """ - TextExtractionMetricsCalculator( - documents_dir=output_dir, - ground_truths_dir=source_dir, - group_by=group_by, - weights=weights, - document_type=output_type, - ).on_files(document_paths=output_list, ground_truth_paths=source_list).calculate( - export_dir=export_dir, visualize_progress=visualize, display_agg_df=True - ) - - -def measure_element_type_accuracy( - output_dir: str, - source_dir: str, - output_list: Optional[List[str]] = None, - source_list: Optional[List[str]] = None, - export_dir: str = "metrics", - group_by: Optional[str] = None, - visualize: bool = False, -): - """ - Loops through the list of structured output from all of `output_dir` or selected files from - `output_list`, and compare with gold-standard of the same file name under `source_dir` or - selected files from `source_list`. - - Calculates element type frequency accuracy and percent missing. After looped through the - whole list, write to tsv. Also calculates the aggregated accuracy. - """ - ElementTypeMetricsCalculator( - documents_dir=output_dir, - ground_truths_dir=source_dir, - group_by=group_by, - ).on_files(document_paths=output_list, ground_truth_paths=source_list).calculate( - export_dir=export_dir, visualize_progress=visualize, display_agg_df=True - ) - - -def measure_table_structure_accuracy( - output_dir: str, - source_dir: str, - output_list: Optional[List[str]] = None, - source_list: Optional[List[str]] = None, - export_dir: str = "metrics", - visualize: bool = False, - cutoff: Optional[float] = None, -): - """ - Loops through the list of structured output from all of `output_dir` or selected files from - `output_list`, and compare with gold-standard of the same file name under `source_dir` or - selected files from `source_list`. Supports also a json file with filenames as keys and - structured gold-standard output as values. - - Calculates: - - table found accuracy - - table level accuracy - - element in column index accuracy - - element in row index accuracy - - element's column content accuracy - - element's row content accuracy - - - After looped through the whole list, write to tsv. Also calculates the aggregated accuracy. - """ - TableStructureMetricsCalculator( - documents_dir=output_dir, - ground_truths_dir=source_dir, - cutoff=cutoff, - ).on_files(document_paths=output_list, ground_truth_paths=source_list).calculate( - export_dir=export_dir, visualize_progress=visualize, display_agg_df=True - ) - - def get_mean_grouping( group_by: str, data_input: Union[pd.DataFrame, str],