Skip to content

Commit

Permalink
OD metrics for CI (#3269)
Browse files Browse the repository at this point in the history
OD metrics for CI

---------

Co-authored-by: Paweł Kmiecik <[email protected]>
Co-authored-by: Michał Martyniak <[email protected]>
  • Loading branch information
3 people authored Jul 9, 2024
1 parent 3f96a5a commit 176875b
Show file tree
Hide file tree
Showing 5 changed files with 893 additions and 10 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

### Features

**Add Object Detection Metrics to CI** Add object detection metrics (average precision, precision, recall and f1-score) implementations.

### Fixes

* **Fix counting false negatives and false positives in table structure evaluation**
Expand Down
52 changes: 52 additions & 0 deletions unstructured/ingest/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from unstructured.metrics.evaluate import (
ElementTypeMetricsCalculator,
ObjectDetectionMetricsCalculator,
TableStructureMetricsCalculator,
TextExtractionMetricsCalculator,
filter_metrics,
Expand Down Expand Up @@ -249,6 +250,57 @@ def measure_table_structure_accuracy_command(
)


@main.command()
@click.option("--output_dir", type=str, help="Directory to structured output.")
@click.option("--source_dir", type=str, help="Directory to structured source.")
@click.option(
"--output_list",
type=str,
multiple=True,
help=(
"Optional: list of selected structured output file names under the "
"directory to be evaluated. If none, all files under directory will be used."
),
)
@click.option(
"--source_list",
type=str,
multiple=True,
help="Optional: list of selected source file names under the directory \
to be evaluate. If none, all files under directory will be used.",
)
@click.option(
"--export_dir",
type=str,
default="metrics",
help="Directory to save the output evaluation metrics to. Default to \
your/working/dir/metrics/",
)
@click.option(
"--visualize",
is_flag=True,
show_default=True,
default=False,
help="Add the flag to show progress bar.",
)
def measure_object_detection_metrics_command(
output_dir: str,
source_dir: str,
export_dir: str,
visualize: bool,
output_list: Optional[List[str]] = None,
source_list: Optional[List[str]] = None,
):
return (
ObjectDetectionMetricsCalculator(
documents_dir=output_dir,
ground_truths_dir=source_dir,
)
.on_files(document_paths=output_list, ground_truth_paths=source_list)
.calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True)
)


@main.command()
@click.option(
"--data_input", type=str, required=True, help="Takes in path to data file as .tsv .csv .txt"
Expand Down
161 changes: 156 additions & 5 deletions unstructured/metrics/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
calculate_element_type_percent_match,
get_element_type_frequency,
)
from unstructured.metrics.object_detection import ObjectDetectionEvalProcessor
from unstructured.metrics.table.table_eval import TableEvalProcessor
from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text
from unstructured.metrics.utils import (
Expand Down Expand Up @@ -73,6 +74,25 @@ def __post_init__(self):
path.relative_to(self.ground_truths_dir) for path in self.ground_truths_dir.rglob("*")
]

@property
@abstractmethod
def default_tsv_name(self):
"""Default name for the per-document metrics TSV file."""

@property
@abstractmethod
def default_agg_tsv_name(self):
"""Default name for the aggregated metrics TSV file."""

@abstractmethod
def _generate_dataframes(self, rows: list) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Generates pandas DataFrames from the list of rows.
The first DF (index 0) is a dataframe containing metrics per file.
The second DF (index 1) is a dataframe containing the aggregated
metrics.
"""

def on_files(
self,
document_paths: Optional[list[str | Path]] = None,
Expand Down Expand Up @@ -158,7 +178,7 @@ def _try_process_document(self, doc: Path) -> Optional[list]:
return None

@abstractmethod
def _process_document(self, doc: Path) -> list:
def _process_document(self, doc: Path) -> Optional[list]:
"""Should return all metadata and metrics for a single document."""


Expand Down Expand Up @@ -202,7 +222,7 @@ def default_tsv_name(self):
def default_agg_tsv_name(self):
return "aggregate-table-structure-accuracy.tsv"

def _process_document(self, doc: Path) -> list:
def _process_document(self, doc: Path) -> Optional[list]:
doc_path = Path(doc)
out_filename = doc_path.stem
doctype = Path(out_filename).suffix[1:]
Expand Down Expand Up @@ -322,7 +342,7 @@ def _validate_inputs(self):
"Please note that some files will be skipped."
)

def _process_document(self, doc: Path) -> list:
def _process_document(self, doc: Path) -> Optional[list]:
filename = doc.stem
doctype = doc.suffixes[0]
connector = doc.parts[0] if len(doc.parts) > 1 else None
Expand Down Expand Up @@ -397,7 +417,7 @@ def default_tsv_name(self) -> str:
def default_agg_tsv_name(self) -> str:
return "aggregate-scores-element-type.tsv"

def _process_document(self, doc: Path) -> list:
def _process_document(self, doc: Path) -> Optional[list]:
filename = doc.stem
doctype = doc.suffixes[0]
connector = doc.parts[0] if len(doc.parts) > 1 else None
Expand Down Expand Up @@ -453,9 +473,13 @@ def get_mean_grouping(
elif eval_name == "element_type":
agg_fields = ["element-type-accuracy"]
agg_name = "element-type"
elif eval_name == "object_detection":
agg_fields = ["f1_score", "m_ap"]
agg_name = "object-detection"
else:
raise ValueError(
"Unknown metric. Expected `text_extraction` or `element_type` or `table_extraction`."
f"Unknown metric for eval {eval_name}. "
f"Expected `text_extraction` or `element_type` or `table_extraction`."
)

if isinstance(data_input, str):
Expand Down Expand Up @@ -571,3 +595,130 @@ def filter_metrics(
raise ValueError("Please provide `export_filename`.")
else:
raise ValueError("Return type must be either `dataframe` or `file`.")


@dataclass
class ObjectDetectionMetricsCalculator(BaseMetricsCalculator):
"""
Calculates object detection metrics for each document:
- f1 score
- precision
- recall
- average precision (mAP)
It also calculates aggregated metrics.
"""

def __post_init__(self):
super().__post_init__()
self._document_paths = [
path.relative_to(self.documents_dir)
for path in self.documents_dir.rglob("analysis/*/layout_dump/object_detection.json")
]

@property
def supported_metric_names(self):
return ["f1_score", "precision", "recall", "m_ap"]

@property
def default_tsv_name(self):
return "all-docs-object-detection-metrics.tsv"

@property
def default_agg_tsv_name(self):
return "aggregate-object-detection-metrics.tsv"

def _find_file_in_ground_truth(self, file_stem: str) -> Optional[Path]:
"""Find the file corresponding to OD model dump file among the set of ground truth files
The files in ground truth paths keep the original extension and have .json suffix added,
e.g.:
some_document.pdf.json
poster.jpg.json
To compare to `file_stem` we need to take the prefix part of the file, thus double-stem
is applied.
"""
for path in self._ground_truth_paths:
if Path(path.stem).stem == file_stem:
return path
return None

def _process_document(self, doc: Path) -> Optional[list]:
"""Calculate metrics for a single document.
As OD dump directory structure differes from other simple outputs, it needs
a specific processing to match the output OD dump file with corresponding
OD GT file.
The outputs are placed in a dicrectory structure:
analysis
|- document_name
|- layout_dump
|- object_detection.json
|- bboxes # not used in this evaluation
and the GT file is pleced in od_gt directory for given dataset
dataset_name
|- od_gt
|- document_name.pdf.json
Args:
doc (Path): path to the OD dump file
Returns:
list: a list of metrics (representing a single row) for a single document
"""
od_dump_path = Path(doc)
file_stem = od_dump_path.parts[-3] # we take the `document_name` - so the filename stem

src_gt_filename = self._find_file_in_ground_truth(file_stem)

if src_gt_filename not in self._ground_truth_paths:
return None

doctype = Path(src_gt_filename.stem).suffix[1:]

prediction_file = self.documents_dir / doc
if not prediction_file.exists():
logger.warning(f"Prediction file {prediction_file} does not exist, skipping")
return None

ground_truth_file = self.ground_truths_dir / src_gt_filename
if not ground_truth_file.exists():
logger.warning(f"Ground truth file {ground_truth_file} does not exist, skipping")
return None

processor = ObjectDetectionEvalProcessor.from_json_files(
prediction_file_path=prediction_file,
ground_truth_file_path=ground_truth_file,
)
metrics = processor.get_metrics()

return [
src_gt_filename.stem,
doctype,
None, # connector
] + [getattr(metrics, metric) for metric in self.supported_metric_names]

def _generate_dataframes(self, rows) -> tuple[pd.DataFrame, pd.DataFrame]:
headers = ["filename", "doctype", "connector"] + self.supported_metric_names
df = pd.DataFrame(rows, columns=headers)

if df.empty:
agg_df = pd.DataFrame(columns=AGG_HEADERS)
else:
element_metrics_results = {}
for metric in self.supported_metric_names:
metric_df = df[df[metric].notnull()]
agg_metric = metric_df[metric].agg([_mean, _stdev, _pstdev, _count]).transpose()
if agg_metric.empty:
element_metrics_results[metric] = pd.Series(
data=[None, None, None, 0], index=["_mean", "_stdev", "_pstdev", "_count"]
)
else:
element_metrics_results[metric] = agg_metric
agg_df = pd.DataFrame(element_metrics_results).transpose().reset_index()
agg_df.columns = AGG_HEADERS

return df, agg_df
Loading

0 comments on commit 176875b

Please sign in to comment.