From 176875bf26de6ece33298a27686a096b7bd0fe63 Mon Sep 17 00:00:00 2001
From: Marianna <marianna.parzych@deepsense.ai>
Date: Tue, 9 Jul 2024 14:41:20 +0200
Subject: [PATCH] OD metrics for CI (#3269)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OD metrics for CI

---------

Co-authored-by: Paweł Kmiecik <pawel.kmiecik@deepsense.ai>
Co-authored-by: Michał Martyniak <64484917+micmarty-deepsense@users.noreply.github.com>
---
 CHANGELOG.md                             |   2 +
 unstructured/ingest/evaluate.py          |  52 ++
 unstructured/metrics/evaluate.py         | 161 +++++-
 unstructured/metrics/object_detection.py | 675 +++++++++++++++++++++++
 unstructured/metrics/utils.py            |  13 +-
 5 files changed, 893 insertions(+), 10 deletions(-)
 create mode 100644 unstructured/metrics/object_detection.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ab1625bbe5..f21706f94d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,8 @@
 
 ### Features
 
+**Add Object Detection Metrics to CI** Add object detection metrics (average precision, precision, recall and f1-score) implementations.
+
 ### Fixes
 
 * **Fix counting false negatives and false positives in table structure evaluation**
diff --git a/unstructured/ingest/evaluate.py b/unstructured/ingest/evaluate.py
index d61ccb1d6b..ba162b5336 100755
--- a/unstructured/ingest/evaluate.py
+++ b/unstructured/ingest/evaluate.py
@@ -6,6 +6,7 @@
 
 from unstructured.metrics.evaluate import (
     ElementTypeMetricsCalculator,
+    ObjectDetectionMetricsCalculator,
     TableStructureMetricsCalculator,
     TextExtractionMetricsCalculator,
     filter_metrics,
@@ -249,6 +250,57 @@ def measure_table_structure_accuracy_command(
     )
 
 
+@main.command()
+@click.option("--output_dir", type=str, help="Directory to structured output.")
+@click.option("--source_dir", type=str, help="Directory to structured source.")
+@click.option(
+    "--output_list",
+    type=str,
+    multiple=True,
+    help=(
+        "Optional: list of selected structured output file names under the "
+        "directory to be evaluated. If none, all files under directory will be used."
+    ),
+)
+@click.option(
+    "--source_list",
+    type=str,
+    multiple=True,
+    help="Optional: list of selected source file names under the directory \
+        to be evaluate. If none, all files under directory will be used.",
+)
+@click.option(
+    "--export_dir",
+    type=str,
+    default="metrics",
+    help="Directory to save the output evaluation metrics to. Default to \
+        your/working/dir/metrics/",
+)
+@click.option(
+    "--visualize",
+    is_flag=True,
+    show_default=True,
+    default=False,
+    help="Add the flag to show progress bar.",
+)
+def measure_object_detection_metrics_command(
+    output_dir: str,
+    source_dir: str,
+    export_dir: str,
+    visualize: bool,
+    output_list: Optional[List[str]] = None,
+    source_list: Optional[List[str]] = None,
+):
+    return (
+        ObjectDetectionMetricsCalculator(
+            documents_dir=output_dir,
+            ground_truths_dir=source_dir,
+        )
+        .on_files(document_paths=output_list, ground_truth_paths=source_list)
+        .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True)
+    )
+
+
 @main.command()
 @click.option(
     "--data_input", type=str, required=True, help="Takes in path to data file as .tsv .csv .txt"
diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py
index 566fa23b1a..4eb4f6f01f 100755
--- a/unstructured/metrics/evaluate.py
+++ b/unstructured/metrics/evaluate.py
@@ -18,6 +18,7 @@
     calculate_element_type_percent_match,
     get_element_type_frequency,
 )
+from unstructured.metrics.object_detection import ObjectDetectionEvalProcessor
 from unstructured.metrics.table.table_eval import TableEvalProcessor
 from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text
 from unstructured.metrics.utils import (
@@ -73,6 +74,25 @@ def __post_init__(self):
             path.relative_to(self.ground_truths_dir) for path in self.ground_truths_dir.rglob("*")
         ]
 
+    @property
+    @abstractmethod
+    def default_tsv_name(self):
+        """Default name for the per-document metrics TSV file."""
+
+    @property
+    @abstractmethod
+    def default_agg_tsv_name(self):
+        """Default name for the aggregated metrics TSV file."""
+
+    @abstractmethod
+    def _generate_dataframes(self, rows: list) -> tuple[pd.DataFrame, pd.DataFrame]:
+        """Generates pandas DataFrames from the list of rows.
+
+        The first DF (index 0) is a dataframe containing metrics per file.
+        The second DF (index 1) is a dataframe containing the aggregated
+            metrics.
+        """
+
     def on_files(
         self,
         document_paths: Optional[list[str | Path]] = None,
@@ -158,7 +178,7 @@ def _try_process_document(self, doc: Path) -> Optional[list]:
             return None
 
     @abstractmethod
-    def _process_document(self, doc: Path) -> list:
+    def _process_document(self, doc: Path) -> Optional[list]:
         """Should return all metadata and metrics for a single document."""
 
 
@@ -202,7 +222,7 @@ def default_tsv_name(self):
     def default_agg_tsv_name(self):
         return "aggregate-table-structure-accuracy.tsv"
 
-    def _process_document(self, doc: Path) -> list:
+    def _process_document(self, doc: Path) -> Optional[list]:
         doc_path = Path(doc)
         out_filename = doc_path.stem
         doctype = Path(out_filename).suffix[1:]
@@ -322,7 +342,7 @@ def _validate_inputs(self):
                 "Please note that some files will be skipped."
             )
 
-    def _process_document(self, doc: Path) -> list:
+    def _process_document(self, doc: Path) -> Optional[list]:
         filename = doc.stem
         doctype = doc.suffixes[0]
         connector = doc.parts[0] if len(doc.parts) > 1 else None
@@ -397,7 +417,7 @@ def default_tsv_name(self) -> str:
     def default_agg_tsv_name(self) -> str:
         return "aggregate-scores-element-type.tsv"
 
-    def _process_document(self, doc: Path) -> list:
+    def _process_document(self, doc: Path) -> Optional[list]:
         filename = doc.stem
         doctype = doc.suffixes[0]
         connector = doc.parts[0] if len(doc.parts) > 1 else None
@@ -453,9 +473,13 @@ def get_mean_grouping(
     elif eval_name == "element_type":
         agg_fields = ["element-type-accuracy"]
         agg_name = "element-type"
+    elif eval_name == "object_detection":
+        agg_fields = ["f1_score", "m_ap"]
+        agg_name = "object-detection"
     else:
         raise ValueError(
-            "Unknown metric. Expected `text_extraction` or `element_type` or `table_extraction`."
+            f"Unknown metric for eval {eval_name}. "
+            f"Expected `text_extraction` or `element_type` or `table_extraction`."
         )
 
     if isinstance(data_input, str):
@@ -571,3 +595,130 @@ def filter_metrics(
         raise ValueError("Please provide `export_filename`.")
     else:
         raise ValueError("Return type must be either `dataframe` or `file`.")
+
+
+@dataclass
+class ObjectDetectionMetricsCalculator(BaseMetricsCalculator):
+    """
+    Calculates object detection metrics for each document:
+    - f1 score
+    - precision
+    - recall
+    - average precision (mAP)
+    It also calculates aggregated metrics.
+    """
+
+    def __post_init__(self):
+        super().__post_init__()
+        self._document_paths = [
+            path.relative_to(self.documents_dir)
+            for path in self.documents_dir.rglob("analysis/*/layout_dump/object_detection.json")
+        ]
+
+    @property
+    def supported_metric_names(self):
+        return ["f1_score", "precision", "recall", "m_ap"]
+
+    @property
+    def default_tsv_name(self):
+        return "all-docs-object-detection-metrics.tsv"
+
+    @property
+    def default_agg_tsv_name(self):
+        return "aggregate-object-detection-metrics.tsv"
+
+    def _find_file_in_ground_truth(self, file_stem: str) -> Optional[Path]:
+        """Find the file corresponding to OD model dump file among the set of ground truth files
+
+        The files in ground truth paths keep the original extension and have .json suffix added,
+        e.g.:
+        some_document.pdf.json
+        poster.jpg.json
+
+        To compare to `file_stem` we need to take the prefix part of the file, thus double-stem
+        is applied.
+        """
+        for path in self._ground_truth_paths:
+            if Path(path.stem).stem == file_stem:
+                return path
+        return None
+
+    def _process_document(self, doc: Path) -> Optional[list]:
+        """Calculate metrics for a single document.
+        As OD dump directory structure differes from other simple outputs, it needs
+        a specific processing to match the output OD dump file with corresponding
+        OD GT file.
+
+        The outputs are placed in a dicrectory structure:
+
+        analysis
+        |- document_name
+            |- layout_dump
+                |- object_detection.json
+            |- bboxes # not used in this evaluation
+
+        and the GT file is pleced in od_gt directory for given dataset
+
+        dataset_name
+        |- od_gt
+            |- document_name.pdf.json
+
+        Args:
+            doc (Path): path to the OD dump file
+
+        Returns:
+            list: a list of metrics (representing a single row) for a single document
+        """
+        od_dump_path = Path(doc)
+        file_stem = od_dump_path.parts[-3]  # we take the `document_name` - so the filename stem
+
+        src_gt_filename = self._find_file_in_ground_truth(file_stem)
+
+        if src_gt_filename not in self._ground_truth_paths:
+            return None
+
+        doctype = Path(src_gt_filename.stem).suffix[1:]
+
+        prediction_file = self.documents_dir / doc
+        if not prediction_file.exists():
+            logger.warning(f"Prediction file {prediction_file} does not exist, skipping")
+            return None
+
+        ground_truth_file = self.ground_truths_dir / src_gt_filename
+        if not ground_truth_file.exists():
+            logger.warning(f"Ground truth file {ground_truth_file} does not exist, skipping")
+            return None
+
+        processor = ObjectDetectionEvalProcessor.from_json_files(
+            prediction_file_path=prediction_file,
+            ground_truth_file_path=ground_truth_file,
+        )
+        metrics = processor.get_metrics()
+
+        return [
+            src_gt_filename.stem,
+            doctype,
+            None,  # connector
+        ] + [getattr(metrics, metric) for metric in self.supported_metric_names]
+
+    def _generate_dataframes(self, rows) -> tuple[pd.DataFrame, pd.DataFrame]:
+        headers = ["filename", "doctype", "connector"] + self.supported_metric_names
+        df = pd.DataFrame(rows, columns=headers)
+
+        if df.empty:
+            agg_df = pd.DataFrame(columns=AGG_HEADERS)
+        else:
+            element_metrics_results = {}
+            for metric in self.supported_metric_names:
+                metric_df = df[df[metric].notnull()]
+                agg_metric = metric_df[metric].agg([_mean, _stdev, _pstdev, _count]).transpose()
+                if agg_metric.empty:
+                    element_metrics_results[metric] = pd.Series(
+                        data=[None, None, None, 0], index=["_mean", "_stdev", "_pstdev", "_count"]
+                    )
+                else:
+                    element_metrics_results[metric] = agg_metric
+            agg_df = pd.DataFrame(element_metrics_results).transpose().reset_index()
+        agg_df.columns = AGG_HEADERS
+
+        return df, agg_df
diff --git a/unstructured/metrics/object_detection.py b/unstructured/metrics/object_detection.py
new file mode 100644
index 0000000000..0c08ae8fce
--- /dev/null
+++ b/unstructured/metrics/object_detection.py
@@ -0,0 +1,675 @@
+"""
+Implements object detection metrics: average precision, precision, recall, and f1 score.
+"""
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import torch
+
+IOU_THRESHOLDS = torch.tensor(
+    [0.5000, 0.5500, 0.6000, 0.6500, 0.7000, 0.7500, 0.8000, 0.8500, 0.9000, 0.9500]
+)
+SCORE_THRESHOLD = 0.1
+RECALL_THRESHOLDS = torch.arange(0, 1.01, 0.01)
+
+
+@dataclass
+class ObjectDetectionEvaluation:
+    """Class representing a gathered table metrics."""
+
+    f1_score: float
+    precision: float
+    recall: float
+    m_ap: float
+
+
+class ObjectDetectionEvalProcessor:
+
+    iou_thresholds = IOU_THRESHOLDS
+    score_threshold = SCORE_THRESHOLD
+    recall_thresholds = RECALL_THRESHOLDS
+
+    def __init__(
+        self,
+        document_preds: list[torch.Tensor],
+        document_targets: list[torch.Tensor],
+        pages_height: list[int],
+        pages_width: list[int],
+        class_labels: list[str],
+        device: str = "cpu",
+    ):
+        """
+        Initializes the ObjectDetection prediction and ground truth.
+
+        Args:
+            document_preds (list):      list (of length pages of document) of
+                                        Tensors of shape (num_predictions, 6)
+                                        format: (x1, y1, x2, y2, confidence,class_label)
+                                        where x1,y1,x2,y2 are according to image size
+            document_targets (list):    list (of length pages of document) of
+                                        Tensors of shape (num_targets, 6)
+                                        format: (label, x1, y1, x2, y2)
+                                        where x,y,w,h are according to image size
+            pages_height (list):        list of height of each page in the document
+            pages_width (list):         list of width of each page in the document
+            class_labels (list):        list of class labels
+        """
+        self.device = device
+        self.document_preds = [pred.to(device) for pred in document_preds]
+        self.document_targets = [target.to(device) for target in document_targets]
+        self.pages_height = pages_height
+        self.pages_width = pages_width
+        self.num_cls = len(class_labels)
+
+    @classmethod
+    def from_json_files(
+        cls,
+        prediction_file_path: Path,
+        ground_truth_file_path: Path,
+    ) -> "ObjectDetectionEvalProcessor":
+        """
+        Initializes the ObjectDetection prediction and ground truth,
+        and converts the data to the required format.
+
+        Args:
+            prediction_file_path (Path): path to json file with predictions dump from OD model
+            ground_truth_file_path (Path): path to json file with OD ground truth data
+        """
+        # TODO: Test after https://unstructured-ai.atlassian.net/browse/ML-92
+        # is done.
+        with open(prediction_file_path) as f:
+            predictions_data = json.load(f)
+        with open(ground_truth_file_path) as f:
+            ground_truth_data = json.load(f)
+
+        assert (
+            predictions_data["object_detection_classes"]
+            == ground_truth_data["object_detection_classes"]
+        ), "Classes in predictions and ground truth do not match."
+        assert len(predictions_data["pages"]) == len(
+            ground_truth_data["pages"]
+        ), "Pages number in predictions and ground truth do not match."
+        for pred_page, gt_page in zip(predictions_data["pages"], ground_truth_data["pages"]):
+            assert (
+                pred_page["size"] == gt_page["size"]
+            ), "Page sizes in predictions and ground truth do not match."
+
+        class_labels = predictions_data["object_detection_classes"]
+        document_preds = cls._process_data(predictions_data, class_labels, prediction=True)
+        document_targets = cls._process_data(ground_truth_data, class_labels)
+        pages_height, pages_width = cls._parse_page_dimensions(predictions_data)
+
+        return cls(document_preds, document_targets, pages_height, pages_width, class_labels)
+
+    @staticmethod
+    def _parse_page_dimensions(data: dict) -> tuple[list, list]:
+        """
+        Process the page dimensions from the json file to the required format.
+        """
+        pages_height = []
+        pages_width = []
+        for page in data["pages"]:
+            pages_height.append(page["size"]["height"])
+            pages_width.append(page["size"]["width"])
+        return pages_height, pages_width
+
+    @staticmethod
+    def _process_data(data: dict, class_labels, prediction: bool = False) -> list[dict]:
+        """
+        Process the elements from the json file to the required format.
+        """
+        pages_list = []
+        for page in data["pages"]:
+            page_elements = []
+            for element in page["elements"]:
+                # Extract coordinates, confidence, and class label from each prediction
+                class_label = element["type"]
+                class_idx = class_labels.index(class_label)
+                x1, y1, x2, y2 = element["bbox"]
+                if prediction:
+                    confidence = element["prob"]
+                    page_elements.append([x1, y1, x2, y2, confidence, class_idx])
+                else:
+                    page_elements.append([class_idx, x1, y1, x2, y2])
+            page_tensor = torch.tensor(page_elements)
+            pages_list.append(page_tensor)
+
+        return pages_list
+
+    @staticmethod
+    def _get_top_k_idx_per_cls(
+        preds_scores: torch.Tensor, preds_cls: torch.Tensor, top_k: int
+    ) -> torch.Tensor:
+        # From: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py  # noqa E501
+        """
+        Get the indexes of all the top k predictions for every class
+
+        Args:
+            preds_scores:   The confidence scores, vector of shape (n_pred)
+            preds_cls:      The predicted class, vector of shape (n_pred)
+            top_k:          Number of predictions to keep per class, ordered by confidence score
+
+        Returns:
+            top_k_idx:     Indexes of the top k predictions. length <= (k * n_unique_class)
+        """
+        n_unique_cls = torch.max(preds_cls)
+        mask = preds_cls.view(-1, 1) == torch.arange(
+            n_unique_cls + 1, device=preds_scores.device
+        ).view(1, -1)
+        preds_scores_per_cls = preds_scores.view(-1, 1) * mask
+
+        sorted_scores_per_cls, sorting_idx = preds_scores_per_cls.sort(0, descending=True)
+        idx_with_satisfying_scores = sorted_scores_per_cls[:top_k, :].nonzero(as_tuple=False)
+        top_k_idx = sorting_idx[idx_with_satisfying_scores.split(1, dim=1)]
+        return top_k_idx.view(-1)
+
+    @staticmethod
+    def _change_bbox_bounds_for_image_size(
+        boxes: np.ndarray, img_shape: tuple[int, int]
+    ) -> np.ndarray:
+        # From: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py  # noqa E501
+        """
+        Clips bboxes to image boundaries.
+
+        Args:
+            bboxes:         Input bounding boxes in XYXY format of [..., 4] shape
+            img_shape:      Image shape (height, width).
+        Returns:
+            clipped_boxes:  Clipped bboxes in XYXY format of [..., 4] shape
+        """
+        boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(min=0, max=img_shape[1])
+        boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(min=0, max=img_shape[0])
+        return boxes
+
+    @staticmethod
+    def _box_iou(box1: torch.Tensor, box2: torch.Tensor) -> torch.Tensor:
+        # From: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py  # noqa E501
+        """
+        Return intersection-over-union (Jaccard index) of boxes.
+        Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+
+        Args:
+            box1: Tensor of shape [N, 4]
+            box2: Tensor of shape [M, 4]
+
+        Returns:
+            iou:    Tensor of shape [N, M]: the NxM matrix containing the pairwise IoU values
+                    for every element in boxes1 and boxes2
+        """
+
+        def box_area(box):
+            # box = 4xn
+            return (box[2] - box[0]) * (box[3] - box[1])
+
+        area1 = box_area(box1.T)
+        area2 = box_area(box2.T)
+
+        # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+        inter = (
+            (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2]))
+            .clamp(0)
+            .prod(2)
+        )
+        return inter / (area1[:, None] + area2 - inter)  # iou = inter / (area1 + area2 - inter)
+
+    def _compute_targets(
+        self,
+        preds_box_xyxy: torch.Tensor,
+        preds_cls: torch.Tensor,
+        targets_box_xyxy: torch.Tensor,
+        targets_cls: torch.Tensor,
+        preds_matched: torch.Tensor,
+        targets_matched: torch.Tensor,
+        preds_idx_to_use: torch.Tensor,
+        iou_thresholds: torch.Tensor,
+    ) -> torch.Tensor:
+        # From: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py  # noqa E501
+        """
+        Computes the matching targets based on IoU for regular scenarios.
+
+        Args:
+            preds_box_xyxy: (torch.Tensor) Predicted bounding boxes in XYXY format.
+            preds_cls: (torch.Tensor) Predicted classes.
+            targets_box_xyxy: (torch.Tensor) Target bounding boxes in XYXY format.
+            targets_cls: (torch.Tensor) Target classes.
+            preds_matched: (torch.Tensor) Tensor indicating which predictions are matched.
+            targets_matched: (torch.Tensor) Tensor indicating which targets are matched.
+            preds_idx_to_use: (torch.Tensor) Indices of predictions to use.
+
+        Returns:
+            targets: Computed matching targets.
+        """
+        # shape = (n_preds x n_targets)
+        iou = self._box_iou(preds_box_xyxy[preds_idx_to_use], targets_box_xyxy)
+
+        # Fill IoU values at index (i, j) with 0 when the prediction (i) and target(j)
+        # are of different class
+        # Filling with 0 is equivalent to ignore these values
+        # since with want IoU > iou_threshold > 0
+        cls_mismatch = preds_cls[preds_idx_to_use].view(-1, 1) != targets_cls.view(1, -1)
+        iou[cls_mismatch] = 0
+
+        # The matching priority is first detection confidence and then IoU value.
+        # The detection is already sorted by confidence in NMS,
+        # so here for each prediction we order the targets by iou.
+        sorted_iou, target_sorted = iou.sort(descending=True, stable=True)
+
+        # Only iterate over IoU values higher than min threshold to speed up the process
+        for pred_selected_i, target_sorted_i in (sorted_iou > iou_thresholds[0]).nonzero(
+            as_tuple=False
+        ):
+            # pred_selected_i and target_sorted_i are relative to filters/sorting,
+            # so we extract their absolute indexes
+            pred_i = preds_idx_to_use[pred_selected_i]
+            target_i = target_sorted[pred_selected_i, target_sorted_i]
+
+            # Vector[j], True when IoU(pred_i, target_i) is above the (j)th threshold
+            is_iou_above_threshold = sorted_iou[pred_selected_i, target_sorted_i] > iou_thresholds
+
+            # Vector[j], True when both pred_i and target_i are not matched yet
+            # for the (j)th threshold
+            are_candidates_free = torch.logical_and(
+                ~preds_matched[pred_i, :], ~targets_matched[target_i, :]
+            )
+
+            # Vector[j], True when (pred_i, target_i) can be matched for the (j)th threshold
+            are_candidates_good = torch.logical_and(is_iou_above_threshold, are_candidates_free)
+
+            # For every threshold (j) where target_i and pred_i can be matched together
+            # ( are_candidates_good[j]==True )
+            # fill the matching placeholders with True
+            targets_matched[target_i, are_candidates_good] = True
+            preds_matched[pred_i, are_candidates_good] = True
+
+            # When all the targets are matched with a prediction for every IoU Threshold, stop.
+            if targets_matched.all():
+                break
+
+        return preds_matched
+
+    def _compute_page_detection_matching(
+        self,
+        preds: torch.Tensor,
+        targets: torch.Tensor,
+        height: int,
+        width: int,
+        top_k: int = 100,
+        return_on_cpu: bool = True,
+    ) -> tuple:
+        # Adapted from: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py  # noqa E501
+        """
+        Match predictions (NMS output) and the targets (ground truth) with respect to metric
+        and confidence score for a given image.
+
+        Args:
+            preds:          Tensor of shape (num_img_predictions, 6)
+                            format: (x1, y1, x2, y2, confidence, class_label)
+                            where x1,y1,x2,y2 are according to image size
+            targets:        targets for this image of shape (num_img_targets, 5)
+                            format:     (label, x1, y1, x2, y2)
+                            where x1,y1,x2,y2 are according to image size
+            height:         dimensions of the image
+            width:          dimensions of the image
+            top_k:          Number of predictions to keep per class, ordered by confidence score
+            return_on_cpu:  If True, the output will be returned on "CPU", otherwise it will be
+                            returned on "device"
+
+        Returns:
+            preds_matched:      Tensor of shape (num_img_predictions, n_thresholds)
+                                True when prediction (i) is matched with a target with respect to
+                                the (j)th threshold
+            preds_to_ignore:    Tensor of shape (num_img_predictions, n_thresholds)
+                                True when prediction (i) is matched with a crowd target with
+                                respect to the (j)th threshold
+            preds_scores:       Tensor of shape (num_img_predictions),
+                                confidence score for every prediction
+            preds_cls:          Tensor of shape (num_img_predictions),
+                                predicted class for every prediction
+            targets_cls:        Tensor of shape (num_img_targets),
+                                ground truth class for every target
+        """
+        thresholds = self.iou_thresholds.to(device=self.device)
+        num_thresholds = len(thresholds)
+
+        if preds is None or len(preds) == 0:
+            preds_matched = torch.zeros((0, num_thresholds), dtype=torch.bool, device=self.device)
+            preds_to_ignore = torch.zeros((0, num_thresholds), dtype=torch.bool, device=self.device)
+            preds_scores = torch.tensor([], dtype=torch.float32, device=self.device)
+            preds_cls = torch.tensor([], dtype=torch.float32, device=self.device)
+            targets_cls = targets[:, 0].to(device=self.device)
+            return preds_matched, preds_to_ignore, preds_scores, preds_cls, targets_cls
+
+        preds_matched = torch.zeros(
+            len(preds), num_thresholds, dtype=torch.bool, device=self.device
+        )
+        targets_matched = torch.zeros(
+            len(targets), num_thresholds, dtype=torch.bool, device=self.device
+        )
+        preds_to_ignore = torch.zeros(
+            len(preds), num_thresholds, dtype=torch.bool, device=self.device
+        )
+
+        preds_cls, preds_box, preds_scores = preds[:, -1], preds[:, 0:4], preds[:, 4]
+        targets_cls, targets_box = targets[:, 0], targets[:, 1:5]
+
+        # Ignore all but the predictions that were top_k for their class
+        preds_idx_to_use = self._get_top_k_idx_per_cls(preds_scores, preds_cls, top_k)
+        preds_to_ignore[:, :] = True
+        preds_to_ignore[preds_idx_to_use] = False
+
+        if len(targets) > 0:  # or len(crowd_targets) > 0:
+            self._change_bbox_bounds_for_image_size(preds, (height, width))
+
+            preds_matched = self._compute_targets(
+                preds_box,
+                preds_cls,
+                targets_box,
+                targets_cls,
+                preds_matched,
+                targets_matched,
+                preds_idx_to_use,
+                thresholds,
+            )
+
+        return preds_matched, preds_to_ignore, preds_scores, preds_cls, targets_cls
+
+    def _compute_detection_metrics(
+        self,
+        preds_matched: torch.Tensor,
+        preds_to_ignore: torch.Tensor,
+        preds_scores: torch.Tensor,
+        preds_cls: torch.Tensor,
+        targets_cls: torch.Tensor,
+    ) -> tuple:
+        # Adapted from: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py  # noqa E501
+        """
+        Compute the list of precision, recall, MaP and f1 for every class.
+
+        Args:
+            preds_matched:      Tensor of shape (num_predictions, n_iou_thresholds)
+                                True when prediction (i) is matched with a target with respect
+                                to the (j)th IoU threshold
+            preds_to_ignore     Tensor of shape (num_predictions, n_iou_thresholds)
+                                True when prediction (i) is matched with a crowd target with
+                                respect to the (j)th IoU threshold
+            preds_scores:       Tensor of shape (num_predictions),
+                                confidence score for every prediction
+            preds_cls:          Tensor of shape (num_predictions),
+                                predicted class for every prediction
+            targets_cls:        Tensor of shape (num_targets),
+                                ground truth class for every target box to be detected
+
+        Returns:
+            ap, precision, recall, f1:  Tensors of shape (n_class, nb_iou_thrs)
+            unique_classes:             Vector with all unique target classes
+        """
+
+        preds_matched, preds_to_ignore = preds_matched.to(self.device), preds_to_ignore.to(
+            self.device
+        )
+        preds_scores, preds_cls, targets_cls = (
+            preds_scores.to(self.device),
+            preds_cls.to(self.device),
+            targets_cls.to(self.device),
+        )
+
+        recall_thresholds = self.recall_thresholds.to(self.device)
+        score_threshold = self.score_threshold
+
+        unique_classes = torch.unique(targets_cls).long()
+
+        n_class, nb_iou_thrs = len(unique_classes), preds_matched.shape[-1]
+
+        ap = torch.zeros((n_class, nb_iou_thrs), device=self.device)
+        precision = torch.zeros((n_class, nb_iou_thrs), device=self.device)
+        recall = torch.zeros((n_class, nb_iou_thrs), device=self.device)
+
+        for cls_i, class_value in enumerate(unique_classes):
+            cls_preds_idx, cls_targets_idx = (preds_cls == class_value), (
+                targets_cls == class_value
+            )
+            (
+                cls_ap,
+                cls_precision,
+                cls_recall,
+            ) = self._compute_detection_metrics_per_cls(
+                preds_matched=preds_matched[cls_preds_idx],
+                preds_to_ignore=preds_to_ignore[cls_preds_idx],
+                preds_scores=preds_scores[cls_preds_idx],
+                n_targets=cls_targets_idx.sum(),
+                recall_thresholds=recall_thresholds,
+                score_threshold=score_threshold,
+            )
+            ap[cls_i, :] = cls_ap
+            precision[cls_i, :] = cls_precision
+            recall[cls_i, :] = cls_recall
+
+        f1 = 2 * precision * recall / (precision + recall + 1e-16)
+        return ap, precision, recall, f1, unique_classes
+
+    def _compute_detection_metrics_per_cls(
+        self,
+        preds_matched: torch.Tensor,
+        preds_to_ignore: torch.Tensor,
+        preds_scores: torch.Tensor,
+        n_targets: int,
+        recall_thresholds: torch.Tensor,
+        score_threshold: float,
+    ):
+        # Adapted from: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py  # noqa E501
+        """
+        Compute the list of precision, recall and MaP of a given class for every recall threshold.
+
+        Args:
+            preds_matched:      Tensor of shape (num_predictions, n_thresholds)
+                                True when prediction (i) is matched with a target
+                                with respect to the(j)th threshold
+            preds_to_ignore     Tensor of shape (num_predictions, n_thresholds)
+                                True when prediction (i) is matched with a crowd target
+                                with respect to the (j)th threshold
+            preds_scores:       Tensor of shape (num_predictions),
+                                confidence score for every prediction
+            n_targets:          Number of target boxes of this class
+            recall_thresholds:  Tensor of shape (max_n_rec_thresh)
+                                list of recall thresholds used to compute MaP
+            score_threshold:    Minimum confidence score to consider a prediction
+                                for the computation of precision and recall (not MaP)
+
+        Returns:
+            ap, precision, recall:     Tensors of shape (nb_thrs)
+        """
+
+        nb_iou_thrs = preds_matched.shape[-1]
+
+        tps = preds_matched
+        fps = torch.logical_and(
+            torch.logical_not(preds_matched), torch.logical_not(preds_to_ignore)
+        )
+
+        if len(tps) == 0:
+            return (
+                torch.zeros(nb_iou_thrs, device=self.device),
+                torch.zeros(nb_iou_thrs, device=self.device),
+                torch.zeros(nb_iou_thrs, device=self.device),
+            )
+
+        # Sort by decreasing score
+        dtype = (
+            torch.uint8
+            if preds_scores.is_cuda and preds_scores.dtype is torch.bool
+            else preds_scores.dtype
+        )
+        sort_ind = torch.argsort(preds_scores.to(dtype), descending=True)
+        tps = tps[sort_ind, :]
+        fps = fps[sort_ind, :]
+        preds_scores = preds_scores[sort_ind].contiguous()
+
+        # Rolling sum over the predictions
+        rolling_tps = torch.cumsum(tps, axis=0, dtype=torch.float)
+        rolling_fps = torch.cumsum(fps, axis=0, dtype=torch.float)
+
+        rolling_recalls = rolling_tps / n_targets
+        rolling_precisions = rolling_tps / (
+            rolling_tps + rolling_fps + torch.finfo(torch.float64).eps
+        )
+
+        # Reversed cummax to only have decreasing values
+        rolling_precisions = rolling_precisions.flip(0).cummax(0).values.flip(0)
+
+        # ==================
+        # RECALL & PRECISION
+
+        # We want the rolling precision/recall at index i so that:
+        # preds_scores[i-1] >= score_threshold > preds_scores[i]
+        # Note: torch.searchsorted works on increasing sequence and preds_scores is decreasing,
+        # so we work with "-"
+        # Note2: right=True due to negation
+        lowest_score_above_threshold = torch.searchsorted(
+            -preds_scores, -score_threshold, right=True
+        )
+
+        if (
+            lowest_score_above_threshold == 0
+        ):  # Here score_threshold > preds_scores[0], so no pred is above the threshold
+            recall = torch.zeros(nb_iou_thrs, device=self.device)
+            precision = torch.zeros(
+                nb_iou_thrs, device=self.device
+            )  # the precision is not really defined when no pred but we need to give it a value
+        else:
+            recall = rolling_recalls[lowest_score_above_threshold - 1]
+            precision = rolling_precisions[lowest_score_above_threshold - 1]
+
+        # ==================
+        # AVERAGE PRECISION
+
+        # shape = (nb_iou_thrs, n_recall_thresholds)
+        recall_thresholds = recall_thresholds.view(1, -1).repeat(nb_iou_thrs, 1)
+
+        # We want the index i so that:
+        # rolling_recalls[i-1] < recall_thresholds[k] <= rolling_recalls[i]
+        # Note:  when recall_thresholds[k] > max(rolling_recalls), i = len(rolling_recalls)
+        # Note2: we work with transpose (.T) to apply torch.searchsorted on first dim
+        # instead of the last one
+        recall_threshold_idx = torch.searchsorted(
+            rolling_recalls.T.contiguous(), recall_thresholds, right=False
+        ).T
+
+        # When recall_thresholds[k] > max(rolling_recalls),
+        # rolling_precisions[i] is not defined, and we want precision = 0
+        rolling_precisions = torch.cat(
+            (rolling_precisions, torch.zeros(1, nb_iou_thrs, device=self.device)), dim=0
+        )
+
+        # shape = (n_recall_thresholds, nb_iou_thrs)
+        sampled_precision_points = torch.gather(
+            input=rolling_precisions, index=recall_threshold_idx, dim=0
+        )
+
+        # Average over the recall_thresholds
+        ap = sampled_precision_points.mean(0)
+
+        return ap, precision, recall
+
+    def get_metrics(self) -> ObjectDetectionEvaluation:
+        """Get per document OD metrics.
+
+        Returns:
+            output_dict: dict with OD metrics
+        """
+        document_matchings = []
+        for preds, targets, height, width in zip(
+            self.document_preds, self.document_targets, self.pages_height, self.pages_width
+        ):
+            # iterate over each page
+            page_matching_tensors = self._compute_page_detection_matching(
+                preds=preds,
+                targets=targets,
+                height=height,
+                width=width,
+            )
+            document_matchings.append(page_matching_tensors)
+
+        # compute metrics for all detections and targets
+        mean_ap, mean_precision, mean_recall, mean_f1 = (
+            -1.0,
+            -1.0,
+            -1.0,
+            -1.0,
+        )
+        mean_ap_per_class = np.zeros(self.num_cls)
+
+        mean_precision_per_class = np.zeros(self.num_cls)
+        mean_recall_per_class = np.zeros(self.num_cls)
+        mean_f1_per_class = np.zeros(self.num_cls)
+
+        if len(document_matchings):
+            matching_info_tensors = [torch.cat(x, 0) for x in list(zip(*document_matchings))]
+
+            # shape (n_class, nb_iou_thresh)
+            (
+                ap_per_present_classes,
+                precision_per_present_classes,
+                recall_per_present_classes,
+                f1_per_present_classes,
+                present_classes,
+            ) = self._compute_detection_metrics(
+                *matching_info_tensors,
+            )
+
+            # Precision, recall and f1 are computed for IoU threshold range, averaged over classes
+            # results before version 3.0.4 (Dec 11 2022) were computed only for smallest value
+            # (i.e IoU 0.5 if metric is @0.5:0.95)
+            mean_precision, mean_recall, mean_f1 = (
+                precision_per_present_classes.mean(),
+                recall_per_present_classes.mean(),
+                f1_per_present_classes.mean(),
+            )
+
+            # MaP is averaged over IoU thresholds and over classes
+            mean_ap = ap_per_present_classes.mean()
+
+            # Fill array of per-class AP scores with values for classes that were present in the
+            # dataset
+            ap_per_class = ap_per_present_classes.mean(1)
+            precision_per_class = precision_per_present_classes.mean(1)
+            recall_per_class = recall_per_present_classes.mean(1)
+            f1_per_class = f1_per_present_classes.mean(1)
+            for i, class_index in enumerate(present_classes):
+                mean_ap_per_class[class_index] = float(ap_per_class[i])
+
+                mean_precision_per_class[class_index] = float(precision_per_class[i])
+                mean_recall_per_class[class_index] = float(recall_per_class[i])
+                mean_f1_per_class[class_index] = float(f1_per_class[i])
+
+        od_evaluation = ObjectDetectionEvaluation(
+            f1_score=float(mean_f1),
+            precision=float(mean_precision),
+            recall=float(mean_recall),
+            m_ap=float(mean_ap),
+        )
+
+        return od_evaluation
+
+
+if __name__ == "__main__":
+    from dataclasses import asdict
+
+    # Example usage
+    prediction_file_paths = [Path("pths/to/predictions.json"), Path("pths/to/predictions2.json")]
+    ground_truth_file_paths = [
+        Path("pths/to/ground_truth.json"),
+        Path("pths/to/ground_truth2.json"),
+    ]
+
+    for prediction_file_path, ground_truth_file_path in zip(
+        prediction_file_paths, ground_truth_file_paths
+    ):
+        eval_processor = ObjectDetectionEvalProcessor.from_json_files(
+            prediction_file_path, ground_truth_file_path
+        )
+
+        metrics: ObjectDetectionEvaluation = eval_processor.get_metrics()
+        print(f"Metrics for {ground_truth_file_path.name}:\n{asdict(metrics)}")
diff --git a/unstructured/metrics/utils.py b/unstructured/metrics/utils.py
index 317b503fd9..c490aa752b 100644
--- a/unstructured/metrics/utils.py
+++ b/unstructured/metrics/utils.py
@@ -2,6 +2,7 @@
 import os
 import re
 import statistics
+from pathlib import Path
 from typing import List, Optional, Union
 
 import click
@@ -98,7 +99,7 @@ def _display(df):
 
 
 def _write_to_file(
-    dir: str, filename: str, df: pd.DataFrame, mode: str = "w", overwrite: bool = True
+    directory: str, filename: str, df: pd.DataFrame, mode: str = "w", overwrite: bool = True
 ):
     """
     Save the metrics report to tsv file. The function allows an option 1) to choose `mode`
@@ -106,15 +107,17 @@ def _write_to_file(
     """
     if mode not in ["w", "a"]:
         raise ValueError("Mode not supported. Mode must be one of [w, a].")
-    if dir and not os.path.exists(dir):
-        os.makedirs(dir)
+    if directory:
+        Path(directory).mkdir(exist_ok=True)
     if "count" in df.columns:
         df["count"] = df["count"].astype(int)
     if "filename" in df.columns and "connector" in df.columns:
         df.sort_values(by=["connector", "filename"], inplace=True)
     if not overwrite:
-        filename = _get_non_duplicated_filename(dir, filename)
-    df.to_csv(os.path.join(dir, filename), sep="\t", mode=mode, index=False, header=(mode == "w"))
+        filename = _get_non_duplicated_filename(directory, filename)
+    df.to_csv(
+        os.path.join(directory, filename), sep="\t", mode=mode, index=False, header=(mode == "w")
+    )
 
 
 def _sorting_key(filename):