From df9770f18a3620a3a6b07d126f36c3a7e0788b68 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <38677807+czaloom@users.noreply.github.com>
Date: Thu, 22 Aug 2024 18:18:33 -0400
Subject: [PATCH] Fix PR Curve FP Count (#717)

---
 api/valor_api/backend/metrics/detection.py    |  14 +-
 .../client/metrics/test_detection.py          | 166 ++++++++++++++++++
 2 files changed, 174 insertions(+), 6 deletions(-)

diff --git a/api/valor_api/backend/metrics/detection.py b/api/valor_api/backend/metrics/detection.py
index 4d1b62b87..3c6bbefa3 100644
--- a/api/valor_api/backend/metrics/detection.py
+++ b/api/valor_api/backend/metrics/detection.py
@@ -251,15 +251,13 @@ def _compute_curves(
         curves = defaultdict(lambda: defaultdict(dict))
 
         for confidence_threshold in [x / 100 for x in range(5, 100, 5)]:
+
+            tp_cnt, fp_cnt, fn_cnt = 0, 0, 0
+
             if label_id not in sorted_ranked_pairs:
-                tp_cnt = 0
                 if label_id in groundtruths_per_label:
                     fn_cnt = len(groundtruths_per_label[label_id])
-                else:
-                    fn_cnt = 0
-
             else:
-                tp_cnt, fn_cnt = 0, 0
                 seen_gts = set()
 
                 for row in sorted_ranked_pairs[label_id]:
@@ -270,6 +268,11 @@ def _compute_curves(
                     ):
                         tp_cnt += 1
                         seen_gts.add(row.gt_id)
+                    elif (
+                        row.score >= confidence_threshold
+                        and row.iou < iou_threshold
+                    ):
+                        fp_cnt += 1
 
                 for (
                     _,
@@ -279,7 +282,6 @@ def _compute_curves(
                     if gt_id not in seen_gts:
                         fn_cnt += 1
 
-            fp_cnt = 0
             for (
                 _,
                 _,
diff --git a/integration_tests/client/metrics/test_detection.py b/integration_tests/client/metrics/test_detection.py
index cab317477..780be8616 100644
--- a/integration_tests/client/metrics/test_detection.py
+++ b/integration_tests/client/metrics/test_detection.py
@@ -3495,3 +3495,169 @@ def test_evaluate_mixed_annotations(
     )
     eval_job_raster.wait_for_completion()
     assert eval_job_raster.status == EvaluationStatus.FAILED
+
+
+def test_evaluate_detection_pr_fp(
+    db: Session, model_name, dataset_name, img1, img2
+):
+    gts = [
+        GroundTruth(
+            datum=img1,
+            annotations=[
+                Annotation(
+                    is_instance=True,
+                    labels=[Label(key="k1", value="v1")],
+                    bounding_box=Box.from_extrema(
+                        xmin=0, xmax=5, ymin=0, ymax=5
+                    ),
+                )
+            ],
+        ),
+        GroundTruth(
+            datum=img2,
+            annotations=[
+                Annotation(
+                    is_instance=True,
+                    labels=[Label(key="k1", value="v1")],
+                    bounding_box=Box.from_extrema(
+                        xmin=0, xmax=5, ymin=0, ymax=5
+                    ),
+                )
+            ],
+        ),
+    ]
+    preds = [
+        Prediction(
+            datum=img1,
+            annotations=[
+                Annotation(
+                    is_instance=True,
+                    labels=[Label(key="k1", value="v1", score=0.8)],
+                    bounding_box=Box.from_extrema(
+                        xmin=0, xmax=5, ymin=0, ymax=5
+                    ),
+                )
+            ],
+        ),
+        Prediction(
+            datum=img2,
+            annotations=[
+                Annotation(
+                    is_instance=True,
+                    labels=[Label(key="k1", value="v1", score=0.8)],
+                    bounding_box=Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                )
+            ],
+        ),
+    ]
+
+    dataset = Dataset.create(dataset_name)
+
+    for gt in gts:
+        dataset.add_groundtruth(gt)
+    dataset.finalize()
+
+    model = Model.create(model_name)
+
+    for pred in preds:
+        model.add_prediction(dataset, pred)
+    model.finalize_inferences(dataset)
+
+    eval_job = model.evaluate_detection(
+        dataset,
+        metrics_to_return=[
+            MetricType.PrecisionRecallCurve,
+        ],
+    )
+    eval_job.wait_for_completion(timeout=30)
+    metrics = eval_job.metrics
+    assert metrics[0]["value"]["v1"]["0.5"] == {
+        "fn": 1,  # img2
+        "fp": 1,  # img2
+        "tn": None,
+        "tp": 1,  # img1
+        "recall": 0.5,
+        "accuracy": None,
+        "f1_score": 0.5,
+        "precision": 0.5,
+    }
+
+    # score threshold is now higher than the scores, so we should the predictions drop out such that we're only left with 2 fns (one for each image)
+    assert metrics[0]["value"]["v1"]["0.85"] == {
+        "fn": 2,
+        "fp": 0,
+        "tn": None,
+        "tp": 0,
+        "recall": 0.0,
+        "accuracy": None,
+        "f1_score": -1,
+        "precision": -1,
+    }
+
+    eval_job = model.evaluate_detection(
+        dataset,
+        metrics_to_return=[
+            MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+    eval_job.wait_for_completion(timeout=30)
+    metrics = eval_job.metrics
+
+    score_threshold = "0.5"
+    assert metrics[0]["value"]["v1"][score_threshold]["tp"]["total"] == 1
+    assert "tn" not in metrics[0]["value"]["v1"][score_threshold]
+    assert (
+        metrics[0]["value"]["v1"][score_threshold]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 1
+    )
+    assert (
+        metrics[0]["value"]["v1"][score_threshold]["fp"]["observations"][
+            "misclassifications"
+        ]["count"]
+        == 0
+    )
+    assert (
+        metrics[0]["value"]["v1"][score_threshold]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert (
+        metrics[0]["value"]["v1"][score_threshold]["fn"]["observations"][
+            "misclassifications"
+        ]["count"]
+        == 0
+    )
+
+    # score threshold is now higher than the scores, so we should the predictions drop out such that we're only left with 2 fns (one for each image)
+    score_threshold = "0.85"
+    assert metrics[0]["value"]["v1"][score_threshold]["tp"]["total"] == 0
+    assert "tn" not in metrics[0]["value"]["v1"][score_threshold]
+    assert (
+        metrics[0]["value"]["v1"][score_threshold]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 0
+    )
+    assert (
+        metrics[0]["value"]["v1"][score_threshold]["fp"]["observations"][
+            "misclassifications"
+        ]["count"]
+        == 0
+    )
+    assert (
+        metrics[0]["value"]["v1"][score_threshold]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 2
+    )
+    assert (
+        metrics[0]["value"]["v1"][score_threshold]["fn"]["observations"][
+            "misclassifications"
+        ]["count"]
+        == 0
+    )