Skip to content

Commit

Permalink
Fix PR Curve FP Count (#717)
Browse files Browse the repository at this point in the history
  • Loading branch information
czaloom authored Aug 22, 2024
1 parent c996f6d commit df9770f
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 6 deletions.
14 changes: 8 additions & 6 deletions api/valor_api/backend/metrics/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,15 +251,13 @@ def _compute_curves(
curves = defaultdict(lambda: defaultdict(dict))

for confidence_threshold in [x / 100 for x in range(5, 100, 5)]:

tp_cnt, fp_cnt, fn_cnt = 0, 0, 0

if label_id not in sorted_ranked_pairs:
tp_cnt = 0
if label_id in groundtruths_per_label:
fn_cnt = len(groundtruths_per_label[label_id])
else:
fn_cnt = 0

else:
tp_cnt, fn_cnt = 0, 0
seen_gts = set()

for row in sorted_ranked_pairs[label_id]:
Expand All @@ -270,6 +268,11 @@ def _compute_curves(
):
tp_cnt += 1
seen_gts.add(row.gt_id)
elif (
row.score >= confidence_threshold
and row.iou < iou_threshold
):
fp_cnt += 1

for (
_,
Expand All @@ -279,7 +282,6 @@ def _compute_curves(
if gt_id not in seen_gts:
fn_cnt += 1

fp_cnt = 0
for (
_,
_,
Expand Down
166 changes: 166 additions & 0 deletions integration_tests/client/metrics/test_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3495,3 +3495,169 @@ def test_evaluate_mixed_annotations(
)
eval_job_raster.wait_for_completion()
assert eval_job_raster.status == EvaluationStatus.FAILED


def test_evaluate_detection_pr_fp(
db: Session, model_name, dataset_name, img1, img2
):
gts = [
GroundTruth(
datum=img1,
annotations=[
Annotation(
is_instance=True,
labels=[Label(key="k1", value="v1")],
bounding_box=Box.from_extrema(
xmin=0, xmax=5, ymin=0, ymax=5
),
)
],
),
GroundTruth(
datum=img2,
annotations=[
Annotation(
is_instance=True,
labels=[Label(key="k1", value="v1")],
bounding_box=Box.from_extrema(
xmin=0, xmax=5, ymin=0, ymax=5
),
)
],
),
]
preds = [
Prediction(
datum=img1,
annotations=[
Annotation(
is_instance=True,
labels=[Label(key="k1", value="v1", score=0.8)],
bounding_box=Box.from_extrema(
xmin=0, xmax=5, ymin=0, ymax=5
),
)
],
),
Prediction(
datum=img2,
annotations=[
Annotation(
is_instance=True,
labels=[Label(key="k1", value="v1", score=0.8)],
bounding_box=Box.from_extrema(
xmin=10, xmax=20, ymin=10, ymax=20
),
)
],
),
]

dataset = Dataset.create(dataset_name)

for gt in gts:
dataset.add_groundtruth(gt)
dataset.finalize()

model = Model.create(model_name)

for pred in preds:
model.add_prediction(dataset, pred)
model.finalize_inferences(dataset)

eval_job = model.evaluate_detection(
dataset,
metrics_to_return=[
MetricType.PrecisionRecallCurve,
],
)
eval_job.wait_for_completion(timeout=30)
metrics = eval_job.metrics
assert metrics[0]["value"]["v1"]["0.5"] == {
"fn": 1, # img2
"fp": 1, # img2
"tn": None,
"tp": 1, # img1
"recall": 0.5,
"accuracy": None,
"f1_score": 0.5,
"precision": 0.5,
}

# score threshold is now higher than the scores, so we should the predictions drop out such that we're only left with 2 fns (one for each image)
assert metrics[0]["value"]["v1"]["0.85"] == {
"fn": 2,
"fp": 0,
"tn": None,
"tp": 0,
"recall": 0.0,
"accuracy": None,
"f1_score": -1,
"precision": -1,
}

eval_job = model.evaluate_detection(
dataset,
metrics_to_return=[
MetricType.DetailedPrecisionRecallCurve,
],
)
eval_job.wait_for_completion(timeout=30)
metrics = eval_job.metrics

score_threshold = "0.5"
assert metrics[0]["value"]["v1"][score_threshold]["tp"]["total"] == 1
assert "tn" not in metrics[0]["value"]["v1"][score_threshold]
assert (
metrics[0]["value"]["v1"][score_threshold]["fp"]["observations"][
"hallucinations"
]["count"]
== 1
)
assert (
metrics[0]["value"]["v1"][score_threshold]["fp"]["observations"][
"misclassifications"
]["count"]
== 0
)
assert (
metrics[0]["value"]["v1"][score_threshold]["fn"]["observations"][
"no_predictions"
]["count"]
== 1
)
assert (
metrics[0]["value"]["v1"][score_threshold]["fn"]["observations"][
"misclassifications"
]["count"]
== 0
)

# score threshold is now higher than the scores, so we should the predictions drop out such that we're only left with 2 fns (one for each image)
score_threshold = "0.85"
assert metrics[0]["value"]["v1"][score_threshold]["tp"]["total"] == 0
assert "tn" not in metrics[0]["value"]["v1"][score_threshold]
assert (
metrics[0]["value"]["v1"][score_threshold]["fp"]["observations"][
"hallucinations"
]["count"]
== 0
)
assert (
metrics[0]["value"]["v1"][score_threshold]["fp"]["observations"][
"misclassifications"
]["count"]
== 0
)
assert (
metrics[0]["value"]["v1"][score_threshold]["fn"]["observations"][
"no_predictions"
]["count"]
== 2
)
assert (
metrics[0]["value"]["v1"][score_threshold]["fn"]["observations"][
"misclassifications"
]["count"]
== 0
)

0 comments on commit df9770f

Please sign in to comment.