diff --git a/lite/tests/object_detection/test_f1.py b/lite/tests/object_detection/test_f1.py
new file mode 100644
index 000000000..078581559
--- /dev/null
+++ b/lite/tests/object_detection/test_f1.py
@@ -0,0 +1,470 @@
+import numpy as np
+from valor_lite.object_detection import DataLoader, Detection, MetricType
+from valor_lite.object_detection.computation import compute_precion_recall
+
+
+def test__compute_f1():
+
+    sorted_pairs = np.array(
+        [
+            # dt,  gt,  pd,  iou,  gl,  pl, score,
+            [0.0, 0.0, 2.0, 0.25, 0.0, 0.0, 0.95],
+            [0.0, 0.0, 3.0, 0.33333, 0.0, 0.0, 0.9],
+            [0.0, 0.0, 4.0, 0.66667, 0.0, 0.0, 0.65],
+            [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.1],
+            [0.0, 0.0, 1.0, 0.5, 0.0, 0.0, 0.01],
+        ]
+    )
+
+    label_metadata = np.array([[1, 5, 0]])
+    iou_thresholds = np.array([0.1, 0.6])
+    score_thresholds = np.array([0.0])
+
+    (_, _, _, counts, _) = compute_precion_recall(
+        sorted_pairs,
+        label_metadata=label_metadata,
+        iou_thresholds=iou_thresholds,
+        score_thresholds=score_thresholds,
+    )
+
+    f1 = counts[:, :, :, 5]
+
+    # f1
+    expected = np.array(
+        [
+            [[1 / 3]],  # iou = 0.1
+            [[1 / 3]],  # iou = 0.6
+        ]
+    )
+    assert np.isclose(f1, expected).all()
+
+
+def test_f1_metrics_first_class(
+    basic_detections_first_class: list[Detection],
+    basic_rotated_detections_first_class: list[Detection],
+):
+    """
+    Basic object detection test.
+
+    groundtruths
+        datum uid1
+            box 1 - label v1 - tp
+            box 3 - label v2 - fn missing prediction
+        datum uid2
+            box 2 - label v1 - fn missing prediction
+
+    predictions
+        datum uid1
+            box 1 - label v1 - score 0.3 - tp
+        datum uid2
+            box 2 - label v2 - score 0.98 - fp
+    """
+    for input_, method in [
+        (basic_detections_first_class, DataLoader.add_bounding_boxes),
+        (basic_rotated_detections_first_class, DataLoader.add_polygons),
+    ]:
+        loader = DataLoader()
+        method(loader, input_)
+        evaluator = loader.finalize()
+
+        metrics = evaluator.evaluate(
+            iou_thresholds=[0.1, 0.6],
+            score_thresholds=[0.0, 0.5],
+        )
+
+        assert evaluator.ignored_prediction_labels == []
+        assert evaluator.missing_prediction_labels == []
+        assert evaluator.n_datums == 2
+        assert evaluator.n_labels == 1
+        assert evaluator.n_groundtruths == 2
+        assert evaluator.n_predictions == 1
+
+        # test F1
+        actual_metrics = [m.to_dict() for m in metrics[MetricType.F1]]
+        expected_metrics = [
+            {
+                "type": "F1",
+                "value": 2 / 3,
+                "parameters": {
+                    "iou_threshold": 0.1,
+                    "score_threshold": 0.0,
+                    "label": "v1",
+                },
+            },
+            {
+                "type": "F1",
+                "value": 2 / 3,
+                "parameters": {
+                    "iou_threshold": 0.6,
+                    "score_threshold": 0.0,
+                    "label": "v1",
+                },
+            },
+            {
+                "type": "F1",
+                "value": 0.0,
+                "parameters": {
+                    "iou_threshold": 0.1,
+                    "score_threshold": 0.5,
+                    "label": "v1",
+                },
+            },
+            {
+                "type": "F1",
+                "value": 0.0,
+                "parameters": {
+                    "iou_threshold": 0.6,
+                    "score_threshold": 0.5,
+                    "label": "v1",
+                },
+            },
+        ]
+        for m in actual_metrics:
+            assert m in expected_metrics
+        for m in expected_metrics:
+            assert m in actual_metrics
+
+
+def test_f1_metrics_second_class(
+    basic_detections_second_class: list[Detection],
+    basic_rotated_detections_second_class: list[Detection],
+):
+    """
+    Basic object detection test.
+
+    groundtruths
+        datum uid1
+            box 3 - label v2 - fn missing prediction
+        datum uid2
+           none
+    predictions
+        datum uid1
+            none
+        datum uid2
+            box 2 - label v2 - score 0.98 - fp
+    """
+    for input_, method in [
+        (basic_detections_second_class, DataLoader.add_bounding_boxes),
+        (basic_rotated_detections_second_class, DataLoader.add_polygons),
+    ]:
+        loader = DataLoader()
+        method(loader, input_)
+        evaluator = loader.finalize()
+
+        metrics = evaluator.evaluate(
+            iou_thresholds=[0.1, 0.6],
+            score_thresholds=[0.0, 0.5],
+        )
+
+        assert evaluator.ignored_prediction_labels == []
+        assert evaluator.missing_prediction_labels == []
+        assert evaluator.n_datums == 2
+        assert evaluator.n_labels == 1
+        assert evaluator.n_groundtruths == 1
+        assert evaluator.n_predictions == 1
+
+        # test F1
+        actual_metrics = [m.to_dict() for m in metrics[MetricType.F1]]
+        expected_metrics = [
+            {
+                "type": "F1",
+                "value": 0.0,
+                "parameters": {
+                    "iou_threshold": 0.1,
+                    "score_threshold": 0.0,
+                    "label": "v2",
+                },
+            },
+            {
+                "type": "F1",
+                "value": 0.0,
+                "parameters": {
+                    "iou_threshold": 0.6,
+                    "score_threshold": 0.0,
+                    "label": "v2",
+                },
+            },
+            {
+                "type": "F1",
+                "value": 0.0,
+                "parameters": {
+                    "iou_threshold": 0.1,
+                    "score_threshold": 0.5,
+                    "label": "v2",
+                },
+            },
+            {
+                "type": "F1",
+                "value": 0.0,
+                "parameters": {
+                    "iou_threshold": 0.6,
+                    "score_threshold": 0.5,
+                    "label": "v2",
+                },
+            },
+        ]
+        for m in actual_metrics:
+            assert m in expected_metrics
+        for m in expected_metrics:
+            assert m in actual_metrics
+
+
+def test_f1_false_negatives_single_datum_baseline(
+    false_negatives_single_datum_baseline_detections: list[Detection],
+):
+    """This is the baseline for the below test. In this case there are two predictions and
+    one groundtruth, but the highest confident prediction overlaps sufficiently with the groundtruth
+    so there is not a penalty for the false negative so the AP is 1
+    """
+
+    loader = DataLoader()
+    loader.add_bounding_boxes(false_negatives_single_datum_baseline_detections)
+    evaluator = loader.finalize()
+
+    metrics = evaluator.evaluate(
+        iou_thresholds=[0.5],
+        score_thresholds=[0.0, 0.9],
+    )
+
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.F1]]
+    expected_metrics = [
+        {
+            "type": "F1",
+            "value": 2 / 3,
+            "parameters": {
+                "iou_threshold": 0.5,
+                "score_threshold": 0.0,
+                "label": "value",
+            },
+        },
+        {
+            "type": "F1",
+            "value": 0.0,
+            "parameters": {
+                "iou_threshold": 0.5,
+                "score_threshold": 0.9,
+                "label": "value",
+            },
+        },
+    ]
+    for m in actual_metrics:
+        assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in actual_metrics
+
+
+def test_f1_false_negatives_single_datum(
+    false_negatives_single_datum_detections: list[Detection],
+):
+    """Tests where high confidence false negative was not being penalized. The
+    difference between this test and the above is that here the prediction with higher confidence
+    does not sufficiently overlap the groundtruth and so is penalized and we get an AP of 0.5
+    """
+
+    loader = DataLoader()
+    loader.add_bounding_boxes(false_negatives_single_datum_detections)
+    evaluator = loader.finalize()
+    metrics = evaluator.evaluate(
+        iou_thresholds=[0.5],
+        score_thresholds=[0.0],
+    )
+
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.F1]]
+    expected_metrics = [
+        {
+            "type": "F1",
+            "value": 2 / 3,
+            "parameters": {
+                "iou_threshold": 0.5,
+                "score_threshold": 0.0,
+                "label": "value",
+            },
+        }
+    ]
+    for m in actual_metrics:
+        assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in actual_metrics
+
+
+def test_f1_false_negatives_two_datums_one_empty_low_confidence_of_fp(
+    false_negatives_two_datums_one_empty_low_confidence_of_fp_detections: list[
+        Detection
+    ],
+):
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class and high IOU)
+        2. A second image with empty groundtruth annotation but a prediction with lower confidence
+        then the prediction on the first image.
+
+    In this case, the AP should be 1.0 since the false positive has lower confidence than the true positive
+
+    """
+
+    loader = DataLoader()
+    loader.add_bounding_boxes(
+        false_negatives_two_datums_one_empty_low_confidence_of_fp_detections
+    )
+    evaluator = loader.finalize()
+    metrics = evaluator.evaluate(
+        iou_thresholds=[0.5],
+        score_thresholds=[0.0],
+    )
+
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.F1]]
+    expected_metrics = [
+        {
+            "type": "F1",
+            "value": 2 / 3,
+            "parameters": {
+                "iou_threshold": 0.5,
+                "score_threshold": 0.0,
+                "label": "value",
+            },
+        }
+    ]
+    for m in actual_metrics:
+        assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in actual_metrics
+
+
+def test_f1_false_negatives_two_datums_one_empty_high_confidence_of_fp(
+    false_negatives_two_datums_one_empty_high_confidence_of_fp_detections: list[
+        Detection
+    ],
+):
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class and high IOU)
+        2. A second image with empty groundtruth annotation and a prediction with higher confidence
+        then the prediction on the first image.
+
+    In this case, the AP should be 0.5 since the false positive has higher confidence than the true positive
+    """
+
+    loader = DataLoader()
+    loader.add_bounding_boxes(
+        false_negatives_two_datums_one_empty_high_confidence_of_fp_detections
+    )
+    evaluator = loader.finalize()
+    metrics = evaluator.evaluate(
+        iou_thresholds=[0.5],
+        score_thresholds=[0.0],
+    )
+
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.F1]]
+    expected_metrics = [
+        {
+            "type": "F1",
+            "value": 2 / 3,
+            "parameters": {
+                "iou_threshold": 0.5,
+                "score_threshold": 0.0,
+                "label": "value",
+            },
+        }
+    ]
+    for m in actual_metrics:
+        assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in actual_metrics
+
+
+def test_f1_false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp(
+    false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp_detections: list[
+        Detection
+    ],
+):
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU)
+        2. A second image with a groundtruth annotation with class `"other value"` and a prediction with lower confidence
+        then the prediction on the first image.
+
+    In this case, the AP for class `"value"` should be 1 since the false positive has lower confidence than the true positive.
+    AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth
+    """
+    loader = DataLoader()
+    loader.add_bounding_boxes(
+        false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp_detections
+    )
+    evaluator = loader.finalize()
+    metrics = evaluator.evaluate(
+        iou_thresholds=[0.5],
+        score_thresholds=[0.0],
+    )
+
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.F1]]
+    expected_metrics = [
+        {
+            "type": "F1",
+            "value": 2 / 3,
+            "parameters": {
+                "iou_threshold": 0.5,
+                "score_threshold": 0.0,
+                "label": "value",
+            },
+        },
+        {
+            "type": "F1",
+            "value": 0.0,
+            "parameters": {
+                "iou_threshold": 0.5,
+                "score_threshold": 0.0,
+                "label": "other value",
+            },
+        },
+    ]
+    for m in actual_metrics:
+        assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in actual_metrics
+
+
+def test_f1_false_negatives_two_datums_one_only_with_different_class_high_confidence_of_fp(
+    false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_detections: list[
+        Detection
+    ],
+):
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU)
+        2. A second image with a groundtruth annotation with class `"other value"` and a prediction with higher confidence
+        then the prediction on the first image.
+
+    In this case, the AP for class `"value"` should be 0.5 since the false positive has higher confidence than the true positive.
+    AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth
+    """
+    loader = DataLoader()
+    loader.add_bounding_boxes(
+        false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_detections
+    )
+    evaluator = loader.finalize()
+    metrics = evaluator.evaluate(
+        iou_thresholds=[0.5],
+        score_thresholds=[0.0],
+    )
+
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.F1]]
+    expected_metrics = [
+        {
+            "type": "F1",
+            "value": 2 / 3,
+            "parameters": {
+                "iou_threshold": 0.5,
+                "score_threshold": 0.0,
+                "label": "value",
+            },
+        },
+        {
+            "type": "F1",
+            "value": 0.0,
+            "parameters": {
+                "iou_threshold": 0.5,
+                "score_threshold": 0.0,
+                "label": "other value",
+            },
+        },
+    ]
+    for m in actual_metrics:
+        assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in actual_metrics
diff --git a/lite/tests/object_detection/test_precision.py b/lite/tests/object_detection/test_precision.py
index 170a2b2e6..0a86d4e5e 100644
--- a/lite/tests/object_detection/test_precision.py
+++ b/lite/tests/object_detection/test_precision.py
@@ -1,4 +1,42 @@
+import numpy as np
 from valor_lite.object_detection import DataLoader, Detection, MetricType
+from valor_lite.object_detection.computation import compute_precion_recall
+
+
+def test__compute_precision():
+
+    sorted_pairs = np.array(
+        [
+            # dt,  gt,  pd,  iou,  gl,  pl, score,
+            [0.0, 0.0, 2.0, 0.25, 0.0, 0.0, 0.95],
+            [0.0, 0.0, 3.0, 0.33333, 0.0, 0.0, 0.9],
+            [0.0, 0.0, 4.0, 0.66667, 0.0, 0.0, 0.65],
+            [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.1],
+            [0.0, 0.0, 1.0, 0.5, 0.0, 0.0, 0.01],
+        ]
+    )
+
+    label_metadata = np.array([[1, 5, 0]])
+    iou_thresholds = np.array([0.1, 0.6])
+    score_thresholds = np.array([0.0])
+
+    (_, _, _, counts, _) = compute_precion_recall(
+        sorted_pairs,
+        label_metadata=label_metadata,
+        iou_thresholds=iou_thresholds,
+        score_thresholds=score_thresholds,
+    )
+
+    precision = counts[:, :, :, 3]
+
+    # precision
+    expected = np.array(
+        [
+            [0.2],  # iou = 0.1
+            [0.2],  # iou = 0.6
+        ]
+    )
+    assert (precision == expected).all()
 
 
 def test_precision_metrics_first_class(
diff --git a/lite/tests/object_detection/test_recall.py b/lite/tests/object_detection/test_recall.py
index ab81f6ccb..662aa00e2 100644
--- a/lite/tests/object_detection/test_recall.py
+++ b/lite/tests/object_detection/test_recall.py
@@ -1,4 +1,42 @@
+import numpy as np
 from valor_lite.object_detection import DataLoader, Detection, MetricType
+from valor_lite.object_detection.computation import compute_precion_recall
+
+
+def test__compute_recall():
+
+    sorted_pairs = np.array(
+        [
+            # dt,  gt,  pd,  iou,  gl,  pl, score,
+            [0.0, 0.0, 2.0, 0.25, 0.0, 0.0, 0.95],
+            [0.0, 0.0, 3.0, 0.33333, 0.0, 0.0, 0.9],
+            [0.0, 0.0, 4.0, 0.66667, 0.0, 0.0, 0.65],
+            [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.1],
+            [0.0, 0.0, 1.0, 0.5, 0.0, 0.0, 0.01],
+        ]
+    )
+
+    label_metadata = np.array([[1, 5, 0]])
+    iou_thresholds = np.array([0.1, 0.6])
+    score_thresholds = np.array([0.0])
+
+    (_, _, _, counts, _) = compute_precion_recall(
+        sorted_pairs,
+        label_metadata=label_metadata,
+        iou_thresholds=iou_thresholds,
+        score_thresholds=score_thresholds,
+    )
+
+    recall = counts[:, :, :, 4]
+
+    # precision
+    expected = np.array(
+        [
+            [1.0],  # iou = 0.1
+            [1.0],  # iou = 0.6
+        ]
+    )
+    assert (recall == expected).all()
 
 
 def test_recall_metrics_first_class(
diff --git a/lite/valor_lite/object_detection/computation.py b/lite/valor_lite/object_detection/computation.py
index 228c7d81e..3f21c8e1c 100644
--- a/lite/valor_lite/object_detection/computation.py
+++ b/lite/valor_lite/object_detection/computation.py
@@ -408,17 +408,20 @@ def compute_precion_recall(
 
             # calculate component metrics
             recall = np.zeros_like(tp_count)
-            precision = np.zeros_like(tp_count)
             np.divide(tp_count, gt_count, where=gt_count > 1e-9, out=recall)
+
+            precision = np.zeros_like(tp_count)
             np.divide(tp_count, pd_count, where=pd_count > 1e-9, out=precision)
+
             fn_count = gt_count - tp_count
 
             f1_score = np.zeros_like(precision)
             np.divide(
-                np.multiply(precision, recall),
+                2 * np.multiply(precision, recall),
                 (precision + recall),
                 where=(precision + recall) > 1e-9,
                 out=f1_score,
+                dtype=np.float64,
             )
 
             counts[iou_idx][score_idx] = np.concatenate(