diff --git a/.github/workflows/benchmark-evaluations.yml b/.github/workflows/client-api-benchmark-evaluations.yml
similarity index 97%
rename from .github/workflows/benchmark-evaluations.yml
rename to .github/workflows/client-api-benchmark-evaluations.yml
index 8d19e0d8b..078a66237 100644
--- a/.github/workflows/benchmark-evaluations.yml
+++ b/.github/workflows/client-api-benchmark-evaluations.yml
@@ -1,4 +1,4 @@
-name: Run benchmarks on pre-existing data
+name: Run API + client benchmarks
 
 on:
   push:
diff --git a/.github/workflows/tests-and-coverage.yml b/.github/workflows/client-api-tests-and-coverage.yml
similarity index 99%
rename from .github/workflows/tests-and-coverage.yml
rename to .github/workflows/client-api-tests-and-coverage.yml
index e415c899c..c16c39026 100644
--- a/.github/workflows/tests-and-coverage.yml
+++ b/.github/workflows/client-api-tests-and-coverage.yml
@@ -1,4 +1,4 @@
-name: Unit, functional, integration tests and code coverage
+name: Run API + client code coverage report
 
 on:
   push:
diff --git a/.github/workflows/core-benchmark-evaluations.yml b/.github/workflows/core-benchmark-evaluations.yml
new file mode 100644
index 000000000..e145b6ec6
--- /dev/null
+++ b/.github/workflows/core-benchmark-evaluations.yml
@@ -0,0 +1,38 @@
+name: Run core benchmarks
+
+on:
+  push:
+    branches: "**"
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  run-benchmarks:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: install core
+        run: pip install -e .
+        working-directory: ./core
+      - name: run classification benchmarks
+        run: python benchmark_script.py
+        working-directory: ./core/benchmarks/classification
+      - name: print classification results
+        run: |
+          export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('results.json', 'r')), indent=4));")
+          echo "$BENCHMARK_RESULTS"
+        working-directory: ./core/benchmarks/classification
+      - name: run object detection benchmarks
+        run: python benchmark_script.py
+        working-directory: ./core/benchmarks/object-detection
+      - name: print object detection results
+        run: |
+          export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('results.json', 'r')), indent=4));")
+          echo "$BENCHMARK_RESULTS"
+        working-directory: ./core/benchmarks/object-detection
+      - run: make stop-env
diff --git a/.github/workflows/core-tests-and-coverage.yml b/.github/workflows/core-tests-and-coverage.yml
new file mode 100644
index 000000000..96762a2d7
--- /dev/null
+++ b/.github/workflows/core-tests-and-coverage.yml
@@ -0,0 +1,36 @@
+name: Run core code coverage report
+
+on:
+  push:
+    branches: "**"
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  core-tests:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: .
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: run tests and report coverage
+        run: |
+          pip install -e ".[test]"
+          COVERAGE_FILE=.coverage.functional python -m coverage run --omit "tests/*"  -m pytest -v tests/functional-tests
+          COVERAGE_FILE=.coverage.unit python -m coverage run --omit "tests/*" -m pytest -v tests/unit-tests
+          python -m coverage combine
+          python -m coverage report -m
+          python -m coverage json
+          export TOTAL=$(python -c "import json;print(json.load(open('coverage.json'))['totals']['percent_covered_display'])")
+          echo "total=$TOTAL" >> $GITHUB_ENV
+          if (( $TOTAL < 90 )); then
+            echo "Coverage is below 90%"
+            exit 1
+          fi
+        working-directory: ./core
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fa4ade5f7..83b21935e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,11 +32,12 @@ repos:
     rev: v1.1.376
     hooks:
       - id: pyright
-        additional_dependencies:
-          [
+        additional_dependencies: [
             "requests",
             "Pillow >= 9.1.0",
             "numpy",
+            "pandas>=2.2.2",
+            "pandas-stubs", # fixes pyright issues with pandas
             "pytest",
             "python-dotenv",
             "SQLAlchemy>=2.0",
@@ -44,7 +45,6 @@ repos:
             "importlib_metadata; python_version < '3.8'",
             "pydantic-settings",
             "tqdm",
-            "pandas",
             "packaging",
             "PyJWT[crypto]",
             "structlog",
@@ -57,4 +57,5 @@ repos:
             "nltk",
             "rouge_score",
             "evaluate",
+            "shapely",
           ]
diff --git a/core/LICENSE b/core/LICENSE
new file mode 100644
index 000000000..2965db998
--- /dev/null
+++ b/core/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Striveworks
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/core/README.md b/core/README.md
new file mode 100644
index 000000000..be1bd6423
--- /dev/null
+++ b/core/README.md
@@ -0,0 +1,238 @@
+# valor_core: Compute classification, object detection, and segmentation metrics locally.
+
+Valor is a centralized evaluation store which makes it easy to measure, explore, and rank model performance. Valor empowers data scientists and engineers to evaluate the performance of their machine learning pipelines and use those evaluations to make better modeling decisions in the future.
+
+`valor_core` is the start of a new backbone for Valor's metric calculations. In the future, the Valor API will import `valor_core`'s evaluation functions in order to efficiently compute its classification, object detection, and segmentation metrics. This module offers a few advantages over the existing `valor` evaluation implementations, including:
+- The ability to calculate metrics locally, without running separate database and API services
+- Faster compute times due to the use of vectors and arrays
+- Easier testing, debugging, and benchmarking due to the separation of concerns between evaluation computations and Postgres operations (e.g., filtering, querying)
+
+Valor is maintained by Striveworks, a cutting-edge MLOps company based out of Austin, Texas. We'd love to learn more about your interest in Valor and answer any questions you may have; please don't hesitate to reach out to us on [Slack](https://striveworks-public.slack.com/join/shared_invite/zt-1a0jx768y-2J1fffN~b4fXYM8GecvOhA#/shared-invite/email) or [GitHub](https://github.com/striveworks/valor).
+
+For more information, please see our [user docs](https://striveworks.github.io/valor/).
+
+## Usage
+
+### Passing Lists of GroundTruth and Prediction Objects
+
+The first way to use `valor_core` is to pass a list of groundtruth and prediction objects to an `evaluate_...` function, like so:
+
+```python
+
+groundtruths = [
+    schemas.GroundTruth(
+            datum=img1,
+            annotations=...
+     ), …
+]
+predictions = [
+    schemas.Prediction(
+            datum=img1,
+            annotations=...
+     ), …
+]
+
+evaluation = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.5,
+        pr_curve_max_examples=1,
+    )
+```
+
+### Passing DataFrames
+
+The second way to use `valor_core` is to pass in a dataframe of groundtruths and predictions:
+
+```python
+
+groundtruth_df = pd.DataFrame(
+        [
+            {
+                "datum_id": 1,
+                "datum_uid": "uid1",
+                "id": 1,
+                "annotation_id": 1,
+                "label_id": 1,
+                "label_key": "k1",
+                "label_value": "v1",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[10, 10], [60, 10], [60, 40], [10, 40], [10, 10]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+            {
+                "datum_id": 1,
+                "datum_uid": "uid1",
+                "id": 2,
+                "annotation_id": 2,
+                "label_id": 2,
+                "label_key": "k2",
+                "label_value": "v2",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [
+                                [87, 10],
+                                [158, 10],
+                                [158, 820],
+                                [87, 820],
+                                [87, 10],
+                            ]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+            {
+                "datum_id": 2,
+                "datum_uid": "uid2",
+                "id": 3,
+                "annotation_id": 3,
+                "label_id": 1,
+                "label_key": "k1",
+                "label_value": "v1",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[15, 0], [70, 0], [70, 20], [15, 20], [15, 0]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+        ]
+)
+prediction_df = pd.DataFrame(
+    [
+        {
+            "id": 1,
+            "annotation_id": 4,
+            "score": 0.3,
+            "datum_id": 1,
+            "datum_uid": "uid1",
+            "label_id": 1,
+            "label_key": "k1",
+            "label_value": "v1",
+            "is_instance": True,
+            "polygon": schemas.Polygon.from_dict(
+                {
+                    "type": "Polygon",
+                    "coordinates": [
+                        [[10, 10], [60, 10], [60, 40], [10, 40], [10, 10]]
+                    ],
+                }
+            ),
+            "raster": None,
+            "bounding_box": None,
+        },
+        {
+            "id": 2,
+            "annotation_id": 5,
+            "score": 0.98,
+            "datum_id": 2,
+            "datum_uid": "uid2",
+            "label_id": 2,
+            "label_key": "k2",
+            "label_value": "v2",
+            "is_instance": True,
+            "polygon": schemas.Polygon.from_dict(
+                {
+                    "type": "Polygon",
+                    "coordinates": [
+                        [[15, 0], [70, 0], [70, 20], [15, 20], [15, 0]]
+                    ],
+                }
+            ),
+            "raster": None,
+            "bounding_box": None,
+        },
+    ]
+)
+
+evaluation = evaluate_detection(
+        groundtruths=groundtruth_df,
+        predictions=prediction_df,
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.5,
+        pr_curve_max_examples=1,
+    )
+```
+
+## Using a Data Manager
+
+Finally, you can use a manager class (i.e., `ValorDetectionManager`) to run your evaluation. The advantage to using a manager class is a) you won't have to keep all annotation types in memory in a large list and b) we can pre-compute certain columns (i.e., `iou`) in advance of the `.evaluate()` call.
+
+
+```python
+manager = valor_core.ValorDetectionManager(...)
+img1 = schemas.Datum(
+        uid="uid1",
+        metadata={
+            "height": image_height,
+            "width": image_width,
+        },
+    )
+groundtruths = [
+    schemas.GroundTruth(
+            datum=img1,
+            annotations=...
+     ), …
+]
+predictions = [
+    schemas.Prediction(
+            datum=img1,
+            annotations=...
+     ), …
+]
+
+
+# the user passes a list of all groundtruths and predictions for a list of datums
+# this allows us to precompute IOUs at the datum_uid + label_key level
+manager.add_data(groundtruths=groundtruths, predictions=predictions)
+
+# the user calls .evaluate() to compute the evaluation
+evaluation = manager.evaluate()
+
+# the user must pass all groundtruths and predictions for a given datum at once
+# this restriction makes it so we can compute IOUs right away and throw away excess info like rasters, saving a significant amount of memory
+with pytest.raises(ValueError):
+    manager.add_data_for_datum(groundtruths=groundtruths, predictions=predictions) # throws error since img1 has already been added to the manager's data
+
+# the user must also specify the label map, `convert_annotation_to_type`, etc. when instantiating the object
+# once set, these attributes can't be changed since subsequent IOU calculations will become apples-to-oranges with prior calculations
+with pytest.raises(ValueError):
+    manager.label_map = some_label_map # throws an error since label map can't be changed, only instantiated
+```
\ No newline at end of file
diff --git a/core/benchmarks/.gitignore b/core/benchmarks/.gitignore
new file mode 100644
index 000000000..94a2dd146
--- /dev/null
+++ b/core/benchmarks/.gitignore
@@ -0,0 +1 @@
+*.json
\ No newline at end of file
diff --git a/core/benchmarks/classification/benchmark_script.py b/core/benchmarks/classification/benchmark_script.py
new file mode 100644
index 000000000..1d13999cf
--- /dev/null
+++ b/core/benchmarks/classification/benchmark_script.py
@@ -0,0 +1,270 @@
+import json
+import os
+import time
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+
+import requests
+from valor_core import (
+    Annotation,
+    Datum,
+    GroundTruth,
+    Label,
+    Prediction,
+    enums,
+    evaluate_classification,
+)
+
+
+def time_it(fn, *args, **kwargs) -> tuple[float, dict]:
+    start = time.time()
+    results = fn(*args, **kwargs)
+    return (time.time() - start, results)
+
+
+def download_data_if_not_exists(file_path: Path, file_url: str):
+    """Download the data from a public bucket if it doesn't exist in the repo."""
+    if os.path.exists(file_path):
+        return
+
+    response = json.loads(requests.get(file_url).text)
+    with open(file_path, "w+") as file:
+        json.dump(response, file, indent=4)
+
+
+def write_results_to_file(write_path: Path, results: list[dict]):
+    """Write results to results.json"""
+    current_datetime = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
+    if os.path.isfile(write_path):
+        with open(write_path, "r") as file:
+            file.seek(0)
+            data = json.load(file)
+    else:
+        data = {}
+
+    data[current_datetime] = results
+
+    with open(write_path, "w+") as file:
+        json.dump(data, file, indent=4)
+
+
+def ingest_groundtruths(raw: dict, pair_limit: int) -> list[GroundTruth]:
+    """Ingest the data into Valor."""
+
+    groundtruths = []
+    slice_ = (
+        raw["groundtruth_prediction_pairs"][:pair_limit]
+        if pair_limit != -1
+        else raw["groundtruth_prediction_pairs"]
+    )
+    for groundtruth, prediction in slice_:
+        groundtruths.append(
+            GroundTruth(
+                datum=Datum(
+                    uid=groundtruth["value"]["datum"]["uid"],
+                    metadata={"width": 224, "height": 224},
+                ),
+                annotations=[
+                    Annotation(
+                        labels=[
+                            Label(
+                                key=label["key"],
+                                value=label["value"],
+                                score=label["score"],
+                            )
+                            for label in annotation["labels"]
+                        ],
+                    )
+                    for annotation in groundtruth["value"]["annotations"]
+                ],
+            )
+        )
+
+    return groundtruths
+
+
+def ingest_predictions(raw: dict, pair_limit: int) -> list[Prediction]:
+    """Ingest the data into Valor."""
+
+    predictions = []
+    slice_ = (
+        raw["groundtruth_prediction_pairs"][:pair_limit]
+        if pair_limit != -1
+        else raw["groundtruth_prediction_pairs"]
+    )
+    for _, prediction in slice_:
+        predictions.append(
+            Prediction(
+                datum=Datum(
+                    uid=prediction["value"]["datum"]["uid"],
+                    metadata={"width": 224, "height": 224},
+                ),
+                annotations=[
+                    Annotation(
+                        labels=[
+                            Label(
+                                key=label["key"],
+                                value=label["value"],
+                                score=label["score"],
+                            )
+                            for label in annotation["labels"]
+                        ],
+                    )
+                    for annotation in prediction["value"]["annotations"]
+                ],
+            )
+        )
+
+    return predictions
+
+
+def run_base_evaluation(groundtruths, predictions):
+    """Run a base evaluation (with no PR curves)."""
+    evaluation = evaluate_classification(groundtruths, predictions)
+    return evaluation
+
+
+def run_pr_curve_evaluation(groundtruths, predictions):
+    """Run a base evaluation with PrecisionRecallCurve included."""
+    evaluation = evaluate_classification(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        metrics_to_return=[
+            enums.MetricType.Accuracy,
+            enums.MetricType.Precision,
+            enums.MetricType.Recall,
+            enums.MetricType.F1,
+            enums.MetricType.ROCAUC,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+    )
+    return evaluation
+
+
+def run_detailed_pr_curve_evaluation(groundtruths, predictions):
+    """Run a base evaluation with PrecisionRecallCurve and DetailedPrecisionRecallCurve included."""
+    evaluation = evaluate_classification(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        metrics_to_return=[
+            enums.MetricType.Accuracy,
+            enums.MetricType.Precision,
+            enums.MetricType.Recall,
+            enums.MetricType.F1,
+            enums.MetricType.ROCAUC,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+    return evaluation
+
+
+@dataclass
+class DataBenchmark:
+    ingestion: float
+
+    def result(self) -> dict[str, float | str]:
+        return {
+            "ingestion": round(self.ingestion, 2),
+        }
+
+
+@dataclass
+class EvaluationBenchmark:
+    limit: int
+    gt_stats: DataBenchmark
+    pd_stats: DataBenchmark
+    n_datums: int
+    n_annotations: int
+    n_labels: int
+    eval_base: float
+    eval_base_pr: float
+    eval_base_pr_detail: float
+
+    def result(self) -> dict[str, float | str | dict[str, str | float]]:
+        return {
+            "limit": self.limit,
+            "groundtruths": self.gt_stats.result(),
+            "predictions": self.pd_stats.result(),
+            "evaluation": {
+                "number_of_datums": self.n_datums,
+                "number_of_annotations": self.n_annotations,
+                "number_of_labels": self.n_labels,
+                "base": round(self.eval_base, 2),
+                "base+pr": round(self.eval_base_pr, 2),
+                "base+pr+detailed": round(self.eval_base_pr_detail, 2),
+            },
+        }
+
+
+def run_benchmarking_analysis(
+    limits: list[int],
+    results_file: str = "results.json",
+    data_file: str = "data.json",
+):
+    """Time various function calls and export the results."""
+    current_directory = Path(os.path.dirname(os.path.realpath(__file__)))
+    write_path = current_directory / Path(results_file)
+    data_path = current_directory / Path(data_file)
+
+    download_data_if_not_exists(
+        file_path=data_path,
+        file_url="https://pub-fae71003f78140bdaedf32a7c8d331d2.r2.dev/classification_data.json",
+    )
+
+    with open(data_path) as file:
+        file.seek(0)
+        raw_data = json.load(file)
+
+    results = list()
+    for limit in limits:
+
+        # ingest groundtruths
+        gt_ingest_time, groundtruths = time_it(
+            ingest_groundtruths,
+            raw=raw_data,
+            pair_limit=limit,
+        )
+
+        # ingest predictions
+        pd_ingest_time, predictions = time_it(
+            ingest_predictions,
+            raw=raw_data,
+            pair_limit=limit,
+        )
+
+        # run evaluations
+        eval_base = run_base_evaluation(groundtruths, predictions)
+        eval_pr = run_pr_curve_evaluation(groundtruths, predictions)
+        eval_detail = run_detailed_pr_curve_evaluation(
+            groundtruths, predictions
+        )
+
+        assert eval_base.meta
+        assert eval_pr.meta
+        assert eval_detail.meta
+
+        results.append(
+            EvaluationBenchmark(
+                limit=limit,
+                gt_stats=DataBenchmark(
+                    ingestion=gt_ingest_time,
+                ),
+                pd_stats=DataBenchmark(
+                    ingestion=pd_ingest_time,
+                ),
+                n_datums=eval_base.meta["datums"],
+                n_annotations=eval_base.meta["annotations"],
+                n_labels=eval_base.meta["labels"],
+                eval_base=eval_base.meta["duration"],
+                eval_base_pr=eval_pr.meta["duration"],
+                eval_base_pr_detail=eval_detail.meta["duration"],
+            ).result()
+        )
+
+    write_results_to_file(write_path=write_path, results=results)
+
+
+if __name__ == "__main__":
+    run_benchmarking_analysis(limits=[5000, 5000])
diff --git a/core/benchmarks/object-detection/benchmark_script.py b/core/benchmarks/object-detection/benchmark_script.py
new file mode 100644
index 000000000..cd794f64e
--- /dev/null
+++ b/core/benchmarks/object-detection/benchmark_script.py
@@ -0,0 +1,432 @@
+import io
+import json
+import os
+import re
+from base64 import b64decode
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from time import time
+
+import numpy as np
+import PIL.Image
+import requests
+from tqdm import tqdm
+from valor_core import (
+    Annotation,
+    Box,
+    Datum,
+    GroundTruth,
+    Label,
+    Polygon,
+    Prediction,
+    Raster,
+    enums,
+    evaluate_detection,
+)
+from valor_core.enums import AnnotationType
+
+
+def time_it(fn, *args, **kwargs):
+    start = time()
+    results = fn(*args, **kwargs)
+    return (time() - start, results)
+
+
+def download_data_if_not_exists(
+    file_name: str,
+    file_path: Path,
+    url: str,
+):
+    """Download the data from a public bucket if it doesn't exist locally."""
+
+    if not os.path.exists(file_path):
+        response = requests.get(url, stream=True)
+        if response.status_code == 200:
+            total_size = int(response.headers.get("content-length", 0))
+            with open(file_path, "wb") as f:
+                with tqdm(
+                    total=total_size,
+                    unit="B",
+                    unit_scale=True,
+                    unit_divisor=1024,
+                    desc=file_name,
+                ) as pbar:
+                    for chunk in response.iter_content(chunk_size=1024):
+                        if chunk:
+                            f.write(chunk)
+                            pbar.update(1024)
+        else:
+            raise RuntimeError(response)
+    else:
+        print(f"{file_name} already exists locally.")
+
+
+def write_results_to_file(write_path: Path, results: list[dict]):
+    """Write results to results.json"""
+    current_datetime = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
+    if os.path.isfile(write_path):
+        with open(write_path, "r") as file:
+            file.seek(0)
+            data = json.load(file)
+    else:
+        data = {}
+
+    data[current_datetime] = results
+
+    with open(write_path, "w+") as file:
+        json.dump(data, file, indent=4)
+
+
+def ingest_groundtruths(
+    dtype: AnnotationType,
+    path: Path,
+    limit: int,
+) -> list[GroundTruth]:
+    groundtruths = []
+    with open(path, "r") as f:
+        for line in f:
+            gt_dict = json.loads(line)
+            gt_dict["datum"].pop("text")
+            gt_dict["datum"] = Datum(**gt_dict["datum"])
+
+            annotations = []
+            for ann in gt_dict["annotations"]:
+                ann.pop("text")
+                ann.pop("context_list")
+
+                labels = []
+                for label in ann["labels"]:
+                    labels.append(Label(**label))
+                ann["labels"] = labels
+
+                if ann["bounding_box"] and dtype == AnnotationType.BOX:
+                    ann["bounding_box"] = Box(ann["bounding_box"])
+                    annotations.append(Annotation(**ann))
+
+                if ann["polygon"] and dtype == AnnotationType.POLYGON:
+                    ann["polygon"] = Polygon(ann["polygon"])
+                    annotations.append(Annotation(**ann))
+
+                if ann["raster"] and dtype == AnnotationType.RASTER:
+                    mask_bytes = b64decode(ann["raster"]["mask"])
+                    with io.BytesIO(mask_bytes) as f:
+                        img = PIL.Image.open(f)
+                        w, h = img.size
+                        if ann["raster"]["geometry"] is not None:
+                            ann["raster"] = Raster.from_geometry(
+                                ann["raster"]["geometry"],
+                                width=w,
+                                height=h,
+                            )
+                        elif ann["raster"]["geometry"] is None:
+                            # decode raster
+                            ann["raster"] = Raster(mask=np.array(img))
+                    annotations.append(Annotation(**ann))
+
+            gt_dict["annotations"] = annotations
+            gt = GroundTruth(**gt_dict)
+            groundtruths.append(gt)
+            if len(groundtruths) >= limit:
+                return groundtruths
+    return groundtruths
+
+
+def ingest_predictions(
+    dtype: AnnotationType,
+    datum_uids: list[str],
+    path: Path,
+    limit: int,
+) -> list[Prediction]:
+
+    pattern = re.compile(r'"uid":\s*"(\d+)"')
+
+    predictions = []
+    with open(path, "r") as f:
+        count = 0
+        for line in f:
+            match = pattern.search(line)
+            if not match:
+                continue
+            elif match.group(1) not in datum_uids:
+                continue
+            pd_dict = json.loads(line)
+
+            pd_dict["datum"].pop("text")
+            pd_dict["datum"] = Datum(**pd_dict["datum"])
+
+            annotations = []
+            for ann in pd_dict["annotations"]:
+                ann.pop("text")
+                ann.pop("context_list")
+
+                labels = []
+                for label in ann["labels"]:
+                    labels.append(Label(**label))
+                ann["labels"] = labels
+
+                if ann["bounding_box"] and dtype == AnnotationType.BOX:
+                    ann["bounding_box"] = Box(ann["bounding_box"])
+                    annotations.append(Annotation(**ann))
+
+                if ann["polygon"] and dtype == AnnotationType.POLYGON:
+                    ann["polygon"] = Polygon(ann["polygon"])
+                    annotations.append(Annotation(**ann))
+
+                if ann["raster"] and dtype == AnnotationType.RASTER:
+                    mask_bytes = b64decode(ann["raster"]["mask"])
+                    with io.BytesIO(mask_bytes) as f:
+                        img = PIL.Image.open(f)
+                        w, h = img.size
+                        if ann["raster"]["geometry"] is not None:
+                            ann["raster"] = Raster.from_geometry(
+                                ann["raster"]["geometry"],
+                                width=w,
+                                height=h,
+                            )
+                        elif ann["raster"]["geometry"] is None:
+                            # decode raster
+                            ann["raster"] = Raster(mask=np.array(img))
+                    annotations.append(Annotation(**ann))
+
+            pd_dict["annotations"] = annotations
+            pd = Prediction(**pd_dict)
+            predictions.append(pd)
+            count += 1
+            if count >= limit:
+                return predictions
+    return predictions
+
+
+def run_base_evaluation(groundtruths, predictions):
+    """Run a base evaluation (with no PR curves)."""
+    evaluation = evaluate_detection(groundtruths, predictions)
+    return evaluation
+
+
+def run_pr_curve_evaluation(groundtruths, predictions):
+    """Run a base evaluation with PrecisionRecallCurve included."""
+    evaluation = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+    )
+    return evaluation
+
+
+def run_detailed_pr_curve_evaluation(groundtruths, predictions):
+    """Run a base evaluation with PrecisionRecallCurve and DetailedPrecisionRecallCurve included."""
+    evaluation = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+    )
+    return evaluation
+
+
+@dataclass
+class DataBenchmark:
+    dtype: str
+    ingestion: float
+
+    def result(self) -> dict[str, float | str]:
+        return {
+            "dtype": self.dtype,
+            "ingestion": round(self.ingestion, 2),
+        }
+
+
+@dataclass
+class EvaluationBenchmark:
+    limit: int
+    gt_stats: DataBenchmark
+    pd_stats: DataBenchmark
+    n_datums: int
+    n_annotations: int
+    n_labels: int
+    eval_base: float
+    eval_base_pr: float
+    eval_base_pr_detail: float
+
+    def result(self) -> dict[str, float | str | dict[str, str | float]]:
+        return {
+            "limit": self.limit,
+            "groundtruths": self.gt_stats.result(),
+            "predictions": self.pd_stats.result(),
+            "evaluation": {
+                "number_of_datums": self.n_datums,
+                "number_of_annotations": self.n_annotations,
+                "number_of_labels": self.n_labels,
+                "base": round(self.eval_base, 2),
+                "base+pr": round(self.eval_base_pr, 2),
+                "base+pr+detailed": round(self.eval_base_pr_detail, 2),
+            },
+        }
+
+
+def run_benchmarking_analysis(
+    limits_to_test: list[int],
+    combinations: list[tuple[AnnotationType, AnnotationType]] | None = None,
+    results_file: str = "results.json",
+    ingestion_chunk_timeout: int = 30,
+    evaluation_timeout: int = 30,
+    compute_pr: bool = True,
+    compute_detailed: bool = True,
+):
+    """Time various function calls and export the results."""
+    current_directory = Path(__file__).parent
+    write_path = current_directory / Path(results_file)
+
+    gt_box_filename = "gt_objdet_coco_bbox.jsonl"
+    gt_polygon_filename = "gt_objdet_coco_polygon.jsonl"
+    # gt_multipolygon_filename = "gt_objdet_coco_raster_multipolygon.jsonl"
+    gt_raster_filename = "gt_objdet_coco_raster_bitmask.jsonl"
+    pd_box_filename = "pd_objdet_yolo_bbox.jsonl"
+    pd_polygon_filename = "pd_objdet_yolo_polygon.jsonl"
+    # pd_multipolygon_filename = "pd_objdet_yolo_multipolygon.jsonl"
+    pd_raster_filename = "pd_objdet_yolo_raster.jsonl"
+
+    groundtruth_caches = {
+        AnnotationType.BOX: gt_box_filename,
+        AnnotationType.POLYGON: gt_polygon_filename,
+        # AnnotationType.MULTIPOLYGON: gt_multipolygon_filename,
+        AnnotationType.RASTER: gt_raster_filename,
+    }
+    prediction_caches = {
+        AnnotationType.BOX: pd_box_filename,
+        AnnotationType.POLYGON: pd_polygon_filename,
+        # AnnotationType.MULTIPOLYGON: pd_multipolygon_filename,
+        AnnotationType.RASTER: pd_raster_filename,
+    }
+
+    # default is to perform all combinations
+    if combinations is None:
+        combinations = [
+            (gt_type, pd_type)
+            for gt_type in groundtruth_caches
+            for pd_type in prediction_caches
+        ]
+
+    # cache data locally
+    filenames = [
+        *list(groundtruth_caches.values()),
+        *list(prediction_caches.values()),
+    ]
+    for filename in filenames:
+        file_path = current_directory / Path(filename)
+        url = f"https://pub-fae71003f78140bdaedf32a7c8d331d2.r2.dev/{filename}"
+        download_data_if_not_exists(
+            file_name=filename, file_path=file_path, url=url
+        )
+
+    # iterate through datum limits
+    results = list()
+    for limit in limits_to_test:
+        for gt_type, pd_type in combinations:
+
+            gt_filename = groundtruth_caches[gt_type]
+            pd_filename = prediction_caches[pd_type]
+
+            # gt ingestion
+            gt_ingest_time, groundtruths = time_it(
+                ingest_groundtruths,
+                dtype=gt_type,
+                path=current_directory / Path(gt_filename),
+                limit=limit,
+            )
+
+            # pd ingestion
+            datum_uids = [gt.datum.uid for gt in groundtruths]
+            pd_ingest_time, predictions = time_it(
+                ingest_predictions,
+                dtype=pd_type,
+                datum_uids=datum_uids,
+                path=current_directory / Path(pd_filename),
+                limit=limit,
+            )
+
+            # run evaluations
+            eval_pr = None
+            eval_detail = None
+            eval_base = run_base_evaluation(groundtruths, predictions)
+            if compute_pr:
+                eval_pr = run_pr_curve_evaluation(groundtruths, predictions)
+            if compute_detailed:
+                eval_detail = run_detailed_pr_curve_evaluation(
+                    groundtruths, predictions
+                )
+
+            assert eval_base.meta
+
+            results.append(
+                EvaluationBenchmark(
+                    limit=limit,
+                    gt_stats=DataBenchmark(
+                        dtype=gt_type,
+                        ingestion=gt_ingest_time,
+                    ),
+                    pd_stats=DataBenchmark(
+                        dtype=pd_type,
+                        ingestion=pd_ingest_time,
+                    ),
+                    n_datums=eval_base.meta["datums"],
+                    n_annotations=eval_base.meta["annotations"],
+                    n_labels=eval_base.meta["labels"],
+                    eval_base=eval_base.meta["duration"],
+                    eval_base_pr=(
+                        eval_pr.meta["duration"]
+                        if eval_pr and eval_pr.meta
+                        else -1
+                    ),
+                    eval_base_pr_detail=(
+                        eval_detail.meta["duration"]
+                        if eval_detail and eval_detail.meta
+                        else -1
+                    ),
+                ).result()
+            )
+
+    write_results_to_file(write_path=write_path, results=results)
+
+
+if __name__ == "__main__":
+
+    # run bounding box benchmark
+    run_benchmarking_analysis(
+        combinations=[
+            (AnnotationType.BOX, AnnotationType.BOX),
+        ],
+        limits_to_test=[5000, 5000],
+    )
+
+    # run polygon benchmark
+    run_benchmarking_analysis(
+        combinations=[
+            (AnnotationType.POLYGON, AnnotationType.POLYGON),
+        ],
+        limits_to_test=[5000, 5000],
+    )
+
+    # run raster benchmark
+    run_benchmarking_analysis(
+        combinations=[
+            (AnnotationType.RASTER, AnnotationType.RASTER),
+        ],
+        limits_to_test=[500, 500],
+    )
diff --git a/core/examples/.gitignore b/core/examples/.gitignore
new file mode 100644
index 000000000..7bc897f92
--- /dev/null
+++ b/core/examples/.gitignore
@@ -0,0 +1 @@
+!*.ipynb
\ No newline at end of file
diff --git a/core/examples/getting_started.ipynb b/core/examples/getting_started.ipynb
new file mode 100644
index 000000000..aa1f8eae7
--- /dev/null
+++ b/core/examples/getting_started.ipynb
@@ -0,0 +1,533 @@
+{
+  "cells": [
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Getting Started with Valor Core\n",
+        "\n",
+        "## Introduction\n",
+        "\n",
+        "Valor is a centralized evaluation store which makes it easy to measure, explore, and rank model performance. Valor empowers data scientists and engineers to evaluate the performance of their machine learning pipelines and use those evaluations to make better modeling decisions in the future. For a conceptual introduction to Valor, [check out our project overview](https://striveworks.github.io/valor/).\n",
+        "\n",
+        "In this notebook, we'll introduce Valor's high-level abstractions and walk through a computer vision-oriented example of how you can use Valor to evaluate model performance. For task-specific examples, please see our follow-up notebooks below:\n",
+        "\n",
+        "- [Tabular classification](https://github.com/Striveworks/valor/blob/main/examples/classification/tabular.ipynb)\n",
+        "- [Object detection](https://github.com/Striveworks/valor/blob/main/examples/object-detection/coco-yolo.ipynb)\n",
+        "- [Semantic segmentation](https://github.com/Striveworks/valor/blob/main/examples/semantic-segmentation/coco-yolo.ipynb)\n",
+        "\n",
+        "Note that this notebook uses `valor_core`, rather than `valor`, to calculate all metrics locally without utilizing Postgres' filtering and data exploration capabilities."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Defining Our Dataset\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "To begin, we import all needed packages from `valor_core`. For instructions on setting up your environment, please see [our docs here](https://striveworks.github.io/valor/getting_started/).\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from pathlib import Path\n",
+        "\n",
+        "from valor_core import (\n",
+        "    Datum,\n",
+        "    Annotation,\n",
+        "    GroundTruth,\n",
+        "    Prediction,\n",
+        "    Label,\n",
+        "    Box,\n",
+        "    evaluate_classification, \n",
+        "    evaluate_detection\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Creating Image Classification GroundTruths and Predictions\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "To describe the various objects in our data, we'll create lists of `GroundTruth` and `Prediction` objects to pass into our `evaluate..` functions. Note that Valor doesn't actually store any images, and that the `Annotations` we use will vary by our task type (i.e., object detection, semantic segmentation, etc.). For demonstrative purposes, we'll create `GroundTruths` for two different learning tasks in this notebook.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[GroundTruth(datum=Datum(uid='img1', metadata={'path': 'a/b/c/img1.png'}), annotations=[Annotation(labels=[Label(key='class_label', value='dog', score=None)], metadata=None, bounding_box=None, polygon=None, raster=None, embedding=None, is_instance=None, implied_task_types=None)]), GroundTruth(datum=Datum(uid='img2', metadata={'path': 'a/b/c/img2.png'}), annotations=[Annotation(labels=[Label(key='class_label', value='cat', score=None)], metadata=None, bounding_box=None, polygon=None, raster=None, embedding=None, is_instance=None, implied_task_types=None)])]\n"
+          ]
+        }
+      ],
+      "source": [
+        "def create_image_classification_data(classification_data):\n",
+        "\n",
+        "    groundtruths, predictions = [], []\n",
+        "\n",
+        "    for element in classification_data:\n",
+        "\n",
+        "        datum = Datum(\n",
+        "            uid=Path(element[\"path\"]).stem, metadata={\"path\": element[\"path\"]}\n",
+        "        )\n",
+        "\n",
+        "        gt_annotations = [\n",
+        "            Annotation(\n",
+        "                labels=[\n",
+        "                    Label(key=key, value=value)\n",
+        "                    for label in element[\"gt_annotations\"]\n",
+        "                    for key, value in label.items()\n",
+        "                ]\n",
+        "            )\n",
+        "        ]\n",
+        "\n",
+        "        pd_annotations = [\n",
+        "            Annotation(\n",
+        "                labels=[\n",
+        "                    Label(\n",
+        "                        key=\"class_label\",\n",
+        "                        value=label[\"class_label\"],\n",
+        "                        score=label[\"score\"],\n",
+        "                    )\n",
+        "                    for label in element[\"pd_annotations\"]\n",
+        "                ]\n",
+        "            )\n",
+        "        ]\n",
+        "\n",
+        "        groundtruths.append(\n",
+        "            GroundTruth(\n",
+        "                datum=datum,\n",
+        "                annotations=gt_annotations,\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "        predictions.append(\n",
+        "            Prediction(\n",
+        "                datum=datum,\n",
+        "                annotations=pd_annotations,\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "    return groundtruths, predictions\n",
+        "\n",
+        "\n",
+        "classification_data = [\n",
+        "        {\n",
+        "            \"path\": \"a/b/c/img1.png\",\n",
+        "            \"gt_annotations\": [{\"class_label\": \"dog\"}],\n",
+        "            \"pd_annotations\": [\n",
+        "                {\"class_label\": \"dog\", \"score\": 0.9},\n",
+        "                {\"class_label\": \"cat\", \"score\": 0.1},\n",
+        "            ],\n",
+        "        },\n",
+        "        {\n",
+        "            \"path\": \"a/b/c/img2.png\",\n",
+        "            \"gt_annotations\": [{\"class_label\": \"cat\"}],\n",
+        "            \"pd_annotations\": [\n",
+        "                {\"class_label\": \"dog\", \"score\": 0.1},\n",
+        "                {\"class_label\": \"cat\", \"score\": 0.9},\n",
+        "            ],\n",
+        "        },\n",
+        "    ]\n",
+        "\n",
+        "\n",
+        "classification_gts, classification_pds = create_image_classification_data(classification_data)\n",
+        "print(classification_gts)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Creating Object Detection GroundTruths and Predictions\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[Prediction(datum=Datum(uid='img3', metadata={'path': 'a/b/c/img3.png'}), annotations=[Annotation(labels=[Label(key='class_label', value='dog', score=0.8), Label(key='class_label', value='cat', score=0.1), Label(key='class_label', value='person', score=0.1)], metadata=None, bounding_box=Box(value=[[(16, 130), (70, 130), (70, 150), (16, 150), (16, 130)]]), polygon=None, raster=None, embedding=None, is_instance=True, implied_task_types=None), Annotation(labels=[Label(key='class_label', value='dog', score=0.05), Label(key='class_label', value='cat', score=0.05), Label(key='class_label', value='person', score=0.9)], metadata=None, bounding_box=Box(value=[[(89, 10), (97, 10), (97, 110), (89, 110), (89, 10)]]), polygon=None, raster=None, embedding=None, is_instance=True, implied_task_types=None)]), Prediction(datum=Datum(uid='img4', metadata={'path': 'a/b/c/img4.png'}), annotations=[Annotation(labels=[Label(key='class_label', value='dog', score=0.8), Label(key='class_label', value='cat', score=0.1), Label(key='class_label', value='person', score=0.1)], metadata=None, bounding_box=Box(value=[[(500, 220), (530, 220), (530, 260), (500, 260), (500, 220)]]), polygon=None, raster=None, embedding=None, is_instance=True, implied_task_types=None)]), Prediction(datum=Datum(uid='img5', metadata={'path': 'a/b/c/img5.png'}), annotations=[])]\n"
+          ]
+        }
+      ],
+      "source": [
+        "def create_groundtruth_from_object_detection_dict(detection_data):\n",
+        "    groundtruths, predictions = [], []\n",
+        "\n",
+        "    for element in detection_data:\n",
+        "\n",
+        "        datum = Datum(\n",
+        "            uid=Path(element[\"path\"]).stem, metadata={\"path\": element[\"path\"]}\n",
+        "        )\n",
+        "\n",
+        "        gt_annotations = [\n",
+        "            Annotation(\n",
+        "                labels=[\n",
+        "                    Label(key=\"class_label\", value=annotation[\"class_label\"])\n",
+        "                ],\n",
+        "                bounding_box=Box.from_extrema(\n",
+        "                    xmin=annotation[\"bbox\"][\"xmin\"],\n",
+        "                    xmax=annotation[\"bbox\"][\"xmax\"],\n",
+        "                    ymin=annotation[\"bbox\"][\"ymin\"],\n",
+        "                    ymax=annotation[\"bbox\"][\"ymax\"],\n",
+        "                ),\n",
+        "                is_instance=True,\n",
+        "            )\n",
+        "            for annotation in element[\"gt_annotations\"]\n",
+        "            if len(annotation) > 0\n",
+        "        ]\n",
+        "\n",
+        "        pd_annotations = [\n",
+        "            Annotation(\n",
+        "                labels=[\n",
+        "                    Label(\n",
+        "                        key=\"class_label\",\n",
+        "                        value=label[\"class_label\"],\n",
+        "                        score=label[\"score\"],\n",
+        "                    )\n",
+        "                    for label in annotation[\"labels\"]\n",
+        "                ],\n",
+        "                bounding_box=Box.from_extrema(\n",
+        "                    xmin=annotation[\"bbox\"][\"xmin\"],\n",
+        "                    xmax=annotation[\"bbox\"][\"xmax\"],\n",
+        "                    ymin=annotation[\"bbox\"][\"ymin\"],\n",
+        "                    ymax=annotation[\"bbox\"][\"ymax\"],\n",
+        "                ),\n",
+        "                is_instance=True,\n",
+        "            )\n",
+        "            for annotation in element[\"pd_annotations\"]\n",
+        "            if len(annotation) > 0\n",
+        "        ]\n",
+        "\n",
+        "        groundtruths.append(\n",
+        "            GroundTruth(\n",
+        "                datum=datum,\n",
+        "                annotations=gt_annotations,\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "        predictions.append(\n",
+        "            Prediction(\n",
+        "                datum=datum,\n",
+        "                annotations=pd_annotations,\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "    return groundtruths, predictions\n",
+        "\n",
+        "\n",
+        "detection_data = [\n",
+        "    {\n",
+        "        \"path\": \"a/b/c/img3.png\",\n",
+        "        \"gt_annotations\": [\n",
+        "            {\n",
+        "                \"class_label\": \"dog\",\n",
+        "                \"bbox\": {\"xmin\": 16, \"ymin\": 130, \"xmax\": 70, \"ymax\": 150},\n",
+        "            },\n",
+        "            {\n",
+        "                \"class_label\": \"person\",\n",
+        "                \"bbox\": {\"xmin\": 89, \"ymin\": 10, \"xmax\": 97, \"ymax\": 110},\n",
+        "            },\n",
+        "        ],\n",
+        "        \"pd_annotations\": [\n",
+        "            {\n",
+        "                \"labels\": [\n",
+        "                    {\"class_label\": \"dog\", \"score\": 0.8},\n",
+        "                    {\"class_label\": \"cat\", \"score\": 0.1},\n",
+        "                    {\"class_label\": \"person\", \"score\": 0.1},\n",
+        "                ],\n",
+        "                \"bbox\": {\"xmin\": 16, \"ymin\": 130, \"xmax\": 70, \"ymax\": 150},\n",
+        "            },\n",
+        "            {\n",
+        "                \"labels\": [\n",
+        "                    {\"class_label\": \"dog\", \"score\": 0.05},\n",
+        "                    {\"class_label\": \"cat\", \"score\": 0.05},\n",
+        "                    {\"class_label\": \"person\", \"score\": 0.9},\n",
+        "                ],\n",
+        "                \"bbox\": {\"xmin\": 89, \"ymin\": 10, \"xmax\": 97, \"ymax\": 110},\n",
+        "            },\n",
+        "        ],\n",
+        "    },\n",
+        "    {\n",
+        "        \"path\": \"a/b/c/img4.png\",\n",
+        "        \"gt_annotations\": [\n",
+        "            {\n",
+        "                \"class_label\": \"cat\",\n",
+        "                \"bbox\": {\"xmin\": 500, \"ymin\": 220, \"xmax\": 530, \"ymax\": 260},\n",
+        "            }\n",
+        "        ],\n",
+        "        \"pd_annotations\": [\n",
+        "            {\n",
+        "                \"labels\": [\n",
+        "                    {\"class_label\": \"dog\", \"score\": 0.8},\n",
+        "                    {\"class_label\": \"cat\", \"score\": 0.1},\n",
+        "                    {\"class_label\": \"person\", \"score\": 0.1},\n",
+        "                ],\n",
+        "                \"bbox\": {\"xmin\": 500, \"ymin\": 220, \"xmax\": 530, \"ymax\": 260},\n",
+        "            }\n",
+        "        ],\n",
+        "    },\n",
+        "    {\"path\": \"a/b/c/img5.png\", \"gt_annotations\": [], \"pd_annotations\": []},\n",
+        "]\n",
+        "\n",
+        "\n",
+        "detection_gts, detection_pds = create_groundtruth_from_object_detection_dict(\n",
+        "    detection_data=detection_data\n",
+        ")\n",
+        "print(detection_pds)"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Evaluating Performance\n",
+        "\n",
+        "Finally, we'll use our Valor abstractions to evaluate model performance. For more detailed, task-specific examples, see our follow-up notebooks at the links below:\n",
+        "\n",
+        "- [Tabular classification](https://github.com/Striveworks/valor/blob/main/examples/classification/tabular.ipynb)\n",
+        "- [Object detection](https://github.com/Striveworks/valor/blob/main/examples/object-detection/coco-yolo.ipynb)\n",
+        "- [Semantic segmentation](https://github.com/Striveworks/valor/blob/main/examples/semantic-segmentation/coco-yolo.ipynb)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Evaluating Detections\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[{'label': {'key': 'class_label', 'value': 'person'},\n",
+              "  'parameters': {'iou': 0.5},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'AP'},\n",
+              " {'label': {'key': 'class_label', 'value': 'person'},\n",
+              "  'parameters': {'iou': 0.75},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'AP'},\n",
+              " {'label': {'key': 'class_label', 'value': 'cat'},\n",
+              "  'parameters': {'iou': 0.5},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'AP'},\n",
+              " {'label': {'key': 'class_label', 'value': 'cat'},\n",
+              "  'parameters': {'iou': 0.75},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'AP'},\n",
+              " {'label': {'key': 'class_label', 'value': 'dog'},\n",
+              "  'parameters': {'iou': 0.5},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'AP'},\n",
+              " {'label': {'key': 'class_label', 'value': 'dog'},\n",
+              "  'parameters': {'iou': 0.75},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'AP'},\n",
+              " {'parameters': {'label_key': 'class_label', 'iou': 0.5},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'mAP'},\n",
+              " {'parameters': {'label_key': 'class_label', 'iou': 0.75},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'mAP'},\n",
+              " {'label': {'key': 'class_label', 'value': 'person'},\n",
+              "  'parameters': {'ious': [0.5,\n",
+              "    0.55,\n",
+              "    0.6,\n",
+              "    0.65,\n",
+              "    0.7,\n",
+              "    0.75,\n",
+              "    0.8,\n",
+              "    0.85,\n",
+              "    0.9,\n",
+              "    0.95]},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'APAveragedOverIOUs'},\n",
+              " {'label': {'key': 'class_label', 'value': 'cat'},\n",
+              "  'parameters': {'ious': [0.5,\n",
+              "    0.55,\n",
+              "    0.6,\n",
+              "    0.65,\n",
+              "    0.7,\n",
+              "    0.75,\n",
+              "    0.8,\n",
+              "    0.85,\n",
+              "    0.9,\n",
+              "    0.95]},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'APAveragedOverIOUs'},\n",
+              " {'label': {'key': 'class_label', 'value': 'dog'},\n",
+              "  'parameters': {'ious': [0.5,\n",
+              "    0.55,\n",
+              "    0.6,\n",
+              "    0.65,\n",
+              "    0.7,\n",
+              "    0.75,\n",
+              "    0.8,\n",
+              "    0.85,\n",
+              "    0.9,\n",
+              "    0.95]},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'APAveragedOverIOUs'},\n",
+              " {'parameters': {'label_key': 'class_label',\n",
+              "   'ious': [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'mAPAveragedOverIOUs'},\n",
+              " {'label': {'key': 'class_label', 'value': 'person'},\n",
+              "  'parameters': {'ious': [0.5,\n",
+              "    0.55,\n",
+              "    0.6,\n",
+              "    0.65,\n",
+              "    0.7,\n",
+              "    0.75,\n",
+              "    0.8,\n",
+              "    0.85,\n",
+              "    0.9,\n",
+              "    0.95]},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'AR'},\n",
+              " {'label': {'key': 'class_label', 'value': 'cat'},\n",
+              "  'parameters': {'ious': [0.5,\n",
+              "    0.55,\n",
+              "    0.6,\n",
+              "    0.65,\n",
+              "    0.7,\n",
+              "    0.75,\n",
+              "    0.8,\n",
+              "    0.85,\n",
+              "    0.9,\n",
+              "    0.95]},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'AR'},\n",
+              " {'label': {'key': 'class_label', 'value': 'dog'},\n",
+              "  'parameters': {'ious': [0.5,\n",
+              "    0.55,\n",
+              "    0.6,\n",
+              "    0.65,\n",
+              "    0.7,\n",
+              "    0.75,\n",
+              "    0.8,\n",
+              "    0.85,\n",
+              "    0.9,\n",
+              "    0.95]},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'AR'},\n",
+              " {'parameters': {'label_key': 'class_label',\n",
+              "   'ious': [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'mAR'}]"
+            ]
+          },
+          "execution_count": 4,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "eval_objdet = evaluate_detection(groundtruths=detection_gts, predictions=detection_pds)\n",
+        "eval_objdet.metrics"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[{'label': {'key': 'class_label', 'value': 'cat'},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'Precision'},\n",
+              " {'label': {'key': 'class_label', 'value': 'cat'},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'Recall'},\n",
+              " {'label': {'key': 'class_label', 'value': 'cat'}, 'value': 1.0, 'type': 'F1'},\n",
+              " {'label': {'key': 'class_label', 'value': 'dog'},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'Precision'},\n",
+              " {'label': {'key': 'class_label', 'value': 'dog'},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'Recall'},\n",
+              " {'label': {'key': 'class_label', 'value': 'dog'}, 'value': 1.0, 'type': 'F1'},\n",
+              " {'parameters': {'label_key': 'class_label'},\n",
+              "  'value': 1.0,\n",
+              "  'type': 'Accuracy'},\n",
+              " {'parameters': {'label_key': 'class_label'}, 'value': 1.0, 'type': 'ROCAUC'}]"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "eval_clf = evaluate_classification(groundtruths=classification_gts, predictions=classification_pds)\n",
+        "eval_clf.metrics"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Next Steps\n",
+        "\n",
+        "For more examples, we'd recommend reviewing our [other sample notebooks on GitHub](https://github.com/Striveworks/valor/blob/main/examples/). For more detailed explanations of Valor's technical underpinnings, see our [technical concepts guide](technical_concepts.md).\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": ".env-valor",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.13"
+    },
+    "orig_nbformat": 4
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
diff --git a/core/pyproject.toml b/core/pyproject.toml
new file mode 100644
index 000000000..103b17354
--- /dev/null
+++ b/core/pyproject.toml
@@ -0,0 +1,41 @@
+[project]
+name = "valor-core"
+dynamic = ["version"]
+description = "Compute valor metrics directly in your client."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
+dependencies = [
+    "Pillow >= 9.1.0",
+    "numpy",
+    "importlib_metadata; python_version < '3.8'",
+    "pandas>=2.2.2",
+    "pandas-stubs",
+    "tqdm",
+    "requests",
+    "shapely"
+]
+
+[project.urls]
+homepage = "https://www.striveworks.com"
+
+[build-system]
+requires = ["setuptools>=61.0", "setuptools_scm[toml]>=6.2"]
+build-backend = "setuptools.build_meta"
+
+[project.optional-dependencies]
+test = ["pytest", "coverage"]
+
+[tool.black]
+line-length = 79
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+
+[tool.setuptools_scm]
+root = ".."
diff --git a/core/tests/conftest.py b/core/tests/conftest.py
new file mode 100644
index 000000000..2406038b7
--- /dev/null
+++ b/core/tests/conftest.py
@@ -0,0 +1,2857 @@
+import math
+
+import numpy as np
+import pandas as pd
+import pytest
+from valor_core import schemas
+
+
+@pytest.fixture
+def box_points() -> list[tuple[float, float]]:
+    return [
+        (-5, -5),
+        (5, -5),
+        (5, 5),
+        (-5, 5),
+        (-5, -5),
+    ]
+
+
+@pytest.fixture
+def rotated_box_points() -> list[tuple[float, float]]:
+    """Same area and sides as box_points, but rotated 45 degrees."""
+    d = 5.0 * math.sqrt(2)
+    return [
+        (0, -d),
+        (d, 0),
+        (0, d),
+        (-d, 0),
+        (0, -d),
+    ]
+
+
+@pytest.fixture
+def images() -> list[schemas.Datum]:
+    return [
+        schemas.Datum(
+            uid=f"{i}",
+            metadata={
+                "height": 1000,
+                "width": 2000,
+            },
+        )
+        for i in range(4)
+    ]
+
+
+@pytest.fixture
+def evaluate_detection_functional_test_groundtruths(
+    images: list[schemas.Datum],
+) -> list[schemas.GroundTruth]:
+    """Creates a dataset called "test_dataset" with some ground truth
+    detections. These detections are taken from a torchmetrics unit test (see test_metrics.py)
+    """
+
+    gts_per_img = [
+        {"boxes": [[214.1500, 41.2900, 562.4100, 285.0700]], "labels": ["4"]},
+        {
+            "boxes": [
+                [13.00, 22.75, 548.98, 632.42],
+                [1.66, 3.32, 270.26, 275.23],
+            ],
+            "labels": ["2", "2"],
+        },
+        {
+            "boxes": [
+                [61.87, 276.25, 358.29, 379.43],
+                [2.75, 3.66, 162.15, 316.06],
+                [295.55, 93.96, 313.97, 152.79],
+                [326.94, 97.05, 340.49, 122.98],
+                [356.62, 95.47, 372.33, 147.55],
+                [462.08, 105.09, 493.74, 146.99],
+                [277.11, 103.84, 292.44, 150.72],
+            ],
+            "labels": ["4", "1", "0", "0", "0", "0", "0"],
+        },
+        {
+            "boxes": [
+                [72.92, 45.96, 91.23, 80.57],
+                [50.17, 45.34, 71.28, 79.83],
+                [81.28, 47.04, 98.66, 78.50],
+                [63.96, 46.17, 84.35, 80.48],
+                [75.29, 23.01, 91.85, 50.85],
+                [56.39, 21.65, 75.66, 45.54],
+                [73.14, 1.10, 98.96, 28.33],
+                [62.34, 55.23, 78.14, 79.57],
+                [44.17, 45.78, 63.99, 78.48],
+                [58.18, 44.80, 66.42, 56.25],
+            ],
+            "labels": [
+                "49",
+                "49",
+                "49",
+                "49",
+                "49",
+                "49",
+                "49",
+                "49",
+                "49",
+                "49",
+            ],
+        },
+    ]
+    return [
+        schemas.GroundTruth(
+            datum=image,
+            annotations=[
+                schemas.Annotation(
+                    labels=[schemas.Label(key="class", value=class_label)],
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=box[0],
+                        ymin=box[1],
+                        xmax=box[2],
+                        ymax=box[3],
+                    ),
+                    is_instance=True,
+                )
+                for box, class_label in zip(gts["boxes"], gts["labels"])
+            ],
+        )
+        for gts, image in zip(gts_per_img, images)
+    ]
+
+
+# predictions to use for testing AP
+@pytest.fixture
+def evaluate_detection_functional_test_predictions(
+    images: list[schemas.Datum],
+) -> list[schemas.Prediction]:
+    """Creates a model called "test_model" with some predicted
+    detections on the dataset "test_dataset". These predictions are taken
+    from a torchmetrics unit test (see test_metrics.py)
+    """
+
+    # predictions for four images taken from
+    # https://github.com/Lightning-AI/metrics/blob/107dbfd5fb158b7ae6d76281df44bd94c836bfce/tests/unittests/detection/test_map.py#L59
+    preds_per_img = [
+        {
+            "boxes": [[258.15, 41.29, 606.41, 285.07]],
+            "scores": [0.236],
+            "labels": ["4"],
+        },
+        {
+            "boxes": [
+                [61.00, 22.75, 565.00, 632.42],
+                [12.66, 3.32, 281.26, 275.23],
+            ],
+            "scores": [0.318, 0.726],
+            "labels": ["3", "2"],
+        },
+        {
+            "boxes": [
+                [87.87, 276.25, 384.29, 379.43],
+                [0.00, 3.66, 142.15, 316.06],
+                [296.55, 93.96, 314.97, 152.79],
+                [328.94, 97.05, 342.49, 122.98],
+                [356.62, 95.47, 372.33, 147.55],
+                [464.08, 105.09, 495.74, 146.99],
+                [276.11, 103.84, 291.44, 150.72],
+            ],
+            "scores": [0.546, 0.3, 0.407, 0.611, 0.335, 0.805, 0.953],
+            "labels": ["4", "1", "0", "0", "0", "0", "0"],
+        },
+        {
+            "boxes": [
+                [72.92, 45.96, 91.23, 80.57],
+                [45.17, 45.34, 66.28, 79.83],
+                [82.28, 47.04, 99.66, 78.50],
+                [59.96, 46.17, 80.35, 80.48],
+                [75.29, 23.01, 91.85, 50.85],
+                [71.14, 1.10, 96.96, 28.33],
+                [61.34, 55.23, 77.14, 79.57],
+                [41.17, 45.78, 60.99, 78.48],
+                [56.18, 44.80, 64.42, 56.25],
+            ],
+            "scores": [
+                0.532,
+                0.204,
+                0.782,
+                0.202,
+                0.883,
+                0.271,
+                0.561,
+                0.204,
+                0.349,
+            ],
+            "labels": ["49", "49", "49", "49", "49", "49", "49", "49", "49"],
+        },
+    ]
+
+    db_preds_per_img = [
+        schemas.Prediction(
+            datum=image,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(
+                            key="class", value=class_label, score=score
+                        )
+                    ],
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=box[0],
+                        ymin=box[1],
+                        xmax=box[2],
+                        ymax=box[3],
+                    ),
+                    is_instance=True,
+                )
+                for box, class_label, score in zip(
+                    preds["boxes"], preds["labels"], preds["scores"]
+                )
+            ],
+        )
+        for preds, image in zip(preds_per_img, images)
+    ]
+
+    return db_preds_per_img
+
+
+@pytest.fixture
+def evaluate_detection_functional_test_groundtruths_with_rasters(
+    img1: schemas.Datum,
+) -> list[schemas.GroundTruth]:
+    """Used to test object detection functionality on rasters"""
+
+    gts = {
+        "rasters": [
+            np.ones((80, 32), dtype=bool),
+            np.ones((80, 32), dtype=bool),
+            np.ones((80, 32), dtype=bool),
+        ],
+        "labels": ["label1", "label2", "label3"],
+    }
+    return [
+        schemas.GroundTruth(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    labels=[schemas.Label(key="class", value=class_label)],
+                    raster=schemas.Raster(raster),
+                    is_instance=True,
+                )
+                for raster, class_label in zip(gts["rasters"], gts["labels"])
+            ],
+        )
+    ]
+
+
+@pytest.fixture
+def evaluate_detection_functional_test_predictions_with_rasters(
+    img1: schemas.Datum,
+) -> list[schemas.Prediction]:
+    """Used to test object detection functionality on rasters"""
+
+    preds = {
+        "rasters": [
+            np.ones((80, 32), dtype=bool),
+            np.ones((80, 32), dtype=bool),
+            np.zeros((80, 32), dtype=bool),
+            np.zeros((80, 32), dtype=bool),
+        ],
+        "labels": ["label1", "label2", "label3", "label4"],
+        "scores": [
+            0.3,
+            0.93,
+            0.92,
+            0.94,
+        ],  # we expect our AP and AR metrics to be 1 for label2 since the second prediction has a higher score than the third
+    }
+
+    return [
+        schemas.Prediction(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(
+                            key="class", value=class_label, score=score
+                        )
+                    ],
+                    raster=schemas.Raster(raster),
+                    is_instance=True,
+                )
+                for raster, class_label, score in zip(
+                    preds["rasters"], preds["labels"], preds["scores"]
+                )
+            ],
+        )
+    ]
+
+
+@pytest.fixture
+def rect1() -> list[tuple[float, float]]:
+    """Box with area = 1500."""
+    return [
+        (10, 10),
+        (60, 10),
+        (60, 40),
+        (10, 40),
+        (10, 10),
+    ]
+
+
+@pytest.fixture
+def rect2() -> list[tuple[float, float]]:
+    """Box with area = 1100."""
+    return [
+        (15, 0),
+        (70, 0),
+        (70, 20),
+        (15, 20),
+        (15, 0),
+    ]
+
+
+@pytest.fixture
+def rect3() -> list[tuple[float, float]]:
+    """Box with area = 57,510."""
+    return [
+        (87, 10),
+        (158, 10),
+        (158, 820),
+        (87, 820),
+        (87, 10),
+    ]
+
+
+@pytest.fixture
+def rect4() -> list[tuple[float, float]]:
+    """Box with area = 90."""
+    return [
+        (1, 10),
+        (10, 10),
+        (10, 20),
+        (1, 20),
+        (1, 10),
+    ]
+
+
+@pytest.fixture
+def rect5() -> list[tuple[float, float]]:
+    """Box with partial overlap to rect3."""
+    return [
+        (87, 10),
+        (158, 10),
+        (158, 400),
+        (87, 400),
+        (87, 10),
+    ]
+
+
+@pytest.fixture
+def evaluate_detection_groundtruths(
+    rect1: list[tuple[float, float]],
+    rect2: list[tuple[float, float]],
+    rect3: list[tuple[float, float]],
+    img1: schemas.Datum,
+    img2: schemas.Datum,
+) -> list[schemas.GroundTruth]:
+    return [
+        schemas.GroundTruth(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box([rect1]),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2")],
+                    bounding_box=schemas.Box([rect3]),
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box([rect2]),
+                )
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def evaluate_detection_predictions(
+    rect1: list[tuple[float, float]],
+    rect2: list[tuple[float, float]],
+    img1: schemas.Datum,
+    img2: schemas.Datum,
+) -> list[schemas.Prediction]:
+    return [
+        schemas.Prediction(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1", score=0.3)],
+                    bounding_box=schemas.Box([rect1]),
+                )
+            ],
+        ),
+        schemas.Prediction(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2", score=0.98)],
+                    bounding_box=schemas.Box([rect2]),
+                )
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def evaluate_detection_groundtruths_with_label_maps(
+    rect1: list[tuple[float, float]],
+    rect2: list[tuple[float, float]],
+    rect3: list[tuple[float, float]],
+    img1: schemas.Datum,
+    img2: schemas.Datum,
+) -> list[schemas.GroundTruth]:
+    return [
+        schemas.GroundTruth(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[
+                        schemas.Label(key="class_name", value="maine coon cat")
+                    ],
+                    bounding_box=schemas.Box([rect1]),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[
+                        schemas.Label(key="class", value="british shorthair")
+                    ],
+                    bounding_box=schemas.Box([rect3]),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box([rect1]),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2")],
+                    bounding_box=schemas.Box([rect3]),
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="class", value="siamese cat")],
+                    bounding_box=schemas.Box([rect2]),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box([rect2]),
+                ),
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def evaluate_detection_predictions_with_label_maps(
+    rect1: list[tuple[float, float]],
+    rect2: list[tuple[float, float]],
+    img1: schemas.Datum,
+    img2: schemas.Datum,
+) -> list[schemas.Prediction]:
+    return [
+        schemas.Prediction(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[
+                        schemas.Label(key="class", value="cat", score=0.3)
+                    ],
+                    bounding_box=schemas.Box([rect1]),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1", score=0.3)],
+                    bounding_box=schemas.Box([rect1]),
+                ),
+            ],
+        ),
+        schemas.Prediction(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[
+                        schemas.Label(
+                            key="class_name", value="cat", score=0.98
+                        )
+                    ],
+                    bounding_box=schemas.Box([rect2]),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2", score=0.98)],
+                    bounding_box=schemas.Box([rect2]),
+                ),
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def evaluate_detection_detailed_pr_curve_groundtruths(
+    img1,
+    img2,
+    rect1,
+    rect2,
+    rect3,
+    rect4,
+    rect5,
+):
+    return [
+        schemas.GroundTruth(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box([rect1]),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="missed_detection")],
+                    bounding_box=schemas.Box([rect2]),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v2")],
+                    bounding_box=schemas.Box([rect3]),
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="low_iou")],
+                    bounding_box=schemas.Box([rect1]),
+                ),
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def evaluate_detection_detailed_pr_curve_predictions(
+    img1,
+    img2,
+    rect1,
+    rect2,
+    rect3,
+    rect4,
+    rect5,
+):
+    return [
+        schemas.Prediction(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1", score=0.5)],
+                    bounding_box=schemas.Box([rect1]),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[
+                        schemas.Label(key="k1", value="not_v2", score=0.3)
+                    ],
+                    bounding_box=schemas.Box([rect5]),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[
+                        schemas.Label(
+                            key="k1", value="hallucination", score=0.1
+                        )
+                    ],
+                    bounding_box=schemas.Box([rect4]),
+                ),
+            ],
+        ),
+        # prediction for img2 has the wrong bounding box, so it should count as a hallucination
+        schemas.Prediction(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[
+                        schemas.Label(key="k1", value="low_iou", score=0.5)
+                    ],
+                    bounding_box=schemas.Box([rect2]),
+                ),
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def evaluate_tabular_clf_groundtruths():
+    return pd.DataFrame(
+        [
+            {
+                "id": 9040,
+                "annotation_id": 11373,
+                "label_id": 8031,
+                "created_at": 1722267392923,
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 822,
+                "datum_uid": "uid0",
+            },
+            {
+                "id": 9041,
+                "annotation_id": 11374,
+                "label_id": 8031,
+                "created_at": 1722267392967,
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 823,
+                "datum_uid": "uid1",
+            },
+            {
+                "id": 9042,
+                "annotation_id": 11375,
+                "label_id": 8033,
+                "created_at": 1722267393007,
+                "label_key": "class",
+                "label_value": "2",
+                "datum_id": 824,
+                "datum_uid": "uid2",
+            },
+            {
+                "id": 9043,
+                "annotation_id": 11376,
+                "label_id": 8034,
+                "created_at": 1722267393047,
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 825,
+                "datum_uid": "uid3",
+            },
+            {
+                "id": 9044,
+                "annotation_id": 11377,
+                "label_id": 8034,
+                "created_at": 1722267393088,
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 826,
+                "datum_uid": "uid4",
+            },
+            {
+                "id": 9045,
+                "annotation_id": 11378,
+                "label_id": 8034,
+                "created_at": 1722267393125,
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 827,
+                "datum_uid": "uid5",
+            },
+            {
+                "id": 9046,
+                "annotation_id": 11379,
+                "label_id": 8031,
+                "created_at": 1722267393166,
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 828,
+                "datum_uid": "uid6",
+            },
+            {
+                "id": 9047,
+                "annotation_id": 11380,
+                "label_id": 8031,
+                "created_at": 1722267393215,
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 829,
+                "datum_uid": "uid7",
+            },
+            {
+                "id": 9048,
+                "annotation_id": 11381,
+                "label_id": 8031,
+                "created_at": 1722267393263,
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 830,
+                "datum_uid": "uid8",
+            },
+            {
+                "id": 9049,
+                "annotation_id": 11382,
+                "label_id": 8031,
+                "created_at": 1722267393306,
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 831,
+                "datum_uid": "uid9",
+            },
+        ]
+    )
+
+
+@pytest.fixture
+def evaluate_tabular_clf_predictions():
+    return pd.DataFrame(
+        [
+            {
+                "id": 4600,
+                "annotation_id": 11385,
+                "label_id": 8033,
+                "score": 0.09,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.502504"),
+                "label_key": "class",
+                "label_value": "2",
+                "datum_id": 824,
+                "datum_uid": "uid2",
+            },
+            {
+                "id": 4599,
+                "annotation_id": 11385,
+                "label_id": 8031,
+                "score": 0.88,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.502504"),
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 824,
+                "datum_uid": "uid2",
+            },
+            {
+                "id": 4598,
+                "annotation_id": 11385,
+                "label_id": 8034,
+                "score": 0.03,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.502504"),
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 824,
+                "datum_uid": "uid2",
+            },
+            {
+                "id": 4603,
+                "annotation_id": 11386,
+                "label_id": 8033,
+                "score": 0.0,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.546293"),
+                "label_key": "class",
+                "label_value": "2",
+                "datum_id": 825,
+                "datum_uid": "uid3",
+            },
+            {
+                "id": 4602,
+                "annotation_id": 11386,
+                "label_id": 8031,
+                "score": 0.03,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.546293"),
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 825,
+                "datum_uid": "uid3",
+            },
+            {
+                "id": 4601,
+                "annotation_id": 11386,
+                "label_id": 8034,
+                "score": 0.97,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.546293"),
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 825,
+                "datum_uid": "uid3",
+            },
+            {
+                "id": 4606,
+                "annotation_id": 11387,
+                "label_id": 8033,
+                "score": 0.0,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.586264"),
+                "label_key": "class",
+                "label_value": "2",
+                "datum_id": 826,
+                "datum_uid": "uid4",
+            },
+            {
+                "id": 4605,
+                "annotation_id": 11387,
+                "label_id": 8031,
+                "score": 0.0,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.586264"),
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 826,
+                "datum_uid": "uid4",
+            },
+            {
+                "id": 4604,
+                "annotation_id": 11387,
+                "label_id": 8034,
+                "score": 1.0,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.586264"),
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 826,
+                "datum_uid": "uid4",
+            },
+            {
+                "id": 4609,
+                "annotation_id": 11388,
+                "label_id": 8033,
+                "score": 0.0,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.631094"),
+                "label_key": "class",
+                "label_value": "2",
+                "datum_id": 827,
+                "datum_uid": "uid5",
+            },
+            {
+                "id": 4608,
+                "annotation_id": 11388,
+                "label_id": 8031,
+                "score": 0.0,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.631094"),
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 827,
+                "datum_uid": "uid5",
+            },
+            {
+                "id": 4607,
+                "annotation_id": 11388,
+                "label_id": 8034,
+                "score": 1.0,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.631094"),
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 827,
+                "datum_uid": "uid5",
+            },
+            {
+                "id": 4612,
+                "annotation_id": 11389,
+                "label_id": 8033,
+                "score": 0.03,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.673800"),
+                "label_key": "class",
+                "label_value": "2",
+                "datum_id": 828,
+                "datum_uid": "uid6",
+            },
+            {
+                "id": 4611,
+                "annotation_id": 11389,
+                "label_id": 8031,
+                "score": 0.96,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.673800"),
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 828,
+                "datum_uid": "uid6",
+            },
+            {
+                "id": 4610,
+                "annotation_id": 11389,
+                "label_id": 8034,
+                "score": 0.01,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.673800"),
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 828,
+                "datum_uid": "uid6",
+            },
+            {
+                "id": 4615,
+                "annotation_id": 11390,
+                "label_id": 8033,
+                "score": 0.7,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.709818"),
+                "label_key": "class",
+                "label_value": "2",
+                "datum_id": 829,
+                "datum_uid": "uid7",
+            },
+            {
+                "id": 4614,
+                "annotation_id": 11390,
+                "label_id": 8031,
+                "score": 0.02,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.709818"),
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 829,
+                "datum_uid": "uid7",
+            },
+            {
+                "id": 4613,
+                "annotation_id": 11390,
+                "label_id": 8034,
+                "score": 0.28,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.709818"),
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 829,
+                "datum_uid": "uid7",
+            },
+            {
+                "id": 4618,
+                "annotation_id": 11391,
+                "label_id": 8033,
+                "score": 0.01,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.745536"),
+                "label_key": "class",
+                "label_value": "2",
+                "datum_id": 830,
+                "datum_uid": "uid8",
+            },
+            {
+                "id": 4617,
+                "annotation_id": 11391,
+                "label_id": 8031,
+                "score": 0.21,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.745536"),
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 830,
+                "datum_uid": "uid8",
+            },
+            {
+                "id": 4616,
+                "annotation_id": 11391,
+                "label_id": 8034,
+                "score": 0.78,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.745536"),
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 830,
+                "datum_uid": "uid8",
+            },
+            {
+                "id": 4621,
+                "annotation_id": 11392,
+                "label_id": 8033,
+                "score": 0.44,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.797759"),
+                "label_key": "class",
+                "label_value": "2",
+                "datum_id": 831,
+                "datum_uid": "uid9",
+            },
+            {
+                "id": 4620,
+                "annotation_id": 11392,
+                "label_id": 8031,
+                "score": 0.11,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.797759"),
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 831,
+                "datum_uid": "uid9",
+            },
+            {
+                "id": 4619,
+                "annotation_id": 11392,
+                "label_id": 8034,
+                "score": 0.45,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.797759"),
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 831,
+                "datum_uid": "uid9",
+            },
+            {
+                "id": 4594,
+                "annotation_id": 11383,
+                "label_id": 8033,
+                "score": 0.28,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.411278"),
+                "label_key": "class",
+                "label_value": "2",
+                "datum_id": 822,
+                "datum_uid": "uid0",
+            },
+            {
+                "id": 4593,
+                "annotation_id": 11383,
+                "label_id": 8031,
+                "score": 0.35,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.411278"),
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 822,
+                "datum_uid": "uid0",
+            },
+            {
+                "id": 4592,
+                "annotation_id": 11383,
+                "label_id": 8034,
+                "score": 0.37,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.411278"),
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 822,
+                "datum_uid": "uid0",
+            },
+            {
+                "id": 4597,
+                "annotation_id": 11384,
+                "label_id": 8033,
+                "score": 0.15,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.465625"),
+                "label_key": "class",
+                "label_value": "2",
+                "datum_id": 823,
+                "datum_uid": "uid1",
+            },
+            {
+                "id": 4596,
+                "annotation_id": 11384,
+                "label_id": 8031,
+                "score": 0.61,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.465625"),
+                "label_key": "class",
+                "label_value": "1",
+                "datum_id": 823,
+                "datum_uid": "uid1",
+            },
+            {
+                "id": 4595,
+                "annotation_id": 11384,
+                "label_id": 8034,
+                "score": 0.24,
+                "created_at": pd.Timestamp("2024-07-29 15:36:33.465625"),
+                "label_key": "class",
+                "label_value": "0",
+                "datum_id": 823,
+                "datum_uid": "uid1",
+            },
+        ]
+    )
+
+
+@pytest.fixture
+def evaluate_image_clf_groundtruths():
+    return [
+        schemas.GroundTruth(
+            datum=schemas.Datum(
+                uid="uid5",
+                metadata={
+                    "height": 900,
+                    "width": 300,
+                },
+            ),
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k4", value="v4"),
+                        schemas.Label(key="k5", value="v5"),
+                    ],
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=schemas.Datum(
+                uid="uid6",
+                metadata={
+                    "height": 900,
+                    "width": 300,
+                },
+            ),
+            annotations=[
+                schemas.Annotation(
+                    labels=[schemas.Label(key="k4", value="v4")],
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=schemas.Datum(
+                uid="uid8",
+                metadata={
+                    "height": 900,
+                    "width": 300,
+                },
+            ),
+            annotations=[
+                schemas.Annotation(
+                    labels=[schemas.Label(key="k3", value="v3")],
+                )
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def evaluate_image_clf_predictions():
+    return [
+        schemas.Prediction(
+            datum=schemas.Datum(
+                uid="uid5",
+                metadata={
+                    "height": 900,
+                    "width": 300,
+                },
+            ),
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k4", value="v1", score=0.47),
+                        schemas.Label(key="k4", value="v8", score=0.53),
+                        schemas.Label(key="k5", value="v1", score=1.0),
+                    ],
+                )
+            ],
+        ),
+        schemas.Prediction(
+            datum=schemas.Datum(
+                uid="uid6",
+                metadata={
+                    "height": 900,
+                    "width": 300,
+                },
+            ),
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k4", value="v4", score=0.71),
+                        schemas.Label(key="k4", value="v5", score=0.29),
+                    ],
+                )
+            ],
+        ),
+        schemas.Prediction(
+            datum=schemas.Datum(
+                uid="uid8",
+                metadata={
+                    "height": 900,
+                    "width": 300,
+                },
+            ),
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k3", value="v1", score=1.0),
+                    ],
+                )
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def gt_clfs_tabular() -> list[int]:
+    """ground truth for a tabular classification task"""
+    return [1, 1, 2, 0, 0, 0, 1, 1, 1, 1]
+
+
+@pytest.fixture
+def pred_clfs_tabular() -> list[list[float]]:
+    """predictions for a tabular classification task"""
+    return [
+        [0.37, 0.35, 0.28],
+        [0.24, 0.61, 0.15],
+        [0.03, 0.88, 0.09],
+        [0.97, 0.03, 0.0],
+        [1.0, 0.0, 0.0],
+        [1.0, 0.0, 0.0],
+        [0.01, 0.96, 0.03],
+        [0.28, 0.02, 0.7],
+        [0.78, 0.21, 0.01],
+        [0.45, 0.11, 0.44],
+    ]
+
+
+@pytest.fixture
+def image_height():
+    return 900
+
+
+@pytest.fixture
+def image_width():
+    return 300
+
+
+@pytest.fixture
+def img1(
+    image_height: int,
+    image_width: int,
+) -> schemas.Datum:
+    coordinates = [
+        [
+            (125.2750725, 38.760525),
+            (125.3902365, 38.775069),
+            (125.5054005, 38.789613),
+            (125.5051935, 38.71402425),
+            (125.5049865, 38.6384355),
+            (125.3902005, 38.6244225),
+            (125.2754145, 38.6104095),
+            (125.2752435, 38.68546725),
+            (125.2750725, 38.760525),
+        ]
+    ]
+    return schemas.Datum(
+        uid="uid1",
+        metadata={
+            "geospatial": schemas.Polygon(coordinates),
+            "height": image_height,
+            "width": image_width,
+        },
+    )
+
+
+@pytest.fixture
+def img2(
+    image_height: int,
+    image_width: int,
+) -> schemas.Datum:
+    coordinates = (44.1, 22.4)
+    return schemas.Datum(
+        uid="uid2",
+        metadata={
+            "geospatial": schemas.Point(coordinates),
+            "height": image_height,
+            "width": image_width,
+        },
+    )
+
+
+@pytest.fixture
+def img5(
+    image_height: int,
+    image_width: int,
+) -> schemas.Datum:
+    return schemas.Datum(
+        uid="uid5",
+        metadata={
+            "height": image_height,
+            "width": image_width,
+        },
+    )
+
+
+@pytest.fixture
+def img6(
+    image_height: int,
+    image_width: int,
+) -> schemas.Datum:
+    return schemas.Datum(
+        uid="uid6",
+        metadata={
+            "height": image_height,
+            "width": image_width,
+        },
+    )
+
+
+@pytest.fixture
+def img8(
+    image_height: int,
+    image_width: int,
+) -> schemas.Datum:
+    return schemas.Datum(
+        uid="uid8",
+        metadata={
+            "height": image_height,
+            "width": image_width,
+        },
+    )
+
+
+@pytest.fixture
+def gt_clfs_with_label_maps(
+    img5: schemas.Datum,
+    img6: schemas.Datum,
+    img8: schemas.Datum,
+) -> list[schemas.GroundTruth]:
+    return [
+        schemas.GroundTruth(
+            datum=img5,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k4", value="v4"),
+                        schemas.Label(key="k5", value="v5"),
+                        schemas.Label(key="class", value="siamese cat"),
+                    ],
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img6,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k4", value="v4"),
+                        schemas.Label(key="class", value="british shorthair"),
+                    ],
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img8,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k3", value="v3"),
+                        schemas.Label(key="class", value="tabby cat"),
+                    ],
+                )
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def pred_clfs_with_label_maps(
+    img5: schemas.Datum,
+    img6: schemas.Datum,
+    img8: schemas.Datum,
+) -> list[schemas.Prediction]:
+    return [
+        schemas.Prediction(
+            datum=img5,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k4", value="v1", score=0.47),
+                        schemas.Label(key="k4", value="v8", score=0.53),
+                        schemas.Label(key="k5", value="v1", score=1.0),
+                        schemas.Label(key="class", value="cat", score=1.0),
+                    ],
+                )
+            ],
+        ),
+        schemas.Prediction(
+            datum=img6,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k4", value="v4", score=0.71),
+                        schemas.Label(key="k4", value="v5", score=0.29),
+                        schemas.Label(
+                            key="class_name", value="cat", score=1.0
+                        ),
+                    ],
+                )
+            ],
+        ),
+        schemas.Prediction(
+            datum=img8,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k3", value="v1", score=1.0),
+                        schemas.Label(key="class", value="cat", score=1.0),
+                    ],
+                )
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def gt_clfs_label_key_mismatch(
+    img5: schemas.Datum,
+    img6: schemas.Datum,
+    img8: schemas.Datum,
+) -> list[schemas.GroundTruth]:
+    return [
+        schemas.GroundTruth(
+            datum=img5,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k4", value="v4"),
+                        schemas.Label(key="k5", value="v5"),
+                    ],
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img6,
+            annotations=[
+                schemas.Annotation(
+                    labels=[schemas.Label(key="k4", value="v4")],
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img8,
+            annotations=[
+                schemas.Annotation(
+                    labels=[schemas.Label(key="k3", value="v3")],
+                )
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def pred_clfs_label_key_mismatch(
+    img5: schemas.Datum, img6: schemas.Datum
+) -> list[schemas.Prediction]:
+    return [
+        schemas.Prediction(
+            datum=img5,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k12", value="v12", score=0.47),
+                        schemas.Label(key="k12", value="v16", score=0.53),
+                        schemas.Label(key="k13", value="v13", score=1.0),
+                    ],
+                )
+            ],
+        ),
+        schemas.Prediction(
+            datum=img6,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k4", value="v4", score=0.71),
+                        schemas.Label(key="k4", value="v5", score=0.29),
+                    ],
+                )
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def gt_clfs(
+    img5: schemas.Datum,
+    img6: schemas.Datum,
+    img8: schemas.Datum,
+) -> list[schemas.GroundTruth]:
+    return [
+        schemas.GroundTruth(
+            datum=img5,
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="k4", value="v4"),
+                        schemas.Label(key="k5", value="v5"),
+                    ],
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img6,
+            annotations=[
+                schemas.Annotation(
+                    labels=[schemas.Label(key="k4", value="v4")],
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img8,
+            annotations=[
+                schemas.Annotation(
+                    labels=[schemas.Label(key="k3", value="v3")],
+                )
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def classification_functional_test_data():
+    animal_gts = ["bird", "dog", "bird", "bird", "cat", "dog"]
+    animal_preds = [
+        {"bird": 0.6, "dog": 0.2, "cat": 0.2},
+        {"cat": 0.9, "dog": 0.1, "bird": 0.0},
+        {"cat": 0.8, "dog": 0.05, "bird": 0.15},
+        {"dog": 0.75, "cat": 0.1, "bird": 0.15},
+        {"cat": 1.0, "dog": 0.0, "bird": 0.0},
+        {"cat": 0.4, "dog": 0.4, "bird": 0.2},
+    ]
+
+    color_gts = ["white", "white", "red", "blue", "black", "red"]
+    color_preds = [
+        {"white": 0.65, "red": 0.1, "blue": 0.2, "black": 0.05},
+        {"blue": 0.5, "white": 0.3, "red": 0.0, "black": 0.2},
+        {"red": 0.4, "white": 0.2, "blue": 0.1, "black": 0.3},
+        {"white": 1.0, "red": 0.0, "blue": 0.0, "black": 0.0},
+        {"red": 0.8, "white": 0.0, "blue": 0.2, "black": 0.0},
+        {"red": 0.9, "white": 0.06, "blue": 0.01, "black": 0.03},
+    ]
+
+    imgs = [
+        schemas.Datum(
+            uid=f"uid{i}",
+            metadata={
+                "height": 128,
+                "width": 256,
+            },
+        )
+        for i in range(6)
+    ]
+
+    gts = [
+        schemas.GroundTruth(
+            datum=imgs[i],
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="animal", value=animal_gts[i]),
+                        schemas.Label(key="color", value=color_gts[i]),
+                    ],
+                )
+            ],
+        )
+        for i in range(6)
+    ]
+
+    preds = [
+        schemas.Prediction(
+            datum=imgs[i],
+            annotations=[
+                schemas.Annotation(
+                    labels=[
+                        schemas.Label(key="animal", value=value, score=score)
+                        for value, score in animal_preds[i].items()
+                    ]
+                    + [
+                        schemas.Label(key="color", value=value, score=score)
+                        for value, score in color_preds[i].items()
+                    ],
+                )
+            ],
+        )
+        for i in range(6)
+    ]
+
+    return (gts, preds)
+
+
+@pytest.fixture
+def classification_functional_test_groundtruth_df():
+    return pd.DataFrame(
+        [
+            {
+                "datum_uid": "uid0",
+                "datum_id": -5384017641951508119,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": -7219056621792402854,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "bird",
+                "label_id": 6844413835611710259,
+                "id": -6147199056584656887,
+                "mapped_groundtruth_label_keys": "class",
+                "label": ("animal", "bird"),
+            },
+            {
+                "datum_uid": "uid0",
+                "datum_id": -5384017641951508119,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": -7219056621792402854,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "white",
+                "label_id": 1137203407882171315,
+                "id": 8837325099618861823,
+                "mapped_groundtruth_label_keys": "color",
+                "label": ("color", "white"),
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": -8510955155591861879,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": 8790918715870844863,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "dog",
+                "label_id": 8009222289478380372,
+                "id": -1593123359500601416,
+                "mapped_groundtruth_label_keys": "class",
+                "label": ("animal", "dog"),
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": -8510955155591861879,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": 8790918715870844863,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "white",
+                "label_id": 1137203407882171315,
+                "id": 3582630467549642626,
+                "mapped_groundtruth_label_keys": "color",
+                "label": ("color", "white"),
+            },
+            {
+                "datum_uid": "uid2",
+                "datum_id": -8411940843701065439,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": -3239983991430348508,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "bird",
+                "label_id": 6844413835611710259,
+                "id": -6917823642762098726,
+                "mapped_groundtruth_label_keys": "class",
+                "label": ("animal", "bird"),
+            },
+            {
+                "datum_uid": "uid2",
+                "datum_id": -8411940843701065439,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": -3239983991430348508,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "red",
+                "label_id": -3886640484917084310,
+                "id": -1339278877785114234,
+                "mapped_groundtruth_label_keys": "color",
+                "label": ("color", "red"),
+            },
+            {
+                "datum_uid": "uid3",
+                "datum_id": -2265528102457502931,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": 4382196578706948542,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "bird",
+                "label_id": 6844413835611710259,
+                "id": 1083297721794099590,
+                "mapped_groundtruth_label_keys": "class",
+                "label": ("animal", "bird"),
+            },
+            {
+                "datum_uid": "uid3",
+                "datum_id": -2265528102457502931,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": 4382196578706948542,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "blue",
+                "label_id": -1372075868144138351,
+                "id": -615284425434206300,
+                "mapped_groundtruth_label_keys": "color",
+                "label": ("color", "blue"),
+            },
+            {
+                "datum_uid": "uid4",
+                "datum_id": -4389124420839664731,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val1",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": 4962111685767385274,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "cat",
+                "label_id": 4524343817500814041,
+                "id": -7816578330009256692,
+                "mapped_groundtruth_label_keys": "class",
+                "label": ("animal", "cat"),
+            },
+            {
+                "datum_uid": "uid4",
+                "datum_id": -4389124420839664731,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val1",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": 4962111685767385274,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "black",
+                "label_id": 1817852877141727993,
+                "id": -5129897778521880842,
+                "mapped_groundtruth_label_keys": "color",
+                "label": ("color", "black"),
+            },
+            {
+                "datum_uid": "uid5",
+                "datum_id": 5314927723853009775,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": -746121109706998955,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "dog",
+                "label_id": 8009222289478380372,
+                "id": -6769946184488850844,
+                "mapped_groundtruth_label_keys": "class",
+                "label": ("animal", "dog"),
+            },
+            {
+                "datum_uid": "uid5",
+                "datum_id": 5314927723853009775,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": -746121109706998955,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "red",
+                "label_id": -3886640484917084310,
+                "id": -503991891998595125,
+                "mapped_groundtruth_label_keys": "color",
+                "label": ("color", "red"),
+            },
+        ]
+    )
+
+
+@pytest.fixture
+def classification_functional_test_prediction_df():
+    """Used in test_rocauc_with_label_map so that we can test _calculate_rocauc directly, since this original text violated the matching groundtruth/prediction label keys criteria."""
+    return pd.DataFrame(
+        [
+            {
+                "datum_uid": "uid0",
+                "datum_id": -5384017641951508119,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": -6728727181236673047,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "bird",
+                "score": 0.6,
+                "label_id": -5215084239238914495,
+                "id": -1240527857667701281,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "bird"),
+            },
+            {
+                "datum_uid": "uid0",
+                "datum_id": -5384017641951508119,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": -6728727181236673047,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "dog",
+                "score": 0.2,
+                "label_id": -6049586979668957678,
+                "id": 49317224219915580,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "dog"),
+            },
+            {
+                "datum_uid": "uid0",
+                "datum_id": -5384017641951508119,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": -6728727181236673047,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "cat",
+                "score": 0.2,
+                "label_id": 7273800936934963489,
+                "id": 233173136032973625,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "cat"),
+            },
+            {
+                "datum_uid": "uid0",
+                "datum_id": -5384017641951508119,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": -6728727181236673047,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "white",
+                "score": 0.65,
+                "label_id": -4826903763707637373,
+                "id": -6184807819874130814,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "white"),
+            },
+            {
+                "datum_uid": "uid0",
+                "datum_id": -5384017641951508119,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": -6728727181236673047,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "red",
+                "score": 0.1,
+                "label_id": 4216827315928697217,
+                "id": 5704534164417962892,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "red"),
+            },
+            {
+                "datum_uid": "uid0",
+                "datum_id": -5384017641951508119,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": -6728727181236673047,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "blue",
+                "score": 0.2,
+                "label_id": -3960395303314501711,
+                "id": 1511896606515226706,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "blue"),
+            },
+            {
+                "datum_uid": "uid0",
+                "datum_id": -5384017641951508119,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": -6728727181236673047,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "black",
+                "score": 0.05,
+                "label_id": -8589704813442599109,
+                "id": 3647731253780364946,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "black"),
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": -8510955155591861879,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": 4939978831501967353,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "cat",
+                "score": 0.9,
+                "label_id": 2094222191875474652,
+                "id": -4753231139294527417,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "cat"),
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": -8510955155591861879,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": 4939978831501967353,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "dog",
+                "score": 0.1,
+                "label_id": -4878077841794693757,
+                "id": 8538318431236799830,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "dog"),
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": -8510955155591861879,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": 4939978831501967353,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "bird",
+                "score": 0.0,
+                "label_id": 8183125692418530608,
+                "id": 5468044993361705841,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "bird"),
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": -8510955155591861879,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": 4939978831501967353,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "blue",
+                "score": 0.5,
+                "label_id": 5578669252512141405,
+                "id": 5993876661711494245,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "blue"),
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": -8510955155591861879,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": 4939978831501967353,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "white",
+                "score": 0.3,
+                "label_id": -4200814355896957607,
+                "id": -1473852835329269153,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "white"),
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": -8510955155591861879,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": 4939978831501967353,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "red",
+                "score": 0.0,
+                "label_id": -519495577997781294,
+                "id": -2806063230919808758,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "red"),
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": -8510955155591861879,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": 4939978831501967353,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "black",
+                "score": 0.2,
+                "label_id": -4372451618257326717,
+                "id": -9192777550609387657,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "black"),
+            },
+            {
+                "datum_uid": "uid2",
+                "datum_id": -8411940843701065439,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 7499720668016145718,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "cat",
+                "score": 0.8,
+                "label_id": 3361029567128538938,
+                "id": -2495225296460022208,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "cat"),
+            },
+            {
+                "datum_uid": "uid2",
+                "datum_id": -8411940843701065439,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 7499720668016145718,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "dog",
+                "score": 0.05,
+                "label_id": 1495879137950468608,
+                "id": 96491879800885197,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "dog"),
+            },
+            {
+                "datum_uid": "uid2",
+                "datum_id": -8411940843701065439,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 7499720668016145718,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "bird",
+                "score": 0.15,
+                "label_id": -3283720280595522641,
+                "id": 1354699752396805280,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "bird"),
+            },
+            {
+                "datum_uid": "uid2",
+                "datum_id": -8411940843701065439,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 7499720668016145718,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "red",
+                "score": 0.4,
+                "label_id": -2416149083383886333,
+                "id": 268130056698580260,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "red"),
+            },
+            {
+                "datum_uid": "uid2",
+                "datum_id": -8411940843701065439,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 7499720668016145718,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "white",
+                "score": 0.2,
+                "label_id": -1998826250032086593,
+                "id": -4021126010657534621,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "white"),
+            },
+            {
+                "datum_uid": "uid2",
+                "datum_id": -8411940843701065439,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 7499720668016145718,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "blue",
+                "score": 0.1,
+                "label_id": -4127427154085111908,
+                "id": 6376790152767730567,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "blue"),
+            },
+            {
+                "datum_uid": "uid2",
+                "datum_id": -8411940843701065439,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 7499720668016145718,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "black",
+                "score": 0.3,
+                "label_id": -5292453587279810103,
+                "id": 7023758392816762513,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "black"),
+            },
+            {
+                "datum_uid": "uid3",
+                "datum_id": -2265528102457502931,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": 4348440930043552140,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "dog",
+                "score": 0.75,
+                "label_id": -1804361582153801946,
+                "id": 2109915554097816409,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "dog"),
+            },
+            {
+                "datum_uid": "uid3",
+                "datum_id": -2265528102457502931,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": 4348440930043552140,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "cat",
+                "score": 0.1,
+                "label_id": -4720233526095501343,
+                "id": -7234886842398502296,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "cat"),
+            },
+            {
+                "datum_uid": "uid3",
+                "datum_id": -2265528102457502931,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": 4348440930043552140,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "bird",
+                "score": 0.15,
+                "label_id": -3283720280595522641,
+                "id": 1110595858053279959,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "bird"),
+            },
+            {
+                "datum_uid": "uid3",
+                "datum_id": -2265528102457502931,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": 4348440930043552140,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "white",
+                "score": 1.0,
+                "label_id": 5280415162891465313,
+                "id": 8226781192373612358,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "white"),
+            },
+            {
+                "datum_uid": "uid3",
+                "datum_id": -2265528102457502931,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": 4348440930043552140,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "red",
+                "score": 0.0,
+                "label_id": -519495577997781294,
+                "id": -1930456292948739198,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "red"),
+            },
+            {
+                "datum_uid": "uid3",
+                "datum_id": -2265528102457502931,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": 4348440930043552140,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "blue",
+                "score": 0.0,
+                "label_id": 6597917751396615534,
+                "id": 5770081132013712295,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "blue"),
+            },
+            {
+                "datum_uid": "uid3",
+                "datum_id": -2265528102457502931,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val0",
+                },
+                "annotation_id": 4348440930043552140,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "black",
+                "score": 0.0,
+                "label_id": 1350538389931074891,
+                "id": 9216624913651577421,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "black"),
+            },
+            {
+                "datum_uid": "uid4",
+                "datum_id": -4389124420839664731,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val1",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": -3609568981720823102,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "cat",
+                "score": 1.0,
+                "label_id": 7155939162232491288,
+                "id": 8865373275147915155,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "cat"),
+            },
+            {
+                "datum_uid": "uid4",
+                "datum_id": -4389124420839664731,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val1",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": -3609568981720823102,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "dog",
+                "score": 0.0,
+                "label_id": -8923497484890863398,
+                "id": 7811596003484809003,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "dog"),
+            },
+            {
+                "datum_uid": "uid4",
+                "datum_id": -4389124420839664731,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val1",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": -3609568981720823102,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "bird",
+                "score": 0.0,
+                "label_id": 8183125692418530608,
+                "id": -603291948951724467,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "bird"),
+            },
+            {
+                "datum_uid": "uid4",
+                "datum_id": -4389124420839664731,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val1",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": -3609568981720823102,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "red",
+                "score": 0.8,
+                "label_id": 1005923488131372002,
+                "id": 2186370402320236011,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "red"),
+            },
+            {
+                "datum_uid": "uid4",
+                "datum_id": -4389124420839664731,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val1",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": -3609568981720823102,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "white",
+                "score": 0.0,
+                "label_id": -6581901677798598125,
+                "id": 5980951779669100519,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "white"),
+            },
+            {
+                "datum_uid": "uid4",
+                "datum_id": -4389124420839664731,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val1",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": -3609568981720823102,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "blue",
+                "score": 0.2,
+                "label_id": -3960395303314501711,
+                "id": -2623103473497724690,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "blue"),
+            },
+            {
+                "datum_uid": "uid4",
+                "datum_id": -4389124420839664731,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val1",
+                    "md2": "md1-val1",
+                },
+                "annotation_id": -3609568981720823102,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "black",
+                "score": 0.0,
+                "label_id": 1350538389931074891,
+                "id": 1948160906536205683,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "black"),
+            },
+            {
+                "datum_uid": "uid5",
+                "datum_id": 5314927723853009775,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 2454836867465092903,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "cat",
+                "score": 0.4,
+                "label_id": -5278394517120365112,
+                "id": 8196690759347808946,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "cat"),
+            },
+            {
+                "datum_uid": "uid5",
+                "datum_id": 5314927723853009775,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 2454836867465092903,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "dog",
+                "score": 0.4,
+                "label_id": -3672411415008402703,
+                "id": -1938030899200555758,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "dog"),
+            },
+            {
+                "datum_uid": "uid5",
+                "datum_id": 5314927723853009775,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 2454836867465092903,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "animal",
+                "label_value": "bird",
+                "score": 0.2,
+                "label_id": -4720668901151276709,
+                "id": -375807178484672075,
+                "mapped_prediction_label_keys": "class",
+                "label": ("animal", "bird"),
+            },
+            {
+                "datum_uid": "uid5",
+                "datum_id": 5314927723853009775,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 2454836867465092903,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "red",
+                "score": 0.9,
+                "label_id": -2571710428146614475,
+                "id": 7302285613830353470,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "red"),
+            },
+            {
+                "datum_uid": "uid5",
+                "datum_id": 5314927723853009775,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 2454836867465092903,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "white",
+                "score": 0.06,
+                "label_id": 6423587877188027700,
+                "id": -5213005280939427276,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "white"),
+            },
+            {
+                "datum_uid": "uid5",
+                "datum_id": 5314927723853009775,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 2454836867465092903,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "blue",
+                "score": 0.01,
+                "label_id": -7515229394567381620,
+                "id": 3837015023039237314,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "blue"),
+            },
+            {
+                "datum_uid": "uid5",
+                "datum_id": 5314927723853009775,
+                "datum_metadata": {
+                    "height": 128,
+                    "width": 256,
+                    "md1": "md1-val0",
+                    "md2": "md1-val2",
+                },
+                "annotation_id": 2454836867465092903,
+                "annotation_metadata": None,
+                "bounding_box": None,
+                "raster": None,
+                "embedding": None,
+                "polygon": None,
+                "is_instance": None,
+                "label_key": "color",
+                "label_value": "black",
+                "score": 0.03,
+                "label_id": -824168874021550241,
+                "id": 551917309394979383,
+                "mapped_prediction_label_keys": "color",
+                "label": ("color", "black"),
+            },
+        ]
+    )
diff --git a/core/tests/functional-tests/test_classification.py b/core/tests/functional-tests/test_classification.py
new file mode 100644
index 000000000..a56b92bc6
--- /dev/null
+++ b/core/tests/functional-tests/test_classification.py
@@ -0,0 +1,1247 @@
+import random
+
+import pytest
+from valor_core import enums, schemas
+from valor_core.classification import (
+    _calculate_rocauc,
+    evaluate_classification,
+)
+
+
+def test_evaluate_image_clf(
+    evaluate_image_clf_groundtruths, evaluate_image_clf_predictions
+):
+
+    eval_job = evaluate_classification(
+        groundtruths=evaluate_image_clf_groundtruths,
+        predictions=evaluate_image_clf_predictions,
+    )
+
+    eval_job_metrics = eval_job.metrics
+
+    expected_metrics = [
+        {"type": "Accuracy", "parameters": {"label_key": "k4"}, "value": 0.5},
+        {
+            "type": "ROCAUC",
+            "parameters": {"label_key": "k4"},
+            "value": 1.0,
+        },
+        {
+            "type": "Precision",
+            "value": 1.0,  # no false predictions
+            "label": {"key": "k4", "value": "v4"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.5,  # img5 had the correct prediction, but not img6
+            "label": {"key": "k4", "value": "v4"},
+        },
+        {
+            "type": "F1",
+            "value": 0.6666666666666666,
+            "label": {"key": "k4", "value": "v4"},
+        },
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k4", "value": "v8"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k4", "value": "v8"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k4", "value": "v8"}},
+        {
+            "type": "Precision",
+            "value": -1.0,
+            "label": {"key": "k4", "value": "v5"},
+        },
+        {
+            "type": "Recall",
+            "value": -1.0,
+            "label": {"key": "k4", "value": "v5"},
+        },
+        {"type": "F1", "value": -1.0, "label": {"key": "k4", "value": "v5"}},
+        {
+            "type": "Precision",
+            "value": -1.0,  # this value is -1 (not 0) because this label is never used anywhere; (k4, v8) has the higher score for img5, so it's chosen over (k4, v1)
+            "label": {"key": "k4", "value": "v1"},
+        },
+        {
+            "type": "Recall",
+            "value": -1.0,
+            "label": {"key": "k4", "value": "v1"},
+        },
+        {"type": "F1", "value": -1.0, "label": {"key": "k4", "value": "v1"}},
+        {"type": "Accuracy", "parameters": {"label_key": "k5"}, "value": 0.0},
+        {
+            "type": "ROCAUC",
+            "parameters": {"label_key": "k5"},
+            "value": 1.0,
+        },
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k5", "value": "v1"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k5", "value": "v1"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k5", "value": "v1"}},
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k5", "value": "v5"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k5", "value": "v5"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k5", "value": "v5"}},
+        {"type": "Accuracy", "parameters": {"label_key": "k3"}, "value": 0.0},
+        {"type": "ROCAUC", "parameters": {"label_key": "k3"}, "value": 1.0},
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k3", "value": "v1"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k3", "value": "v1"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k3", "value": "v1"}},
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k3", "value": "v3"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k3", "value": "v3"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k3", "value": "v3"}},
+    ]
+
+    expected_confusion_matrices = [
+        {
+            "label_key": "k5",
+            "entries": [{"prediction": "v1", "groundtruth": "v5", "count": 1}],
+        },
+        {
+            "label_key": "k4",
+            "entries": [
+                {"prediction": "v4", "groundtruth": "v4", "count": 1},
+                {"prediction": "v8", "groundtruth": "v4", "count": 1},
+            ],
+        },
+        {
+            "label_key": "k3",
+            "entries": [{"prediction": "v1", "groundtruth": "v3", "count": 1}],
+        },
+    ]
+
+    for m in eval_job_metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in eval_job_metrics
+
+    confusion_matrices = eval_job.confusion_matrices
+    assert confusion_matrices
+    for m in confusion_matrices:
+        assert m in expected_confusion_matrices
+    for m in expected_confusion_matrices:
+        assert m in confusion_matrices
+
+    # test evaluation metadata
+    expected_metadata = {
+        "datums": 3,
+        "labels": 8,
+        "annotations": 6,
+    }
+
+    for key, value in expected_metadata.items():
+        assert eval_job.meta[key] == value  # type: ignore - issue #605
+
+    # eval should definitely take less than 5 seconds, usually around .4
+    assert eval_job.meta["duration"] <= 5  # type: ignore - issue #605
+
+    # check that metrics arg works correctly
+    selected_metrics = random.sample(
+        [
+            enums.MetricType.Accuracy,
+            enums.MetricType.ROCAUC,
+            enums.MetricType.Precision,
+            enums.MetricType.F1,
+            enums.MetricType.Recall,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        2,
+    )
+
+    eval_job = evaluate_classification(
+        groundtruths=evaluate_image_clf_groundtruths,
+        predictions=evaluate_image_clf_predictions,
+        metrics_to_return=selected_metrics,
+    )
+
+    assert set([metric["type"] for metric in eval_job.metrics]) == set(
+        selected_metrics
+    )
+
+    # check that passing None to metrics returns the assumed list of default metrics
+    default_metrics = [
+        enums.MetricType.Precision,
+        enums.MetricType.Recall,
+        enums.MetricType.F1,
+        enums.MetricType.Accuracy,
+        enums.MetricType.ROCAUC,
+    ]
+    eval_job = evaluate_classification(
+        groundtruths=evaluate_image_clf_groundtruths,
+        predictions=evaluate_image_clf_predictions,
+        metrics_to_return=None,
+    )
+    assert set([metric["type"] for metric in eval_job.metrics]) == set(
+        default_metrics
+    )
+
+
+def test_evaluate_tabular_clf(
+    evaluate_tabular_clf_groundtruths, evaluate_tabular_clf_predictions
+):
+    eval_job = evaluate_classification(
+        groundtruths=evaluate_tabular_clf_groundtruths,
+        predictions=evaluate_tabular_clf_predictions,
+    )
+
+    eval_job_metrics = eval_job.metrics
+
+    expected_metrics = [
+        {
+            "type": "Accuracy",
+            "parameters": {"label_key": "class"},
+            "value": 0.5,
+        },
+        {
+            "type": "ROCAUC",
+            "parameters": {"label_key": "class"},
+            "value": 0.7685185185185185,
+        },
+        {
+            "type": "Precision",
+            "value": 0.6666666666666666,
+            "label": {"key": "class", "value": "1"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.3333333333333333,
+            "label": {"key": "class", "value": "1"},
+        },
+        {
+            "type": "F1",
+            "value": 0.4444444444444444,
+            "label": {"key": "class", "value": "1"},
+        },
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "class", "value": "2"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "class", "value": "2"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "class", "value": "2"}},
+        {
+            "type": "Precision",
+            "value": 0.5,
+            "label": {"key": "class", "value": "0"},
+        },
+        {
+            "type": "Recall",
+            "value": 1.0,
+            "label": {"key": "class", "value": "0"},
+        },
+        {
+            "type": "F1",
+            "value": 0.6666666666666666,
+            "label": {"key": "class", "value": "0"},
+        },
+    ]
+    for m in eval_job_metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in eval_job_metrics
+
+    confusion_matrices = eval_job.confusion_matrices
+
+    expected_confusion_matrix = {
+        "label_key": "class",
+        "entries": [
+            {"prediction": "0", "groundtruth": "0", "count": 3},
+            {"prediction": "0", "groundtruth": "1", "count": 3},
+            {"prediction": "1", "groundtruth": "1", "count": 2},
+            {"prediction": "1", "groundtruth": "2", "count": 1},
+            {"prediction": "2", "groundtruth": "1", "count": 1},
+        ],
+    }
+
+    # validate return schema
+    assert confusion_matrices
+    assert len(confusion_matrices) == 1
+    confusion_matrix = confusion_matrices[0]
+    assert "label_key" in confusion_matrix
+    assert "entries" in confusion_matrix
+
+    # validate values
+    assert (
+        confusion_matrix["label_key"] == expected_confusion_matrix["label_key"]
+    )
+    for entry in confusion_matrix["entries"]:
+        assert entry in expected_confusion_matrix["entries"]
+    for entry in expected_confusion_matrix["entries"]:
+        assert entry in confusion_matrix["entries"]
+
+    # validate return schema
+    assert len(confusion_matrices) == 1
+    confusion_matrix = confusion_matrices[0]
+    assert "label_key" in confusion_matrix
+    assert "entries" in confusion_matrix
+
+    # validate values
+    assert (
+        confusion_matrix["label_key"] == expected_confusion_matrix["label_key"]
+    )
+    for entry in confusion_matrix["entries"]:
+        assert entry in expected_confusion_matrix["entries"]
+    for entry in expected_confusion_matrix["entries"]:
+        assert entry in confusion_matrix["entries"]
+
+
+def test_evaluate_classification_with_label_maps(
+    gt_clfs_with_label_maps: list[schemas.GroundTruth],
+    pred_clfs_with_label_maps: list[schemas.Prediction],
+):
+    # check baseline case, where we have mismatched ground truth and prediction label keys
+    with pytest.raises(ValueError) as e:
+        evaluate_classification(
+            groundtruths=gt_clfs_with_label_maps,
+            predictions=pred_clfs_with_label_maps,
+        )
+    assert "label keys must match" in str(e)
+
+    # now try using a label map to connect all the cats
+    label_mapping = {
+        # map the ground truths
+        schemas.Label(key="class", value="tabby cat"): schemas.Label(
+            key="special_class", value="cat_type1"
+        ),
+        schemas.Label(key="class", value="siamese cat"): schemas.Label(
+            key="special_class", value="cat_type1"
+        ),
+        schemas.Label(key="class", value="british shorthair"): schemas.Label(
+            key="special_class", value="cat_type1"
+        ),
+        # map the predictions
+        schemas.Label(key="class", value="cat"): schemas.Label(
+            key="special_class", value="cat_type1"
+        ),
+        schemas.Label(key="class_name", value="cat"): schemas.Label(
+            key="special_class", value="cat_type1"
+        ),
+    }
+
+    cat_expected_metrics = [
+        {"type": "Accuracy", "parameters": {"label_key": "k3"}, "value": 0.0},
+        {"type": "ROCAUC", "parameters": {"label_key": "k3"}, "value": 1.0},
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k3", "value": "v1"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k3", "value": "v1"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k3", "value": "v1"}},
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k3", "value": "v3"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k3", "value": "v3"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k3", "value": "v3"}},
+        {"type": "Accuracy", "parameters": {"label_key": "k5"}, "value": 0.0},
+        {"type": "ROCAUC", "parameters": {"label_key": "k5"}, "value": 1.0},
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k5", "value": "v5"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k5", "value": "v5"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k5", "value": "v5"}},
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k5", "value": "v1"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k5", "value": "v1"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k5", "value": "v1"}},
+        {
+            "type": "Accuracy",
+            "parameters": {"label_key": "special_class"},
+            "value": 1.0,
+        },
+        {
+            "type": "ROCAUC",
+            "parameters": {"label_key": "special_class"},
+            "value": 1.0,
+        },
+        {
+            "type": "Precision",
+            "value": 1.0,
+            "label": {"key": "special_class", "value": "cat_type1"},
+        },
+        {
+            "type": "Recall",
+            "value": 1.0,
+            "label": {"key": "special_class", "value": "cat_type1"},
+        },
+        {
+            "type": "F1",
+            "value": 1.0,
+            "label": {"key": "special_class", "value": "cat_type1"},
+        },
+        {"type": "Accuracy", "parameters": {"label_key": "k4"}, "value": 0.5},
+        {
+            "type": "ROCAUC",
+            "parameters": {
+                "label_key": "k4",
+            },
+            "value": 1.0,
+        },
+        {
+            "type": "Precision",
+            "value": -1.0,
+            "label": {"key": "k4", "value": "v5"},
+        },
+        {
+            "type": "Recall",
+            "value": -1.0,
+            "label": {"key": "k4", "value": "v5"},
+        },
+        {"type": "F1", "value": -1.0, "label": {"key": "k4", "value": "v5"}},
+        {
+            "type": "Precision",
+            "value": -1.0,
+            "label": {"key": "k4", "value": "v1"},
+        },
+        {
+            "type": "Recall",
+            "value": -1.0,
+            "label": {"key": "k4", "value": "v1"},
+        },
+        {"type": "F1", "value": -1.0, "label": {"key": "k4", "value": "v1"}},
+        {
+            "type": "Precision",
+            "value": 1.0,
+            "label": {"key": "k4", "value": "v4"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.5,
+            "label": {"key": "k4", "value": "v4"},
+        },
+        {
+            "type": "F1",
+            "value": 0.6666666666666666,
+            "label": {"key": "k4", "value": "v4"},
+        },
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k4", "value": "v8"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k4", "value": "v8"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k4", "value": "v8"}},
+    ]
+
+    cat_expected_cm = [
+        {
+            "label_key": "special_class",
+            "entries": [
+                {
+                    "prediction": "cat_type1",
+                    "groundtruth": "cat_type1",
+                    "count": 3,
+                }
+            ],
+        }
+        # other label keys not included for testing purposes
+    ]
+
+    eval_job = evaluate_classification(
+        groundtruths=gt_clfs_with_label_maps,
+        predictions=pred_clfs_with_label_maps,
+        label_map=label_mapping,
+        pr_curve_max_examples=3,
+        metrics_to_return=[
+            enums.MetricType.Precision,
+            enums.MetricType.Recall,
+            enums.MetricType.F1,
+            enums.MetricType.Accuracy,
+            enums.MetricType.ROCAUC,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+
+    pr_expected_values = {
+        # k3
+        (0, "k3", "v1", "0.1", "fp"): 1,
+        (0, "k3", "v1", "0.1", "tn"): 2,
+        (0, "k3", "v3", "0.1", "fn"): 1,
+        (0, "k3", "v3", "0.1", "tn"): 2,
+        (0, "k3", "v3", "0.1", "accuracy"): 2 / 3,
+        (0, "k3", "v3", "0.1", "precision"): -1,
+        (0, "k3", "v3", "0.1", "recall"): 0,
+        (0, "k3", "v3", "0.1", "f1_score"): -1,
+        # k4
+        (1, "k4", "v1", "0.1", "fp"): 1,
+        (1, "k4", "v1", "0.1", "tn"): 2,
+        (1, "k4", "v4", "0.1", "fn"): 1,
+        (1, "k4", "v4", "0.1", "tn"): 1,
+        (1, "k4", "v4", "0.1", "tp"): 1,
+        (1, "k4", "v4", "0.9", "tp"): 0,
+        (1, "k4", "v4", "0.9", "tn"): 1,
+        (1, "k4", "v4", "0.9", "fn"): 2,
+        (1, "k4", "v5", "0.1", "fp"): 1,
+        (1, "k4", "v5", "0.1", "tn"): 2,
+        (1, "k4", "v5", "0.3", "fp"): 0,
+        (1, "k4", "v5", "0.3", "tn"): 3,
+        (1, "k4", "v8", "0.1", "tn"): 2,
+        (1, "k4", "v8", "0.6", "fp"): 0,
+        (1, "k4", "v8", "0.6", "tn"): 3,
+        # k5
+        (2, "k5", "v1", "0.1", "fp"): 1,
+        (2, "k5", "v1", "0.1", "tn"): 2,
+        (2, "k5", "v5", "0.1", "fn"): 1,
+        (
+            2,
+            "k5",
+            "v5",
+            "0.1",
+            "tn",
+        ): 2,
+        (2, "k5", "v1", "0.1", "accuracy"): 2 / 3,
+        (2, "k5", "v1", "0.1", "precision"): 0,
+        (2, "k5", "v1", "0.1", "recall"): -1,
+        (2, "k5", "v1", "0.1", "f1_score"): -1,
+        # special_class
+        (3, "special_class", "cat_type1", "0.1", "tp"): 3,
+        (3, "special_class", "cat_type1", "0.1", "tn"): 0,
+        (3, "special_class", "cat_type1", "0.95", "tp"): 3,
+    }
+
+    pr_metrics = []
+    detailed_pr_metrics = []
+    for m in eval_job.metrics:
+        if m["type"] == "PrecisionRecallCurve":
+            pr_metrics.append(m)
+        elif m["type"] == "DetailedPrecisionRecallCurve":
+            detailed_pr_metrics.append(m)
+        else:
+            assert m in cat_expected_metrics
+
+    for m in cat_expected_metrics:
+        assert m in eval_job.metrics
+
+    pr_metrics.sort(key=lambda x: x["parameters"]["label_key"])
+    detailed_pr_metrics.sort(key=lambda x: x["parameters"]["label_key"])
+
+    for (
+        index,
+        key,
+        value,
+        threshold,
+        metric,
+    ), expected_value in pr_expected_values.items():
+        assert (
+            pr_metrics[index]["value"][value][float(threshold)][metric]
+            == expected_value
+        )
+
+    # check DetailedPrecisionRecallCurve
+    detailed_pr_expected_answers = {
+        # k3
+        (0, "v1", "0.1", "tp"): {"all": 0, "total": 0},
+        (0, "v1", "0.1", "fp"): {
+            "misclassifications": 1,
+            "total": 1,
+        },
+        (0, "v1", "0.1", "tn"): {"all": 2, "total": 2},
+        (0, "v1", "0.1", "fn"): {
+            "no_predictions": 0,
+            "misclassifications": 0,
+            "total": 0,
+        },
+        # k4
+        (1, "v1", "0.1", "tp"): {"all": 0, "total": 0},
+        (1, "v1", "0.1", "fp"): {
+            "misclassifications": 1,
+            "total": 1,
+        },
+        (1, "v1", "0.1", "tn"): {"all": 2, "total": 2},
+        (1, "v1", "0.1", "fn"): {
+            "no_predictions": 0,
+            "misclassifications": 0,
+            "total": 0,
+        },
+        (1, "v4", "0.1", "fn"): {
+            "no_predictions": 0,
+            "misclassifications": 1,
+            "total": 1,
+        },
+        (1, "v8", "0.1", "tn"): {"all": 2, "total": 2},
+    }
+
+    for (
+        index,
+        value,
+        threshold,
+        metric,
+    ), expected_output in detailed_pr_expected_answers.items():
+        model_output = detailed_pr_metrics[index]["value"][value][
+            float(threshold)
+        ][metric]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+
+    # check metadata
+    assert eval_job and eval_job.meta
+    assert eval_job.meta["datums"] == 3
+    assert eval_job.meta["labels"] == 13
+    assert eval_job.meta["annotations"] == 6
+    assert eval_job.meta["duration"] <= 10  # usually 2
+
+    # check confusion matrix
+    confusion_matrix = eval_job.confusion_matrices
+
+    assert confusion_matrix
+    for row in confusion_matrix:
+        if row["label_key"] == "special_class":
+            for entry in cat_expected_cm[0]["entries"]:
+                assert entry in row["entries"]
+            for entry in row["entries"]:
+                assert entry in cat_expected_cm[0]["entries"]
+
+    # finally, check invalid label_map
+    with pytest.raises(TypeError):
+        _ = evaluate_classification(
+            groundtruths=gt_clfs_with_label_maps,
+            predictions=pred_clfs_with_label_maps,
+            label_map=[
+                [
+                    [
+                        schemas.Label(key="class", value="tabby cat"),
+                        schemas.Label(key="class", value="mammals"),
+                    ]
+                ]
+            ],  # type: ignore - purposefully raising error,
+            pr_curve_max_examples=3,
+            metrics_to_return=[
+                enums.MetricType.Precision,
+                enums.MetricType.Recall,
+                enums.MetricType.F1,
+                enums.MetricType.Accuracy,
+                enums.MetricType.ROCAUC,
+                enums.MetricType.PrecisionRecallCurve,
+                enums.MetricType.DetailedPrecisionRecallCurve,
+            ],
+        )
+
+
+def test_evaluate_classification_mismatched_label_keys(
+    gt_clfs_label_key_mismatch: list[schemas.GroundTruth],
+    pred_clfs_label_key_mismatch: list[schemas.Prediction],
+):
+    """Check that we get an error when trying to evaluate over ground truths and predictions with different sets of label keys."""
+
+    with pytest.raises(ValueError) as e:
+        evaluate_classification(
+            groundtruths=gt_clfs_label_key_mismatch,
+            predictions=pred_clfs_label_key_mismatch,
+        )
+    assert "label keys must match" in str(e)
+
+
+def test_evaluate_classification_model_with_no_predictions(
+    gt_clfs: list[schemas.GroundTruth],
+):
+
+    expected_metrics = [
+        {"type": "Accuracy", "parameters": {"label_key": "k5"}, "value": 0.0},
+        {"type": "ROCAUC", "parameters": {"label_key": "k5"}, "value": 0.0},
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k5", "value": "v5"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k5", "value": "v5"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k5", "value": "v5"}},
+        {"type": "Accuracy", "parameters": {"label_key": "k4"}, "value": 0.0},
+        {"type": "ROCAUC", "parameters": {"label_key": "k4"}, "value": 0.0},
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k4", "value": "v4"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k4", "value": "v4"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k4", "value": "v4"}},
+        {"type": "Accuracy", "parameters": {"label_key": "k3"}, "value": 0.0},
+        {"type": "ROCAUC", "parameters": {"label_key": "k3"}, "value": 0.0},
+        {
+            "type": "Precision",
+            "value": 0.0,
+            "label": {"key": "k3", "value": "v3"},
+        },
+        {
+            "type": "Recall",
+            "value": 0.0,
+            "label": {"key": "k3", "value": "v3"},
+        },
+        {"type": "F1", "value": 0.0, "label": {"key": "k3", "value": "v3"}},
+    ]
+
+    evaluation = evaluate_classification(
+        groundtruths=gt_clfs,
+        predictions=[
+            schemas.Prediction(datum=gt_clfs[0].datum, annotations=[])
+        ],
+    )
+
+    computed_metrics = evaluation.metrics
+
+    assert all([metric["value"] == 0 for metric in computed_metrics])
+    assert all([metric in computed_metrics for metric in expected_metrics])
+    assert all([metric in expected_metrics for metric in computed_metrics])
+
+
+def test_compute_confusion_matrix_at_label_key_using_label_map(
+    classification_functional_test_data,
+):
+    """
+    Test grouping using the label_map
+    """
+
+    groundtruths, predictions = classification_functional_test_data
+
+    eval_job = evaluate_classification(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        label_map={
+            schemas.Label(key="animal", value="dog"): schemas.Label(
+                key="class", value="mammal"
+            ),
+            schemas.Label(key="animal", value="cat"): schemas.Label(
+                key="class", value="mammal"
+            ),
+            schemas.Label(key="animal", value="bird"): schemas.Label(
+                key="class", value="avian"
+            ),
+        },
+    )
+
+    cm = eval_job.confusion_matrices
+
+    expected_entries = [
+        {
+            "label_key": "class",
+            "entries": [
+                {"prediction": "avian", "groundtruth": "avian", "count": 1},
+                {"prediction": "mammal", "groundtruth": "avian", "count": 2},
+                {"prediction": "mammal", "groundtruth": "mammal", "count": 3},
+            ],
+        },
+        {
+            "label_key": "color",
+            "entries": [
+                {"prediction": "blue", "groundtruth": "white", "count": 1},
+                {"prediction": "red", "groundtruth": "black", "count": 1},
+                {"prediction": "red", "groundtruth": "red", "count": 2},
+                {"prediction": "white", "groundtruth": "blue", "count": 1},
+                {"prediction": "white", "groundtruth": "white", "count": 1},
+            ],
+        },
+    ]
+
+    assert cm
+    assert len(cm) == len(expected_entries)
+    for entry in cm:
+        assert entry in expected_entries
+    for entry in expected_entries:
+        assert entry in cm
+
+
+def test_rocauc_with_label_map(
+    classification_functional_test_prediction_df,
+    classification_functional_test_groundtruth_df,
+):
+    """Test ROC auc computation using a label_map to group labels together. Matches the following output from sklearn:
+
+    import numpy as np
+    from sklearn.metrics import roc_auc_score
+
+    # for the "animal" label key
+    y_true = np.array([0, 1, 0, 0, 1, 1])
+    y_score = np.array(
+        [
+            [0.6, 0.4],
+            [0.0, 1],
+            [0.15, 0.85],
+            [0.15, 0.85],
+            [0.0, 1.0],
+            [0.2, 0.8],
+        ]
+    )
+
+    score = roc_auc_score(y_true, y_score[:, 1], multi_class="ovr")
+    assert score == 0.7777777777777778
+
+    Note that the label map is already built into the pandas dataframes used in this test.
+
+    """
+
+    computed_metrics = [
+        m.to_dict()
+        for m in _calculate_rocauc(
+            prediction_df=classification_functional_test_prediction_df,
+            groundtruth_df=classification_functional_test_groundtruth_df,
+        )
+    ]
+
+    expected_metrics = [
+        {
+            "parameters": {"label_key": "animal"},
+            "value": 0.8009259259259259,
+            "type": "ROCAUC",
+        },
+        {
+            "parameters": {"label_key": "color"},
+            "value": 0.43125,
+            "type": "ROCAUC",
+        },
+    ]
+
+    for entry in computed_metrics:
+        assert entry in expected_metrics
+    for entry in expected_metrics:
+        assert entry in computed_metrics
+
+
+def test_compute_classification(
+    classification_functional_test_data,
+):
+    """
+    Tests the _compute_classification function.
+    """
+    groundtruths, predictions = classification_functional_test_data
+
+    eval_job = evaluate_classification(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        metrics_to_return=[
+            enums.MetricType.Precision,
+            enums.MetricType.Recall,
+            enums.MetricType.F1,
+            enums.MetricType.Accuracy,
+            enums.MetricType.ROCAUC,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+
+    computed_metrics = [
+        m
+        for m in eval_job.metrics
+        if m["type"]
+        not in ["PrecisionRecallCurve", "DetailedPrecisionRecallCurve"]
+    ]
+    pr_curves = [
+        m for m in eval_job.metrics if m["type"] == "PrecisionRecallCurve"
+    ]
+    detailed_pr_curves = [
+        m
+        for m in eval_job.metrics
+        if m["type"] == "DetailedPrecisionRecallCurve"
+    ]
+    confusion_matrices = eval_job.confusion_matrices
+
+    expected_metrics = [
+        {
+            "label": {"key": "animal", "value": "bird"},
+            "value": 1.0,
+            "type": "Precision",
+        },
+        {
+            "label": {"key": "animal", "value": "bird"},
+            "value": 0.3333333333333333,
+            "type": "Recall",
+        },
+        {
+            "label": {"key": "animal", "value": "bird"},
+            "value": 0.5,
+            "type": "F1",
+        },
+        {
+            "label": {"key": "animal", "value": "cat"},
+            "value": 0.25,
+            "type": "Precision",
+        },
+        {
+            "label": {"key": "animal", "value": "cat"},
+            "value": 1.0,
+            "type": "Recall",
+        },
+        {
+            "label": {"key": "animal", "value": "cat"},
+            "value": 0.4,
+            "type": "F1",
+        },
+        {
+            "label": {"key": "animal", "value": "dog"},
+            "value": 0.0,
+            "type": "Precision",
+        },
+        {
+            "label": {"key": "animal", "value": "dog"},
+            "value": 0.0,
+            "type": "Recall",
+        },
+        {
+            "label": {"key": "animal", "value": "dog"},
+            "value": 0.0,
+            "type": "F1",
+        },
+        {
+            "label": {"key": "color", "value": "blue"},
+            "value": 0.0,
+            "type": "Precision",
+        },
+        {
+            "label": {"key": "color", "value": "blue"},
+            "value": 0.0,
+            "type": "Recall",
+        },
+        {
+            "label": {"key": "color", "value": "blue"},
+            "value": 0.0,
+            "type": "F1",
+        },
+        {
+            "label": {"key": "color", "value": "red"},
+            "value": 0.6666666666666666,
+            "type": "Precision",
+        },
+        {
+            "label": {"key": "color", "value": "red"},
+            "value": 1.0,
+            "type": "Recall",
+        },
+        {
+            "label": {"key": "color", "value": "red"},
+            "value": 0.8,
+            "type": "F1",
+        },
+        {
+            "label": {"key": "color", "value": "white"},
+            "value": 0.5,
+            "type": "Precision",
+        },
+        {
+            "label": {"key": "color", "value": "white"},
+            "value": 0.5,
+            "type": "Recall",
+        },
+        {
+            "label": {"key": "color", "value": "white"},
+            "value": 0.5,
+            "type": "F1",
+        },
+        {
+            "label": {"key": "color", "value": "black"},
+            "value": 0.0,
+            "type": "Precision",
+        },
+        {
+            "label": {"key": "color", "value": "black"},
+            "value": 0.0,
+            "type": "Recall",
+        },
+        {
+            "label": {"key": "color", "value": "black"},
+            "value": 0.0,
+            "type": "F1",
+        },
+        {
+            "parameters": {"label_key": "animal"},
+            "value": 0.3333333333333333,
+            "type": "Accuracy",
+        },
+        {
+            "parameters": {"label_key": "color"},
+            "value": 0.5,
+            "type": "Accuracy",
+        },
+        {
+            "parameters": {"label_key": "animal"},
+            "value": 0.8009259259259259,
+            "type": "ROCAUC",
+        },
+        {
+            "parameters": {"label_key": "color"},
+            "value": 0.43125,
+            "type": "ROCAUC",
+        },
+    ]
+    expected_pr_curves = {
+        # bird
+        ("bird", 0.05, "tp"): 3,
+        ("bird", 0.05, "fp"): 1,
+        ("bird", 0.05, "tn"): 2,
+        ("bird", 0.05, "fn"): 0,
+        ("bird", 0.3, "tp"): 1,
+        ("bird", 0.3, "fn"): 2,
+        ("bird", 0.3, "fp"): 0,
+        ("bird", 0.3, "tn"): 3,
+        ("bird", 0.65, "fn"): 3,
+        ("bird", 0.65, "tn"): 3,
+        ("bird", 0.65, "tp"): 0,
+        ("bird", 0.65, "fp"): 0,
+        # dog
+        ("dog", 0.05, "tp"): 2,
+        ("dog", 0.05, "fp"): 3,
+        ("dog", 0.05, "tn"): 1,
+        ("dog", 0.05, "fn"): 0,
+        ("dog", 0.45, "fn"): 2,
+        ("dog", 0.45, "fp"): 1,
+        ("dog", 0.45, "tn"): 3,
+        ("dog", 0.45, "tp"): 0,
+        ("dog", 0.8, "fn"): 2,
+        ("dog", 0.8, "fp"): 0,
+        ("dog", 0.8, "tn"): 4,
+        ("dog", 0.8, "tp"): 0,
+        # cat
+        ("cat", 0.05, "tp"): 1,
+        ("cat", 0.05, "tn"): 0,
+        ("cat", 0.05, "fp"): 5,
+        ("cat", 0.05, "fn"): 0,
+        ("cat", 0.95, "tp"): 1,
+        ("cat", 0.95, "fp"): 0,
+        ("cat", 0.95, "tn"): 5,
+        ("cat", 0.95, "fn"): 0,
+    }
+    expected_detailed_pr_curves = {
+        # bird
+        ("bird", 0.05, "tp"): {"all": 3, "total": 3},
+        ("bird", 0.05, "fp"): {
+            "misclassifications": 1,
+            "total": 1,
+        },
+        ("bird", 0.05, "tn"): {"all": 2, "total": 2},
+        ("bird", 0.05, "fn"): {
+            "no_predictions": 0,
+            "misclassifications": 0,
+            "total": 0,
+        },
+        # dog
+        ("dog", 0.05, "tp"): {"all": 2, "total": 2},
+        ("dog", 0.05, "fp"): {
+            "misclassifications": 3,
+            "total": 3,
+        },
+        ("dog", 0.05, "tn"): {"all": 1, "total": 1},
+        ("dog", 0.8, "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 1,
+            "total": 2,
+        },
+        # cat
+        ("cat", 0.05, "tp"): {"all": 1, "total": 1},
+        ("cat", 0.05, "fp"): {
+            "misclassifications": 5,
+            "total": 5,
+        },
+        ("cat", 0.05, "tn"): {"all": 0, "total": 0},
+        ("cat", 0.8, "fn"): {
+            "no_predictions": 0,
+            "misclassifications": 0,
+            "total": 0,
+        },
+    }
+    expected_cm = [
+        {
+            "label_key": "animal",
+            "entries": [
+                {"prediction": "bird", "groundtruth": "bird", "count": 1},
+                {"prediction": "cat", "groundtruth": "bird", "count": 1},
+                {"prediction": "cat", "groundtruth": "cat", "count": 1},
+                {"prediction": "cat", "groundtruth": "dog", "count": 2},
+                {"prediction": "dog", "groundtruth": "bird", "count": 1},
+            ],
+        },
+        {
+            "label_key": "color",
+            "entries": [
+                {"prediction": "blue", "groundtruth": "white", "count": 1},
+                {"prediction": "red", "groundtruth": "black", "count": 1},
+                {"prediction": "red", "groundtruth": "red", "count": 2},
+                {"prediction": "white", "groundtruth": "blue", "count": 1},
+                {"prediction": "white", "groundtruth": "white", "count": 1},
+            ],
+        },
+    ]
+
+    # assert base metrics
+    for actual, expected in [
+        (computed_metrics, expected_metrics),
+        (confusion_matrices, expected_cm),
+    ]:
+        for entry in actual:
+            assert entry in expected
+        for entry in expected:
+            assert entry in actual
+
+    # assert pr curves
+    for (
+        value,
+        threshold,
+        metric,
+    ), expected_length in expected_pr_curves.items():
+        classification = pr_curves[0]["value"][value][threshold][metric]
+        assert classification == expected_length
+
+    # assert DetailedPRCurves
+    for (
+        value,
+        threshold,
+        metric,
+    ), expected_output in expected_detailed_pr_curves.items():
+        model_output = detailed_pr_curves[0]["value"][value][threshold][metric]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+    # test that DetailedPRCurve gives more examples when we adjust pr_curve_max_examples
+    eval_job = evaluate_classification(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        pr_curve_max_examples=3,
+        metrics_to_return=[
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+
+    assert (
+        len(
+            eval_job.metrics[0]["value"]["bird"][0.05]["tp"]["observations"][
+                "all"
+            ]["examples"]
+        )
+        == 3
+    )
+    assert (
+        len(
+            eval_job.metrics[0]["value"]["bird"][0.05]["tn"]["observations"][
+                "all"
+            ]["examples"]
+        )
+        == 2
+    )  # only two examples exist
+
+    # test behavior if pr_curve_max_examples == 0
+    eval_job = evaluate_classification(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        pr_curve_max_examples=0,
+        metrics_to_return=[
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+
+    assert (
+        len(
+            eval_job.metrics[0]["value"]["bird"][0.05]["tp"]["observations"][
+                "all"
+            ]["examples"]
+        )
+        == 0
+    )
+    assert (
+        len(
+            eval_job.metrics[0]["value"]["bird"][0.05]["tn"]["observations"][
+                "all"
+            ]["examples"]
+        )
+        == 0
+    )
diff --git a/core/tests/functional-tests/test_detection.py b/core/tests/functional-tests/test_detection.py
new file mode 100644
index 000000000..7f89fd4cb
--- /dev/null
+++ b/core/tests/functional-tests/test_detection.py
@@ -0,0 +1,4429 @@
+import random
+
+import numpy as np
+import pandas as pd
+import pytest
+from valor_core import enums, geometry, schemas
+from valor_core.detection import _calculate_101_pt_interp, evaluate_detection
+
+
+def test__calculate_101_pt_interp():
+    # make sure we get back 0 if we don't pass any precisions
+    assert _calculate_101_pt_interp([], []) == 0
+
+    # get back -1 if all recalls and precisions are -1
+    assert _calculate_101_pt_interp([-1, -1], [-1, -1]) == -1
+
+
+def test_evaluate_detection(
+    evaluate_detection_groundtruths, evaluate_detection_predictions
+):
+    """
+    Test detection evaluations with area thresholds.
+
+    gt_dets1
+        datum 1
+            - Label (k1, v1) with Annotation area = 1500
+            - Label (k2, v2) with Annotation area = 57,510
+        datum2
+            - Label (k1, v1) with Annotation area = 1100
+
+    pred_dets
+        datum 1
+            - Label (k1, v1) with Annotation area = 1500
+            - Label (k2, v2) with Annotation area = 57,510
+        datum2
+            - Label (k1, v1) with Annotation area = 1100
+    """
+
+    eval_job = evaluate_detection(
+        groundtruths=evaluate_detection_groundtruths,
+        predictions=evaluate_detection_predictions,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+        ],
+    )
+
+    metrics = eval_job.metrics
+
+    expected_metrics = [
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.1},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.6},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "AR",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "mAR",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAR",
+        },
+    ]
+
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in metrics
+
+    assert eval_job.ignored_pred_labels == []
+    assert eval_job.missing_pred_labels == []
+
+    result = eval_job
+    result_dict = result.to_dict()
+
+    # duration isn't deterministic, so test meta separately
+    assert result_dict["meta"]["datums"] == 2
+    assert result_dict["meta"]["labels"] == 2
+    assert result_dict["meta"]["annotations"] == 5
+    assert result_dict["meta"]["duration"] <= 5
+    result_dict.pop("meta")
+    result_dict.pop("metrics")
+
+    assert result_dict == {
+        "parameters": {
+            "label_map": {},
+            "metrics_to_return": [
+                enums.MetricType.AP,
+                enums.MetricType.AR,
+                enums.MetricType.mAP,
+                enums.MetricType.APAveragedOverIOUs,
+                enums.MetricType.mAR,
+                enums.MetricType.mAPAveragedOverIOUs,
+            ],
+            "iou_thresholds_to_compute": [0.1, 0.6],
+            "iou_thresholds_to_return": [0.1, 0.6],
+            "recall_score_threshold": 0.0,
+            "pr_curve_iou_threshold": 0.5,
+            "pr_curve_max_examples": 1,
+            "convert_annotations_to_type": None,
+        },
+        "confusion_matrices": [],
+        "ignored_pred_labels": [],
+        "missing_pred_labels": [],
+    }
+
+    # check that metrics arg works correctly
+    selected_metrics = random.sample(
+        [
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        2,
+    )
+    eval_job = evaluate_detection(
+        groundtruths=evaluate_detection_groundtruths,
+        predictions=evaluate_detection_predictions,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=selected_metrics,
+    )
+
+    metrics = eval_job.metrics
+    assert set([metric["type"] for metric in eval_job.metrics]) == set(
+        selected_metrics
+    )
+
+
+def test_evaluate_detection_via_pandas_df():
+    """
+    Test detection evaluations with area thresholds.
+
+    gt_dets1
+        datum 1
+            - Label (k1, v1) with Annotation area = 1500
+            - Label (k2, v2) with Annotation area = 57,510
+        datum2
+            - Label (k1, v1) with Annotation area = 1100
+
+    pred_dets
+        datum 1
+            - Label (k1, v1) with Annotation area = 1500
+            - Label (k2, v2) with Annotation area = 57,510
+        datum2
+            - Label (k1, v1) with Annotation area = 1100
+    """
+    groundtruth_df = pd.DataFrame(
+        [
+            {
+                "datum_id": 1,
+                "datum_uid": "uid1",
+                "id": 1,
+                "annotation_id": 1,
+                "label_id": 1,
+                "label_key": "k1",
+                "label_value": "v1",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[10, 10], [60, 10], [60, 40], [10, 40], [10, 10]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+            {
+                "datum_id": 1,
+                "datum_uid": "uid1",
+                "id": 2,
+                "annotation_id": 2,
+                "label_id": 2,
+                "label_key": "k2",
+                "label_value": "v2",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [
+                                [87, 10],
+                                [158, 10],
+                                [158, 820],
+                                [87, 820],
+                                [87, 10],
+                            ]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+            {
+                "datum_id": 2,
+                "datum_uid": "uid2",
+                "id": 3,
+                "annotation_id": 3,
+                "label_id": 1,
+                "label_key": "k1",
+                "label_value": "v1",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[15, 0], [70, 0], [70, 20], [15, 20], [15, 0]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+        ]
+    )
+    prediction_df = pd.DataFrame(
+        [
+            {
+                "id": 1,
+                "annotation_id": 4,
+                "score": 0.3,
+                "datum_id": 1,
+                "datum_uid": "uid1",
+                "label_id": 1,
+                "label_key": "k1",
+                "label_value": "v1",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[10, 10], [60, 10], [60, 40], [10, 40], [10, 10]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+            {
+                "id": 2,
+                "annotation_id": 5,
+                "score": 0.98,
+                "datum_id": 2,
+                "datum_uid": "uid2",
+                "label_id": 2,
+                "label_key": "k2",
+                "label_value": "v2",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[15, 0], [70, 0], [70, 20], [15, 20], [15, 0]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+        ]
+    )
+
+    eval_job = evaluate_detection(
+        groundtruths=groundtruth_df,
+        predictions=prediction_df,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+        ],
+    )
+
+    metrics = eval_job.metrics
+
+    expected_metrics = [
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.1},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.6},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "AR",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "mAR",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAR",
+        },
+    ]
+
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in metrics
+
+    assert eval_job.ignored_pred_labels == []
+    assert eval_job.missing_pred_labels == []
+
+    result = eval_job
+    result_dict = result.to_dict()
+
+    # duration isn't deterministic, so test meta separately
+    assert result_dict["meta"]["datums"] == 2
+    assert result_dict["meta"]["labels"] == 2
+    assert result_dict["meta"]["annotations"] == 5
+    assert result_dict["meta"]["duration"] <= 5
+    result_dict.pop("meta")
+    result_dict.pop("metrics")
+
+    assert result_dict == {
+        "parameters": {
+            "iou_thresholds_to_compute": [0.1, 0.6],
+            "iou_thresholds_to_return": [0.1, 0.6],
+            "label_map": {},
+            "recall_score_threshold": 0.0,
+            "metrics_to_return": [
+                enums.MetricType.AP,
+                enums.MetricType.AR,
+                enums.MetricType.mAP,
+                enums.MetricType.APAveragedOverIOUs,
+                enums.MetricType.mAR,
+                enums.MetricType.mAPAveragedOverIOUs,
+            ],
+            "pr_curve_iou_threshold": 0.5,
+            "pr_curve_max_examples": 1,
+            "convert_annotations_to_type": None,
+        },
+        "confusion_matrices": [],
+        "missing_pred_labels": [],
+        "ignored_pred_labels": [],
+    }
+
+    # check that metrics arg works correctly
+    selected_metrics = random.sample(
+        [
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        2,
+    )
+    eval_job = evaluate_detection(
+        groundtruths=groundtruth_df,
+        predictions=prediction_df,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=selected_metrics,
+    )
+
+    metrics = eval_job.metrics
+    assert set([metric["type"] for metric in eval_job.metrics]) == set(
+        selected_metrics
+    )
+
+
+def test_evaluate_detection_with_label_maps(
+    evaluate_detection_groundtruths_with_label_maps,
+    evaluate_detection_predictions_with_label_maps,
+):
+    # for the first evaluation, don't do anything about the mismatched labels
+    # we expect the evaluation to return the same expected metrics as for our standard detection tests
+
+    baseline_expected_metrics = [
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "class_name", "value": "maine coon cat"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "class_name", "value": "maine coon cat"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "class", "value": "british shorthair"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "class", "value": "british shorthair"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "class", "value": "siamese cat"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "class", "value": "siamese cat"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": -1.0,
+            "label": {"key": "class_name", "value": "cat"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class_name", "value": "maine coon cat"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class", "value": "british shorthair"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": -1.0,
+            "label": {"key": "class", "value": "cat"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class", "value": "siamese cat"},
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.5,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class_name"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "class"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "class"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "class_name"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "class_name"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class_name", "value": "maine coon cat"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class", "value": "british shorthair"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class", "value": "siamese cat"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class_name"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps,
+        predictions=evaluate_detection_predictions_with_label_maps,
+        pr_curve_max_examples=1,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+
+    assert (
+        eval_job.ignored_pred_labels is not None
+        and eval_job.missing_pred_labels is not None
+    )
+    assert (
+        len(eval_job.ignored_pred_labels) == 2
+    )  # we're ignoring the two "cat" model predictions
+    assert (
+        len(eval_job.missing_pred_labels) == 3
+    )  # we're missing three gts_det_syn representing different breeds of cats
+
+    metrics = eval_job.metrics
+
+    pr_metrics = []
+    pr_metrics = []
+    detailed_pr_metrics = []
+    for m in metrics:
+        if m["type"] == "PrecisionRecallCurve":
+            pr_metrics.append(m)
+        elif m["type"] == "DetailedPrecisionRecallCurve":
+            detailed_pr_metrics.append(m)
+        else:
+            assert m in baseline_expected_metrics
+
+    pr_metrics.sort(key=lambda x: x["parameters"]["label_key"])
+    detailed_pr_metrics.sort(key=lambda x: x["parameters"]["label_key"])
+
+    pr_expected_answers = {
+        # class
+        (
+            0,
+            "class",
+            "cat",
+            "0.1",
+            "fp",
+        ): 1,
+        (0, "class", "cat", "0.4", "fp"): 0,
+        (0, "class", "siamese cat", "0.1", "fn"): 1,
+        (0, "class", "british shorthair", "0.1", "fn"): 1,
+        # class_name
+        (1, "class_name", "cat", "0.1", "fp"): 1,
+        (1, "class_name", "maine coon cat", "0.1", "fn"): 1,
+        # k1
+        (2, "k1", "v1", "0.1", "fn"): 1,
+        (2, "k1", "v1", "0.1", "tp"): 1,
+        (2, "k1", "v1", "0.4", "fn"): 2,
+        # k2
+        (3, "k2", "v2", "0.1", "fn"): 1,
+        (3, "k2", "v2", "0.1", "fp"): 1,
+    }
+
+    for (
+        index,
+        key,
+        value,
+        threshold,
+        metric,
+    ), expected_value in pr_expected_answers.items():
+        assert (
+            pr_metrics[index]["value"][value][float(threshold)][metric]
+            == expected_value
+        )
+
+    # check DetailedPrecisionRecallCurve
+    detailed_pr_expected_answers = {
+        # class
+        (0, "cat", "0.1", "fp"): {
+            "hallucinations": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        (0, "cat", "0.4", "fp"): {
+            "hallucinations": 0,
+            "misclassifications": 0,
+            "total": 0,
+        },
+        (0, "british shorthair", "0.1", "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        # class_name
+        (1, "cat", "0.4", "fp"): {
+            "hallucinations": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        (1, "maine coon cat", "0.1", "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        # k1
+        (2, "v1", "0.1", "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        (2, "v1", "0.4", "fn"): {
+            "no_predictions": 2,
+            "misclassifications": 0,
+            "total": 2,
+        },
+        (2, "v1", "0.1", "tp"): {"all": 1, "total": 1},
+        # k2
+        (3, "v2", "0.1", "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        (3, "v2", "0.1", "fp"): {
+            "hallucinations": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+    }
+
+    for (
+        index,
+        value,
+        threshold,
+        metric,
+    ), expected_output in detailed_pr_expected_answers.items():
+        model_output = detailed_pr_metrics[index]["value"][value][
+            float(threshold)
+        ][metric]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+
+    # check that we get at most 1 example
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["cat"][0.4]["fp"]["observations"]["hallucinations"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 0
+    )
+    assert (
+        len(
+            detailed_pr_metrics[2]["value"]["v1"][0.4]["fn"]["observations"]["no_predictions"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 1
+    )
+
+    # now, we correct most of the mismatched labels with a label map
+    cat_expected_metrics = [
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.33663366336633666,
+            "label": {"key": "class", "value": "cat"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.33663366336633666,
+            "label": {"key": "class", "value": "cat"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.3333333333333333,
+            "label": {"key": "class", "value": "cat"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": -1.0,
+            "label": {"key": "class_name", "value": "cat"},
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "class"},
+            "value": 0.33663366336633666,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "class"},
+            "value": 0.33663366336633666,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "class"},
+            "value": 0.33663366336633666,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "class"},
+            "value": 0.33663366336633666,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class"},
+            "value": 0.3333333333333333,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.5,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class_name"},
+            "value": -1.0,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.33663366336633666,
+            "label": {"key": "class", "value": "cat"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class"},
+            "value": 0.33663366336633666,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps,
+        predictions=evaluate_detection_predictions_with_label_maps,
+        label_map={
+            schemas.Label(
+                key="class_name", value="maine coon cat"
+            ): schemas.Label(key="class", value="cat"),
+            schemas.Label(key="class", value="siamese cat"): schemas.Label(
+                key="class", value="cat"
+            ),
+            schemas.Label(
+                key="class", value="british shorthair"
+            ): schemas.Label(key="class", value="cat"),
+        },
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+
+    assert eval_job.ignored_pred_labels is not None
+    assert eval_job.missing_pred_labels is not None
+
+    assert (
+        len(eval_job.ignored_pred_labels) == 1
+    )  # Label(key='class_name', value='cat', score=None) is still never used
+    assert len(eval_job.missing_pred_labels) == 0
+
+    metrics = eval_job.metrics
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in cat_expected_metrics
+    for m in cat_expected_metrics:
+        assert m in metrics
+
+    assert eval_job.parameters.label_map == {
+        schemas.Label(
+            key="class_name", value="maine coon cat", score=None
+        ): schemas.Label(key="class", value="cat", score=None),
+        schemas.Label(
+            key="class", value="siamese cat", score=None
+        ): schemas.Label(key="class", value="cat", score=None),
+        schemas.Label(
+            key="class", value="british shorthair", score=None
+        ): schemas.Label(key="class", value="cat", score=None),
+    }
+
+    # next, we check that the label mapping works when the label is completely foreign
+    # to both groundtruths and predictions
+    foo_expected_metrics = [
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.6666666666666666,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "foo"},
+            "value": 0.6666666666666666,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.5,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+    ]
+
+    label_mapping = {
+        # map the ground truths
+        schemas.Label(key="class_name", value="maine coon cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        schemas.Label(key="class", value="siamese cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        schemas.Label(key="class", value="british shorthair"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        # map the predictions
+        schemas.Label(key="class", value="cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        schemas.Label(key="class_name", value="cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+    }
+
+    eval_job = evaluate_detection(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps,
+        predictions=evaluate_detection_predictions_with_label_maps,
+        label_map=label_mapping,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+
+    assert (
+        eval_job.ignored_pred_labels is not None
+        and eval_job.missing_pred_labels is not None
+    )
+    assert len(eval_job.ignored_pred_labels) == 0
+    assert len(eval_job.missing_pred_labels) == 0
+
+    metrics = eval_job.metrics
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in foo_expected_metrics
+    for m in foo_expected_metrics:
+        assert m in metrics
+
+    assert eval_job.parameters.label_map == {
+        schemas.Label(
+            key="class_name", value="maine coon cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(
+            key="class", value="siamese cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(
+            key="class", value="british shorthair", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(key="class", value="cat", score=None): schemas.Label(
+            key="foo", value="bar", score=None
+        ),
+        schemas.Label(
+            key="class_name", value="cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+    }
+
+    # finally, let's test using a higher recall_score_threshold
+    # this new threshold will disqualify all of our predictions for img1
+
+    foo_expected_metrics_with_higher_score_threshold = [
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.3333333333333333,  # two missed groundtruth on the first image, and 1 hit for the second image
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "foo"},
+            "value": 0.3333333333333333,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.0,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps,
+        predictions=evaluate_detection_predictions_with_label_maps,
+        label_map=label_mapping,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        recall_score_threshold=0.8,
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+    )
+
+    assert (
+        eval_job.ignored_pred_labels is not None
+        and eval_job.missing_pred_labels is not None
+    )
+    assert len(eval_job.ignored_pred_labels) == 0
+    assert len(eval_job.missing_pred_labels) == 0
+
+    assert eval_job.to_dict()["parameters"] == {
+        "label_map": {
+            schemas.Label(
+                key="class_name", value="maine coon cat", score=None
+            ): schemas.Label(key="foo", value="bar", score=None),
+            schemas.Label(
+                key="class", value="siamese cat", score=None
+            ): schemas.Label(key="foo", value="bar", score=None),
+            schemas.Label(
+                key="class", value="british shorthair", score=None
+            ): schemas.Label(key="foo", value="bar", score=None),
+            schemas.Label(key="class", value="cat", score=None): schemas.Label(
+                key="foo", value="bar", score=None
+            ),
+            schemas.Label(
+                key="class_name", value="cat", score=None
+            ): schemas.Label(key="foo", value="bar", score=None),
+        },
+        "metrics_to_return": [
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        "iou_thresholds_to_compute": [0.1, 0.6],
+        "iou_thresholds_to_return": [0.1, 0.6],
+        "recall_score_threshold": 0.8,
+        "pr_curve_iou_threshold": 0.5,
+        "pr_curve_max_examples": 1,
+        "convert_annotations_to_type": None,
+    }
+
+    metrics = eval_job.metrics
+
+    pr_metrics = []
+    for m in metrics:
+        if m["type"] == "PrecisionRecallCurve":
+            pr_metrics.append(m)
+        elif m["type"] == "DetailedPrecisionRecallCurve":
+            continue
+        else:
+            assert m in foo_expected_metrics_with_higher_score_threshold
+
+    for m in foo_expected_metrics_with_higher_score_threshold:
+        assert m in metrics
+
+    pr_metrics.sort(key=lambda x: x["parameters"]["label_key"])
+
+    pr_expected_answers = {
+        # foo
+        (0, "foo", "bar", "0.1", "fn"): 1,  # missed rect3
+        (0, "foo", "bar", "0.1", "tp"): 2,
+        (0, "foo", "bar", "0.4", "fn"): 2,
+        (0, "foo", "bar", "0.4", "tp"): 1,
+        # k1
+        (1, "k1", "v1", "0.1", "fn"): 1,
+        (1, "k1", "v1", "0.1", "tp"): 1,
+        (1, "k1", "v1", "0.4", "fn"): 2,
+        # k2
+        (2, "k2", "v2", "0.1", "fn"): 1,
+        (2, "k2", "v2", "0.1", "fp"): 1,
+    }
+
+    for (
+        index,
+        _,
+        value,
+        threshold,
+        metric,
+    ), expected_value in pr_expected_answers.items():
+        assert (
+            pr_metrics[index]["value"][value][float(threshold)][metric]
+            == expected_value
+        )
+
+    assert eval_job.parameters.label_map == {
+        schemas.Label(
+            key="class_name", value="maine coon cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(
+            key="class", value="siamese cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(
+            key="class", value="british shorthair", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(key="class", value="cat", score=None): schemas.Label(
+            key="foo", value="bar", score=None
+        ),
+        schemas.Label(
+            key="class_name", value="cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+    }
+
+
+def test_evaluate_detection_false_negatives_single_image_baseline():
+    """This is the baseline for the below test. In this case there are two predictions and
+    one groundtruth, but the highest confident prediction overlaps sufficiently with the groundtruth
+    so there is not a penalty for the false negative so the AP is 1
+    """
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        )
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=100, xmax=110, ymin=100, ymax=200
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.7)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    ap_metric = [m for m in eval_job.metrics if m["type"] == "AP"][0]
+    assert ap_metric == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 1,
+        "label": {"key": "key", "value": "value"},
+    }
+
+
+def test_evaluate_detection_false_negatives_single_image():
+    """Tests fix for a bug where high confidence false negative was not being penalized. The
+    difference between this test and the above is that here the prediction with higher confidence
+    does not sufficiently overlap the groundtruth and so is penalized and we get an AP of 0.5
+    """
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        )
+    ]
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=100, xmax=110, ymin=100, ymax=200
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.9)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    ap_metric = [m for m in eval_job.metrics if m["type"] == "AP"][0]
+    assert ap_metric == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 0.5,
+        "label": {"key": "key", "value": "value"},
+    }
+
+
+def test_evaluate_detection_false_negatives_two_images_one_empty_low_confidence_of_fp():
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class and high IOU)
+        2. A second image with empty groundtruth annotation but a prediction with lower confidence
+        then the prediction on the first image.
+
+    In this case, the AP should be 1.0 since the false positive has lower confidence than the true positive
+
+    """
+
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[schemas.Annotation(labels=[])],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.7)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    ap_metric = [m for m in eval_job.metrics if m["type"] == "AP"][0]
+    assert ap_metric == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 1.0,
+        "label": {"key": "key", "value": "value"},
+    }
+
+
+def test_evaluate_detection_false_negatives_two_images_one_empty_high_confidence_of_fp():
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class and high IOU)
+        2. A second image with empty groundtruth annotation and a prediction with higher confidence
+        then the prediction on the first image.
+
+    In this case, the AP should be 0.5 since the false positive has higher confidence than the true positive
+    """
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[schemas.Annotation(labels=[])],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.9)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    ap_metric = [m for m in eval_job.metrics if m["type"] == "AP"][0]
+    assert ap_metric == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 0.5,
+        "label": {"key": "key", "value": "value"},
+    }
+
+
+def test_evaluate_detection_false_negatives_two_images_one_only_with_different_class_low_confidence_of_fp():
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU)
+        2. A second image with a groundtruth annotation with class `"other value"` and a prediction with lower confidence
+        then the prediction on the first image.
+
+    In this case, the AP for class `"value"` should be 1 since the false positive has lower confidence than the true positive.
+    AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth
+    """
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="other value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.7)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    ap_metric1 = [
+        m
+        for m in eval_job.metrics
+        if m["type"] == "AP" and m["label"] == {"key": "key", "value": "value"}
+    ][0]
+    assert ap_metric1 == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 1.0,
+        "label": {"key": "key", "value": "value"},
+    }
+
+    # label `"other value"` is not in the predictions so we should get an AP of 0
+    ap_metric2 = [
+        m
+        for m in eval_job.metrics
+        if m["type"] == "AP"
+        and m["label"] == {"key": "key", "value": "other value"}
+    ][0]
+    assert ap_metric2 == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 0,
+        "label": {"key": "key", "value": "other value"},
+    }
+
+
+def test_evaluate_detection_false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp():
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU)
+        2. A second image with a groundtruth annotation with clas `"other value"` and a prediction with higher confidence
+        then the prediction on the first image.
+
+    In this case, the AP for class `"value"` should be 0.5 since the false positive has higher confidence than the true positive.
+    AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth
+    """
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="other value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.9)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    ap_metric1 = [
+        m
+        for m in eval_job.metrics
+        if m["type"] == "AP" and m["label"] == {"key": "key", "value": "value"}
+    ][0]
+    assert ap_metric1 == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 0.5,
+        "label": {"key": "key", "value": "value"},
+    }
+
+    # label `"other value"` is not in the predictions so we should get an AP of 0
+    ap_metric2 = [
+        m
+        for m in eval_job.metrics
+        if m["type"] == "AP"
+        and m["label"] == {"key": "key", "value": "other value"}
+    ][0]
+    assert ap_metric2 == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 0,
+        "label": {"key": "key", "value": "other value"},
+    }
+
+
+@pytest.fixture
+def test_detailed_precision_recall_curve(
+    evaluate_detection_detailed_pr_curve_groundtruths,
+    evaluate_detection_detailed_pr_curve_predictions,
+):
+
+    eval_job = evaluate_detection(
+        groundtruths=evaluate_detection_detailed_pr_curve_groundtruths,
+        predictions=evaluate_detection_detailed_pr_curve_predictions,
+        metrics_to_return=[enums.MetricType.DetailedPrecisionRecallCurve],
+    )
+
+    # one true positive that becomes a false negative when score > .5
+    assert eval_job.metrics[0]["value"]["v1"]["0.3"]["tp"]["total"] == 1
+    assert eval_job.metrics[0]["value"]["v1"]["0.55"]["tp"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["v1"]["0.55"]["fn"]["total"] == 1
+    assert (
+        eval_job.metrics[0]["value"]["v1"]["0.55"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert eval_job.metrics[0]["value"]["v1"]["0.05"]["fn"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["v1"]["0.05"]["fp"]["total"] == 0
+
+    # one missed detection that never changes
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.95"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["tp"]["total"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["fp"]["total"]
+        == 0
+    )
+
+    # one fn missed_dection that becomes a misclassification when pr_curve_iou_threshold <= .48 and score threshold <= .3
+    assert (
+        eval_job.metrics[0]["value"]["v2"]["0.3"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["v2"]["0.35"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert eval_job.metrics[0]["value"]["v2"]["0.05"]["tp"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["v2"]["0.05"]["fp"]["total"] == 0
+
+    # one fp hallucination that becomes a misclassification when pr_curve_iou_threshold <= .48 and score threshold <= .3
+    assert (
+        eval_job.metrics[0]["value"]["not_v2"]["0.05"]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["not_v2"]["0.05"]["fp"]["observations"][
+            "misclassifications"
+        ]["count"]
+        == 0
+    )
+    assert eval_job.metrics[0]["value"]["not_v2"]["0.05"]["tp"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["not_v2"]["0.05"]["fn"]["total"] == 0
+
+    # one fp hallucination that disappears when score threshold >.15
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["fp"][
+            "observations"
+        ]["hallucinations"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.35"]["fp"][
+            "observations"
+        ]["hallucinations"]["count"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["tp"]["total"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["fn"]["total"]
+        == 0
+    )
+
+    # one missed detection and one hallucination due to low iou overlap
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.3"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.95"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.3"]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.55"]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 0
+    )
+
+    # repeat tests using a lower IOU threshold
+    eval_job_low_iou_threshold = evaluate_detection(
+        groundtruths=evaluate_detection_detailed_pr_curve_groundtruths,
+        predictions=evaluate_detection_detailed_pr_curve_predictions,
+        metrics_to_return=[enums.MetricType.DetailedPrecisionRecallCurve],
+        pr_curve_iou_threshold=0.45,
+    )
+
+    # one true positive that becomes a false negative when score > .5
+    assert eval_job.metrics[0]["value"]["v1"]["0.3"]["tp"]["total"] == 1
+    assert eval_job.metrics[0]["value"]["v1"]["0.55"]["tp"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["v1"]["0.55"]["fn"]["total"] == 1
+    assert (
+        eval_job.metrics[0]["value"]["v1"]["0.55"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert eval_job.metrics[0]["value"]["v1"]["0.05"]["fn"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["v1"]["0.05"]["fp"]["total"] == 0
+
+    # one missed detection that never changes
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.95"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["tp"]["total"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["fp"]["total"]
+        == 0
+    )
+
+    # one fn missed_dection that becomes a misclassification when pr_curve_iou_threshold <= .48 and score threshold <= .3
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.3"]["fn"][
+            "observations"
+        ]["misclassifications"]["count"]
+        == 1
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.3"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 0
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.35"]["fn"][
+            "observations"
+        ]["misclassifications"]["count"]
+        == 0
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.35"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 1
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.05"]["tp"][
+            "total"
+        ]
+        == 0
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.05"]["fp"][
+            "total"
+        ]
+        == 0
+    )
+
+    # one fp hallucination that becomes a misclassification when pr_curve_iou_threshold <= .48 and score threshold <= .3
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["not_v2"]["0.05"]["fp"][
+            "observations"
+        ]["hallucinations"]["count"]
+        == 0
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["not_v2"]["0.05"]["fp"][
+            "observations"
+        ]["misclassifications"]["count"]
+        == 1
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["not_v2"]["0.05"]["tp"][
+            "total"
+        ]
+        == 0
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["not_v2"]["0.05"]["fn"][
+            "total"
+        ]
+        == 0
+    )
+
+    # one fp hallucination that disappears when score threshold >.15
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["fp"][
+            "observations"
+        ]["hallucinations"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.35"]["fp"][
+            "observations"
+        ]["hallucinations"]["count"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["tp"]["total"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["fn"]["total"]
+        == 0
+    )
+
+    # one missed detection and one hallucination due to low iou overlap
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.3"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.95"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.3"]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.55"]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 0
+    )
+
+
+def test_evaluate_detection_model_with_no_predictions(
+    evaluate_detection_groundtruths,
+):
+    """
+    Test detection evaluations when the model outputs nothing.
+
+    gt_dets1
+        datum 1
+            - Label (k1, v1) with Annotation area = 1500
+            - Label (k2, v2) with Annotation area = 57,510
+        datum2
+            - Label (k1, v1) with Annotation area = 1100
+    """
+    predictions = []
+    for gt in evaluate_detection_groundtruths:
+        predictions.append(
+            schemas.Prediction(
+                datum=gt.datum,
+                annotations=[],
+            )
+        )
+
+    expected_metrics = [
+        {
+            "label": {
+                "key": "k2",
+                "value": "v2",
+            },
+            "parameters": {
+                "iou": 0.5,
+            },
+            "type": "AP",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k2",
+                "value": "v2",
+            },
+            "parameters": {
+                "iou": 0.75,
+            },
+            "type": "AP",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k1",
+                "value": "v1",
+            },
+            "parameters": {
+                "iou": 0.5,
+            },
+            "type": "AP",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k1",
+                "value": "v1",
+            },
+            "parameters": {
+                "iou": 0.75,
+            },
+            "type": "AP",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k2",
+                "value": "v2",
+            },
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "type": "AR",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k1",
+                "value": "v1",
+            },
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "type": "AR",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "iou": 0.5,
+                "label_key": "k2",
+            },
+            "type": "mAP",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "iou": 0.75,
+                "label_key": "k2",
+            },
+            "type": "mAP",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "iou": 0.5,
+                "label_key": "k1",
+            },
+            "type": "mAP",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "iou": 0.75,
+                "label_key": "k1",
+            },
+            "type": "mAP",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+                "label_key": "k2",
+            },
+            "type": "mAR",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+                "label_key": "k1",
+            },
+            "type": "mAR",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k2",
+                "value": "v2",
+            },
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "type": "APAveragedOverIOUs",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k1",
+                "value": "v1",
+            },
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "type": "APAveragedOverIOUs",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+                "label_key": "k2",
+            },
+            "type": "mAPAveragedOverIOUs",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+                "label_key": "k1",
+            },
+            "type": "mAPAveragedOverIOUs",
+            "value": 0.0,
+        },
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=evaluate_detection_groundtruths,
+        predictions=predictions,
+    )
+
+    computed_metrics = eval_job.metrics
+
+    assert all([metric["value"] == 0 for metric in computed_metrics])
+
+    for m in expected_metrics:
+        assert m in computed_metrics
+
+    for m in computed_metrics:
+        assert m in expected_metrics
+
+
+def test_evaluate_detection_functional_test(
+    evaluate_detection_functional_test_groundtruths,
+    evaluate_detection_functional_test_predictions,
+):
+
+    eval_job = evaluate_detection(
+        groundtruths=evaluate_detection_functional_test_groundtruths,
+        predictions=evaluate_detection_functional_test_predictions,
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.5,
+        pr_curve_max_examples=1,
+    )
+
+    metrics = [
+        m
+        for m in eval_job.metrics
+        if m["type"]
+        not in ["PrecisionRecallCurve", "DetailedPrecisionRecallCurve"]
+    ]
+
+    # round all metrics to the third decimal place
+    for i, m in enumerate(metrics):
+        metrics[i]["value"] = round(m["value"], 3)
+
+    pr_metrics = [
+        m for m in eval_job.metrics if m["type"] == "PrecisionRecallCurve"
+    ]
+    detailed_pr_metrics = [
+        m
+        for m in eval_job.metrics
+        if m["type"] == "DetailedPrecisionRecallCurve"
+    ]
+
+    # cf with torch metrics/pycocotools results listed here:
+    # https://github.com/Lightning-AI/metrics/blob/107dbfd5fb158b7ae6d76281df44bd94c836bfce/tests/unittests/detection/test_map.py#L231
+    expected_metrics = [
+        {
+            "label": {"key": "class", "value": "0"},
+            "parameters": {"iou": 0.5},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "0"},
+            "parameters": {"iou": 0.75},
+            "value": 0.723,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "2"},
+            "parameters": {"iou": 0.5},
+            "value": 0.505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "2"},
+            "parameters": {"iou": 0.75},
+            "value": 0.505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "49"},
+            "parameters": {"iou": 0.5},
+            "value": 0.791,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "49"},
+            "parameters": {"iou": 0.75},
+            "value": 0.576,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "1"},
+            "parameters": {"iou": 0.5},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "1"},
+            "parameters": {"iou": 0.75},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "4"},
+            "parameters": {"iou": 0.5},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "4"},
+            "parameters": {"iou": 0.75},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "class", "iou": 0.5},
+            "value": 0.859,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "class", "iou": 0.75},
+            "value": 0.761,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "class", "value": "0"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.725,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "2"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.454,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "49"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.556,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "1"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.8,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "4"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.65,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {
+                "label_key": "class",
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "value": 0.637,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "0"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.78,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "2"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.45,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "49"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.58,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "3"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": -1.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "1"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.8,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "4"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.65,
+            "type": "AR",
+        },
+        {
+            "parameters": {
+                "label_key": "class",
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "value": 0.652,
+            "type": "mAR",
+        },
+    ]
+
+    pr_expected_answers = {
+        # (class, 4)
+        ("class", "4", 0.05, "tp"): 2,
+        ("class", "4", 0.05, "fn"): 0,
+        ("class", "4", 0.25, "tp"): 1,
+        ("class", "4", 0.25, "fn"): 1,
+        ("class", "4", 0.55, "tp"): 0,
+        ("class", "4", 0.55, "fn"): 2,
+        # (class, 2)
+        ("class", "2", 0.05, "tp"): 1,
+        ("class", "2", 0.05, "fn"): 1,
+        ("class", "2", 0.75, "tp"): 0,
+        ("class", "2", 0.75, "fn"): 2,
+        # (class, 49)
+        ("class", "49", 0.05, "tp"): 8,
+        ("class", "49", 0.3, "tp"): 5,
+        ("class", "49", 0.5, "tp"): 4,
+        ("class", "49", 0.85, "tp"): 1,
+        # (class, 3)
+        ("class", "3", 0.05, "tp"): 0,
+        ("class", "3", 0.05, "fp"): 1,
+        # (class, 1)
+        ("class", "1", 0.05, "tp"): 1,
+        ("class", "1", 0.35, "tp"): 0,
+        # (class, 0)
+        ("class", "0", 0.05, "tp"): 5,
+        ("class", "0", 0.5, "tp"): 3,
+        ("class", "0", 0.95, "tp"): 1,
+        ("class", "0", 0.95, "fn"): 4,
+    }
+
+    detailed_pr_expected_answers = {
+        # (class, 4)
+        ("4", 0.05, "tp"): {"all": 2, "total": 2},
+        ("4", 0.05, "fn"): {
+            "no_predictions": 0,
+            "misclassifications": 0,
+            "total": 0,
+        },
+        # (class, 2)
+        ("2", 0.05, "tp"): {"all": 1, "total": 1},
+        ("2", 0.05, "fn"): {
+            "no_predictions": 0,
+            "misclassifications": 1,
+            "total": 1,
+        },
+        ("2", 0.75, "tp"): {"all": 0, "total": 0},
+        ("2", 0.75, "fn"): {
+            "no_predictions": 2,
+            "misclassifications": 0,
+            "total": 2,
+        },
+        # (class, 49)
+        ("49", 0.05, "tp"): {"all": 9, "total": 9},
+        # (class, 3)
+        ("3", 0.05, "tp"): {"all": 0, "total": 0},
+        ("3", 0.05, "fp"): {
+            "hallucinations": 0,
+            "misclassifications": 1,
+            "total": 1,
+        },
+        # (class, 1)
+        ("1", 0.05, "tp"): {"all": 1, "total": 1},
+        ("1", 0.8, "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        # (class, 0)
+        ("0", 0.05, "tp"): {"all": 5, "total": 5},
+        ("0", 0.95, "fn"): {
+            "no_predictions": 4,
+            "misclassifications": 0,
+            "total": 4,
+        },
+    }
+
+    for m in metrics:
+        assert m in expected_metrics
+    for m in metrics:
+        assert m in eval_job.metrics
+
+    for (
+        _,
+        value,
+        threshold,
+        metric,
+    ), expected_value in pr_expected_answers.items():
+        assert (
+            pr_metrics[0]["value"][value][threshold][metric] == expected_value
+        )
+
+    for (
+        value,
+        threshold,
+        metric,
+    ), expected_output in detailed_pr_expected_answers.items():
+        model_output = detailed_pr_metrics[0]["value"][value][threshold][
+            metric
+        ]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+
+    # spot check number of examples
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 1
+    )
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 1
+    )
+
+    # raise the iou threshold
+    eval_job_higher_threshold = evaluate_detection(
+        groundtruths=evaluate_detection_functional_test_groundtruths,
+        predictions=evaluate_detection_functional_test_predictions,
+        metrics_to_return=[
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.9,
+        pr_curve_max_examples=1,
+    )
+
+    pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "PrecisionRecallCurve"
+    ]
+    detailed_pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "DetailedPrecisionRecallCurve"
+    ]
+
+    pr_expected_answers = {
+        # (class, 4)
+        ("class", "4", 0.05, "tp"): 0,
+        ("class", "4", 0.05, "fn"): 2,
+        # (class, 2)
+        ("class", "2", 0.05, "tp"): 1,
+        ("class", "2", 0.05, "fn"): 1,
+        ("class", "2", 0.75, "tp"): 0,
+        ("class", "2", 0.75, "fn"): 2,
+        # (class, 49)
+        ("class", "49", 0.05, "tp"): 2,
+        ("class", "49", 0.3, "tp"): 2,
+        ("class", "49", 0.5, "tp"): 2,
+        ("class", "49", 0.85, "tp"): 1,
+        # (class, 3)
+        ("class", "3", 0.05, "tp"): 0,
+        ("class", "3", 0.05, "fp"): 1,
+        # (class, 1)
+        ("class", "1", 0.05, "tp"): 0,
+        ("class", "1", 0.05, "fn"): 1,
+        # (class, 0)
+        ("class", "0", 0.05, "tp"): 1,
+        ("class", "0", 0.5, "tp"): 0,
+        ("class", "0", 0.95, "fn"): 5,
+    }
+
+    detailed_pr_expected_answers = {
+        # (class, 4)
+        ("4", 0.05, "tp"): {"all": 0, "total": 0},
+        ("4", 0.05, "fn"): {
+            "no_predictions": 2,  # below IOU threshold of .9
+            "misclassifications": 0,
+            "total": 2,
+        },
+        # (class, 2)
+        ("2", 0.05, "tp"): {"all": 1, "total": 1},
+        ("2", 0.05, "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        ("2", 0.75, "tp"): {"all": 0, "total": 0},
+        ("2", 0.75, "fn"): {
+            "no_predictions": 2,
+            "misclassifications": 0,
+            "total": 2,
+        },
+        # (class, 49)
+        ("49", 0.05, "tp"): {"all": 2, "total": 2},
+        # (class, 3)
+        ("3", 0.05, "tp"): {"all": 0, "total": 0},
+        ("3", 0.05, "fp"): {
+            "hallucinations": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        # (class, 1)
+        ("1", 0.05, "tp"): {"all": 0, "total": 0},
+        ("1", 0.8, "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        # (class, 0)
+        ("0", 0.05, "tp"): {"all": 1, "total": 1},
+        ("0", 0.95, "fn"): {
+            "no_predictions": 5,
+            "misclassifications": 0,
+            "total": 5,
+        },
+    }
+
+    for (
+        key,
+        value,
+        threshold,
+        metric,
+    ), expected_count in pr_expected_answers.items():
+        actual_count = pr_metrics[0]["value"][value][threshold][metric]
+        assert actual_count == expected_count
+
+    for (
+        value,
+        threshold,
+        metric,
+    ), expected_output in detailed_pr_expected_answers.items():
+        model_output = detailed_pr_metrics[0]["value"][value][threshold][
+            metric
+        ]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 1
+    )
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 1
+    )
+
+    # repeat the above, but with a higher pr_max_curves_example
+    eval_job_higher_threshold = evaluate_detection(
+        groundtruths=evaluate_detection_functional_test_groundtruths,
+        predictions=evaluate_detection_functional_test_predictions,
+        metrics_to_return=[
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.9,
+        pr_curve_max_examples=3,
+    )
+
+    pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "PrecisionRecallCurve"
+    ]
+    detailed_pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "DetailedPrecisionRecallCurve"
+    ]
+
+    for (
+        key,
+        value,
+        threshold,
+        metric,
+    ), expected_count in pr_expected_answers.items():
+        actual_count = pr_metrics[0]["value"][value][threshold][metric]
+        assert actual_count == expected_count
+
+    for (
+        value,
+        threshold,
+        metric,
+    ), expected_output in detailed_pr_expected_answers.items():
+        model_output = detailed_pr_metrics[0]["value"][value][threshold][
+            metric
+        ]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 3
+    )
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 2
+    )
+
+    # test behavior if pr_curve_max_examples == 0
+    eval_job_higher_threshold = evaluate_detection(
+        groundtruths=evaluate_detection_functional_test_groundtruths,
+        predictions=evaluate_detection_functional_test_predictions,
+        metrics_to_return=[
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.9,
+        pr_curve_max_examples=0,
+    )
+
+    pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "PrecisionRecallCurve"
+    ]
+    detailed_pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "DetailedPrecisionRecallCurve"
+    ]
+
+    for (
+        key,
+        value,
+        threshold,
+        metric,
+    ), expected_count in pr_expected_answers.items():
+        actual_count = pr_metrics[0]["value"][value][threshold][metric]
+        assert actual_count == expected_count
+
+    for (
+        value,
+        threshold,
+        metric,
+    ), expected_output in detailed_pr_expected_answers.items():
+        model_output = detailed_pr_metrics[0]["value"][value][threshold][
+            metric
+        ]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+
+    # spot check number of examples
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 0
+    )
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 0
+    )
+
+
+def test_evaluate_detection_functional_test_with_rasters(
+    evaluate_detection_functional_test_groundtruths_with_rasters,
+    evaluate_detection_functional_test_predictions_with_rasters,
+):
+    eval_job = evaluate_detection(
+        groundtruths=evaluate_detection_functional_test_groundtruths_with_rasters,
+        predictions=evaluate_detection_functional_test_predictions_with_rasters,
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.5,
+        pr_curve_max_examples=1,
+    )
+
+    metrics = [
+        m
+        for m in eval_job.metrics
+        if m["type"]
+        not in ["PrecisionRecallCurve", "DetailedPrecisionRecallCurve"]
+    ]
+
+    # round all metrics to the third decimal place
+    for i, m in enumerate(metrics):
+        metrics[i]["value"] = round(m["value"], 3)
+
+    pr_metrics = [
+        m for m in eval_job.metrics if m["type"] == "PrecisionRecallCurve"
+    ]
+
+    expected_metrics = [
+        {
+            "label": {"key": "class", "value": "label1"},
+            "parameters": {"iou": 0.5},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "label1"},
+            "parameters": {"iou": 0.75},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "label2"},
+            "parameters": {"iou": 0.5},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "label2"},
+            "parameters": {"iou": 0.75},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "label3"},
+            "parameters": {"iou": 0.5},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "label3"},
+            "parameters": {"iou": 0.75},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "class", "iou": 0.5},
+            "value": 0.667,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "class", "iou": 0.75},
+            "value": 0.667,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "class", "value": "label1"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 1.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "label2"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 1.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "label3"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {
+                "label_key": "class",
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "value": 0.667,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "label1"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 1.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "label4"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": -1.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "label2"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 1.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "label3"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.0,
+            "type": "AR",
+        },
+        {
+            "parameters": {
+                "label_key": "class",
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "value": 0.667,
+            "type": "mAR",
+        },
+    ]
+
+    for m in metrics:
+        assert m in expected_metrics
+
+    for m in expected_metrics:
+        assert m in metrics
+
+    pr_expected_answers = {
+        ("class", "label1", 0.05, "tp"): 1,
+        ("class", "label1", 0.35, "tp"): 0,
+        ("class", "label2", 0.05, "tp"): 1,
+        ("class", "label2", 0.05, "fp"): 0,
+        ("class", "label2", 0.95, "fp"): 0,
+        ("class", "label3", 0.05, "tp"): 0,
+        ("class", "label3", 0.05, "fn"): 1,
+        ("class", "label4", 0.05, "tp"): 0,
+        ("class", "label4", 0.05, "fp"): 1,
+    }
+
+    for (
+        _,
+        value,
+        threshold,
+        metric,
+    ), expected_value in pr_expected_answers.items():
+        assert (
+            pr_metrics[0]["value"][value][threshold][metric] == expected_value
+        )
+
+
+def test_evaluate_mixed_annotations(
+    image_height: int,
+    image_width: int,
+):
+    """Test the automatic conversion to rasters."""
+
+    datum = schemas.Datum(uid="datum1")
+
+    xmin, xmax, ymin, ymax = 11, 45, 37, 102
+    h, w = image_height, image_width
+    mask = np.zeros((h, w), dtype=bool)
+    mask[ymin:ymax, xmin:xmax] = True
+
+    pts = [
+        (xmin, ymin),
+        (xmin, ymax),
+        (xmax, ymax),
+        (xmax, ymin),
+        (xmin, ymin),
+    ]
+    poly = schemas.Polygon([pts])
+    raster = schemas.Raster(mask)
+    box = schemas.Box.from_extrema(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
+
+    gt_annotations = [
+        schemas.Annotation(
+            raster=raster,
+            labels=[schemas.Label(key="key", value="value")],
+            is_instance=True,
+        ),
+        schemas.Annotation(
+            raster=raster,
+            labels=[schemas.Label(key="key1", value="value")],
+            is_instance=True,
+        ),
+        schemas.Annotation(
+            raster=raster,
+            labels=[schemas.Label(key="key2", value="value")],
+            is_instance=True,
+        ),
+    ]
+
+    pd_annotations = [
+        schemas.Annotation(
+            raster=raster,
+            labels=[schemas.Label(key="key", value="value", score=0.90)],
+            is_instance=True,
+        ),
+        schemas.Annotation(
+            polygon=poly,
+            labels=[schemas.Label(key="key1", value="value", score=0.89)],
+            is_instance=True,
+        ),
+        schemas.Annotation(
+            bounding_box=box,
+            labels=[schemas.Label(key="key2", value="value", score=0.88)],
+            is_instance=True,
+        ),
+    ]
+    gts = [
+        schemas.GroundTruth(
+            datum=datum,
+            annotations=[ann for ann in gt_annotations],
+        )
+    ]
+
+    pds = [
+        schemas.Prediction(
+            datum=datum,
+            annotations=[ann for ann in pd_annotations],
+        )
+    ]
+
+    # by default, valor_core should throw an error if given mixed AnnotationTypes without being explicitely told to convert to a certain type
+    with pytest.raises(ValueError):
+        _ = evaluate_detection(
+            groundtruths=gts,
+            predictions=pds,
+            iou_thresholds_to_compute=[0.1, 0.6],
+            iou_thresholds_to_return=[0.1, 0.6],
+            metrics_to_return=[
+                enums.MetricType.AP,
+            ],
+        )
+
+    # test conversion to raster. this should throw an error since the user is trying to convert a Box annotation to a polygon.
+    with pytest.raises(ValueError):
+        evaluate_detection(
+            groundtruths=gts,
+            predictions=pds,
+            iou_thresholds_to_compute=[0.1, 0.6],
+            iou_thresholds_to_return=[0.1, 0.6],
+            metrics_to_return=[
+                enums.MetricType.AP,
+            ],
+            convert_annotations_to_type=enums.AnnotationType.RASTER,
+        )
+
+    # test conversion to polygon. this should throw an error since the user is trying to convert a Box annotation to a polygon.
+    with pytest.raises(ValueError):
+        evaluate_detection(
+            groundtruths=gts,
+            predictions=pds,
+            iou_thresholds_to_compute=[0.1, 0.6],
+            iou_thresholds_to_return=[0.1, 0.6],
+            metrics_to_return=[
+                enums.MetricType.AP,
+            ],
+            convert_annotations_to_type=enums.AnnotationType.POLYGON,
+        )
+
+    # test conversion to box
+    eval_job_box = evaluate_detection(
+        groundtruths=gts,
+        predictions=pds,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+        ],
+        convert_annotations_to_type=enums.AnnotationType.BOX,
+    )
+
+    expected = [
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 1.0,
+            "label": {"key": "key", "value": "value"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 1.0,
+            "label": {"key": "key", "value": "value"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 1.0,
+            "label": {"key": "key2", "value": "value"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 1.0,
+            "label": {"key": "key2", "value": "value"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 1.0,
+            "label": {"key": "key1", "value": "value"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 1.0,
+            "label": {"key": "key1", "value": "value"},
+        },
+    ]
+
+    for m in eval_job_box.metrics:
+        assert m in expected
+    for m in expected:
+        assert m in eval_job_box.metrics
+
+
+def test_evaluate_detection_rotated_bboxes_with_shapely(
+    rect1: list[tuple[float, float]],
+    rect2: list[tuple[float, float]],
+    rect3: list[tuple[float, float]],
+    img1: schemas.Datum,
+    img2: schemas.Datum,
+):
+    """
+    Run the same test as test_evaluate_detection, but rotate all of the bounding boxes by some random numbewr of degrees to confirm we get the same outputs.
+    """
+
+    random_angle = random.uniform(0, 365)
+
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box(
+                        [geometry.rotate_bbox(rect1, random_angle)]
+                    ),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2")],
+                    bounding_box=schemas.Box(
+                        [geometry.rotate_bbox(rect3, random_angle)]
+                    ),
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box(
+                        [geometry.rotate_bbox(rect2, random_angle)]
+                    ),
+                )
+            ],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1", score=0.3)],
+                    bounding_box=schemas.Box(
+                        [geometry.rotate_bbox(rect1, random_angle)]
+                    ),
+                )
+            ],
+        ),
+        schemas.Prediction(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2", score=0.98)],
+                    bounding_box=schemas.Box(
+                        [geometry.rotate_bbox(rect2, random_angle)]
+                    ),
+                )
+            ],
+        ),
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+        ],
+    )
+
+    metrics = eval_job.metrics
+
+    expected_metrics = [
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.1},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.6},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "AR",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "mAR",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAR",
+        },
+    ]
+
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in metrics
+
+    assert eval_job.ignored_pred_labels == []
+    assert eval_job.missing_pred_labels == []
+
+    result = eval_job
+    result_dict = result.to_dict()
+
+    # duration isn't deterministic, so test meta separately
+    assert result_dict["meta"]["datums"] == 2
+    assert result_dict["meta"]["labels"] == 2
+    assert result_dict["meta"]["annotations"] == 5
+    assert result_dict["meta"]["duration"] <= 5
+    result_dict.pop("meta")
+    result_dict.pop("metrics")
+
+    assert result_dict == {
+        "parameters": {
+            "label_map": {},
+            "metrics_to_return": [
+                enums.MetricType.AP,
+                enums.MetricType.AR,
+                enums.MetricType.mAP,
+                enums.MetricType.APAveragedOverIOUs,
+                enums.MetricType.mAR,
+                enums.MetricType.mAPAveragedOverIOUs,
+            ],
+            "iou_thresholds_to_compute": [0.1, 0.6],
+            "iou_thresholds_to_return": [0.1, 0.6],
+            "recall_score_threshold": 0.0,
+            "pr_curve_iou_threshold": 0.5,
+            "pr_curve_max_examples": 1,
+            "convert_annotations_to_type": None,
+        },
+        "confusion_matrices": [],
+        "ignored_pred_labels": [],
+        "missing_pred_labels": [],
+    }
+
+    #  check that metrics arg works correctly
+    selected_metrics = random.sample(
+        [
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        2,
+    )
+    eval_job = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=selected_metrics,
+    )
+
+    metrics = eval_job.metrics
+    assert set([metric["type"] for metric in eval_job.metrics]) == set(
+        selected_metrics
+    )
+
+
+@pytest.fixture
+def rect1_rotated_5_degrees_around_origin() -> list[tuple[float, float]]:
+    """Box with area = 1500."""
+    return [
+        (9.090389553440874, 10.833504408394036),
+        (58.90012445802815, 15.191291545776945),
+        (56.28545217559841, 45.07713248852931),
+        (6.475717271011129, 40.7193453511464),
+        (9.090389553440874, 10.833504408394036),
+    ]
+
+
+@pytest.fixture
+def rect2_rotated_5_degrees_around_origin() -> list[tuple[float, float]]:
+    """Box with area = 1100."""
+    return [
+        (14.942920471376183, 1.3073361412148725),
+        (69.7336288664222, 6.1009019923360714),
+        (67.99051401146903, 26.024795954170983),
+        (13.19980561642302, 21.231230103049782),
+        (14.942920471376183, 1.3073361412148725),
+    ]
+
+
+@pytest.fixture
+def rect3_rotated_5_degrees_around_origin() -> list[tuple[float, float]]:
+    """Box with area = 57,510."""
+    return [
+        (85.79738130650527, 17.544496599963715),
+        (156.52720487101922, 23.732554335047446),
+        (85.9310532454161, 830.6502597893614),
+        (15.20122968090216, 824.4622020542777),
+        (85.79738130650527, 17.544496599963715),
+    ]
+
+
+def test_evaluate_detection_rotated_bboxes(
+    rect1_rotated_5_degrees_around_origin: list[tuple[float, float]],
+    rect2_rotated_5_degrees_around_origin: list[tuple[float, float]],
+    rect3_rotated_5_degrees_around_origin: list[tuple[float, float]],
+    img1: schemas.Datum,
+    img2: schemas.Datum,
+):
+    """
+    Run the same test as test_evaluate_detection, but rotate all of the bounding boxes by 5 degrees around the origin to confirm we get the same outputs.
+    """
+
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box(
+                        [rect1_rotated_5_degrees_around_origin]
+                    ),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2")],
+                    bounding_box=schemas.Box(
+                        [rect3_rotated_5_degrees_around_origin]
+                    ),
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box(
+                        [rect2_rotated_5_degrees_around_origin]
+                    ),
+                )
+            ],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1", score=0.3)],
+                    bounding_box=schemas.Box(
+                        [rect1_rotated_5_degrees_around_origin]
+                    ),
+                )
+            ],
+        ),
+        schemas.Prediction(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2", score=0.98)],
+                    bounding_box=schemas.Box(
+                        [rect2_rotated_5_degrees_around_origin]
+                    ),
+                )
+            ],
+        ),
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+        ],
+    )
+
+    metrics = eval_job.metrics
+
+    expected_metrics = [
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.1},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.6},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "AR",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "mAR",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAR",
+        },
+    ]
+
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in metrics
+
+    assert eval_job.ignored_pred_labels == []
+    assert eval_job.missing_pred_labels == []
+
+    result = eval_job
+    result_dict = result.to_dict()
+
+    # duration isn't deterministic, so test meta separately
+    assert result_dict["meta"]["datums"] == 2
+    assert result_dict["meta"]["labels"] == 2
+    assert result_dict["meta"]["annotations"] == 5
+    assert result_dict["meta"]["duration"] <= 5
+    result_dict.pop("meta")
+    result_dict.pop("metrics")
+
+    assert result_dict == {
+        "parameters": {
+            "label_map": {},
+            "metrics_to_return": [
+                enums.MetricType.AP,
+                enums.MetricType.AR,
+                enums.MetricType.mAP,
+                enums.MetricType.APAveragedOverIOUs,
+                enums.MetricType.mAR,
+                enums.MetricType.mAPAveragedOverIOUs,
+            ],
+            "iou_thresholds_to_compute": [0.1, 0.6],
+            "iou_thresholds_to_return": [0.1, 0.6],
+            "recall_score_threshold": 0.0,
+            "pr_curve_iou_threshold": 0.5,
+            "pr_curve_max_examples": 1,
+            "convert_annotations_to_type": None,
+        },
+        "confusion_matrices": [],
+        "ignored_pred_labels": [],
+        "missing_pred_labels": [],
+    }
+
+    #  check that metrics arg works correctly
+    selected_metrics = random.sample(
+        [
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        2,
+    )
+    eval_job = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=selected_metrics,
+    )
+
+    metrics = eval_job.metrics
+    assert set([metric["type"] for metric in eval_job.metrics]) == set(
+        selected_metrics
+    )
+
+
+def test_two_groundtruths_one_datum(
+    evaluate_detection_predictions: list[schemas.Prediction],
+    rect1: list[tuple[float, float]],
+    rect2: list[tuple[float, float]],
+    rect3: list[tuple[float, float]],
+    img1: schemas.Datum,
+    img2: schemas.Datum,
+):
+    """Same test as test_evaluate_detection, but we show that we can handle two groundtruths for a single datum"""
+
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box([rect1]),
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2")],
+                    bounding_box=schemas.Box([rect3]),
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box([rect2]),
+                )
+            ],
+        ),
+    ]
+
+    eval_job = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=evaluate_detection_predictions,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+        ],
+    )
+
+    metrics = eval_job.metrics
+
+    expected_metrics = [
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.1},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.6},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "AR",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "mAR",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAR",
+        },
+    ]
+
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in metrics
+
+    assert eval_job.ignored_pred_labels == []
+    assert eval_job.missing_pred_labels == []
+
+    result = eval_job
+    result_dict = result.to_dict()
+
+    # duration isn't deterministic, so test meta separately
+    assert result_dict["meta"]["datums"] == 2
+    assert result_dict["meta"]["labels"] == 2
+    assert result_dict["meta"]["annotations"] == 5
+    assert result_dict["meta"]["duration"] <= 5
diff --git a/core/tests/functional-tests/test_detection_manager.py b/core/tests/functional-tests/test_detection_manager.py
new file mode 100644
index 000000000..aa539e3ba
--- /dev/null
+++ b/core/tests/functional-tests/test_detection_manager.py
@@ -0,0 +1,4420 @@
+import random
+
+import numpy as np
+import pandas as pd
+import pytest
+from valor_core import enums, geometry, managers, schemas
+
+
+def test_evaluate_detection_with_ValorDetectionManager(
+    evaluate_detection_groundtruths, evaluate_detection_predictions
+):
+    """
+    Test detection evaluations with area thresholds.
+
+    gt_dets1
+        datum 1
+            - Label (k1, v1) with Annotation area = 1500
+            - Label (k2, v2) with Annotation area = 57,510
+        datum2
+            - Label (k1, v1) with Annotation area = 1100
+
+    pred_dets
+        datum 1
+            - Label (k1, v1) with Annotation area = 1500
+            - Label (k2, v2) with Annotation area = 57,510
+        datum2
+            - Label (k1, v1) with Annotation area = 1100
+    """
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+        ],
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_groundtruths,
+        predictions=evaluate_detection_predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    metrics = eval_job.metrics
+
+    expected_metrics = [
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.1},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.6},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "AR",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "mAR",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAR",
+        },
+    ]
+
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in metrics
+
+    assert eval_job.ignored_pred_labels == []
+    assert eval_job.missing_pred_labels == []
+
+    result = eval_job
+    result_dict = result.to_dict()
+
+    # duration isn't deterministic, so test meta separately
+    assert result_dict["meta"]["datums"] == 2
+    assert result_dict["meta"]["labels"] == 2
+    assert result_dict["meta"]["annotations"] == 5
+    assert result_dict["meta"]["duration"] <= 5
+    result_dict.pop("meta")
+    result_dict.pop("metrics")
+
+    assert result_dict == {
+        "parameters": {
+            "label_map": {},
+            "metrics_to_return": [
+                enums.MetricType.AP,
+                enums.MetricType.AR,
+                enums.MetricType.mAP,
+                enums.MetricType.APAveragedOverIOUs,
+                enums.MetricType.mAR,
+                enums.MetricType.mAPAveragedOverIOUs,
+            ],
+            "iou_thresholds_to_compute": [0.1, 0.6],
+            "iou_thresholds_to_return": [0.1, 0.6],
+            "recall_score_threshold": 0.0,
+            "pr_curve_iou_threshold": 0.5,
+            "pr_curve_max_examples": 1,
+            "convert_annotations_to_type": None,
+        },
+        "confusion_matrices": [],
+        "ignored_pred_labels": [],
+        "missing_pred_labels": [],
+    }
+
+    selected_metrics = random.sample(
+        [
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        2,
+    )
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=selected_metrics,
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_groundtruths,
+        predictions=evaluate_detection_predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    metrics = eval_job.metrics
+    assert set([metric["type"] for metric in eval_job.metrics]) == set(
+        selected_metrics
+    )
+
+
+def test_evaluate_detection_via_pandas_df_with_ValorDetectionManager():
+    """The Manager shouldn't except dataframes, so we just confirm this test throws an error here."""
+    groundtruth_df = pd.DataFrame(
+        [
+            {
+                "datum_id": 1,
+                "datum_uid": "uid1",
+                "id": 1,
+                "annotation_id": 1,
+                "label_id": 1,
+                "label_key": "k1",
+                "label_value": "v1",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[10, 10], [60, 10], [60, 40], [10, 40], [10, 10]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+            {
+                "datum_id": 1,
+                "datum_uid": "uid1",
+                "id": 2,
+                "annotation_id": 2,
+                "label_id": 2,
+                "label_key": "k2",
+                "label_value": "v2",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [
+                                [87, 10],
+                                [158, 10],
+                                [158, 820],
+                                [87, 820],
+                                [87, 10],
+                            ]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+            {
+                "datum_id": 2,
+                "datum_uid": "uid2",
+                "id": 3,
+                "annotation_id": 3,
+                "label_id": 1,
+                "label_key": "k1",
+                "label_value": "v1",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[15, 0], [70, 0], [70, 20], [15, 20], [15, 0]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+        ]
+    )
+    prediction_df = pd.DataFrame(
+        [
+            {
+                "id": 1,
+                "annotation_id": 4,
+                "score": 0.3,
+                "datum_id": 1,
+                "datum_uid": "uid1",
+                "label_id": 1,
+                "label_key": "k1",
+                "label_value": "v1",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[10, 10], [60, 10], [60, 40], [10, 40], [10, 10]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+            {
+                "id": 2,
+                "annotation_id": 5,
+                "score": 0.98,
+                "datum_id": 2,
+                "datum_uid": "uid2",
+                "label_id": 2,
+                "label_key": "k2",
+                "label_value": "v2",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[15, 0], [70, 0], [70, 20], [15, 20], [15, 0]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+        ]
+    )
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+        ],
+    )
+
+    with pytest.raises(ValueError):
+        manager.add_data(
+            groundtruths=groundtruth_df,  # type: ignore - purposefully throwing error
+            predictions=prediction_df,  # type: ignore - purposefully throwing error
+        )
+
+
+def test_evaluate_detection_false_negatives_single_image_baseline_with_ValorDetectionManager():
+    """This is the baseline for the below test. In this case there are two predictions and
+    one groundtruth, but the highest confident prediction overlaps sufficiently with the groundtruth
+    so there is not a penalty for the false negative so the AP is 1
+    """
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        )
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=100, xmax=110, ymin=100, ymax=200
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.7)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    manager.add_data(
+        groundtruths=groundtruths,
+        predictions=predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    ap_metric = [m for m in eval_job.metrics if m["type"] == "AP"][0]
+    assert ap_metric == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 1,
+        "label": {"key": "key", "value": "value"},
+    }
+
+
+def test_evaluate_detection_false_negatives_single_image_with_ValorDetectionManager():
+    """Tests fix for a bug where high confidence false negative was not being penalized. The
+    difference between this test and the above is that here the prediction with higher confidence
+    does not sufficiently overlap the groundtruth and so is penalized and we get an AP of 0.5
+    """
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        )
+    ]
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=100, xmax=110, ymin=100, ymax=200
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.9)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    manager.add_data(
+        groundtruths=groundtruths,
+        predictions=predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    ap_metric = [m for m in eval_job.metrics if m["type"] == "AP"][0]
+    assert ap_metric == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 0.5,
+        "label": {"key": "key", "value": "value"},
+    }
+
+
+def test_evaluate_detection_false_negatives_two_images_one_empty_low_confidence_of_fp_with_ValorDetectionManager():
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class and high IOU)
+        2. A second image with empty groundtruth annotation but a prediction with lower confidence
+        then the prediction on the first image.
+
+    In this case, the AP should be 1.0 since the false positive has lower confidence than the true positive
+
+    """
+
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[schemas.Annotation(labels=[])],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.7)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    manager.add_data(
+        groundtruths=groundtruths,
+        predictions=predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    ap_metric = [m for m in eval_job.metrics if m["type"] == "AP"][0]
+    assert ap_metric == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 1.0,
+        "label": {"key": "key", "value": "value"},
+    }
+
+
+def test_evaluate_detection_false_negatives_two_images_one_empty_high_confidence_of_fp_with_ValorDetectionManager():
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class and high IOU)
+        2. A second image with empty groundtruth annotation and a prediction with higher confidence
+        then the prediction on the first image.
+
+    In this case, the AP should be 0.5 since the false positive has higher confidence than the true positive
+    """
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[schemas.Annotation(labels=[])],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.9)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    manager.add_data(
+        groundtruths=groundtruths,
+        predictions=predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    ap_metric = [m for m in eval_job.metrics if m["type"] == "AP"][0]
+    assert ap_metric == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 0.5,
+        "label": {"key": "key", "value": "value"},
+    }
+
+
+def test_evaluate_detection_false_negatives_two_images_one_only_with_different_class_low_confidence_of_fp_with_ValorDetectionManager():
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU)
+        2. A second image with a groundtruth annotation with class `"other value"` and a prediction with lower confidence
+        then the prediction on the first image.
+
+    In this case, the AP for class `"value"` should be 1 since the false positive has lower confidence than the true positive.
+    AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth
+    """
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="other value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.7)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    manager.add_data(
+        groundtruths=groundtruths,
+        predictions=predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    ap_metric1 = [
+        m
+        for m in eval_job.metrics
+        if m["type"] == "AP" and m["label"] == {"key": "key", "value": "value"}
+    ][0]
+    assert ap_metric1 == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 1.0,
+        "label": {"key": "key", "value": "value"},
+    }
+
+    # label `"other value"` is not in the predictions so we should get an AP of 0
+    ap_metric2 = [
+        m
+        for m in eval_job.metrics
+        if m["type"] == "AP"
+        and m["label"] == {"key": "key", "value": "other value"}
+    ][0]
+    assert ap_metric2 == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 0,
+        "label": {"key": "key", "value": "other value"},
+    }
+
+
+def test_evaluate_detection_false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_with_ValorDetectionManager():
+    """In this test we have
+        1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU)
+        2. A second image with a groundtruth annotation with clas `"other value"` and a prediction with higher confidence
+        then the prediction on the first image.
+
+    In this case, the AP for class `"value"` should be 0.5 since the false positive has higher confidence than the true positive.
+    AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth
+    """
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[schemas.Label(key="key", value="other value")],
+                    is_instance=True,
+                )
+            ],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid1"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.8)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+        schemas.Prediction(
+            datum=schemas.Datum(uid="uid2"),
+            annotations=[
+                schemas.Annotation(
+                    bounding_box=schemas.Box.from_extrema(
+                        xmin=10, xmax=20, ymin=10, ymax=20
+                    ),
+                    labels=[
+                        schemas.Label(key="key", value="value", score=0.9)
+                    ],
+                    is_instance=True,
+                ),
+            ],
+        ),
+    ]
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.5],
+        iou_thresholds_to_return=[0.5],
+    )
+
+    manager.add_data(
+        groundtruths=groundtruths,
+        predictions=predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    ap_metric1 = [
+        m
+        for m in eval_job.metrics
+        if m["type"] == "AP" and m["label"] == {"key": "key", "value": "value"}
+    ][0]
+    assert ap_metric1 == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 0.5,
+        "label": {"key": "key", "value": "value"},
+    }
+
+    # label `"other value"` is not in the predictions so we should get an AP of 0
+    ap_metric2 = [
+        m
+        for m in eval_job.metrics
+        if m["type"] == "AP"
+        and m["label"] == {"key": "key", "value": "other value"}
+    ][0]
+    assert ap_metric2 == {
+        "type": "AP",
+        "parameters": {"iou": 0.5},
+        "value": 0,
+        "label": {"key": "key", "value": "other value"},
+    }
+
+
+@pytest.fixture
+def test_detailed_precision_recall_curve_with_ValorDetectionManager(
+    evaluate_detection_detailed_pr_curve_groundtruths,
+    evaluate_detection_detailed_pr_curve_predictions,
+):
+
+    manager = managers.ValorDetectionManager(
+        metrics_to_return=[enums.MetricType.DetailedPrecisionRecallCurve],
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_detailed_pr_curve_groundtruths,
+        predictions=evaluate_detection_detailed_pr_curve_predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    # one true positive that becomes a false negative when score > .5
+    assert eval_job.metrics[0]["value"]["v1"]["0.3"]["tp"]["total"] == 1
+    assert eval_job.metrics[0]["value"]["v1"]["0.55"]["tp"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["v1"]["0.55"]["fn"]["total"] == 1
+    assert (
+        eval_job.metrics[0]["value"]["v1"]["0.55"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert eval_job.metrics[0]["value"]["v1"]["0.05"]["fn"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["v1"]["0.05"]["fp"]["total"] == 0
+
+    # one missed detection that never changes
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.95"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["tp"]["total"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["fp"]["total"]
+        == 0
+    )
+
+    # one fn missed_dection that becomes a misclassification when pr_curve_iou_threshold <= .48 and score threshold <= .3
+    assert (
+        eval_job.metrics[0]["value"]["v2"]["0.3"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["v2"]["0.35"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert eval_job.metrics[0]["value"]["v2"]["0.05"]["tp"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["v2"]["0.05"]["fp"]["total"] == 0
+
+    # one fp hallucination that becomes a misclassification when pr_curve_iou_threshold <= .48 and score threshold <= .3
+    assert (
+        eval_job.metrics[0]["value"]["not_v2"]["0.05"]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["not_v2"]["0.05"]["fp"]["observations"][
+            "misclassifications"
+        ]["count"]
+        == 0
+    )
+    assert eval_job.metrics[0]["value"]["not_v2"]["0.05"]["tp"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["not_v2"]["0.05"]["fn"]["total"] == 0
+
+    # one fp hallucination that disappears when score threshold >.15
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["fp"][
+            "observations"
+        ]["hallucinations"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.35"]["fp"][
+            "observations"
+        ]["hallucinations"]["count"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["tp"]["total"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["fn"]["total"]
+        == 0
+    )
+
+    # one missed detection and one hallucination due to low iou overlap
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.3"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.95"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.3"]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.55"]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 0
+    )
+
+    # repeat tests using a lower IOU threshold
+
+    manager = managers.ValorDetectionManager(
+        metrics_to_return=[enums.MetricType.DetailedPrecisionRecallCurve],
+        pr_curve_iou_threshold=0.45,
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_detailed_pr_curve_groundtruths,
+        predictions=evaluate_detection_detailed_pr_curve_predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job_low_iou_threshold = manager.evaluate()
+
+    # one true positive that becomes a false negative when score > .5
+    assert eval_job.metrics[0]["value"]["v1"]["0.3"]["tp"]["total"] == 1
+    assert eval_job.metrics[0]["value"]["v1"]["0.55"]["tp"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["v1"]["0.55"]["fn"]["total"] == 1
+    assert (
+        eval_job.metrics[0]["value"]["v1"]["0.55"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert eval_job.metrics[0]["value"]["v1"]["0.05"]["fn"]["total"] == 0
+    assert eval_job.metrics[0]["value"]["v1"]["0.05"]["fp"]["total"] == 0
+
+    # one missed detection that never changes
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.95"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["tp"]["total"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["missed_detection"]["0.05"]["fp"]["total"]
+        == 0
+    )
+
+    # one fn missed_dection that becomes a misclassification when pr_curve_iou_threshold <= .48 and score threshold <= .3
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.3"]["fn"][
+            "observations"
+        ]["misclassifications"]["count"]
+        == 1
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.3"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 0
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.35"]["fn"][
+            "observations"
+        ]["misclassifications"]["count"]
+        == 0
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.35"]["fn"][
+            "observations"
+        ]["no_predictions"]["count"]
+        == 1
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.05"]["tp"][
+            "total"
+        ]
+        == 0
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["v2"]["0.05"]["fp"][
+            "total"
+        ]
+        == 0
+    )
+
+    # one fp hallucination that becomes a misclassification when pr_curve_iou_threshold <= .48 and score threshold <= .3
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["not_v2"]["0.05"]["fp"][
+            "observations"
+        ]["hallucinations"]["count"]
+        == 0
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["not_v2"]["0.05"]["fp"][
+            "observations"
+        ]["misclassifications"]["count"]
+        == 1
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["not_v2"]["0.05"]["tp"][
+            "total"
+        ]
+        == 0
+    )
+    assert (
+        eval_job_low_iou_threshold.metrics[0]["value"]["not_v2"]["0.05"]["fn"][
+            "total"
+        ]
+        == 0
+    )
+
+    # one fp hallucination that disappears when score threshold >.15
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["fp"][
+            "observations"
+        ]["hallucinations"]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.35"]["fp"][
+            "observations"
+        ]["hallucinations"]["count"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["tp"]["total"]
+        == 0
+    )
+    assert (
+        eval_job.metrics[0]["value"]["hallucination"]["0.05"]["fn"]["total"]
+        == 0
+    )
+
+    # one missed detection and one hallucination due to low iou overlap
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.3"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.95"]["fn"]["observations"][
+            "no_predictions"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.3"]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 1
+    )
+    assert (
+        eval_job.metrics[0]["value"]["low_iou"]["0.55"]["fp"]["observations"][
+            "hallucinations"
+        ]["count"]
+        == 0
+    )
+
+
+def test_evaluate_detection_model_with_no_predictions_with_ValorDetectionManager(
+    evaluate_detection_groundtruths,
+):
+    """
+    Test detection evaluations when the model outputs nothing.
+
+    gt_dets1
+        datum 1
+            - Label (k1, v1) with Annotation area = 1500
+            - Label (k2, v2) with Annotation area = 57,510
+        datum2
+            - Label (k1, v1) with Annotation area = 1100
+    """
+    predictions = []
+    for gt in evaluate_detection_groundtruths:
+        predictions.append(
+            schemas.Prediction(
+                datum=gt.datum,
+                annotations=[],
+            )
+        )
+
+    expected_metrics = [
+        {
+            "label": {
+                "key": "k2",
+                "value": "v2",
+            },
+            "parameters": {
+                "iou": 0.5,
+            },
+            "type": "AP",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k2",
+                "value": "v2",
+            },
+            "parameters": {
+                "iou": 0.75,
+            },
+            "type": "AP",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k1",
+                "value": "v1",
+            },
+            "parameters": {
+                "iou": 0.5,
+            },
+            "type": "AP",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k1",
+                "value": "v1",
+            },
+            "parameters": {
+                "iou": 0.75,
+            },
+            "type": "AP",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k2",
+                "value": "v2",
+            },
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "type": "AR",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k1",
+                "value": "v1",
+            },
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "type": "AR",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "iou": 0.5,
+                "label_key": "k2",
+            },
+            "type": "mAP",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "iou": 0.75,
+                "label_key": "k2",
+            },
+            "type": "mAP",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "iou": 0.5,
+                "label_key": "k1",
+            },
+            "type": "mAP",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "iou": 0.75,
+                "label_key": "k1",
+            },
+            "type": "mAP",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+                "label_key": "k2",
+            },
+            "type": "mAR",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+                "label_key": "k1",
+            },
+            "type": "mAR",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k2",
+                "value": "v2",
+            },
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "type": "APAveragedOverIOUs",
+            "value": 0.0,
+        },
+        {
+            "label": {
+                "key": "k1",
+                "value": "v1",
+            },
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "type": "APAveragedOverIOUs",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+                "label_key": "k2",
+            },
+            "type": "mAPAveragedOverIOUs",
+            "value": 0.0,
+        },
+        {
+            "parameters": {
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+                "label_key": "k1",
+            },
+            "type": "mAPAveragedOverIOUs",
+            "value": 0.0,
+        },
+    ]
+
+    manager = managers.ValorDetectionManager()
+
+    manager.add_data(
+        groundtruths=evaluate_detection_groundtruths,
+        predictions=predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    computed_metrics = eval_job.metrics
+
+    assert all([metric["value"] == 0 for metric in computed_metrics])
+
+    for m in expected_metrics:
+        assert m in computed_metrics
+
+    for m in computed_metrics:
+        assert m in expected_metrics
+
+
+def test_evaluate_detection_functional_test_with_ValorDetectionManager(
+    evaluate_detection_functional_test_groundtruths,
+    evaluate_detection_functional_test_predictions,
+):
+
+    manager = managers.ValorDetectionManager(
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.5,
+        pr_curve_max_examples=1,
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_functional_test_groundtruths,
+        predictions=evaluate_detection_functional_test_predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    metrics = [
+        m
+        for m in eval_job.metrics
+        if m["type"]
+        not in ["PrecisionRecallCurve", "DetailedPrecisionRecallCurve"]
+    ]
+
+    # round all metrics to the third decimal place
+    for i, m in enumerate(metrics):
+        metrics[i]["value"] = round(m["value"], 3)
+
+    pr_metrics = [
+        m for m in eval_job.metrics if m["type"] == "PrecisionRecallCurve"
+    ]
+    detailed_pr_metrics = [
+        m
+        for m in eval_job.metrics
+        if m["type"] == "DetailedPrecisionRecallCurve"
+    ]
+
+    # cf with torch metrics/pycocotools results listed here:
+    # https://github.com/Lightning-AI/metrics/blob/107dbfd5fb158b7ae6d76281df44bd94c836bfce/tests/unittests/detection/test_map.py#L231
+    expected_metrics = [
+        {
+            "label": {"key": "class", "value": "0"},
+            "parameters": {"iou": 0.5},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "0"},
+            "parameters": {"iou": 0.75},
+            "value": 0.723,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "2"},
+            "parameters": {"iou": 0.5},
+            "value": 0.505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "2"},
+            "parameters": {"iou": 0.75},
+            "value": 0.505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "49"},
+            "parameters": {"iou": 0.5},
+            "value": 0.791,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "49"},
+            "parameters": {"iou": 0.75},
+            "value": 0.576,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "1"},
+            "parameters": {"iou": 0.5},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "1"},
+            "parameters": {"iou": 0.75},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "4"},
+            "parameters": {"iou": 0.5},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "4"},
+            "parameters": {"iou": 0.75},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "class", "iou": 0.5},
+            "value": 0.859,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "class", "iou": 0.75},
+            "value": 0.761,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "class", "value": "0"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.725,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "2"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.454,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "49"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.556,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "1"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.8,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "4"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.65,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {
+                "label_key": "class",
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "value": 0.637,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "0"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.78,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "2"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.45,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "49"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.58,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "3"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": -1.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "1"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.8,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "4"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.65,
+            "type": "AR",
+        },
+        {
+            "parameters": {
+                "label_key": "class",
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "value": 0.652,
+            "type": "mAR",
+        },
+    ]
+
+    pr_expected_answers = {
+        # (class, 4)
+        ("class", "4", 0.05, "tp"): 2,
+        ("class", "4", 0.05, "fn"): 0,
+        ("class", "4", 0.25, "tp"): 1,
+        ("class", "4", 0.25, "fn"): 1,
+        ("class", "4", 0.55, "tp"): 0,
+        ("class", "4", 0.55, "fn"): 2,
+        # (class, 2)
+        ("class", "2", 0.05, "tp"): 1,
+        ("class", "2", 0.05, "fn"): 1,
+        ("class", "2", 0.75, "tp"): 0,
+        ("class", "2", 0.75, "fn"): 2,
+        # (class, 49)
+        ("class", "49", 0.05, "tp"): 8,
+        ("class", "49", 0.3, "tp"): 5,
+        ("class", "49", 0.5, "tp"): 4,
+        ("class", "49", 0.85, "tp"): 1,
+        # (class, 3)
+        ("class", "3", 0.05, "tp"): 0,
+        ("class", "3", 0.05, "fp"): 1,
+        # (class, 1)
+        ("class", "1", 0.05, "tp"): 1,
+        ("class", "1", 0.35, "tp"): 0,
+        # (class, 0)
+        ("class", "0", 0.05, "tp"): 5,
+        ("class", "0", 0.5, "tp"): 3,
+        ("class", "0", 0.95, "tp"): 1,
+        ("class", "0", 0.95, "fn"): 4,
+    }
+
+    detailed_pr_expected_answers = {
+        # (class, 4)
+        ("4", 0.05, "tp"): {"all": 2, "total": 2},
+        ("4", 0.05, "fn"): {
+            "no_predictions": 0,
+            "misclassifications": 0,
+            "total": 0,
+        },
+        # (class, 2)
+        ("2", 0.05, "tp"): {"all": 1, "total": 1},
+        ("2", 0.05, "fn"): {
+            "no_predictions": 0,
+            "misclassifications": 1,
+            "total": 1,
+        },
+        ("2", 0.75, "tp"): {"all": 0, "total": 0},
+        ("2", 0.75, "fn"): {
+            "no_predictions": 2,
+            "misclassifications": 0,
+            "total": 2,
+        },
+        # (class, 49)
+        ("49", 0.05, "tp"): {"all": 9, "total": 9},
+        # (class, 3)
+        ("3", 0.05, "tp"): {"all": 0, "total": 0},
+        ("3", 0.05, "fp"): {
+            "hallucinations": 0,
+            "misclassifications": 1,
+            "total": 1,
+        },
+        # (class, 1)
+        ("1", 0.05, "tp"): {"all": 1, "total": 1},
+        ("1", 0.8, "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        # (class, 0)
+        ("0", 0.05, "tp"): {"all": 5, "total": 5},
+        ("0", 0.95, "fn"): {
+            "no_predictions": 4,
+            "misclassifications": 0,
+            "total": 4,
+        },
+    }
+
+    for m in metrics:
+        assert m in expected_metrics
+    for m in metrics:
+        assert m in eval_job.metrics
+
+    for (
+        _,
+        value,
+        threshold,
+        metric,
+    ), expected_value in pr_expected_answers.items():
+        assert (
+            pr_metrics[0]["value"][value][threshold][metric] == expected_value
+        )
+
+    for (
+        value,
+        threshold,
+        metric,
+    ), expected_output in detailed_pr_expected_answers.items():
+        model_output = detailed_pr_metrics[0]["value"][value][threshold][
+            metric
+        ]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+
+    # spot check number of examples
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 1
+    )
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 1
+    )
+
+    # raise the iou threshold
+    manager = managers.ValorDetectionManager(
+        metrics_to_return=[
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.9,
+        pr_curve_max_examples=1,
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_functional_test_groundtruths,
+        predictions=evaluate_detection_functional_test_predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job_higher_threshold = manager.evaluate()
+
+    pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "PrecisionRecallCurve"
+    ]
+    detailed_pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "DetailedPrecisionRecallCurve"
+    ]
+
+    pr_expected_answers = {
+        # (class, 4)
+        ("class", "4", 0.05, "tp"): 0,
+        ("class", "4", 0.05, "fn"): 2,
+        # (class, 2)
+        ("class", "2", 0.05, "tp"): 1,
+        ("class", "2", 0.05, "fn"): 1,
+        ("class", "2", 0.75, "tp"): 0,
+        ("class", "2", 0.75, "fn"): 2,
+        # (class, 49)
+        ("class", "49", 0.05, "tp"): 2,
+        ("class", "49", 0.3, "tp"): 2,
+        ("class", "49", 0.5, "tp"): 2,
+        ("class", "49", 0.85, "tp"): 1,
+        # (class, 3)
+        ("class", "3", 0.05, "tp"): 0,
+        ("class", "3", 0.05, "fp"): 1,
+        # (class, 1)
+        ("class", "1", 0.05, "tp"): 0,
+        ("class", "1", 0.05, "fn"): 1,
+        # (class, 0)
+        ("class", "0", 0.05, "tp"): 1,
+        ("class", "0", 0.5, "tp"): 0,
+        ("class", "0", 0.95, "fn"): 5,
+    }
+
+    detailed_pr_expected_answers = {
+        # (class, 4)
+        ("4", 0.05, "tp"): {"all": 0, "total": 0},
+        ("4", 0.05, "fn"): {
+            "no_predictions": 2,  # below IOU threshold of .9
+            "misclassifications": 0,
+            "total": 2,
+        },
+        # (class, 2)
+        ("2", 0.05, "tp"): {"all": 1, "total": 1},
+        ("2", 0.05, "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        ("2", 0.75, "tp"): {"all": 0, "total": 0},
+        ("2", 0.75, "fn"): {
+            "no_predictions": 2,
+            "misclassifications": 0,
+            "total": 2,
+        },
+        # (class, 49)
+        ("49", 0.05, "tp"): {"all": 2, "total": 2},
+        # (class, 3)
+        ("3", 0.05, "tp"): {"all": 0, "total": 0},
+        ("3", 0.05, "fp"): {
+            "hallucinations": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        # (class, 1)
+        ("1", 0.05, "tp"): {"all": 0, "total": 0},
+        ("1", 0.8, "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        # (class, 0)
+        ("0", 0.05, "tp"): {"all": 1, "total": 1},
+        ("0", 0.95, "fn"): {
+            "no_predictions": 5,
+            "misclassifications": 0,
+            "total": 5,
+        },
+    }
+
+    for (
+        key,
+        value,
+        threshold,
+        metric,
+    ), expected_count in pr_expected_answers.items():
+        actual_count = pr_metrics[0]["value"][value][threshold][metric]
+        assert actual_count == expected_count
+
+    for (
+        value,
+        threshold,
+        metric,
+    ), expected_output in detailed_pr_expected_answers.items():
+        model_output = detailed_pr_metrics[0]["value"][value][threshold][
+            metric
+        ]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 1
+    )
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 1
+    )
+
+    # repeat the above, but with a higher pr_max_curves_example
+    manager = managers.ValorDetectionManager(
+        metrics_to_return=[
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.9,
+        pr_curve_max_examples=3,
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_functional_test_groundtruths,
+        predictions=evaluate_detection_functional_test_predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job_higher_threshold = manager.evaluate()
+
+    pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "PrecisionRecallCurve"
+    ]
+    detailed_pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "DetailedPrecisionRecallCurve"
+    ]
+
+    for (
+        key,
+        value,
+        threshold,
+        metric,
+    ), expected_count in pr_expected_answers.items():
+        actual_count = pr_metrics[0]["value"][value][threshold][metric]
+        assert actual_count == expected_count
+
+    for (
+        value,
+        threshold,
+        metric,
+    ), expected_output in detailed_pr_expected_answers.items():
+        model_output = detailed_pr_metrics[0]["value"][value][threshold][
+            metric
+        ]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 3
+    )
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 2
+    )
+
+    # test behavior if pr_curve_max_examples == 0
+    manager = managers.ValorDetectionManager(
+        metrics_to_return=[
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.9,
+        pr_curve_max_examples=0,
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_functional_test_groundtruths,
+        predictions=evaluate_detection_functional_test_predictions,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job_higher_threshold = manager.evaluate()
+
+    pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "PrecisionRecallCurve"
+    ]
+    detailed_pr_metrics = [
+        m
+        for m in eval_job_higher_threshold.metrics
+        if m["type"] == "DetailedPrecisionRecallCurve"
+    ]
+
+    for (
+        key,
+        value,
+        threshold,
+        metric,
+    ), expected_count in pr_expected_answers.items():
+        actual_count = pr_metrics[0]["value"][value][threshold][metric]
+        assert actual_count == expected_count
+
+    for (
+        value,
+        threshold,
+        metric,
+    ), expected_output in detailed_pr_expected_answers.items():
+        model_output = detailed_pr_metrics[0]["value"][value][threshold][
+            metric
+        ]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+
+    # spot check number of examples
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 0
+    )
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 0
+    )
+
+
+def test_evaluate_detection_functional_test_with_rasters_with_ValorDetectionManager(
+    evaluate_detection_functional_test_groundtruths_with_rasters,
+    evaluate_detection_functional_test_predictions_with_rasters,
+):
+
+    manager = managers.ValorDetectionManager(
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.5,
+        pr_curve_max_examples=1,
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_functional_test_groundtruths_with_rasters,
+        predictions=evaluate_detection_functional_test_predictions_with_rasters,
+    )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    metrics = [
+        m
+        for m in eval_job.metrics
+        if m["type"]
+        not in ["PrecisionRecallCurve", "DetailedPrecisionRecallCurve"]
+    ]
+
+    # round all metrics to the third decimal place
+    for i, m in enumerate(metrics):
+        metrics[i]["value"] = round(m["value"], 3)
+
+    pr_metrics = [
+        m for m in eval_job.metrics if m["type"] == "PrecisionRecallCurve"
+    ]
+
+    expected_metrics = [
+        {
+            "label": {"key": "class", "value": "label1"},
+            "parameters": {"iou": 0.5},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "label1"},
+            "parameters": {"iou": 0.75},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "label2"},
+            "parameters": {"iou": 0.5},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "label2"},
+            "parameters": {"iou": 0.75},
+            "value": 1.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "label3"},
+            "parameters": {"iou": 0.5},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "class", "value": "label3"},
+            "parameters": {"iou": 0.75},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "class", "iou": 0.5},
+            "value": 0.667,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "class", "iou": 0.75},
+            "value": 0.667,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "class", "value": "label1"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 1.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "label2"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 1.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "label3"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {
+                "label_key": "class",
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "value": 0.667,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "class", "value": "label1"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 1.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "label4"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": -1.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "label2"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 1.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "class", "value": "label3"},
+            "parameters": {
+                "ious": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+            },
+            "value": 0.0,
+            "type": "AR",
+        },
+        {
+            "parameters": {
+                "label_key": "class",
+                "ious": [
+                    0.5,
+                    0.55,
+                    0.6,
+                    0.65,
+                    0.7,
+                    0.75,
+                    0.8,
+                    0.85,
+                    0.9,
+                    0.95,
+                ],
+            },
+            "value": 0.667,
+            "type": "mAR",
+        },
+    ]
+
+    for m in metrics:
+        assert m in expected_metrics
+
+    for m in expected_metrics:
+        assert m in metrics
+
+    pr_expected_answers = {
+        ("class", "label1", 0.05, "tp"): 1,
+        ("class", "label1", 0.35, "tp"): 0,
+        ("class", "label2", 0.05, "tp"): 1,
+        ("class", "label2", 0.05, "fp"): 0,
+        ("class", "label2", 0.95, "fp"): 0,
+        ("class", "label3", 0.05, "tp"): 0,
+        ("class", "label3", 0.05, "fn"): 1,
+        ("class", "label4", 0.05, "tp"): 0,
+        ("class", "label4", 0.05, "fp"): 1,
+    }
+
+    for (
+        _,
+        value,
+        threshold,
+        metric,
+    ), expected_value in pr_expected_answers.items():
+        assert (
+            pr_metrics[0]["value"][value][threshold][metric] == expected_value
+        )
+
+
+def test_evaluate_mixed_annotations_with_ValorDetectionManager(
+    image_height: int,
+    image_width: int,
+):
+    """Test the automatic conversion to rasters."""
+
+    datum = schemas.Datum(uid="datum1")
+
+    xmin, xmax, ymin, ymax = 11, 45, 37, 102
+    h, w = image_height, image_width
+    mask = np.zeros((h, w), dtype=bool)
+    mask[ymin:ymax, xmin:xmax] = True
+
+    pts = [
+        (xmin, ymin),
+        (xmin, ymax),
+        (xmax, ymax),
+        (xmax, ymin),
+        (xmin, ymin),
+    ]
+    poly = schemas.Polygon([pts])
+    raster = schemas.Raster(mask)
+    box = schemas.Box.from_extrema(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
+
+    gt_annotations = [
+        schemas.Annotation(
+            raster=raster,
+            labels=[schemas.Label(key="key", value="value")],
+            is_instance=True,
+        ),
+        schemas.Annotation(
+            raster=raster,
+            labels=[schemas.Label(key="key1", value="value")],
+            is_instance=True,
+        ),
+        schemas.Annotation(
+            raster=raster,
+            labels=[schemas.Label(key="key2", value="value")],
+            is_instance=True,
+        ),
+    ]
+
+    pd_annotations = [
+        schemas.Annotation(
+            raster=raster,
+            labels=[schemas.Label(key="key", value="value", score=0.90)],
+            is_instance=True,
+        ),
+        schemas.Annotation(
+            polygon=poly,
+            labels=[schemas.Label(key="key1", value="value", score=0.89)],
+            is_instance=True,
+        ),
+        schemas.Annotation(
+            bounding_box=box,
+            labels=[schemas.Label(key="key2", value="value", score=0.88)],
+            is_instance=True,
+        ),
+    ]
+    gts = [
+        schemas.GroundTruth(
+            datum=datum,
+            annotations=[ann for ann in gt_annotations],
+        )
+    ]
+
+    pds = [
+        schemas.Prediction(
+            datum=datum,
+            annotations=[ann for ann in pd_annotations],
+        )
+    ]
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+        ],
+    )
+
+    # by default, valor_core should throw an error if given mixed AnnotationTypes without being explicitely told to convert to a certain type
+    with pytest.raises(ValueError):
+        manager.add_data(
+            groundtruths=gts,
+            predictions=pds,
+        )
+
+    # test conversion to raster. this should throw an error since the user is trying to convert a Box annotation to a polygon.
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+        ],
+        convert_annotations_to_type=enums.AnnotationType.RASTER,
+    )
+    with pytest.raises(ValueError):
+        manager.add_data(
+            groundtruths=gts,
+            predictions=pds,
+        )
+
+    # test conversion to polygon. this should throw an error since the user is trying to convert a Box annotation to a polygon.
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+        ],
+        convert_annotations_to_type=enums.AnnotationType.POLYGON,
+    )
+    with pytest.raises(ValueError):
+        manager.add_data(
+            groundtruths=gts,
+            predictions=pds,
+        )
+
+    # test conversion to box
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+        ],
+        convert_annotations_to_type=enums.AnnotationType.BOX,
+    )
+    manager.add_data(
+        groundtruths=gts,
+        predictions=pds,
+    )
+
+    eval_job_box = manager.evaluate()
+
+    expected = [
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 1.0,
+            "label": {"key": "key", "value": "value"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 1.0,
+            "label": {"key": "key", "value": "value"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 1.0,
+            "label": {"key": "key2", "value": "value"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 1.0,
+            "label": {"key": "key2", "value": "value"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 1.0,
+            "label": {"key": "key1", "value": "value"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 1.0,
+            "label": {"key": "key1", "value": "value"},
+        },
+    ]
+
+    for m in eval_job_box.metrics:
+        assert m in expected
+    for m in expected:
+        assert m in eval_job_box.metrics
+
+
+def test_evaluate_detection_rotated_bboxes_with_shapely_with_ValorDetectionManager(
+    rect1: list[tuple[float, float]],
+    rect2: list[tuple[float, float]],
+    rect3: list[tuple[float, float]],
+    img1: schemas.Datum,
+    img2: schemas.Datum,
+):
+    """
+    Run the same test as test_evaluate_detection, but rotate all of the bounding boxes by some random numbewr of degrees to confirm we get the same outputs.
+    """
+
+    random_angle = random.uniform(0, 365)
+
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box(
+                        [geometry.rotate_bbox(rect1, random_angle)]
+                    ),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2")],
+                    bounding_box=schemas.Box(
+                        [geometry.rotate_bbox(rect3, random_angle)]
+                    ),
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box(
+                        [geometry.rotate_bbox(rect2, random_angle)]
+                    ),
+                )
+            ],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1", score=0.3)],
+                    bounding_box=schemas.Box(
+                        [geometry.rotate_bbox(rect1, random_angle)]
+                    ),
+                )
+            ],
+        ),
+        schemas.Prediction(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2", score=0.98)],
+                    bounding_box=schemas.Box(
+                        [geometry.rotate_bbox(rect2, random_angle)]
+                    ),
+                )
+            ],
+        ),
+    ]
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+        ],
+    )
+    manager.add_data(
+        groundtruths=groundtruths,
+        predictions=predictions,
+    )
+
+    eval_job = manager.evaluate()
+
+    metrics = eval_job.metrics
+
+    expected_metrics = [
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.1},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.6},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "AR",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "mAR",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAR",
+        },
+    ]
+
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in metrics
+
+    assert eval_job.ignored_pred_labels == []
+    assert eval_job.missing_pred_labels == []
+
+    result = eval_job
+    result_dict = result.to_dict()
+
+    # duration isn't deterministic, so test meta separately
+    assert result_dict["meta"]["datums"] == 2
+    assert result_dict["meta"]["labels"] == 2
+    assert result_dict["meta"]["annotations"] == 5
+    assert result_dict["meta"]["duration"] <= 5
+    result_dict.pop("meta")
+    result_dict.pop("metrics")
+
+    assert result_dict == {
+        "parameters": {
+            "label_map": {},
+            "metrics_to_return": [
+                enums.MetricType.AP,
+                enums.MetricType.AR,
+                enums.MetricType.mAP,
+                enums.MetricType.APAveragedOverIOUs,
+                enums.MetricType.mAR,
+                enums.MetricType.mAPAveragedOverIOUs,
+            ],
+            "iou_thresholds_to_compute": [0.1, 0.6],
+            "iou_thresholds_to_return": [0.1, 0.6],
+            "recall_score_threshold": 0.0,
+            "pr_curve_iou_threshold": 0.5,
+            "pr_curve_max_examples": 1,
+            "convert_annotations_to_type": None,
+        },
+        "confusion_matrices": [],
+        "ignored_pred_labels": [],
+        "missing_pred_labels": [],
+    }
+
+    #  check that metrics arg works correctly
+    selected_metrics = random.sample(
+        [
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        2,
+    )
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=selected_metrics,
+    )
+    manager.add_data(
+        groundtruths=groundtruths,
+        predictions=predictions,
+    )
+
+    eval_job = manager.evaluate()
+
+    metrics = eval_job.metrics
+    assert set([metric["type"] for metric in eval_job.metrics]) == set(
+        selected_metrics
+    )
+
+
+@pytest.fixture
+def rect1_rotated_5_degrees_around_origin() -> list[tuple[float, float]]:
+    """Box with area = 1500."""
+    return [
+        (9.090389553440874, 10.833504408394036),
+        (58.90012445802815, 15.191291545776945),
+        (56.28545217559841, 45.07713248852931),
+        (6.475717271011129, 40.7193453511464),
+        (9.090389553440874, 10.833504408394036),
+    ]
+
+
+@pytest.fixture
+def rect2_rotated_5_degrees_around_origin() -> list[tuple[float, float]]:
+    """Box with area = 1100."""
+    return [
+        (14.942920471376183, 1.3073361412148725),
+        (69.7336288664222, 6.1009019923360714),
+        (67.99051401146903, 26.024795954170983),
+        (13.19980561642302, 21.231230103049782),
+        (14.942920471376183, 1.3073361412148725),
+    ]
+
+
+@pytest.fixture
+def rect3_rotated_5_degrees_around_origin() -> list[tuple[float, float]]:
+    """Box with area = 57,510."""
+    return [
+        (85.79738130650527, 17.544496599963715),
+        (156.52720487101922, 23.732554335047446),
+        (85.9310532454161, 830.6502597893614),
+        (15.20122968090216, 824.4622020542777),
+        (85.79738130650527, 17.544496599963715),
+    ]
+
+
+def test_evaluate_detection_rotated_bboxes_with_ValorDetectionManager(
+    rect1_rotated_5_degrees_around_origin: list[tuple[float, float]],
+    rect2_rotated_5_degrees_around_origin: list[tuple[float, float]],
+    rect3_rotated_5_degrees_around_origin: list[tuple[float, float]],
+    img1: schemas.Datum,
+    img2: schemas.Datum,
+):
+    """
+    Run the same test as test_evaluate_detection, but rotate all of the bounding boxes by 5 degrees around the origin to confirm we get the same outputs.
+    """
+
+    groundtruths = [
+        schemas.GroundTruth(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box(
+                        [rect1_rotated_5_degrees_around_origin]
+                    ),
+                ),
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2")],
+                    bounding_box=schemas.Box(
+                        [rect3_rotated_5_degrees_around_origin]
+                    ),
+                ),
+            ],
+        ),
+        schemas.GroundTruth(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1")],
+                    bounding_box=schemas.Box(
+                        [rect2_rotated_5_degrees_around_origin]
+                    ),
+                )
+            ],
+        ),
+    ]
+
+    predictions = [
+        schemas.Prediction(
+            datum=img1,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k1", value="v1", score=0.3)],
+                    bounding_box=schemas.Box(
+                        [rect1_rotated_5_degrees_around_origin]
+                    ),
+                )
+            ],
+        ),
+        schemas.Prediction(
+            datum=img2,
+            annotations=[
+                schemas.Annotation(
+                    is_instance=True,
+                    labels=[schemas.Label(key="k2", value="v2", score=0.98)],
+                    bounding_box=schemas.Box(
+                        [rect2_rotated_5_degrees_around_origin]
+                    ),
+                )
+            ],
+        ),
+    ]
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+        ],
+    )
+    manager.add_data(
+        groundtruths=groundtruths,
+        predictions=predictions,
+    )
+
+    eval_job = manager.evaluate()
+
+    metrics = eval_job.metrics
+
+    expected_metrics = [
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "AP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.1},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.1},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k1", "iou": 0.6},
+            "value": 0.504950495049505,
+            "type": "mAP",
+        },
+        {
+            "parameters": {"label_key": "k2", "iou": 0.6},
+            "value": 0.0,
+            "type": "mAP",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "APAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAPAveragedOverIOUs",
+        },
+        {
+            "label": {"key": "k2", "value": "v2"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "AR",
+        },
+        {
+            "label": {"key": "k1", "value": "v1"},
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "AR",
+        },
+        {
+            "parameters": {"label_key": "k1", "ious": [0.1, 0.6]},
+            "value": 0.5,
+            "type": "mAR",
+        },
+        {
+            "parameters": {"label_key": "k2", "ious": [0.1, 0.6]},
+            "value": 0.0,
+            "type": "mAR",
+        },
+    ]
+
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in metrics
+
+    assert eval_job.ignored_pred_labels == []
+    assert eval_job.missing_pred_labels == []
+
+    result = eval_job
+    result_dict = result.to_dict()
+
+    # duration isn't deterministic, so test meta separately
+    assert result_dict["meta"]["datums"] == 2
+    assert result_dict["meta"]["labels"] == 2
+    assert result_dict["meta"]["annotations"] == 5
+    assert result_dict["meta"]["duration"] <= 5
+    result_dict.pop("meta")
+    result_dict.pop("metrics")
+
+    assert result_dict == {
+        "parameters": {
+            "label_map": {},
+            "metrics_to_return": [
+                enums.MetricType.AP,
+                enums.MetricType.AR,
+                enums.MetricType.mAP,
+                enums.MetricType.APAveragedOverIOUs,
+                enums.MetricType.mAR,
+                enums.MetricType.mAPAveragedOverIOUs,
+            ],
+            "iou_thresholds_to_compute": [0.1, 0.6],
+            "iou_thresholds_to_return": [0.1, 0.6],
+            "recall_score_threshold": 0.0,
+            "pr_curve_iou_threshold": 0.5,
+            "pr_curve_max_examples": 1,
+            "convert_annotations_to_type": None,
+        },
+        "confusion_matrices": [],
+        "ignored_pred_labels": [],
+        "missing_pred_labels": [],
+    }
+
+    #  check that metrics arg works correctly
+    selected_metrics = random.sample(
+        [
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        2,
+    )
+
+    manager = managers.ValorDetectionManager(
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=selected_metrics,
+    )
+    manager.add_data(
+        groundtruths=groundtruths,
+        predictions=predictions,
+    )
+
+    eval_job = manager.evaluate()
+
+    metrics = eval_job.metrics
+    assert set([metric["type"] for metric in eval_job.metrics]) == set(
+        selected_metrics
+    )
+
+
+def test_evaluate_detection_with_label_maps_and_ValorDetectionManager(
+    evaluate_detection_groundtruths_with_label_maps,
+    evaluate_detection_predictions_with_label_maps,
+):
+    """This test is the same as test_evaluate_detection_with_label_maps, but we use ValorDetectionManager to pre-compute IOUs in advance"""
+    manager = managers.ValorDetectionManager(
+        pr_curve_max_examples=1,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps[:1],
+        predictions=evaluate_detection_predictions_with_label_maps[:1],
+    )
+
+    # test that both fields are required
+    with pytest.raises(ValueError):
+        manager.add_data(
+            groundtruths=[],
+            predictions=evaluate_detection_predictions_with_label_maps[:2],
+        )
+
+    with pytest.raises(ValueError):
+        manager.add_data(
+            groundtruths=evaluate_detection_groundtruths_with_label_maps[:2],
+            predictions=[],
+        )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps[1:2],
+        predictions=evaluate_detection_predictions_with_label_maps[1:2],
+    )
+
+    # can't add an already existing datum
+    with pytest.raises(ValueError):
+        manager.add_data(
+            groundtruths=evaluate_detection_groundtruths_with_label_maps[1:2],
+            predictions=evaluate_detection_predictions_with_label_maps[1:2],
+        )
+
+    # check that ious have been precomputed
+    assert "iou_" in manager.joint_df.columns
+    assert all(
+        [
+            col not in ["raster", "bounding_box"]
+            for col in manager.joint_df.columns
+        ]
+    )
+
+    eval_job = manager.evaluate()
+
+    baseline_expected_metrics = [
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "class_name", "value": "maine coon cat"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "class_name", "value": "maine coon cat"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "class", "value": "british shorthair"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "class", "value": "british shorthair"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "class", "value": "siamese cat"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "class", "value": "siamese cat"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": -1.0,
+            "label": {"key": "class_name", "value": "cat"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class_name", "value": "maine coon cat"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class", "value": "british shorthair"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": -1.0,
+            "label": {"key": "class", "value": "cat"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class", "value": "siamese cat"},
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.5,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class_name"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "class"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "class"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "class_name"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "class_name"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class_name", "value": "maine coon cat"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class", "value": "british shorthair"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "class", "value": "siamese cat"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class_name"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+    ]
+
+    assert (
+        eval_job.ignored_pred_labels is not None
+        and eval_job.missing_pred_labels is not None
+    )
+    assert (
+        len(eval_job.ignored_pred_labels) == 2
+    )  # we're ignoring the two "cat" model predictions
+    assert (
+        len(eval_job.missing_pred_labels) == 3
+    )  # we're missing three gts_det_syn representing different breeds of cats
+
+    metrics = eval_job.metrics
+
+    pr_metrics = []
+    pr_metrics = []
+    detailed_pr_metrics = []
+    for m in metrics:
+        if m["type"] == "PrecisionRecallCurve":
+            pr_metrics.append(m)
+        elif m["type"] == "DetailedPrecisionRecallCurve":
+            detailed_pr_metrics.append(m)
+        else:
+            assert m in baseline_expected_metrics
+
+    pr_metrics.sort(key=lambda x: x["parameters"]["label_key"])
+    detailed_pr_metrics.sort(key=lambda x: x["parameters"]["label_key"])
+
+    pr_expected_answers = {
+        # class
+        (
+            0,
+            "class",
+            "cat",
+            "0.1",
+            "fp",
+        ): 1,
+        (0, "class", "cat", "0.4", "fp"): 0,
+        (0, "class", "siamese cat", "0.1", "fn"): 1,
+        (0, "class", "british shorthair", "0.1", "fn"): 1,
+        # class_name
+        (1, "class_name", "cat", "0.1", "fp"): 1,
+        (1, "class_name", "maine coon cat", "0.1", "fn"): 1,
+        # k1
+        (2, "k1", "v1", "0.1", "fn"): 1,
+        (2, "k1", "v1", "0.1", "tp"): 1,
+        (2, "k1", "v1", "0.4", "fn"): 2,
+        # k2
+        (3, "k2", "v2", "0.1", "fn"): 1,
+        (3, "k2", "v2", "0.1", "fp"): 1,
+    }
+
+    for (
+        index,
+        key,
+        value,
+        threshold,
+        metric,
+    ), expected_value in pr_expected_answers.items():
+        assert (
+            pr_metrics[index]["value"][value][float(threshold)][metric]
+            == expected_value
+        )
+
+    # check DetailedPrecisionRecallCurve
+    detailed_pr_expected_answers = {
+        # class
+        (0, "cat", "0.1", "fp"): {
+            "hallucinations": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        (0, "cat", "0.4", "fp"): {
+            "hallucinations": 0,
+            "misclassifications": 0,
+            "total": 0,
+        },
+        (0, "british shorthair", "0.1", "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        # class_name
+        (1, "cat", "0.4", "fp"): {
+            "hallucinations": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        (1, "maine coon cat", "0.1", "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        # k1
+        (2, "v1", "0.1", "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        (2, "v1", "0.4", "fn"): {
+            "no_predictions": 2,
+            "misclassifications": 0,
+            "total": 2,
+        },
+        (2, "v1", "0.1", "tp"): {"all": 1, "total": 1},
+        # k2
+        (3, "v2", "0.1", "fn"): {
+            "no_predictions": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+        (3, "v2", "0.1", "fp"): {
+            "hallucinations": 1,
+            "misclassifications": 0,
+            "total": 1,
+        },
+    }
+
+    for (
+        index,
+        value,
+        threshold,
+        metric,
+    ), expected_output in detailed_pr_expected_answers.items():
+        model_output = detailed_pr_metrics[index]["value"][value][
+            float(threshold)
+        ][metric]
+        assert isinstance(model_output, dict)
+        assert model_output["total"] == expected_output["total"]
+        assert all(
+            [
+                model_output["observations"][key]["count"]  # type: ignore - we know this element is a dict
+                == expected_output[key]
+                for key in [
+                    key
+                    for key in expected_output.keys()
+                    if key not in ["total"]
+                ]
+            ]
+        )
+
+    # check that we get at most 1 example
+    assert (
+        len(
+            detailed_pr_metrics[0]["value"]["cat"][0.4]["fp"]["observations"]["hallucinations"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 0
+    )
+    assert (
+        len(
+            detailed_pr_metrics[2]["value"]["v1"][0.4]["fn"]["observations"]["no_predictions"][  # type: ignore - we know this element is a dict
+                "examples"
+            ]
+        )
+        == 1
+    )
+
+    # now, we correct most of the mismatched labels with a label map
+
+    label_map = {
+        schemas.Label(key="class_name", value="maine coon cat"): schemas.Label(
+            key="class", value="cat"
+        ),
+        schemas.Label(key="class", value="siamese cat"): schemas.Label(
+            key="class", value="cat"
+        ),
+        schemas.Label(key="class", value="british shorthair"): schemas.Label(
+            key="class", value="cat"
+        ),
+    }
+
+    # test that you can't modify an instanciated manager since that will lead to apples-to-oranges iou calculations
+    with pytest.raises(AttributeError):
+        manager.label_map = label_map
+
+    manager = managers.ValorDetectionManager(
+        label_map=label_map,
+        pr_curve_max_examples=1,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps[:1],
+        predictions=evaluate_detection_predictions_with_label_maps[:1],
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps[1:2],
+        predictions=evaluate_detection_predictions_with_label_maps[1:2],
+    )
+
+    eval_job = manager.evaluate()
+
+    cat_expected_metrics = [
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.33663366336633666,
+            "label": {"key": "class", "value": "cat"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.33663366336633666,
+            "label": {"key": "class", "value": "cat"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.3333333333333333,
+            "label": {"key": "class", "value": "cat"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": -1.0,
+            "label": {"key": "class_name", "value": "cat"},
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "class"},
+            "value": 0.33663366336633666,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "class"},
+            "value": 0.33663366336633666,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "class"},
+            "value": 0.33663366336633666,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "class"},
+            "value": 0.33663366336633666,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class"},
+            "value": 0.3333333333333333,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.5,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class_name"},
+            "value": -1.0,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.33663366336633666,
+            "label": {"key": "class", "value": "cat"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "class"},
+            "value": 0.33663366336633666,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+    ]
+
+    assert eval_job.ignored_pred_labels is not None
+    assert eval_job.missing_pred_labels is not None
+
+    assert (
+        len(eval_job.ignored_pred_labels) == 1
+    )  # Label(key='class_name', value='cat', score=None) is still never used
+    assert len(eval_job.missing_pred_labels) == 0
+
+    metrics = eval_job.metrics
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in cat_expected_metrics
+    for m in cat_expected_metrics:
+        assert m in metrics
+
+    assert eval_job.parameters.label_map == {
+        schemas.Label(
+            key="class_name", value="maine coon cat", score=None
+        ): schemas.Label(key="class", value="cat", score=None),
+        schemas.Label(
+            key="class", value="siamese cat", score=None
+        ): schemas.Label(key="class", value="cat", score=None),
+        schemas.Label(
+            key="class", value="british shorthair", score=None
+        ): schemas.Label(key="class", value="cat", score=None),
+    }
+
+    # next, we check that the label mapping works when the label is completely foreign
+    # to both groundtruths and predictions
+    label_map = {
+        # map the ground truths
+        schemas.Label(key="class_name", value="maine coon cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        schemas.Label(key="class", value="siamese cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        schemas.Label(key="class", value="british shorthair"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        # map the predictions
+        schemas.Label(key="class", value="cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        schemas.Label(key="class_name", value="cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+    }
+
+    manager = managers.ValorDetectionManager(
+        label_map=label_map,
+        pr_curve_max_examples=1,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps[:1],
+        predictions=evaluate_detection_predictions_with_label_maps[:1],
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps[1:2],
+        predictions=evaluate_detection_predictions_with_label_maps[1:2],
+    )
+
+    eval_job = manager.evaluate()
+
+    foo_expected_metrics = [
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.6666666666666666,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.5,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "foo"},
+            "value": 0.6666666666666666,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.5,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+    ]
+
+    assert (
+        eval_job.ignored_pred_labels is not None
+        and eval_job.missing_pred_labels is not None
+    )
+    assert len(eval_job.ignored_pred_labels) == 0
+    assert len(eval_job.missing_pred_labels) == 0
+
+    metrics = eval_job.metrics
+    for m in metrics:
+        if m["type"] not in [
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ]:
+            assert m in foo_expected_metrics
+    for m in foo_expected_metrics:
+        assert m in metrics
+
+    assert eval_job.parameters.label_map == {
+        schemas.Label(
+            key="class_name", value="maine coon cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(
+            key="class", value="siamese cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(
+            key="class", value="british shorthair", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(key="class", value="cat", score=None): schemas.Label(
+            key="foo", value="bar", score=None
+        ),
+        schemas.Label(
+            key="class_name", value="cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+    }
+
+    # finally, let's test using a higher recall_score_threshold
+    # this new threshold will disqualify all of our predictions for img1
+    label_map = {
+        # map the ground truths
+        schemas.Label(key="class_name", value="maine coon cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        schemas.Label(key="class", value="siamese cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        schemas.Label(key="class", value="british shorthair"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        # map the predictions
+        schemas.Label(key="class", value="cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+        schemas.Label(key="class_name", value="cat"): schemas.Label(
+            key="foo", value="bar"
+        ),
+    }
+
+    manager = managers.ValorDetectionManager(
+        label_map=label_map,
+        pr_curve_max_examples=1,
+        iou_thresholds_to_compute=[0.1, 0.6],
+        iou_thresholds_to_return=[0.1, 0.6],
+        recall_score_threshold=0.8,
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps[:1],
+        predictions=evaluate_detection_predictions_with_label_maps[:1],
+    )
+
+    manager.add_data(
+        groundtruths=evaluate_detection_groundtruths_with_label_maps[1:2],
+        predictions=evaluate_detection_predictions_with_label_maps[1:2],
+    )
+
+    eval_job = manager.evaluate()
+
+    foo_expected_metrics_with_higher_score_threshold = [
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.1},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.3333333333333333,  # two missed groundtruth on the first image, and 1 hit for the second image
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "AP",
+            "parameters": {"iou": 0.6},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "AR",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.1, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAP",
+            "parameters": {"iou": 0.6, "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "foo"},
+            "value": 0.3333333333333333,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.0,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.6633663366336634,
+            "label": {"key": "foo", "value": "bar"},
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.0,
+            "label": {"key": "k2", "value": "v2"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k1"},
+            "value": 0.504950495049505,
+        },
+        {
+            "type": "mAR",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+        {
+            "type": "APAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6]},
+            "value": 0.504950495049505,
+            "label": {"key": "k1", "value": "v1"},
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "foo"},
+            "value": 0.6633663366336634,
+        },
+        {
+            "type": "mAPAveragedOverIOUs",
+            "parameters": {"ious": [0.1, 0.6], "label_key": "k2"},
+            "value": 0.0,
+        },
+    ]
+
+    assert (
+        eval_job.ignored_pred_labels is not None
+        and eval_job.missing_pred_labels is not None
+    )
+    assert len(eval_job.ignored_pred_labels) == 0
+    assert len(eval_job.missing_pred_labels) == 0
+
+    assert eval_job.to_dict()["parameters"] == {
+        "label_map": {
+            schemas.Label(
+                key="class_name", value="maine coon cat", score=None
+            ): schemas.Label(key="foo", value="bar", score=None),
+            schemas.Label(
+                key="class", value="siamese cat", score=None
+            ): schemas.Label(key="foo", value="bar", score=None),
+            schemas.Label(
+                key="class", value="british shorthair", score=None
+            ): schemas.Label(key="foo", value="bar", score=None),
+            schemas.Label(key="class", value="cat", score=None): schemas.Label(
+                key="foo", value="bar", score=None
+            ),
+            schemas.Label(
+                key="class_name", value="cat", score=None
+            ): schemas.Label(key="foo", value="bar", score=None),
+        },
+        "metrics_to_return": [
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+        ],
+        "iou_thresholds_to_compute": [0.1, 0.6],
+        "iou_thresholds_to_return": [0.1, 0.6],
+        "recall_score_threshold": 0.8,
+        "pr_curve_iou_threshold": 0.5,
+        "pr_curve_max_examples": 1,
+        "convert_annotations_to_type": None,
+    }
+
+    metrics = eval_job.metrics
+
+    pr_metrics = []
+    for m in metrics:
+        if m["type"] == "PrecisionRecallCurve":
+            pr_metrics.append(m)
+        elif m["type"] == "DetailedPrecisionRecallCurve":
+            continue
+        else:
+            assert m in foo_expected_metrics_with_higher_score_threshold
+
+    for m in foo_expected_metrics_with_higher_score_threshold:
+        assert m in metrics
+
+    pr_metrics.sort(key=lambda x: x["parameters"]["label_key"])
+
+    pr_expected_answers = {
+        # foo
+        (0, "foo", "bar", "0.1", "fn"): 1,  # missed rect3
+        (0, "foo", "bar", "0.1", "tp"): 2,
+        (0, "foo", "bar", "0.4", "fn"): 2,
+        (0, "foo", "bar", "0.4", "tp"): 1,
+        # k1
+        (1, "k1", "v1", "0.1", "fn"): 1,
+        (1, "k1", "v1", "0.1", "tp"): 1,
+        (1, "k1", "v1", "0.4", "fn"): 2,
+        # k2
+        (2, "k2", "v2", "0.1", "fn"): 1,
+        (2, "k2", "v2", "0.1", "fp"): 1,
+    }
+
+    for (
+        index,
+        _,
+        value,
+        threshold,
+        metric,
+    ), expected_value in pr_expected_answers.items():
+        assert (
+            pr_metrics[index]["value"][value][float(threshold)][metric]
+            == expected_value
+        )
+
+    assert eval_job.parameters.label_map == {
+        schemas.Label(
+            key="class_name", value="maine coon cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(
+            key="class", value="siamese cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(
+            key="class", value="british shorthair", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+        schemas.Label(key="class", value="cat", score=None): schemas.Label(
+            key="foo", value="bar", score=None
+        ),
+        schemas.Label(
+            key="class_name", value="cat", score=None
+        ): schemas.Label(key="foo", value="bar", score=None),
+    }
diff --git a/core/tests/unit-tests/test_geometry.py b/core/tests/unit-tests/test_geometry.py
new file mode 100644
index 000000000..558912d19
--- /dev/null
+++ b/core/tests/unit-tests/test_geometry.py
@@ -0,0 +1,1122 @@
+import numpy as np
+import pytest
+from valor_core import geometry
+from valor_core.schemas import (
+    Box,
+    LineString,
+    MultiLineString,
+    MultiPoint,
+    Point,
+    Polygon,
+    Raster,
+)
+
+
+@pytest.fixture
+def skewed_box_points() -> list[tuple[float, float]]:
+    """Skewed box_points."""
+    return [
+        (0, 0),
+        (10, 0),
+        (15, 10),
+        (5, 10),
+        (0, 0),
+    ]
+
+
+@pytest.fixture
+def raster_raw_mask() -> np.ndarray:
+    """
+    Creates a 2d numpy of bools of shape:
+    | T  F |
+    | F  T |
+    """
+    ones = np.ones((10, 10))
+    zeros = np.zeros((10, 10))
+    top = np.concatenate((ones, zeros), axis=1)
+    bottom = np.concatenate((zeros, ones), axis=1)
+    return np.concatenate((top, bottom), axis=0) == 1
+
+
+def test_point():
+    # valid
+    p1 = Point((1, 1))
+    p2 = Point((1.0, 1.0))
+    p3 = Point((1.0, 0.99))
+
+    # test member fn `__hash__`
+    assert p1.__hash__() == p2.__hash__()
+    assert p1.__hash__() != p3.__hash__()
+
+    # test member fn `resize`
+    p11 = p1.resize(
+        og_img_h=10,
+        og_img_w=10,
+        new_img_h=100,
+        new_img_w=100,
+    )
+    assert p11.x == p1.x * 10
+    assert p11.y == p1.y * 10
+
+    # valid
+    p1 = Point(value=(3.14, -3.14))
+    assert Point(value=(3.14, -3.14))
+    assert Point(value=(-3.14, 3.14))
+
+    # test type validation
+    with pytest.raises(TypeError):
+        Point(value=("test", 0))  # type: ignore - purposefully throwing error
+    with pytest.raises(TypeError):
+        Point(value=(0, "test"))  # type: ignore - purposefully throwing error
+
+    # test geojson conversion
+    geojson = {"type": "Point", "coordinates": [3.14, -3.14]}
+    assert p1.to_dict() == geojson
+    assert Point.from_dict(geojson).value == [3.14, -3.14]
+
+    # test wkt conversion
+    wkt = "POINT (3.14 -3.14)"
+    assert p1.to_wkt() == wkt
+
+
+def test_polygon(box_points, skewed_box_points, rotated_box_points):
+    p1 = (-1, 0)
+    p2 = (-5, 2)
+    p3 = (-2, 5)
+    coords = [p1, p2, p3, p1]
+
+    # valid
+    poly = Polygon([coords])
+    poly_w_hole = Polygon([coords, coords])  # defines a hole
+
+    assert poly.to_wkt() == "POLYGON ((-1 0, -5 2, -2 5, -1 0))"
+    assert (
+        poly.to_array() == np.array([[-1, 0], [-5, 2], [-2, 5], [-1, 0]])
+    ).all()
+    assert poly.to_coordinates() == [
+        [
+            {"x": -1, "y": 0},
+            {"x": -5, "y": 2},
+            {"x": -2, "y": 5},
+            {"x": -1, "y": 0},
+        ]
+    ]
+
+    # test validation
+    with pytest.raises(ValueError):
+        assert Polygon([[p1, p2, p3]])
+    with pytest.raises(TypeError):
+        Polygon(123)  # type: ignore - testing
+    with pytest.raises(TypeError):
+        Polygon([poly, 123])  # type: ignore - testing
+    with pytest.raises(TypeError):
+        Polygon([poly, [123]])  # type: ignore - testing
+
+    # test property 'boundary'
+    assert poly.boundary == coords
+    assert poly_w_hole.boundary == coords
+
+    # test property 'holes'
+    assert poly.holes == []
+    assert poly_w_hole.holes == [coords]
+
+    # test property 'xmin'
+    assert poly.xmin == -5
+
+    # test property 'xmax'
+    assert poly.xmax == -1
+
+    # test property 'ymin'
+    assert poly.ymin == 0
+
+    # test property 'ymax'
+    assert poly.ymax == 5
+
+    # valid
+    p1 = Polygon(value=[box_points])
+    p2 = Polygon(value=[skewed_box_points, box_points])
+    p3 = Polygon(value=[skewed_box_points, box_points, rotated_box_points])
+
+    # test type validation
+    with pytest.raises(TypeError):
+        Polygon(value=[])
+    with pytest.raises(TypeError):
+        Polygon(value=box_points)  # type: ignore - purposefully throwing error
+    with pytest.raises(TypeError):
+        Polygon(
+            value=["skewed_box_points"]  # type: ignore - purposefully throwing error
+        )
+    with pytest.raises(TypeError):
+        Polygon(value=[box_points, []])
+    with pytest.raises(TypeError):
+        Polygon(
+            value=[box_points, 123]  # type: ignore - purposefully throwing error
+        )
+
+    # test geojson conversion
+    geojson = {
+        "type": "Polygon",
+        "coordinates": [
+            [[point[0], point[1]] for point in box_points],
+            [[point[0], point[1]] for point in skewed_box_points],
+        ],
+    }
+    assert Polygon(value=[box_points, skewed_box_points]).to_dict() == geojson
+    assert Polygon.from_dict(geojson).value == [
+        [[-5, -5], [5, -5], [5, 5], [-5, 5], [-5, -5]],
+        [[0, 0], [10, 0], [15, 10], [5, 10], [0, 0]],
+    ]
+
+    # test wkt conversion
+    assert p1.to_wkt() == "POLYGON ((-5 -5, 5 -5, 5 5, -5 5, -5 -5))"
+    assert (
+        p2.to_wkt()
+        == "POLYGON ((0 0, 10 0, 15 10, 5 10, 0 0),(-5 -5, 5 -5, 5 5, -5 5, -5 -5))"
+    )
+    assert (
+        p3.to_wkt()
+        == "POLYGON ((0 0, 10 0, 15 10, 5 10, 0 0),(-5 -5, 5 -5, 5 5, -5 5, -5 -5),(0 -7.0710678118654755, 7.0710678118654755 0, 0 7.0710678118654755, -7.0710678118654755 0, 0 -7.0710678118654755))"
+    )
+
+
+def test_box(box_points, skewed_box_points, rotated_box_points):
+    p1 = (-1, -2)
+    p2 = (10, -2)
+    p3 = (10, 11)
+    p4 = (-1, 11)
+    coords = [[p1, p2, p3, p4, p1]]
+
+    obj = Box(coords)
+    assert obj.to_wkt() == "POLYGON ((-1 -2, 10 -2, 10 11, -1 11, -1 -2))"
+    assert (
+        obj.to_array()
+        == np.array([[-1, -2], [10, -2], [10, 11], [-1, 11], [-1, -2]])
+    ).all()
+    assert obj.to_coordinates() == [
+        [
+            {"x": -1, "y": -2},
+            {"x": 10, "y": -2},
+            {"x": 10, "y": 11},
+            {"x": -1, "y": 11},
+            {"x": -1, "y": -2},
+        ]
+    ]
+
+    with pytest.raises(TypeError):
+        Box(polygon=p1)  # type: ignore - testing
+    with pytest.raises(ValueError):
+        Box([[p1, p2, p3, p4]])
+
+    # test classmethod `from_extrema`
+    assert Box.from_extrema(xmin=-1, xmax=10, ymin=-2, ymax=11).value == coords
+
+    assert Box(value=[box_points])
+
+    assert Box(value=[rotated_box_points])
+
+    with pytest.raises(NotImplementedError):
+        assert Box(value=[skewed_box_points])
+
+    # test type validation
+    with pytest.raises(ValueError):
+        Box(value=[])  # type: ignore - purposefully throwing error
+    with pytest.raises(ValueError):
+        Box(value=[box_points, box_points])  # box does not have holes
+    with pytest.raises(TypeError):  # type checking
+        Box(value=1234)  # type: ignore - purposefully throwing error
+    with pytest.raises(TypeError):
+        Box(value=box_points[0])  # type: ignore - purposefully throwing error
+    with pytest.raises(ValueError):
+        box_plus_one = [[*box_points[0:-1], (10, 10), box_points[0]]]
+        Box(value=box_plus_one)
+    with pytest.raises(ValueError):
+        box_minus_one = [[*box_points[0:-2], box_points[0]]]
+        Box(value=box_minus_one)
+
+    box_points_xmin = min([point[0] for point in box_points])
+    box_points_xmax = max([point[0] for point in box_points])
+    box_points_ymin = min([point[1] for point in box_points])
+    box_points_ymax = max([point[1] for point in box_points])
+    assert Box.from_extrema(
+        xmin=box_points_xmin,
+        ymin=box_points_ymin,
+        xmax=box_points_xmax,
+        ymax=box_points_ymax,
+    ).value == [box_points]
+
+    # test geojson conversion
+    geojson = {
+        "type": "Polygon",
+        "coordinates": [[[point[0], point[1]] for point in box_points]],
+    }
+    assert Box(value=[box_points]).to_dict() == geojson
+    assert Box.from_dict(geojson).value == [
+        [[-5, -5], [5, -5], [5, 5], [-5, 5], [-5, -5]]
+    ]
+
+    # test wkt conversion
+    assert (
+        Box(value=[box_points]).to_wkt()
+        == "POLYGON ((-5 -5, 5 -5, 5 5, -5 5, -5 -5))"
+    )
+
+    assert (
+        Box(value=[rotated_box_points]).to_wkt()
+        == "POLYGON ((0 -7.0710678118654755, 7.0710678118654755 0, 0 7.0710678118654755, -7.0710678118654755 0, 0 -7.0710678118654755))"
+    )
+
+
+def test_raster(
+    raster_raw_mask, box_points, skewed_box_points, rotated_box_points
+):
+    mask1 = np.ones((10, 10)) == 1
+
+    # valid
+    assert (
+        Raster(mask=mask1).to_array()
+        == np.array(
+            [
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+            ]
+        )
+    ).all()
+
+    assert (
+        Raster(mask=mask1).to_array()
+        == np.array(
+            [
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+                [True, True, True, True, True, True, True, True, True, True],
+            ]
+        )
+    ).all()
+
+    # test validation
+    with pytest.raises(TypeError):
+        assert Raster({"mask": "test", "geometry": None})  # type: ignore - testing
+    with pytest.raises(TypeError):
+        assert Raster(123)  # type: ignore - testing
+
+    mask2 = np.ones((10, 10, 10)) == 1
+    mask3 = np.ones((10, 10))
+    with pytest.raises(ValueError):
+        Raster(mask2)
+    with pytest.raises(ValueError):
+        Raster(mask3)
+
+    # test member fn `to_numpy`
+    r = Raster(raster_raw_mask)
+    value = r.encode_value()
+    assert value
+    assert (
+        value["mask"]
+        == "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUAQAAAACl8iCgAAAAF0lEQVR4nGP4f4CBiYGBIGZgsP9AjDoAuysDE0GVDN8AAAAASUVORK5CYII="
+    )
+    assert (r.to_array() == raster_raw_mask).all()
+
+    # test  non-2D arrays
+    with pytest.raises(ValueError):
+        Raster(mask=np.array([False]))
+
+    # test non-boolean arrays
+    with pytest.raises(ValueError):
+        Raster(mask=np.array([[1, 1]]))
+
+
+def test_multipoint(box_points):
+    # valid
+    assert MultiPoint(value=[box_points[0]])
+    assert MultiPoint(
+        value=[
+            box_points[0],
+            box_points[1],
+        ]
+    )
+    assert MultiPoint(
+        value=box_points,
+    )
+
+    # test type validation
+    with pytest.raises(TypeError):
+        LineString(value=[])  # type: ignore - purposefully throwing error
+    with pytest.raises(TypeError):
+        LineString(value="points")  # type: ignore - purposefully throwing error
+    with pytest.raises(TypeError):
+        LineString(value=box_points[0])  # type: ignore - purposefully throwing error
+    with pytest.raises(TypeError):
+        LineString(value=[1, 2])  # type: ignore - purposefully throwing error
+
+    # test geojson conversion
+    geojson = {
+        "type": "MultiPoint",
+        "coordinates": [[point[0], point[1]] for point in box_points],
+    }
+    assert MultiPoint(value=box_points).to_dict() == geojson
+    assert MultiPoint.from_dict(geojson).value == [
+        [-5, -5],
+        [5, -5],
+        [5, 5],
+        [-5, 5],
+        [-5, -5],
+    ]
+
+    # test wkt conversion
+    wkt = "MULTIPOINT ((-5 -5), (5 -5), (5 5), (-5 5), (-5 -5))"
+    assert MultiPoint(value=box_points).to_wkt() == wkt
+
+
+def test_linestring(box_points):
+    # valid
+    assert LineString(value=box_points[0:2])
+    assert LineString(
+        value=box_points,
+    )
+
+    # test that linestring requires at least two points
+    with pytest.raises(TypeError):
+        LineString(value=[])
+    with pytest.raises(TypeError):
+        LineString(value=[box_points[0]])
+
+    # test type validation
+    with pytest.raises(TypeError):
+        LineString(value="points")  # type: ignore - purposefully throwing error
+    with pytest.raises(TypeError):
+        LineString(value=[1, 2])  # type: ignore - purposefully throwing error
+
+    # test geojson conversion
+    geojson = {
+        "type": "LineString",
+        "coordinates": [[point[0], point[1]] for point in box_points],
+    }
+    assert LineString(value=box_points).to_dict() == geojson
+    assert LineString.from_dict(geojson).value == [
+        [-5, -5],
+        [5, -5],
+        [5, 5],
+        [-5, 5],
+        [-5, -5],
+    ]
+
+    # test wkt conversion
+    wkt = "LINESTRING (-5 -5, 5 -5, 5 5, -5 5, -5 -5)"
+    assert LineString(value=box_points).to_wkt() == wkt
+
+
+def test_multilinestring(
+    box_points,
+    skewed_box_points,
+):
+    assert MultiLineString(value=[box_points])
+    assert MultiLineString(value=[box_points, box_points])
+
+    # test type validation
+    with pytest.raises(ValueError):
+        MultiLineString(value=[])
+    with pytest.raises(TypeError):
+        MultiLineString(
+            value=[
+                box_points[0],
+                box_points[1],
+            ]
+        )
+    with pytest.raises(TypeError):
+        MultiLineString(
+            value=[
+                box_points[0],
+                box_points[1],
+                (1, 3),  # type: ignore - purposefully throwing error
+            ]
+        )
+
+    # test geojson conversion
+    geojson = {
+        "type": "MultiLineString",
+        "coordinates": [
+            [[point[0], point[1]] for point in box_points],
+            [[point[0], point[1]] for point in skewed_box_points],
+        ],
+    }
+    assert (
+        MultiLineString(value=[box_points, skewed_box_points]).to_dict()
+        == geojson
+    )
+    assert MultiLineString.from_dict(geojson).value == [
+        [[-5, -5], [5, -5], [5, 5], [-5, 5], [-5, -5]],
+        [[0, 0], [10, 0], [15, 10], [5, 10], [0, 0]],
+    ]
+
+    # test wkt conversion
+    wkt = "MULTILINESTRING ((-5 -5, 5 -5, 5 5, -5 5, -5 -5),(0 0, 10 0, 15 10, 5 10, 0 0))"
+    assert (
+        MultiLineString(value=[box_points, skewed_box_points]).to_wkt() == wkt
+    )
+
+
+def test_convert_coordinates_to_raster():
+    coordinates = [
+        [
+            {"x": 1, "y": 1},
+            {"x": 3, "y": 1},
+            {"x": 3, "y": 3},
+            {"x": 1, "y": 3},
+        ]
+    ]
+    height = 5
+    width = 5
+    expected_output = np.array(
+        [
+            [0, 0, 0, 0, 0],
+            [0, 1, 1, 1, 0],
+            [0, 1, 1, 1, 0],
+            [0, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0],
+        ]
+    )
+
+    raster = Raster.from_coordinates(coordinates, height, width).to_array()
+    assert np.array_equal(raster, expected_output)  # type: ignore - numpy typing error
+
+    # test empty coordinates
+    coordinates = []
+    height = 5
+    width = 5
+    expected_output = np.zeros((5, 5), dtype=np.uint8)
+
+    raster = Raster.from_coordinates(coordinates, height, width).to_array()
+    assert np.array_equal(raster, expected_output)  # type: ignore - numpy typing error
+
+    # test invalid contours
+    coordinates = [[{"x": 1, "y": 1}]]  # Invalid contour (only 1 point)
+    height = 5
+    width = 5
+    expected_output = np.zeros((5, 5), dtype=np.uint8)
+
+    raster = Raster.from_coordinates(coordinates, height, width).to_array()
+    assert np.array_equal(raster, expected_output)  # type: ignore - numpy typing error
+
+    # test multiple contours
+    coordinates = [
+        [
+            {"x": 1, "y": 1},
+            {"x": 3, "y": 1},
+            {"x": 3, "y": 3},
+            {"x": 1, "y": 3},
+        ],
+        [
+            {"x": 0, "y": 0},
+            {"x": 1, "y": 0},
+            {"x": 1, "y": 2},
+            {"x": 0, "y": 2},
+        ],
+    ]
+    height = 5
+    width = 5
+    expected_output = np.array(
+        [
+            [1, 1, 0, 0, 0],
+            [1, 1, 1, 1, 0],
+            [1, 1, 1, 1, 0],
+            [0, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0],
+        ]
+    )
+
+    raster = Raster.from_coordinates(coordinates, height, width).to_array()
+    assert np.array_equal(raster, expected_output)  # type: ignore - numpy typing error
+
+    # test errors
+    with pytest.raises(TypeError):
+        Raster.from_coordinates(
+            [
+                [[1, 1], [1, 2], [3, 1], [4, 1]],
+            ],  # type: ignore
+            height,
+            width,
+        )
+
+    with pytest.raises(TypeError):
+        Raster.from_coordinates(
+            [
+                [
+                    {"x": 1, "y": 1},
+                    {"x": 3, "y": 1},
+                    {"bad_key": 3, "y": 3},
+                    {"x": 1, "y": 3},
+                ],
+            ],
+            height,
+            width,
+        )
+
+
+def test_convert_geometry_to_raster():
+    # test box
+    p1 = (1, 2)
+    p2 = (3, 2)
+    p3 = (3, 5)
+    p4 = (1, 5)
+    coords = [[p1, p2, p3, p4, p1]]
+    box = Box(coords)
+    expected_output = np.zeros((5, 5), dtype=bool)
+    expected_output[2:5, 1:4] = True
+    output = Raster.from_geometry(box, height=5, width=5).to_array()
+    assert np.array_equal(
+        output,
+        expected_output,
+    )
+
+    p1 = (1, 2)
+    p2 = (5, 2)
+    p3 = (5, 7)
+    p4 = (1, 7)
+    coords = [[p1, p2, p3, p4, p1]]
+    box = Box(coords)
+    expected_output = np.zeros((8, 9), dtype=bool)
+    expected_output[2:8, 1:6] = True
+    output = Raster.from_geometry(box, width=9, height=8).to_array()
+    assert output.shape == (8, 9)  # 8 rows, 9 cols
+    assert np.array_equal(
+        output,
+        expected_output,
+    )
+
+    p1 = (1, 2)
+    p2 = (10, 2)
+    p3 = (10, 11)
+    p4 = (1, 11)
+    coords = [[p1, p2, p3, p4, p1]]
+    box = Box(coords)
+    expected_output = np.zeros((15, 15), dtype=bool)
+    expected_output[2:12, 1:11] = True
+    output = Raster.from_geometry(box, height=15, width=15).to_array()
+    assert np.array_equal(
+        output,
+        expected_output,
+    )
+
+    # test incorrect box (can't use negative coordinates)
+    p1 = (-1, -2)
+    p2 = (10, -2)
+    p3 = (10, 11)
+    p4 = (-1, 11)
+    coords = [[p1, p2, p3, p4, p1]]
+    box = Box(coords)
+
+    with pytest.raises(ValueError):
+        Raster.from_geometry(box, height=15, width=15).to_array()
+
+    # test case where the height and width is less than the implied height and width from the contours
+    p1 = (1, 2)
+    p2 = (10, 2)
+    p3 = (10, 11)
+    p4 = (1, 11)
+    coords = [[p1, p2, p3, p4, p1]]
+    box = Box(coords)
+    expected_output = np.zeros((6, 7), dtype=bool)
+    expected_output[2:6, 1:7] = True
+    output = Raster.from_geometry(box, height=6, width=7).to_array()
+    assert np.array_equal(
+        output,
+        expected_output,
+    )
+
+    # test polygons
+    # triangle
+    polygon = Polygon([[(2.0, 1.0), (6.0, 1.0), (4.0, 5.0), (2.0, 1.0)]])
+    output = Raster.from_geometry(polygon, height=9, width=9).to_array()
+    expected_output = np.array(
+        [
+            [False, False, False, False, False, False, False, False, False],
+            [False, False, True, True, True, True, True, False, False],
+            [False, False, False, True, True, True, False, False, False],
+            [False, False, False, True, True, True, False, False, False],
+            [False, False, False, False, True, False, False, False, False],
+            [False, False, False, False, True, False, False, False, False],
+            [False, False, False, False, False, False, False, False, False],
+            [False, False, False, False, False, False, False, False, False],
+            [False, False, False, False, False, False, False, False, False],
+        ]
+    )
+    assert np.array_equal(output, expected_output)
+
+    polygon = Polygon([[(0, 0), (2, 0), (1, 2), (0, 0)]])
+    output = Raster.from_geometry(polygon, height=3, width=3).to_array()
+    expected_output = np.array(
+        [[True, True, True], [False, True, False], [False, True, False]]
+    )
+    assert np.array_equal(output, expected_output)
+
+    # random five-pointed shape
+    polygon = Polygon([[(5, 7), (2, 3), (8, 1), (9, 6), (4, 5), (5, 7)]])
+    output = Raster.from_geometry(polygon, height=9, width=9).to_array()
+    expected_output = np.array(
+        [
+            [False, False, False, False, False, False, False, False, False],
+            [False, False, False, False, False, False, False, False, True],
+            [False, False, False, False, False, True, True, True, True],
+            [False, False, True, True, True, True, True, True, True],
+            [False, False, False, True, True, True, True, True, True],
+            [False, False, False, False, True, True, True, True, True],
+            [False, False, False, False, True, False, False, False, False],
+            [False, False, False, False, False, True, False, False, False],
+            [False, False, False, False, False, False, False, False, False],
+        ]
+    )
+    assert np.array_equal(output, expected_output)
+
+    # test multiple shapes
+    polygon = Polygon([[(0, 0), (2, 0), (1, 2), (0, 0)]]).to_coordinates()
+    box = Box([[(4, 4), (4, 5), (5, 5), (5, 4), (4, 4)]]).to_coordinates()
+    output = Raster.from_coordinates(
+        polygon + box, height=6, width=6
+    ).to_array()
+    expected_output = np.array(
+        [
+            [True, True, True, False, False, False],
+            [False, True, False, False, False, False],
+            [False, True, False, False, False, False],
+            [False, False, False, False, False, False],
+            [False, False, False, False, True, True],
+            [False, False, False, False, True, True],
+        ]
+    )
+    assert np.array_equal(output, expected_output)
+
+    # test if we don't have the right number of points
+    with pytest.raises(ValueError):
+        polygon = Polygon([[(0, 0), (0, 2), (2, 1)]])
+
+
+def test_calculate_iou():
+    """Test ability to calculate IOU for axis-aligend and rotated bounding boxes."""
+
+    # first, we test that we get the same IOU when we rotate polygon around the origin by the same number of degrees
+    # these tests were created by taking the original bboxes and rotating them by using:
+    # list(shapely.affinity.rotate(shapely.Polygon(bbox), angle=angle, origin="centroid").exterior.coords)
+    tests = [
+        {
+            "original_bbox1": [(1, 1), (6, 1), (6, 6), (1, 6)],
+            "original_bbox2": [(3, 3), (8, 3), (8, 8), (3, 8)],
+            "angles": [0, 45, 90],
+            "bbox1": [
+                [(1.0, 1.0), (6.0, 1.0), (6.0, 6.0), (1.0, 6.0), (1.0, 1.0)],
+                [
+                    (1.1102230246251565e-16, 1.414213562373095),
+                    (3.535533905932738, 4.949747468305833),
+                    (8.881784197001252e-16, 8.485281374238571),
+                    (-3.5355339059327373, 4.949747468305834),
+                    (1.1102230246251565e-16, 1.414213562373095),
+                ],
+                [
+                    (-1.0, 1.0),
+                    (-1.0, 6.0),
+                    (-6.0, 6.0),
+                    (-6.0, 1.0),
+                    (-1.0, 1.0),
+                ],
+            ],
+            "bbox2": [
+                [(3.0, 3.0), (8.0, 3.0), (8.0, 8.0), (3.0, 8.0), (3.0, 3.0)],
+                [
+                    (4.440892098500626e-16, 4.242640687119286),
+                    (3.535533905932738, 7.7781745930520225),
+                    (8.881784197001252e-16, 11.31370849898476),
+                    (-3.535533905932737, 7.778174593052023),
+                    (4.440892098500626e-16, 4.242640687119286),
+                ],
+                [
+                    (-3.0, 3.0),
+                    (-3.0, 8.0),
+                    (-8.0, 8.0),
+                    (-8.0, 3.0),
+                    (-3.0, 3.0),
+                ],
+            ],
+            # expected values come from shapely using the following function
+            # def shapely_calc(bbox1, bbox2):
+            #     poly1 = Pgon(bbox1)
+            #     poly2 = Pgon(bbox2)
+            #     intersection_area = poly1.intersection(poly2).area
+            #     union_area = poly1.area + poly2.area - intersection_area
+            #     return intersection_area / union_area if union_area != 0 else 0
+            "expected": 0.2195,
+        }
+    ]
+
+    for test in tests:
+        for bbox1, bbox2 in zip(test["bbox1"], test["bbox2"]):
+
+            expected = test["expected"]
+
+            iou = geometry.calculate_iou(bbox1=bbox1, bbox2=bbox2)
+            assert expected == round(iou, 4)
+
+    # next we rotate shapes around their centroids to check that we get the same IOUs as shapely
+    tests = [
+        {
+            "original_bbox1": [(1, 1), (6, 1), (6, 6), (1, 6)],
+            "original_bbox2": [(3, 3), (8, 3), (8, 8), (3, 8)],
+            "angles": [30, 60, 90, 112, 157, 249, 312],
+            "bbox1": [
+                [
+                    (2.584936490538903, 0.08493649053890318),
+                    (6.915063509461096, 2.5849364905389027),
+                    (4.415063509461097, 6.915063509461096),
+                    (0.08493649053890362, 4.415063509461096),
+                    (2.584936490538903, 0.08493649053890318),
+                ],
+                [
+                    (4.415063509461096, 0.08493649053890318),
+                    (6.915063509461097, 4.415063509461096),
+                    (2.5849364905389036, 6.915063509461096),
+                    (0.08493649053890273, 2.5849364905389036),
+                    (4.415063509461096, 0.08493649053890318),
+                ],
+                [(6.0, 1.0), (6.0, 6.0), (1.0, 6.0), (1.0, 1.0), (6.0, 1.0)],
+                [
+                    (6.754476119956748, 2.118556847122812),
+                    (4.881443152877187, 6.754476119956749),
+                    (0.2455238800432502, 4.881443152877188),
+                    (2.118556847122811, 0.2455238800432511),
+                    (6.754476119956748, 2.118556847122812),
+                ],
+                [
+                    (6.778089954854287, 4.824434312407915),
+                    (2.175565687592086, 6.778089954854286),
+                    (0.221910045145715, 2.1755656875920844),
+                    (4.8244343124079165, 0.22191004514571322),
+                    (6.778089954854287, 4.824434312407915),
+                ],
+                [
+                    (2.061968807620248, 6.729870940106256),
+                    (0.27012905989374447, 2.0619688076202483),
+                    (4.938031192379752, 0.27012905989374403),
+                    (6.729870940106256, 4.938031192379752),
+                    (2.061968807620248, 6.729870940106256),
+                ],
+                [
+                    (-0.030688579590631093, 3.6850355477963417),
+                    (3.3149644522036583, -0.030688579590631537),
+                    (7.030688579590632, 3.3149644522036574),
+                    (3.6850355477963417, 7.030688579590631),
+                    (-0.030688579590631093, 3.6850355477963417),
+                ],
+            ],
+            "bbox2": [
+                [
+                    (4.584936490538903, 2.084936490538903),
+                    (8.915063509461095, 4.584936490538903),
+                    (6.415063509461096, 8.915063509461095),
+                    (2.0849364905389027, 6.4150635094610955),
+                    (4.584936490538903, 2.084936490538903),
+                ],
+                [
+                    (6.4150635094610955, 2.084936490538903),
+                    (8.915063509461095, 6.4150635094610955),
+                    (4.584936490538904, 8.915063509461097),
+                    (2.0849364905389027, 4.5849364905389045),
+                    (6.4150635094610955, 2.084936490538903),
+                ],
+                [(8.0, 3.0), (8.0, 8.0), (3.0, 8.0), (3.0, 3.0), (8.0, 3.0)],
+                [
+                    (8.754476119956747, 4.118556847122812),
+                    (6.881443152877187, 8.754476119956749),
+                    (2.245523880043251, 6.881443152877189),
+                    (4.118556847122811, 2.2455238800432515),
+                    (8.754476119956747, 4.118556847122812),
+                ],
+                [
+                    (8.778089954854286, 6.824434312407915),
+                    (4.175565687592085, 8.778089954854286),
+                    (2.221910045145714, 4.175565687592085),
+                    (6.824434312407915, 2.221910045145714),
+                    (8.778089954854286, 6.824434312407915),
+                ],
+                [
+                    (4.061968807620248, 8.729870940106256),
+                    (2.270129059893745, 4.061968807620248),
+                    (6.938031192379753, 2.270129059893746),
+                    (8.729870940106256, 6.938031192379753),
+                    (4.061968807620248, 8.729870940106256),
+                ],
+                [
+                    (1.9693114204093698, 5.685035547796343),
+                    (5.314964452203658, 1.9693114204093694),
+                    (9.030688579590631, 5.314964452203658),
+                    (5.685035547796342, 9.030688579590631),
+                    (1.9693114204093698, 5.685035547796343),
+                ],
+            ],
+            "expected": [
+                0.2401,
+                0.2401,
+                0.2195,
+                0.2295,
+                0.2306,
+                0.2285,
+                0.2676,
+            ],
+        },
+        {
+            "original_bbox1": [(12, 15), (45, 15), (45, 48), (12, 48)],
+            "original_bbox2": [(22, 25), (55, 25), (55, 58), (22, 58)],
+            "angles": [
+                7,
+                24,
+                40,
+                65,
+                84,
+                107,
+                120,
+                143,
+                167,
+            ],
+            "bbox1": [
+                [
+                    (14.13383266410312, 13.112144331733255),
+                    (46.88785566826675, 17.13383266410312),
+                    (42.866167335896876, 49.88785566826675),
+                    (10.112144331733253, 45.866167335896876),
+                    (14.13383266410312, 13.112144331733255),
+                ],
+                [
+                    (20.13765455964779, 9.715345338146385),
+                    (50.28465466185362, 23.13765455964779),
+                    (36.862345440352215, 53.28465466185362),
+                    (6.715345338146383, 39.862345440352215),
+                    (20.13765455964779, 9.715345338146385),
+                ],
+                [
+                    (26.466262248364764, 8.254271128708965),
+                    (51.745728871291035, 29.46626224836476),
+                    (30.53373775163524, 54.745728871291035),
+                    (5.254271128708968, 33.53373775163524),
+                    (26.466262248364764, 8.254271128708965),
+                ],
+                [
+                    (36.480877167383184, 9.572720195173734),
+                    (50.42727980482626, 39.480877167383184),
+                    (20.519122832616816, 53.427279804826256),
+                    (6.572720195173737, 23.519122832616812),
+                    (36.480877167383184, 9.572720195173734),
+                ],
+                [
+                    (43.18489162966023, 13.365669082507209),
+                    (46.63433091749279, 46.18489162966023),
+                    (13.815108370339779, 49.63433091749279),
+                    (10.36566908250721, 16.81510837033977),
+                    (43.18489162966023, 13.365669082507209),
+                ],
+                [
+                    (49.10316160131525, 20.54510465453507),
+                    (39.45489534546494, 52.10316160131524),
+                    (7.896838398684764, 42.454895345464934),
+                    (17.545104654535074, 10.89683839868476),
+                    (49.10316160131525, 20.54510465453507),
+                ],
+                [
+                    (51.03941916244324, 25.46058083755676),
+                    (34.539419162443245, 54.03941916244324),
+                    (5.960580837556762, 37.53941916244324),
+                    (22.460580837556762, 8.960580837556762),
+                    (51.03941916244324, 25.46058083755676),
+                ],
+                [
+                    (51.60743379778914, 34.74753803377154),
+                    (25.252461966228466, 54.60743379778913),
+                    (5.3925662022108725, 28.252461966228463),
+                    (31.747538033771548, 8.392566202210872),
+                    (51.60743379778914, 34.74753803377154),
+                ],
+                [
+                    (48.288798465630144, 43.865413672282614),
+                    (16.13458632771738, 51.28879846563015),
+                    (8.711201534369835, 19.134586327717386),
+                    (40.86541367228261, 11.711201534369849),
+                    (48.288798465630144, 43.865413672282614),
+                ],
+            ],
+            "bbox2": [
+                [
+                    (24.133832664103117, 23.11214433173326),
+                    (56.887855668266745, 27.133832664103124),
+                    (52.866167335896876, 59.88785566826675),
+                    (20.11214433173325, 55.86616733589688),
+                    (24.133832664103117, 23.11214433173326),
+                ],
+                [
+                    (30.137654559647785, 19.71534533814638),
+                    (60.28465466185362, 33.137654559647785),
+                    (46.862345440352215, 63.28465466185361),
+                    (16.71534533814638, 49.862345440352215),
+                    (30.137654559647785, 19.71534533814638),
+                ],
+                [
+                    (36.46626224836476, 18.254271128708965),
+                    (61.745728871291035, 39.46626224836476),
+                    (40.53373775163524, 64.74572887129104),
+                    (15.254271128708968, 43.53373775163524),
+                    (36.46626224836476, 18.254271128708965),
+                ],
+                [
+                    (46.480877167383184, 19.572720195173737),
+                    (60.42727980482627, 49.480877167383184),
+                    (30.51912283261682, 63.42727980482626),
+                    (16.572720195173737, 33.519122832616816),
+                    (46.480877167383184, 19.572720195173737),
+                ],
+                [
+                    (53.18489162966023, 23.36566908250721),
+                    (56.63433091749279, 56.18489162966023),
+                    (23.81510837033978, 59.63433091749279),
+                    (20.365669082507218, 26.81510837033977),
+                    (53.18489162966023, 23.36566908250721),
+                ],
+                [
+                    (59.10316160131525, 30.54510465453507),
+                    (49.454895345464934, 62.10316160131524),
+                    (17.896838398684764, 52.454895345464934),
+                    (27.545104654535074, 20.89683839868476),
+                    (59.10316160131525, 30.54510465453507),
+                ],
+                [
+                    (61.03941916244324, 35.46058083755676),
+                    (44.53941916244324, 64.03941916244324),
+                    (15.960580837556762, 47.53941916244324),
+                    (32.46058083755676, 18.960580837556765),
+                    (61.03941916244324, 35.46058083755676),
+                ],
+                [
+                    (61.60743379778913, 44.74753803377154),
+                    (35.25246196622846, 64.60743379778913),
+                    (15.392566202210872, 38.252461966228466),
+                    (41.74753803377154, 18.392566202210872),
+                    (61.60743379778913, 44.74753803377154),
+                ],
+                [
+                    (58.28879846563015, 53.865413672282614),
+                    (26.134586327717386, 61.28879846563015),
+                    (18.71120153436985, 29.134586327717386),
+                    (50.865413672282614, 21.71120153436984),
+                    (58.28879846563015, 53.865413672282614),
+                ],
+            ],
+            "expected": [
+                0.3224,
+                0.3403,
+                0.3809,
+                0.3421,
+                0.3219,
+                0.3303,
+                0.3523,
+                0.3711,
+                0.3263,
+            ],
+        },
+    ]
+
+    for test in tests:
+        for bbox1, bbox2, expected in zip(
+            test["bbox1"], test["bbox2"], test["expected"]
+        ):
+            iou = geometry.calculate_iou(bbox1=bbox1, bbox2=bbox2)
+            assert expected == round(iou, 4)
+
+
+def test_is_axis_aligned(box_points, skewed_box_points, rotated_box_points):
+    tests = [
+        {
+            "bbox": [(1, 1), (6, 1), (6, 6), (1, 6)],
+            "expected": True,
+        },
+        # rotated box
+        {
+            "bbox": [
+                (2.584936490538903, 0.08493649053890318),
+                (6.915063509461096, 2.5849364905389027),
+                (4.415063509461097, 6.915063509461096),
+                (0.08493649053890362, 4.415063509461096),
+                (2.584936490538903, 0.08493649053890318),
+            ],
+            "expected": False,
+        },
+    ]
+
+    for test in tests:
+        assert geometry.is_axis_aligned(bbox=test["bbox"]) == test["expected"]
+
+    assert geometry.is_axis_aligned(bbox=box_points)
+    assert not geometry.is_axis_aligned(bbox=skewed_box_points)
+    assert not geometry.is_axis_aligned(bbox=rotated_box_points)
+
+
+def test_is_skewed(box_points, skewed_box_points, rotated_box_points):
+    tests = [
+        {
+            "bbox": [(1, 1), (6, 1), (6, 6), (1, 6)],
+            "expected": False,
+        },
+        # rotated box
+        {
+            "bbox": [
+                (2.584936490538903, 0.08493649053890318),
+                (6.915063509461096, 2.5849364905389027),
+                (4.415063509461097, 6.915063509461096),
+                (0.08493649053890362, 4.415063509461096),
+                (2.584936490538903, 0.08493649053890318),
+            ],
+            "expected": False,
+        },
+    ]
+
+    for test in tests:
+        assert geometry.is_skewed(bbox=test["bbox"]) == test["expected"]
+
+    assert not geometry.is_skewed(bbox=box_points)
+    assert geometry.is_skewed(bbox=skewed_box_points)
+    assert not geometry.is_skewed(bbox=rotated_box_points)
+
+
+def test_is_rotated(box_points, skewed_box_points, rotated_box_points):
+    tests = [
+        {
+            "bbox": [(1, 1), (6, 1), (6, 6), (1, 6)],
+            "expected": False,
+        },
+        # rotated box
+        {
+            "bbox": [
+                (2.584936490538903, 0.08493649053890318),
+                (6.915063509461096, 2.5849364905389027),
+                (4.415063509461097, 6.915063509461096),
+                (0.08493649053890362, 4.415063509461096),
+                (2.584936490538903, 0.08493649053890318),
+            ],
+            "expected": True,
+        },
+    ]
+
+    for test in tests:
+        assert geometry.is_rotated(bbox=test["bbox"]) == test["expected"]
+
+    assert not geometry.is_rotated(bbox=box_points)
+    assert not geometry.is_rotated(bbox=skewed_box_points)
+    assert geometry.is_rotated(bbox=rotated_box_points)
diff --git a/core/tests/unit-tests/test_metrics.py b/core/tests/unit-tests/test_metrics.py
new file mode 100644
index 000000000..525c1cc4e
--- /dev/null
+++ b/core/tests/unit-tests/test_metrics.py
@@ -0,0 +1,287 @@
+import pytest
+from valor_core import metrics, schemas
+
+
+def test_APMetric():
+    ap_metric = metrics.APMetric(
+        iou=0.2, value=0.5, label=schemas.Label(key="k1", value="v1")
+    )
+
+    with pytest.raises(TypeError):
+        metrics.APMetric(
+            iou=None, value=0.5, label=schemas.Label(key="k1", value="v1")  # type: ignore - purposefully throwing error
+        )
+
+    with pytest.raises(TypeError):
+        metrics.APMetric(iou=0.1, value=0.5, label="k1")  # type: ignore - purposefully throwing error
+
+    assert all(
+        [
+            key in ["label", "parameters", "value", "type"]
+            for key in ap_metric.to_dict().keys()
+        ]
+    )
+
+
+def test_APMetricAveragedOverIOUs():
+    ap_averaged_metric = metrics.APMetricAveragedOverIOUs(
+        ious=set([0.1, 0.2]),
+        value=0.5,
+        label=schemas.Label(key="k1", value="v1"),
+    )
+
+    with pytest.raises(TypeError):
+        metrics.APMetricAveragedOverIOUs(
+            ious=None, value=0.5, label=schemas.Label(key="k1", value="v1")  # type: ignore - purposefully throwing error
+        )
+
+    with pytest.raises(TypeError):
+        metrics.APMetricAveragedOverIOUs(
+            ious=set([0.1, 0.2]), value=0.5, label="k1"  # type: ignore - purposefully throwing error
+        )
+
+    assert all(
+        [
+            key in ["label", "parameters", "value", "type"]
+            for key in ap_averaged_metric.to_dict().keys()
+        ]
+    )
+
+
+def test_mAPMetric():
+    map_metric = metrics.mAPMetric(iou=0.2, value=0.5, label_key="key")
+
+    with pytest.raises(TypeError):
+        metrics.mAPMetric(iou=None, value=0.5, label_key="key")  # type: ignore - purposefully throwing error
+
+    with pytest.raises(TypeError):
+        metrics.mAPMetric(iou=0.1, value="value", label_key="key")  # type: ignore - purposefully throwing error
+
+    with pytest.raises(TypeError):
+        metrics.mAPMetric(iou=0.1, value=0.5, label_key=None)  # type: ignore - purposefully throwing error
+
+    assert all(
+        [
+            key in ["label", "parameters", "value", "type"]
+            for key in map_metric.to_dict()
+        ]
+    )
+
+
+def test_mAPMetricAveragedOverIOUs():
+    map_averaged_metric = metrics.mAPMetricAveragedOverIOUs(
+        ious=set([0.1, 0.2]), value=0.5, label_key="key"
+    )
+
+    with pytest.raises(TypeError):
+        metrics.mAPMetricAveragedOverIOUs(ious=None, value=0.5, label_key="key")  # type: ignore - purposefully throwing error
+
+    with pytest.raises(TypeError):
+        metrics.mAPMetricAveragedOverIOUs(ious=set([0.1, 0.2]), value="value", label_key="key")  # type: ignore - purposefully throwing error
+
+    with pytest.raises(TypeError):
+        map_averaged_metric = metrics.mAPMetricAveragedOverIOUs(
+            ious=set([0.1, 0.2]), value=0.5, label_key=None  # type: ignore - purposefully throwing error
+        )
+
+    assert all(
+        [
+            key in ["label", "parameters", "value", "type"]
+            for key in map_averaged_metric.to_dict()
+        ]
+    )
+
+
+def test_ConfusionMatrixEntry():
+    metrics.ConfusionMatrixEntry(
+        prediction="pred", groundtruth="gt", count=123
+    )
+
+    with pytest.raises(TypeError):
+        metrics.ConfusionMatrixEntry(
+            prediction=None, groundtruth="gt", count=123  # type: ignore - purposefully throwing error
+        )
+
+    with pytest.raises(TypeError):
+        metrics.ConfusionMatrixEntry(
+            prediction="pred", groundtruth=123, count=123  # type: ignore - purposefully throwing error
+        )
+
+    with pytest.raises(TypeError):
+        metrics.ConfusionMatrixEntry(
+            prediction="pred", groundtruth="gt", count="not an int"  # type: ignore - purposefully throwing error
+        )
+
+
+def test__BaseConfusionMatrix():
+    metrics._BaseConfusionMatrix(
+        label_key="label",
+        entries=[
+            metrics.ConfusionMatrixEntry(
+                prediction="pred1", groundtruth="gt1", count=123
+            ),
+            metrics.ConfusionMatrixEntry(
+                prediction="pred2", groundtruth="gt2", count=234
+            ),
+        ],
+    )
+
+    with pytest.raises(TypeError):
+        metrics._BaseConfusionMatrix(
+            label_key=123,  # type: ignore - purposefully throwing error
+            entries=[
+                metrics.ConfusionMatrixEntry(
+                    prediction="pred1", groundtruth="gt1", count=123
+                ),
+                metrics.ConfusionMatrixEntry(
+                    prediction="pred2", groundtruth="gt2", count=234
+                ),
+            ],
+        )
+
+    with pytest.raises(TypeError):
+        metrics._BaseConfusionMatrix(label_key="label", entries=None)  # type: ignore - purposefully throwing error
+
+    with pytest.raises(TypeError):
+        metrics._BaseConfusionMatrix(
+            label_key="label", entries=["not an entry"]  # type: ignore - purposefully throwing error
+        )
+
+
+def test_ConfusionMatrix():
+    confusion_matrix = metrics.ConfusionMatrix(
+        label_key="label",
+        entries=[
+            metrics.ConfusionMatrixEntry(
+                prediction="pred1", groundtruth="gt1", count=123
+            ),
+            metrics.ConfusionMatrixEntry(
+                prediction="pred2", groundtruth="gt2", count=234
+            ),
+        ],
+    )
+
+    with pytest.raises(TypeError):
+        metrics.ConfusionMatrix(
+            label_key=123,
+            entries=[
+                metrics.ConfusionMatrixEntry(
+                    prediction="pred1", groundtruth="gt1", count=123
+                ),
+                metrics.ConfusionMatrixEntry(
+                    prediction="pred2", groundtruth="gt2", count=234
+                ),
+            ],
+        )
+
+    with pytest.raises(TypeError):
+        metrics.ConfusionMatrix(label_key="label", entries=None)
+
+    with pytest.raises(TypeError):
+        metrics.ConfusionMatrix(label_key="label", entries=["not an entry"])
+
+    assert all(
+        [key in ["label_key", "entries"] for key in confusion_matrix.to_dict()]
+    )
+
+
+def test_AccuracyMetric():
+    acc_metric = metrics.AccuracyMetric(label_key="key", value=0.5)
+
+    with pytest.raises(TypeError):
+        metrics.AccuracyMetric(label_key=None, value=0.5)  # type: ignore - purposefully throwing error
+
+    with pytest.raises(TypeError):
+        metrics.AccuracyMetric(label_key="key", value="value")  # type: ignore - purposefully throwing error
+
+    assert all(
+        [
+            key in ["label", "parameters", "value", "type"]
+            for key in acc_metric.to_dict()
+        ]
+    )
+
+
+def test_PrecisionMetric():
+    precision_recall_metric = metrics.PrecisionMetric(
+        label=schemas.Label(key="key", value="value"), value=0.5
+    )
+    mapping = precision_recall_metric.to_dict()
+
+    assert all([key in ["value", "type", "label"] for key in mapping])
+
+    assert mapping["type"] == "Precision"
+
+
+def test_RecallMetric():
+    precision_recall_metric = metrics.RecallMetric(
+        label=schemas.Label(key="key", value="value"), value=0.5
+    )
+    mapping = precision_recall_metric.to_dict()
+
+    assert all(
+        [key in ["label", "parameters", "value", "type"] for key in mapping]
+    )
+
+    assert mapping["type"] == "Recall"
+
+
+def test_F1Metric():
+    precision_recall_metric = metrics.F1Metric(
+        label=schemas.Label(key="key", value="value"), value=0.5
+    )
+    mapping = precision_recall_metric.to_dict()
+
+    assert all(
+        [key in ["label", "parameters", "value", "type"] for key in mapping]
+    )
+
+    assert mapping["type"] == "F1"
+
+
+def test_ROCAUCMetric():
+    roc_auc_metric = metrics.ROCAUCMetric(label_key="key", value=0.2)
+
+    with pytest.raises(TypeError):
+        metrics.ROCAUCMetric(label_key=None, value=0.2)  # type: ignore - purposefully throwing error
+
+    with pytest.raises(TypeError):
+        metrics.ROCAUCMetric(label_key=123, value=0.2)  # type: ignore - purposefully throwing error
+
+    with pytest.raises(TypeError):
+        metrics.ROCAUCMetric(label_key="key", value="not a number")  # type: ignore - purposefully throwing error
+
+    assert all(
+        [
+            key in ["value", "type", "evaluation_id", "parameters"]
+            for key in roc_auc_metric.to_dict()
+        ]
+    )
+
+
+def test_PrecisionRecallCurve():
+
+    m = metrics.PrecisionRecallCurve(
+        label_key="k1",
+        pr_curve_iou_threshold=0.5,
+        value={"v1": {0.25: {"tp": 1}}},
+    )
+    assert m.to_dict() == {
+        "parameters": {"label_key": "k1"},
+        "value": {"v1": {0.25: {"tp": 1}}},
+        "type": "PrecisionRecallCurve",
+    }
+
+
+def test_DetailedPrecisionRecallCurve():
+
+    m = metrics.DetailedPrecisionRecallCurve(
+        label_key="k1",
+        pr_curve_iou_threshold=0.5,
+        value={"v1": {0.25: {"tp": {"total": 3}}}},
+    )
+    assert m.to_dict() == {
+        "parameters": {"label_key": "k1"},
+        "value": {"v1": {0.25: {"tp": {"total": 3}}}},
+        "type": "DetailedPrecisionRecallCurve",
+    }
diff --git a/core/tests/unit-tests/test_schemas.py b/core/tests/unit-tests/test_schemas.py
new file mode 100644
index 000000000..701d076f7
--- /dev/null
+++ b/core/tests/unit-tests/test_schemas.py
@@ -0,0 +1,382 @@
+import copy
+
+import pytest
+from valor_core import enums, schemas
+
+
+@pytest.fixture
+def metadata() -> dict[str, dict[str, str | float]]:
+    return {
+        "m1": {"type": "string", "value": "v1"},
+        "m2": {"type": "float", "value": 0.1},
+    }
+
+
+@pytest.fixture
+def labels() -> list[schemas.Label]:
+    return [
+        schemas.Label(key="k1", value="v1"),
+        schemas.Label(key="k1", value="v2"),
+        schemas.Label(key="k2", value="v3"),
+        schemas.Label(key="k3", value="v4"),
+    ]
+
+
+@pytest.fixture
+def box_points() -> list[tuple[float, float]]:
+    return [
+        (-5, -5),
+        (5, -5),
+        (5, 5),
+        (-5, 5),
+        (-5, -5),
+    ]
+
+
+@pytest.fixture
+def bbox(box_points: list[tuple[float, float]]) -> schemas.Box:
+    return schemas.Box(value=[box_points])
+
+
+@pytest.fixture
+def polygon(box_points: list[tuple[float, float]]) -> schemas.Polygon:
+    return schemas.Polygon(value=[box_points])
+
+
+@pytest.fixture
+def raster() -> schemas.Raster:
+    """
+    Creates a 2d numpy of bools of shape:
+    | T  F |
+    | F  T |
+    """
+    mask = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUAQAAAACl8iCgAAAAF0lEQVR4nGP4f4CBiYGBIGZgsP9AjDoAuysDE0GVDN8AAAAASUVORK5CYII="
+    return schemas.Raster.decode_value(mask)
+
+
+def test_label():
+    # valid
+    l1 = schemas.Label(key="test", value="value")
+
+    # test validation
+    with pytest.raises(TypeError):
+        assert schemas.Label(key=123, value="123")  # type: ignore - testing
+    with pytest.raises(TypeError):
+        assert schemas.Label(key="123", value=123)  # type: ignore - testing
+
+    # test member fn `__eq__`
+    l2 = schemas.Label(key="test", value="value")
+    assert l1 == l2
+
+    # test member fn `__ne__`
+    l3 = schemas.Label(key="test", value="other")
+    assert l1 != l3
+
+    # test member fn `__hash__`
+    assert l1.__hash__() == l2.__hash__()
+
+
+def test_scored_label():
+    l1 = schemas.Label(key="test", value="value")
+
+    # valid
+    s1 = schemas.Label(key="test", value="value", score=0.5)
+    s2 = schemas.Label(key="test", value="value", score=0.5)
+    s3 = schemas.Label(key="test", value="value", score=0.1)
+    s4 = schemas.Label(key="test", value="other", score=0.5)
+    s5 = schemas.Label(key="other", value="value", score=0.5)
+
+    # test validation
+    with pytest.raises(TypeError):
+        assert schemas.Label(key="k", value="v", score="boo")  # type: ignore - testing
+
+    # test property `key`
+    assert l1.key == "test"
+
+    # test property `value`
+    assert l1.value == "value"
+
+    # test member fn `__eq__`
+    assert s1 == s2
+    assert not (s1 == s3)
+    assert not (s1 == s4)
+    assert not (s1 == s5)
+
+    # test member fn `__ne__`
+    assert not (s1 != s2)
+    assert s1 != s3
+    assert s1 != s4
+    assert s1 != s5
+
+    # test member fn `__hash__`
+    assert s1.__hash__() == s2.__hash__()
+    assert s1.__hash__() != s3.__hash__()
+    assert s1.__hash__() != s4.__hash__()
+    assert s1.__hash__() != s5.__hash__()
+
+
+def test_label_equality():
+    label1 = schemas.Label(key="test", value="value")
+    label2 = schemas.Label(key="test", value="value")
+    label3 = schemas.Label(key="test", value="other")
+    label4 = schemas.Label(key="other", value="value")
+
+    eq1 = label1 == label2
+    assert eq1
+
+    eq2 = label1 == label3
+    assert not eq2
+
+    eq3 = label1 == label4
+    assert not eq3
+
+
+def test_label_score():
+    label1 = schemas.Label(key="test", value="value", score=0.5)
+    label2 = schemas.Label(key="test", value="value", score=0.5)
+    label3 = schemas.Label(key="test", value="value", score=0.1)
+    assert label1.score
+    assert label2.score
+    assert label3.score
+
+    b1 = label1.score == label2.score
+    assert b1
+
+    b2 = label1.score > label3.score
+    assert b2
+
+    b3 = label1.score < label3.score
+    assert not b3
+
+    b4 = label1.score >= label2.score
+    assert b4
+
+    b5 = label1.score != label3.score
+    assert b5
+
+    b6 = label1.score != label2.score
+    assert not b6
+
+
+def test_datum():
+    schemas.Datum(uid="123")
+    schemas.Datum(uid="123", metadata={})
+    schemas.Datum(uid="123", metadata={"name": 1})
+
+    with pytest.raises(TypeError):
+        schemas.Datum(uid=123)  # type: ignore
+    with pytest.raises(TypeError):
+        schemas.Datum(uid="123", metadata=1)  # type: ignore
+    with pytest.raises(TypeError):
+        schemas.Datum(uid="123", metadata=[1])  # type: ignore
+
+
+def test_annotation(
+    bbox: schemas.Box,
+    polygon: schemas.Polygon,
+    raster: schemas.Raster,
+    labels: list[schemas.Label],
+    metadata: dict[str, dict[str, str | float]],
+):
+    # valid
+    schemas.Annotation(
+        bounding_box=bbox,
+        labels=labels,
+    )
+    schemas.Annotation(
+        polygon=polygon,
+        labels=labels,
+    )
+    schemas.Annotation(raster=raster, labels=labels)
+    schemas.Annotation(
+        raster=raster,
+        labels=labels,
+    )
+    schemas.Annotation(
+        labels=labels,
+        bounding_box=bbox,
+        polygon=polygon,
+        raster=raster,
+    )
+    schemas.Annotation(labels=labels)
+    schemas.Annotation(labels=labels, metadata={})
+    schemas.Annotation(
+        labels=labels,
+        metadata=metadata,
+    )
+    schemas.Annotation(
+        labels=labels,
+        polygon=bbox,  # bbox is a constrained polygon so this is valid usage
+    )
+
+    # test `__post_init__`
+    with pytest.raises(TypeError):
+        schemas.Annotation(
+            labels=labels,
+            bounding_box=schemas.Polygon(value=[[(0, 0), (1, 0), (1, 1), (0, 0)]]),  # type: ignore - testing
+        )
+    with pytest.raises(TypeError):
+        schemas.Annotation(
+            labels=labels,
+            raster=bbox,  # type: ignore
+        )
+    with pytest.raises(TypeError):
+        schemas.Annotation(
+            labels=labels,
+            metadata=[1234],  # type: ignore - testing
+        )
+
+
+def test_groundtruth_annotation():
+    l1 = schemas.Label(key="test", value="value")
+    l2 = schemas.Label(key="test", value="other")
+    l3 = schemas.Label(key="other", value="value")
+
+    # valid
+    schemas.Annotation(
+        labels=[l1, l2, l3],
+    )
+
+    # test `__post_init__`
+    with pytest.raises(TypeError):
+        schemas.Annotation(labels=l1)  # type: ignore - testing
+    with pytest.raises(TypeError):
+        schemas.Annotation(labels=[l1, l2, "label"])  # type: ignore - testing
+
+
+def test_prediction_annotation():
+    l1 = schemas.Label(key="test", value="value")
+    l2 = schemas.Label(key="test", value="other")
+    l3 = schemas.Label(key="other", value="value")
+
+    s1 = copy.deepcopy(l1)
+    s1.score = 0.5
+    s2 = copy.deepcopy(l2)
+    s2.score = 0.5
+    s3 = copy.deepcopy(l3)
+    s3.score = 1
+
+    # valid
+    schemas.Annotation(labels=[s1, s2, s3])
+
+    # test `__post_init__`
+    with pytest.raises(TypeError):
+        schemas.Annotation(labels=s1)  # type: ignore - testing
+    with pytest.raises(TypeError):
+        schemas.Annotation(labels=[s1, s2, "label"])  # type: ignore - testing
+
+
+def test_groundtruth():
+    label = schemas.Label(key="test", value="value")
+    datum = schemas.Datum(uid="somefile")
+    gts = [
+        schemas.Annotation(labels=[label]),
+        schemas.Annotation(labels=[label]),
+    ]
+
+    # valid
+    schemas.GroundTruth(
+        datum=datum,
+        annotations=gts,
+    )
+
+    # test `__post_init__`
+    with pytest.raises(TypeError):
+        schemas.GroundTruth(
+            datum="schemas.Datum",  # type: ignore - testing
+            annotations=gts,
+        )
+    with pytest.raises(TypeError):
+        schemas.GroundTruth(
+            datum=datum,
+            annotations=gts[0],  # type: ignore - testing
+        )
+
+    with pytest.raises(TypeError):
+        schemas.GroundTruth(
+            datum=datum,
+            annotations=[gts[0], gts[1], "schemas.Annotation"],  # type: ignore - testing
+        )
+
+    assert schemas.GroundTruth(
+        datum=datum,
+        annotations=gts,
+    ) == schemas.GroundTruth(
+        datum=datum,
+        annotations=gts,
+    )
+
+
+def test_prediction():
+    scored_label = schemas.Label(key="test", value="value", score=1.0)
+    datum = schemas.Datum(uid="somefile")
+    pds = [
+        schemas.Annotation(
+            labels=[scored_label],
+        ),
+        schemas.Annotation(
+            labels=[scored_label],
+        ),
+    ]
+
+    schemas.Prediction(datum=datum, annotations=pds)
+
+    # test `__post_init__`
+    with pytest.raises(TypeError):
+        schemas.Prediction(datum="schemas.Datum", annotations=pds)  # type: ignore - testing
+    with pytest.raises(TypeError):
+        schemas.Prediction(
+            datum=datum,
+            annotations=pds[0],  # type: ignore - testing
+        )
+
+    with pytest.raises(TypeError):
+        schemas.Prediction(
+            datum=datum,
+            annotations=[pds[0], pds[1], "schemas.Annotation"],  # type: ignore - testing
+        )
+
+    assert schemas.Prediction(
+        datum=datum, annotations=pds
+    ) == schemas.Prediction(datum=datum, annotations=pds)
+
+
+def test_EvaluationParameters():
+    schemas.EvaluationParameters()
+
+    schemas.EvaluationParameters(
+        iou_thresholds_to_compute=[0.2, 0.6],
+        iou_thresholds_to_return=[],
+    )
+
+    schemas.EvaluationParameters(
+        iou_thresholds_to_compute=[],
+        iou_thresholds_to_return=[],
+    )
+
+    # If no llm-guided metrics are requested, then llm_api_params is not required.
+    schemas.EvaluationParameters(
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+        ],
+    )
+
+    schemas.EvaluationParameters(
+        convert_annotations_to_type=enums.AnnotationType.BOX,
+    )
+
+    with pytest.raises(TypeError):
+        schemas.EvaluationParameters(
+            label_map=[
+                [["class_name", "maine coon cat"], ["class", "cat"]],
+                [["class", "siamese cat"], ["class", "cat"]],
+                [["class", "british shorthair"], ["class", "cat"]],
+            ],  # type: ignore
+        )
+
+    with pytest.raises(TypeError):
+        schemas.EvaluationParameters(label_map={"bad": "inputs"})  # type: ignore
+
+    with pytest.raises(TypeError):
+        schemas.EvaluationParameters(metrics_to_return={"bad": "inputs"})  # type: ignore
diff --git a/core/tests/unit-tests/test_utilities.py b/core/tests/unit-tests/test_utilities.py
new file mode 100644
index 000000000..758ec1d94
--- /dev/null
+++ b/core/tests/unit-tests/test_utilities.py
@@ -0,0 +1,283 @@
+import pandas as pd
+import pytest
+from valor_core import enums
+from valor_core.utilities import (
+    create_validated_groundtruth_df,
+    create_validated_prediction_df,
+)
+
+
+def test_create_validated_groundtruth_df():
+
+    # test that the dataframe has the right columns
+    df = pd.DataFrame(
+        [
+            {
+                "datum_uid": "uid0",
+                "datum_id": "img0",
+                "id": "gt0",
+                "label_key": "class_label",
+                "label_value": "dog",
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": "img1",
+                "id": "gt1",
+                "label_key": "class_label",
+                "label_value": "dog",
+            },
+            {
+                "datum_uid": "uid2",
+                "datum_id": "img2",
+                "id": "gt2",
+                "label_key": "class_label",
+                "label_value": "dog",
+            },
+            {
+                "datum_uid": "uid3",
+                "datum_id": "img3",
+                "id": "gt3",
+                "label_key": "class_label",
+                "label_value": "dog",
+            },
+            {
+                "datum_uid": "uid4",
+                "datum_id": "img4",
+                "id": "gt4",
+                "label_key": "class_label",
+                "label_value": "dog",
+            },
+        ]
+    )
+
+    with pytest.raises(ValueError):
+        create_validated_groundtruth_df(
+            df, task_type=enums.TaskType.CLASSIFICATION
+        )
+
+    # test that we get an error if we don't pass non-unique IDs
+    df = pd.DataFrame(
+        [
+            {
+                "datum_uid": "uid0",
+                "datum_id": "img0",
+                "id": "gt0",
+                "label_key": "class_label",
+                "label_value": "dog",
+                "annotation_id": 1,
+                "label_id": 0,
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": "img1",
+                "id": "gt0",
+                "label_key": "class_label",
+                "annotation_id": 2,
+                "label_value": "dog",
+                "label_id": 0,
+            },
+        ]
+    )
+
+    with pytest.raises(ValueError):
+        create_validated_groundtruth_df(
+            df, task_type=enums.TaskType.CLASSIFICATION
+        )
+
+    # test that groundtruth dataframes can't have scores
+    df = pd.DataFrame(
+        [
+            {
+                "datum_uid": "uid0",
+                "datum_id": "img0",
+                "id": "gt0",
+                "label_key": "class_label",
+                "label_value": "dog",
+                "annotation_id": 1,
+                "label_id": 0,
+                "score": 0.99,
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": "img1",
+                "id": "gt1",
+                "label_key": "class_label",
+                "annotation_id": 2,
+                "label_value": "dog",
+                "label_id": 0,
+                "score": 0.01,
+            },
+        ]
+    )
+
+    with pytest.raises(ValueError):
+        create_validated_groundtruth_df(
+            df, task_type=enums.TaskType.CLASSIFICATION
+        )
+
+    # test correct example
+    df = pd.DataFrame(
+        [
+            {
+                "datum_uid": "uid0",
+                "datum_id": "img0",
+                "id": "gt0",
+                "label_key": "class_label",
+                "label_value": "dog",
+                "annotation_id": 1,
+                "label_id": 0,
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": "img1",
+                "id": "gt1",
+                "label_key": "class_label",
+                "annotation_id": 2,
+                "label_value": "dog",
+                "label_id": 0,
+            },
+        ]
+    )
+
+    create_validated_groundtruth_df(
+        df, task_type=enums.TaskType.CLASSIFICATION
+    )
+
+
+def test_create_validated_prediction_df():
+
+    # test that the dataframe has the right columns
+    df = pd.DataFrame(
+        [
+            {
+                "datum_uid": "uid0",
+                "datum_id": "img0",
+                "id": "pd0",
+                "label_key": "class_label",
+                "label_value": "dog",
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": "img1",
+                "id": "pd1",
+                "label_key": "class_label",
+                "label_value": "dog",
+            },
+            {
+                "datum_uid": "uid2",
+                "datum_id": "img2",
+                "id": "pd2",
+                "label_key": "class_label",
+                "label_value": "dog",
+            },
+            {
+                "datum_uid": "uid3",
+                "datum_id": "img3",
+                "id": "pd3",
+                "label_key": "class_label",
+                "label_value": "dog",
+            },
+            {
+                "datum_uid": "uid4",
+                "datum_id": "img4",
+                "id": "pd4",
+                "label_key": "class_label",
+                "label_value": "dog",
+            },
+        ]
+    )
+
+    with pytest.raises(ValueError):
+        create_validated_prediction_df(
+            df, task_type=enums.TaskType.CLASSIFICATION
+        )
+
+    # test that we get an error if we don't pass non-unique IDs
+    df = pd.DataFrame(
+        [
+            {
+                "datum_uid": "uid0",
+                "datum_id": "img0",
+                "id": "pd0",
+                "label_key": "class_label",
+                "label_value": "dog",
+                "annotation_id": 1,
+                "label_id": 0,
+                "score": 0.08,
+            },
+            {
+                "datum_uid": "uid1",
+                "datum_id": "img1",
+                "id": "pd0",
+                "label_key": "class_label",
+                "annotation_id": 2,
+                "label_value": "cat",
+                "label_id": 0,
+                "score": 0.92,
+            },
+        ]
+    )
+
+    with pytest.raises(ValueError):
+        create_validated_prediction_df(
+            df, task_type=enums.TaskType.CLASSIFICATION
+        )
+
+    # test that we get an error if the prediction scores for a given label key and datum don't add up to 1
+    df = pd.DataFrame(
+        [
+            {
+                "datum_uid": "uid0",
+                "datum_id": "img0",
+                "id": "pd0",
+                "label_key": "class_label",
+                "label_value": "dog",
+                "annotation_id": 1,
+                "label_id": 0,
+                "score": 0.04,
+            },
+            {
+                "datum_uid": "uid0",
+                "datum_id": "img0",
+                "id": "pd1",
+                "label_key": "class_label",
+                "annotation_id": 2,
+                "label_value": "cat",
+                "label_id": 0,
+                "score": 0.92,
+            },
+        ]
+    )
+
+    with pytest.raises(ValueError):
+        create_validated_prediction_df(
+            df, task_type=enums.TaskType.CLASSIFICATION
+        )
+
+    # test correct example
+    df = pd.DataFrame(
+        [
+            {
+                "datum_uid": "uid0",
+                "datum_id": "img0",
+                "id": "pd0",
+                "label_key": "class_label",
+                "label_value": "dog",
+                "annotation_id": 1,
+                "label_id": 0,
+                "score": 0.08,
+            },
+            {
+                "datum_uid": "uid0",
+                "datum_id": "img0",
+                "id": "pd1",
+                "label_key": "class_label",
+                "annotation_id": 2,
+                "label_value": "cat",
+                "label_id": 0,
+                "score": 0.92,
+            },
+        ]
+    )
+
+    create_validated_prediction_df(df, task_type=enums.TaskType.CLASSIFICATION)
diff --git a/core/valor_core/__init__.py b/core/valor_core/__init__.py
new file mode 100644
index 000000000..21d64ce96
--- /dev/null
+++ b/core/valor_core/__init__.py
@@ -0,0 +1,69 @@
+from .classification import evaluate_classification
+from .detection import evaluate_detection
+from .managers import ValorDetectionManager
+from .metrics import (
+    AccuracyMetric,
+    APMetric,
+    APMetricAveragedOverIOUs,
+    ARMetric,
+    ConfusionMatrix,
+    ConfusionMatrixEntry,
+    DetailedPrecisionRecallCurve,
+    F1Metric,
+    PrecisionMetric,
+    PrecisionRecallCurve,
+    RecallMetric,
+    ROCAUCMetric,
+    mAPMetric,
+    mAPMetricAveragedOverIOUs,
+    mARMetric,
+)
+from .schemas import (
+    Annotation,
+    Box,
+    Datum,
+    Evaluation,
+    GroundTruth,
+    Label,
+    LineString,
+    MultiLineString,
+    MultiPoint,
+    Point,
+    Polygon,
+    Prediction,
+    Raster,
+)
+
+__all__ = [
+    "ValorDetectionManager",
+    "evaluate_classification",
+    "evaluate_detection",
+    "Annotation",
+    "Datum",
+    "GroundTruth",
+    "Prediction",
+    "Label",
+    "Point",
+    "Polygon",
+    "Evaluation",
+    "Raster",
+    "AccuracyMetric",
+    "ConfusionMatrix",
+    "F1Metric",
+    "PrecisionMetric",
+    "RecallMetric",
+    "ROCAUCMetric",
+    "PrecisionRecallCurve",
+    "DetailedPrecisionRecallCurve",
+    "APMetric",
+    "ARMetric",
+    "mARMetric",
+    "APMetricAveragedOverIOUs",
+    "MultiPoint",
+    "LineString",
+    "MultiLineString",
+    "Box",
+    "mAPMetric",
+    "mAPMetricAveragedOverIOUs",
+    "ConfusionMatrixEntry",
+]
diff --git a/core/valor_core/classification.py b/core/valor_core/classification.py
new file mode 100644
index 000000000..377a5b305
--- /dev/null
+++ b/core/valor_core/classification.py
@@ -0,0 +1,1324 @@
+import gc
+import time
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+from valor_core import enums, metrics, schemas, utilities
+
+
+def _calculate_confusion_matrix_df(
+    merged_groundtruths_and_predictions_df: pd.DataFrame,
+) -> tuple[pd.DataFrame, list[metrics.ConfusionMatrix]]:
+    """Calculate our confusion matrix dataframe."""
+
+    cm_counts_df = (
+        merged_groundtruths_and_predictions_df[
+            ["label_key", "pd_label_value", "gt_label_value"]
+        ]
+        .groupby(
+            ["label_key", "pd_label_value", "gt_label_value"],
+            as_index=False,
+            dropna=False,
+        )
+        .size()
+    )
+
+    cm_counts_df["true_positive_flag"] = (
+        cm_counts_df["pd_label_value"] == cm_counts_df["gt_label_value"]
+    )
+
+    # resolve pandas typing error
+    if not isinstance(cm_counts_df, pd.DataFrame):
+        raise TypeError(
+            f"Expected a pd.DataFrame, but got {type(cm_counts_df)}"
+        )
+
+    # count of predictions per grouper key
+    cm_counts_df = cm_counts_df.merge(
+        cm_counts_df.groupby(
+            ["label_key", "pd_label_value"],
+            as_index=False,
+            dropna=False,
+        )
+        .size()
+        .rename({"size": "number_of_predictions"}, axis=1),
+        on=["label_key", "pd_label_value"],
+    )
+
+    # count of groundtruths per grouper key
+    cm_counts_df = cm_counts_df.merge(
+        cm_counts_df.groupby(
+            ["label_key", "gt_label_value"],
+            as_index=False,
+            dropna=False,
+        )
+        .size()
+        .rename({"size": "number_of_groundtruths"}, axis=1),
+    )
+
+    cm_counts_df = cm_counts_df.merge(
+        cm_counts_df[
+            [
+                "label_key",
+                "pd_label_value",
+                "true_positive_flag",
+            ]
+        ]
+        .groupby(
+            ["label_key", "pd_label_value"],
+            as_index=False,
+            dropna=False,
+        )
+        .sum()
+        .rename(
+            columns={"true_positive_flag": "true_positives_per_pd_label_value"}
+        ),
+        on=["label_key", "pd_label_value"],
+    )
+
+    cm_counts_df = cm_counts_df.merge(
+        cm_counts_df[["label_key", "gt_label_value", "true_positive_flag"]]
+        .groupby(
+            ["label_key", "gt_label_value"],
+            as_index=False,
+            dropna=False,
+        )
+        .sum()
+        .rename(
+            columns={"true_positive_flag": "true_positives_per_gt_label_value"}
+        ),
+        on=["label_key", "gt_label_value"],
+    )
+
+    cm_counts_df = cm_counts_df.merge(
+        cm_counts_df[["label_key", "true_positive_flag"]]
+        .groupby("label_key", as_index=False, dropna=False)
+        .sum()
+        .rename(
+            columns={"true_positive_flag": "true_positives_per_label_key"}
+        ),
+        on="label_key",
+    )
+
+    # create ConfusionMatrix objects
+    confusion_matrices = []
+    for label_key in cm_counts_df.loc[:, "label_key"].unique():
+        revelant_rows = cm_counts_df.loc[
+            (cm_counts_df["label_key"] == label_key)
+            & cm_counts_df["gt_label_value"].notnull()
+        ]
+        relevant_confusion_matrices = metrics.ConfusionMatrix(
+            label_key=label_key,
+            entries=[
+                metrics.ConfusionMatrixEntry(
+                    prediction=row["pd_label_value"],
+                    groundtruth=row["gt_label_value"],
+                    count=row["size"],
+                )
+                for row in revelant_rows.to_dict(orient="records")
+                if isinstance(row["pd_label_value"], str)
+                and isinstance(row["gt_label_value"], str)
+            ],
+        )
+        confusion_matrices.append(relevant_confusion_matrices)
+
+    return cm_counts_df, confusion_matrices
+
+
+def _calculate_metrics_at_label_value_level(
+    cm_counts_df: pd.DataFrame,
+) -> pd.DataFrame:
+    """Calculate metrics using the confusion matix dataframe."""
+
+    # create base dataframe that's unique at the (grouper key, grouper value level)
+    unique_label_values_per_label_key_df = pd.DataFrame(
+        np.concatenate(
+            [
+                cm_counts_df[["label_key", "pd_label_value"]].values,
+                cm_counts_df.loc[
+                    cm_counts_df["gt_label_value"].notnull(),
+                    ["label_key", "gt_label_value"],
+                ].values,
+            ]
+        ),
+        columns=[
+            "label_key",
+            "label_value",
+        ],
+    ).drop_duplicates()
+
+    # compute metrics using confusion matrices
+    metrics_per_label_key_and_label_value_df = (
+        unique_label_values_per_label_key_df.assign(
+            number_true_positives=lambda df: df.apply(
+                lambda chain_df: (
+                    cm_counts_df[
+                        (
+                            cm_counts_df["gt_label_value"]
+                            == chain_df["label_value"]
+                        )
+                        & (cm_counts_df["label_key"] == chain_df["label_key"])
+                        & (cm_counts_df["true_positive_flag"])
+                    ]["size"].sum()
+                ),
+                axis=1,
+            )
+        )
+        .assign(
+            number_of_groundtruths=unique_label_values_per_label_key_df.apply(
+                lambda chain_df: (
+                    cm_counts_df[
+                        (
+                            cm_counts_df["gt_label_value"]
+                            == chain_df["label_value"]
+                        )
+                        & (cm_counts_df["label_key"] == chain_df["label_key"])
+                    ]["size"].sum()
+                ),
+                axis=1,
+            )
+        )
+        .assign(
+            number_of_predictions=unique_label_values_per_label_key_df.apply(
+                lambda chain_df: (
+                    cm_counts_df[
+                        (
+                            cm_counts_df["pd_label_value"]
+                            == chain_df["label_value"]
+                        )
+                        & (cm_counts_df["label_key"] == chain_df["label_key"])
+                    ]["size"].sum()
+                ),
+                axis=1,
+            )
+        )
+        .assign(
+            precision=lambda chain_df: chain_df["number_true_positives"]
+            / chain_df["number_of_predictions"]
+        )
+        .assign(
+            recall=lambda chain_df: chain_df["number_true_positives"]
+            / chain_df["number_of_groundtruths"]
+        )
+        .assign(
+            f1=lambda chain_df: (
+                2 * chain_df["precision"] * chain_df["recall"]
+            )
+            / (chain_df["precision"] + chain_df["recall"])
+        )
+    )
+
+    # replace nulls and infinities
+    metrics_per_label_key_and_label_value_df[
+        ["precision", "recall", "f1"]
+    ] = metrics_per_label_key_and_label_value_df.loc[
+        :, ["precision", "recall", "f1"]
+    ].replace(
+        [np.inf, -np.inf, np.nan], 0
+    )
+
+    # replace values of labels that only exist in predictions (not groundtruths) with -1
+    labels_to_replace = cm_counts_df.loc[
+        cm_counts_df["gt_label_value"].isnull(),
+        ["label_key", "pd_label_value"],
+    ].values.tolist()
+
+    for key, value in labels_to_replace:
+        metrics_per_label_key_and_label_value_df.loc[
+            (metrics_per_label_key_and_label_value_df["label_key"] == key)
+            & (
+                metrics_per_label_key_and_label_value_df["label_value"]
+                == value
+            ),
+            ["precision", "recall", "f1"],
+        ] = -1
+
+    return metrics_per_label_key_and_label_value_df
+
+
+def _calculate_precision_recall_f1_metrics(
+    metrics_per_label_key_and_label_value_df: pd.DataFrame,
+) -> list[metrics.PrecisionMetric | metrics.RecallMetric | metrics.F1Metric]:
+    """Calculate Precision, Recall, and F1 metrics."""
+    # create metric objects
+    output = []
+
+    for row in metrics_per_label_key_and_label_value_df.loc[
+        ~metrics_per_label_key_and_label_value_df["label_value"].isnull(),
+        ["label_key", "label_value", "precision", "recall", "f1"],
+    ].to_dict(orient="records"):
+        pydantic_label = schemas.Label(
+            key=row["label_key"], value=row["label_value"]
+        )
+
+        output += [
+            metrics.PrecisionMetric(
+                label=pydantic_label,
+                value=row["precision"],
+            ),
+            metrics.RecallMetric(
+                label=pydantic_label,
+                value=row["recall"],
+            ),
+            metrics.F1Metric(
+                label=pydantic_label,
+                value=row["f1"],
+            ),
+        ]
+    return output
+
+
+def _calculate_accuracy_metrics(
+    cm_counts_df: pd.DataFrame,
+) -> list[metrics.AccuracyMetric]:
+    """Calculate Accuracy metrics."""
+    accuracy_calculations = (
+        cm_counts_df.loc[
+            (
+                cm_counts_df["gt_label_value"].notnull()
+                & cm_counts_df["true_positive_flag"]
+            ),
+            ["label_key", "size"],
+        ]
+        .groupby(["label_key"], as_index=False)
+        .sum()
+        .rename({"size": "true_positives_per_label_key"}, axis=1)
+    ).merge(
+        cm_counts_df.loc[
+            (cm_counts_df["gt_label_value"].notnull()),
+            ["label_key", "size"],
+        ]
+        .groupby(["label_key"], as_index=False)
+        .sum()
+        .rename({"size": "observations_per_label_key"}, axis=1),
+        on="label_key",
+        how="outer",
+    )
+
+    accuracy_calculations["accuracy"] = (
+        accuracy_calculations["true_positives_per_label_key"]
+        / accuracy_calculations["observations_per_label_key"]
+    )
+
+    # some elements may be np.nan if a given grouper key has no true positives
+    # replace those accuracy scores with 0
+    accuracy_calculations["accuracy"] = accuracy_calculations[
+        "accuracy"
+    ].fillna(value=0)
+
+    return [
+        metrics.AccuracyMetric(
+            label_key=values["label_key"], value=values["accuracy"]
+        )
+        for _, values in accuracy_calculations.iterrows()
+    ]
+
+
+def _get_joint_df(
+    prediction_df: pd.DataFrame, groundtruth_df: pd.DataFrame
+) -> pd.DataFrame:
+    """Merge the ground truth and prediction dataframes into one, joint dataframe."""
+    max_scores_by_label_key_and_datum_id = (
+        prediction_df[["label_key", "datum_id", "score"]]
+        .groupby(
+            [
+                "label_key",
+                "datum_id",
+            ],
+            as_index=False,
+        )
+        .max()
+    )
+
+    # catch pandas typing error
+    if not isinstance(prediction_df, pd.DataFrame) or not isinstance(
+        max_scores_by_label_key_and_datum_id, pd.DataFrame
+    ):
+        raise ValueError(
+            "prediction_df and max_scores_by_label_key_and_datum_id must be pandas Dataframes."
+        )
+
+    best_prediction_id_per_label_key_and_datum_id = (
+        pd.merge(
+            prediction_df,
+            max_scores_by_label_key_and_datum_id,
+            on=["label_key", "datum_id", "score"],
+            how="inner",
+        )[["label_key", "datum_id", "id", "score"]]
+        .groupby(["label_key", "datum_id"], as_index=False)
+        .min()
+        .rename(columns={"score": "best_score"})
+    )
+
+    best_prediction_label_for_each_label_key_and_datum = pd.merge(
+        prediction_df[["label_key", "label_value", "datum_id", "id"]],
+        best_prediction_id_per_label_key_and_datum_id,
+        on=["label_key", "datum_id", "id"],
+        how="inner",
+    )[["label_key", "datum_id", "label_value", "best_score"]]
+
+    # count the number of matches for each (pd_label_value, gt_label_value) for each label_key
+    merged_groundtruths_and_predictions_df = pd.merge(
+        groundtruth_df[["datum_id", "label_key", "label_value"]].rename(
+            columns={"label_value": "gt_label_value"}
+        ),
+        best_prediction_label_for_each_label_key_and_datum.rename(
+            columns={"label_value": "pd_label_value"}
+        ),
+        on=["datum_id", "label_key"],
+        how="left",
+    )
+
+    # add back any labels that appear in predictions but not groundtruths
+    missing_labels_from_predictions = list(
+        set(
+            zip(
+                [None] * len(prediction_df),
+                prediction_df["label_key"],
+                [None] * len(prediction_df),
+                prediction_df["label_value"],
+                [None] * len(prediction_df),
+            )
+        ).difference(
+            set(
+                zip(
+                    [None] * len(merged_groundtruths_and_predictions_df),
+                    merged_groundtruths_and_predictions_df["label_key"],
+                    [None] * len(merged_groundtruths_and_predictions_df),
+                    merged_groundtruths_and_predictions_df["pd_label_value"],
+                    [None] * len(prediction_df),
+                )
+            ).union(
+                set(
+                    zip(
+                        [None] * len(merged_groundtruths_and_predictions_df),
+                        merged_groundtruths_and_predictions_df["label_key"],
+                        [None] * len(merged_groundtruths_and_predictions_df),
+                        merged_groundtruths_and_predictions_df[
+                            "gt_label_value"
+                        ],
+                        [None] * len(prediction_df),
+                    )
+                )
+            )
+        )
+    )
+
+    missing_label_df = pd.DataFrame(
+        missing_labels_from_predictions,
+        columns=merged_groundtruths_and_predictions_df.columns,
+    )
+
+    merged_groundtruths_and_predictions_df = (
+        merged_groundtruths_and_predictions_df.copy()
+        if missing_label_df.empty
+        else (
+            missing_label_df.copy()
+            if merged_groundtruths_and_predictions_df.empty
+            else pd.concat(
+                [
+                    merged_groundtruths_and_predictions_df,
+                    missing_label_df,
+                ],
+                ignore_index=True,
+            )
+        )
+    )
+
+    return merged_groundtruths_and_predictions_df
+
+
+def _calculate_rocauc(
+    prediction_df: pd.DataFrame, groundtruth_df: pd.DataFrame
+) -> list[metrics.ROCAUCMetric]:
+    """Calculate ROC AUC metrics."""
+    # if there are no predictions, then ROCAUC should be 0 for all groundtruth grouper keys
+    if prediction_df.empty:
+        return [
+            metrics.ROCAUCMetric(label_key=label_key, value=float(0))
+            for label_key in groundtruth_df["label_key"].unique()
+        ]
+
+    merged_predictions_and_groundtruths = (
+        prediction_df[["datum_id", "label_key", "label_value", "score"]]
+        .merge(
+            groundtruth_df[["datum_id", "label_key", "label_value"]].rename(
+                columns={
+                    "label_value": "gt_label_value",
+                }
+            ),
+            on=["datum_id", "label_key"],
+            how="left",
+        )
+        .assign(
+            is_true_positive=lambda chain_df: chain_df["label_value"]
+            == chain_df["gt_label_value"],
+        )
+        .assign(
+            is_false_positive=lambda chain_df: chain_df["label_value"]
+            != chain_df["gt_label_value"],
+        )
+    ).sort_values(
+        by=["score", "label_key", "gt_label_value"],
+        ascending=[False, False, True],
+    )
+
+    # count the number of observations (i.e., predictions) and true positives for each grouper key
+    total_observations_per_label_key_and_label_value = (
+        merged_predictions_and_groundtruths.groupby(
+            ["label_key", "label_value"], as_index=False
+        )["gt_label_value"]
+        .size()
+        .rename({"size": "n"}, axis=1)
+    )
+
+    total_true_positves_per_label_key_and_label_value = (
+        merged_predictions_and_groundtruths.loc[
+            merged_predictions_and_groundtruths["is_true_positive"], :
+        ]
+        .groupby(["label_key", "label_value"], as_index=False)[
+            "gt_label_value"
+        ]
+        .size()
+        .rename({"size": "n_true_positives"}, axis=1)
+    )
+
+    merged_counts = merged_predictions_and_groundtruths.merge(
+        total_observations_per_label_key_and_label_value,
+        on=["label_key", "label_value"],
+        how="left",
+    ).merge(
+        total_true_positves_per_label_key_and_label_value,
+        on=["label_key", "label_value"],
+        how="left",
+    )
+
+    cumulative_sums = (
+        merged_counts[
+            [
+                "label_key",
+                "label_value",
+                "is_true_positive",
+                "is_false_positive",
+            ]
+        ]
+        .groupby(["label_key", "label_value"], as_index=False)
+        .cumsum()
+    ).rename(
+        columns={
+            "is_true_positive": "cum_true_positive_cnt",
+            "is_false_positive": "cum_false_positive_cnt",
+        }
+    )
+
+    rates = pd.concat([merged_counts, cumulative_sums], axis=1)
+
+    # correct cumulative sums to be the max value for a given datum_id / label_key / label_value (this logic brings pandas' cumsum logic in line with psql's sum().over())
+    max_cum_sums = (
+        rates.groupby(["label_key", "label_value", "score"], as_index=False)[
+            ["cum_true_positive_cnt", "cum_false_positive_cnt"]
+        ]
+        .max()
+        .rename(
+            columns={
+                "cum_true_positive_cnt": "max_cum_true_positive_cnt",
+                "cum_false_positive_cnt": "max_cum_false_positive_cnt",
+            }
+        )
+    )
+    rates = rates.merge(max_cum_sums, on=["label_key", "label_value", "score"])
+    rates["cum_true_positive_cnt"] = rates[
+        ["cum_true_positive_cnt", "max_cum_true_positive_cnt"]
+    ].max(axis=1)
+    rates["cum_false_positive_cnt"] = rates[
+        ["cum_false_positive_cnt", "max_cum_false_positive_cnt"]
+    ].max(axis=1)
+
+    # calculate tpr and fpr
+    rates = rates.assign(
+        tpr=lambda chain_df: chain_df["cum_true_positive_cnt"]
+        / chain_df["n_true_positives"]
+    ).assign(
+        fpr=lambda chain_df: chain_df["cum_false_positive_cnt"]
+        / (chain_df["n"] - chain_df["n_true_positives"])
+    )
+
+    # sum trapezoidal areas by grouper key and grouper value
+    trap_areas_per_label_value = pd.concat(
+        [
+            rates[
+                [
+                    "label_key",
+                    "label_value",
+                    "n",
+                    "n_true_positives",
+                    "tpr",
+                    "fpr",
+                ]
+            ],
+            rates.groupby(["label_key", "label_value"], as_index=False)[
+                ["tpr", "fpr"]
+            ]
+            .shift(1)
+            .rename(columns={"tpr": "lagged_tpr", "fpr": "lagged_fpr"}),
+        ],
+        axis=1,
+    ).assign(
+        trap_area=lambda chain_df: 0.5
+        * (
+            (chain_df["tpr"] + chain_df["lagged_tpr"])
+            * (chain_df["fpr"] - chain_df["lagged_fpr"])
+        )
+    )
+
+    summed_trap_areas_per_label_value = trap_areas_per_label_value.groupby(
+        ["label_key", "label_value"], as_index=False
+    )[["n", "n_true_positives", "trap_area"]].sum(min_count=1)
+
+    # replace values if specific conditions are met
+    summed_trap_areas_per_label_value = (
+        summed_trap_areas_per_label_value.assign(
+            trap_area=lambda chain_df: np.select(
+                [
+                    chain_df["n_true_positives"].isnull(),
+                    ((chain_df["n"] - chain_df["n_true_positives"]) == 0),
+                ],
+                [1, 1],
+                default=chain_df["trap_area"],
+            )
+        )
+    )
+
+    # take the average across grouper keys
+    average_across_label_keys = summed_trap_areas_per_label_value.groupby(
+        "label_key", as_index=False
+    )["trap_area"].mean()
+
+    return [
+        metrics.ROCAUCMetric(
+            label_key=values["label_key"], value=values["trap_area"]
+        )
+        for _, values in average_across_label_keys.iterrows()
+    ]
+
+
+def _add_samples_to_dataframe(
+    pr_curve_counts_df: pd.DataFrame,
+    pr_calc_df: pd.DataFrame,
+    max_examples: int,
+    flag_column: str,
+) -> pd.DataFrame:
+    """Efficienctly gather samples for a given flag."""
+
+    sample_df = pd.concat(
+        [
+            pr_calc_df[pr_calc_df[flag_column]]
+            .groupby(
+                [
+                    "label_key",
+                    "label_value_gt",
+                    "confidence_threshold",
+                ],
+                as_index=False,
+            )[["datum_uid"]]
+            .agg(lambda x: tuple(x.head(max_examples)))
+            .rename(columns={"label_value_gt": "label_value"}),
+            pr_calc_df[pr_calc_df[flag_column]]
+            .groupby(
+                [
+                    "label_key",
+                    "label_value_pd",
+                    "confidence_threshold",
+                ],
+                as_index=False,
+            )[["datum_uid"]]
+            .agg(lambda x: tuple(x.head(max_examples)))
+            .rename(columns={"label_value_pd": "label_value"}),
+        ],
+        axis=0,
+    ).drop_duplicates()
+
+    if not sample_df.empty:
+        sample_df[f"{flag_column}_samples"] = sample_df.apply(
+            lambda row: set(zip(*row[["datum_uid"]])),  # type: ignore - pandas typing error
+            axis=1,
+        )
+
+        pr_curve_counts_df = pr_curve_counts_df.merge(
+            sample_df[
+                [
+                    "label_key",
+                    "label_value",
+                    "confidence_threshold",
+                    f"{flag_column}_samples",
+                ]
+            ],
+            on=["label_key", "label_value", "confidence_threshold"],
+            how="outer",
+        )
+        pr_curve_counts_df[f"{flag_column}_samples"] = pr_curve_counts_df[
+            f"{flag_column}_samples"
+        ].apply(lambda x: list(x) if isinstance(x, set) else list())
+
+    else:
+        pr_curve_counts_df[f"{flag_column}_samples"] = [
+            list() for _ in range(len(pr_curve_counts_df))
+        ]
+
+    return pr_curve_counts_df
+
+
+def _calculate_pr_curves(
+    prediction_df: pd.DataFrame,
+    groundtruth_df: pd.DataFrame,
+    metrics_to_return: list,
+    pr_curve_max_examples: int,
+) -> list[metrics.PrecisionRecallCurve]:
+    """Calculate PrecisionRecallCurve metrics."""
+
+    if not (
+        enums.MetricType.PrecisionRecallCurve in metrics_to_return
+        or enums.MetricType.DetailedPrecisionRecallCurve in metrics_to_return
+    ):
+        return []
+
+    joint_df = (
+        pd.merge(
+            groundtruth_df,
+            prediction_df,
+            on=["datum_id", "datum_uid", "label_key"],
+            how="inner",
+            suffixes=("_gt", "_pd"),
+        ).assign(
+            is_label_match=lambda chain_df: (
+                (chain_df["label_value_pd"] == chain_df["label_value_gt"])
+            )
+        )
+        # only keep the columns we need
+        .loc[
+            :,
+            [
+                "datum_uid",
+                "datum_id",
+                "label_key",
+                "label_value_gt",
+                "id_gt",
+                "label_value_pd",
+                "score",
+                "id_pd",
+                "is_label_match",
+            ],
+        ]
+    )
+
+    # free up memory
+    del groundtruth_df
+    del prediction_df
+    gc.collect()
+
+    # add confidence_threshold to the dataframe and sort
+    pr_calc_df = pd.concat(
+        [
+            joint_df.assign(confidence_threshold=threshold)
+            for threshold in [x / 100 for x in range(5, 100, 5)]
+        ],
+        ignore_index=True,
+    ).sort_values(
+        by=[
+            "label_key",
+            "label_value_pd",
+            "confidence_threshold",
+            "score",
+        ],
+        ascending=False,
+    )
+
+    # create flags where the predictions meet criteria
+    pr_calc_df["true_positive_flag"] = (
+        pr_calc_df["score"] >= pr_calc_df["confidence_threshold"]
+    ) & pr_calc_df["is_label_match"]
+
+    # for all the false positives, we consider them to be a misclassification if they share a key but not a value with a gt
+    pr_calc_df["misclassification_false_positive_flag"] = (
+        pr_calc_df["score"] >= pr_calc_df["confidence_threshold"]
+    ) & ~pr_calc_df["is_label_match"]
+
+    # next, we flag false negatives by declaring any groundtruth that isn't associated with a true positive to be a false negative
+    groundtruths_associated_with_true_positives = (
+        pr_calc_df[pr_calc_df["true_positive_flag"]]
+        .groupby(["confidence_threshold"], as_index=False)["id_gt"]
+        .unique()
+    )
+
+    if not groundtruths_associated_with_true_positives.empty:
+        confidence_interval_to_true_positive_groundtruth_ids_dict = (
+            groundtruths_associated_with_true_positives.set_index(
+                "confidence_threshold"
+            )["id_gt"]
+            .apply(set)
+            .to_dict()
+        )
+
+        mask = pd.Series(False, index=pr_calc_df.index)
+
+        for (
+            threshold,
+            elements,
+        ) in confidence_interval_to_true_positive_groundtruth_ids_dict.items():
+            threshold_mask = pr_calc_df["confidence_threshold"] == threshold
+            membership_mask = pr_calc_df["id_gt"].isin(elements)
+            mask |= threshold_mask & membership_mask
+
+        pr_calc_df["false_negative_flag"] = ~mask
+
+    else:
+        pr_calc_df["false_negative_flag"] = False
+
+    # it's a misclassification if there is a corresponding misclassification false positive
+    pr_calc_df["misclassification_false_negative_flag"] = (
+        pr_calc_df["misclassification_false_positive_flag"]
+        & pr_calc_df["false_negative_flag"]
+    )
+
+    # assign all id_gts that aren't misclassifications but are false negatives to be no_predictions
+    groundtruths_associated_with_misclassification_false_negatives = (
+        pr_calc_df[pr_calc_df["misclassification_false_negative_flag"]]
+        .groupby(["confidence_threshold"], as_index=False)["id_gt"]
+        .unique()
+    )
+
+    if (
+        not groundtruths_associated_with_misclassification_false_negatives.empty
+    ):
+        confidence_interval_to_misclassification_fn_groundtruth_ids_dict = (
+            groundtruths_associated_with_misclassification_false_negatives.set_index(
+                "confidence_threshold"
+            )[
+                "id_gt"
+            ]
+            .apply(set)
+            .to_dict()
+        )
+
+        mask = pd.Series(False, index=pr_calc_df.index)
+
+        for (
+            threshold,
+            elements,
+        ) in (
+            confidence_interval_to_misclassification_fn_groundtruth_ids_dict.items()
+        ):
+            threshold_mask = pr_calc_df["confidence_threshold"] == threshold
+            membership_mask = ~pr_calc_df["id_gt"].isin(elements)
+            mask |= threshold_mask & membership_mask
+
+        pr_calc_df["no_predictions_false_negative_flag"] = (
+            mask & pr_calc_df["false_negative_flag"]
+        )
+
+    else:
+        pr_calc_df["no_predictions_false_negative_flag"] = pr_calc_df[
+            "false_negative_flag"
+        ]
+
+    # true negatives are any rows which don't have another flag
+    pr_calc_df["true_negative_flag"] = (
+        ~pr_calc_df["true_positive_flag"]
+        & ~pr_calc_df["false_negative_flag"]
+        & ~pr_calc_df["misclassification_false_positive_flag"]
+    )
+
+    # next, we sum up the occurences of each classification and merge them together into one dataframe
+    true_positives = (
+        pr_calc_df[pr_calc_df["true_positive_flag"]]
+        .groupby(["label_key", "label_value_pd", "confidence_threshold"])[
+            "id_pd"
+        ]
+        .nunique()
+    )
+    true_positives.name = "true_positives"
+
+    misclassification_false_positives = (
+        pr_calc_df[pr_calc_df["misclassification_false_positive_flag"]]
+        .groupby(["label_key", "label_value_pd", "confidence_threshold"])[
+            "id_pd"
+        ]
+        .nunique()
+    )
+    misclassification_false_positives.name = (
+        "misclassification_false_positives"
+    )
+
+    misclassification_false_negatives = (
+        pr_calc_df[pr_calc_df["misclassification_false_negative_flag"]]
+        .groupby(["label_key", "label_value_gt", "confidence_threshold"])[
+            "id_gt"
+        ]
+        .nunique()
+    )
+    misclassification_false_negatives.name = (
+        "misclassification_false_negatives"
+    )
+
+    no_predictions_false_negatives = (
+        pr_calc_df[pr_calc_df["no_predictions_false_negative_flag"]]
+        .groupby(["label_key", "label_value_gt", "confidence_threshold"])[
+            "id_gt"
+        ]
+        .nunique()
+    )
+    no_predictions_false_negatives.name = "no_predictions_false_negatives"
+
+    # combine these outputs
+    pr_curve_counts_df = (
+        pd.concat(
+            [
+                pr_calc_df.loc[
+                    ~pr_calc_df["label_value_pd"].isnull(),
+                    [
+                        "label_key",
+                        "label_value_pd",
+                        "confidence_threshold",
+                    ],
+                ].rename(columns={"label_value_pd": "label_value"}),
+                pr_calc_df.loc[
+                    ~pr_calc_df["label_value_gt"].isnull(),
+                    [
+                        "label_key",
+                        "label_value_gt",
+                        "confidence_threshold",
+                    ],
+                ].rename(columns={"label_value_gt": "label_value"}),
+            ],
+            axis=0,
+        )
+        .drop_duplicates()
+        .merge(
+            true_positives,
+            left_on=[
+                "label_key",
+                "label_value",
+                "confidence_threshold",
+            ],
+            right_index=True,
+            how="outer",
+        )
+        .merge(
+            misclassification_false_positives,
+            left_on=[
+                "label_key",
+                "label_value",
+                "confidence_threshold",
+            ],
+            right_index=True,
+            how="outer",
+        )
+        .merge(
+            misclassification_false_negatives,
+            left_on=[
+                "label_key",
+                "label_value",
+                "confidence_threshold",
+            ],
+            right_index=True,
+            how="outer",
+        )
+        .merge(
+            no_predictions_false_negatives,
+            left_on=[
+                "label_key",
+                "label_value",
+                "confidence_threshold",
+            ],
+            right_index=True,
+            how="outer",
+        )
+    )
+
+    # we're doing an outer join, so any nulls should be zeroes
+    pr_curve_counts_df.fillna(0, inplace=True)
+
+    # find all unique datums for use when identifying true negatives
+    unique_datum_ids = set(pr_calc_df["datum_id"].unique())
+
+    # calculate additional metrics
+    pr_curve_counts_df["false_positives"] = pr_curve_counts_df[
+        "misclassification_false_positives"
+    ]  # we don't have any hallucinations for classification
+    pr_curve_counts_df["false_negatives"] = (
+        pr_curve_counts_df["misclassification_false_negatives"]
+        + pr_curve_counts_df["no_predictions_false_negatives"]
+    )
+    pr_curve_counts_df["true_negatives"] = len(unique_datum_ids) - (
+        pr_curve_counts_df["true_positives"]
+        + pr_curve_counts_df["false_positives"]
+        + pr_curve_counts_df["false_negatives"]
+    )
+    pr_curve_counts_df["precision"] = pr_curve_counts_df["true_positives"] / (
+        pr_curve_counts_df["true_positives"]
+        + pr_curve_counts_df["false_positives"]
+    )
+    pr_curve_counts_df["recall"] = pr_curve_counts_df["true_positives"] / (
+        pr_curve_counts_df["true_positives"]
+        + pr_curve_counts_df["false_negatives"]
+    )
+    pr_curve_counts_df["accuracy"] = (
+        pr_curve_counts_df["true_positives"]
+        + pr_curve_counts_df["true_negatives"]
+    ) / len(unique_datum_ids)
+    pr_curve_counts_df["f1_score"] = (
+        2 * pr_curve_counts_df["precision"] * pr_curve_counts_df["recall"]
+    ) / (pr_curve_counts_df["precision"] + pr_curve_counts_df["recall"])
+
+    # any NaNs that are left are from division by zero errors
+    pr_curve_counts_df.fillna(-1, inplace=True)
+
+    pr_output = defaultdict(lambda: defaultdict(dict))
+    detailed_pr_output = defaultdict(lambda: defaultdict(dict))
+
+    # add samples to the dataframe for DetailedPrecisionRecallCurves
+    if enums.MetricType.DetailedPrecisionRecallCurve in metrics_to_return:
+        for flag in [
+            "true_positive_flag",
+            "true_negative_flag",
+            "misclassification_false_negative_flag",
+            "no_predictions_false_negative_flag",
+            "misclassification_false_positive_flag",
+        ]:
+            pr_curve_counts_df = _add_samples_to_dataframe(
+                pr_curve_counts_df=pr_curve_counts_df,
+                pr_calc_df=pr_calc_df,
+                max_examples=pr_curve_max_examples,
+                flag_column=flag,
+            )
+
+    for _, row in pr_curve_counts_df.iterrows():
+        pr_output[row["label_key"]][row["label_value"]][
+            row["confidence_threshold"]
+        ] = {
+            "tp": row["true_positives"],
+            "fp": row["false_positives"],
+            "fn": row["false_negatives"],
+            "tn": row["true_negatives"],
+            "accuracy": row["accuracy"],
+            "precision": row["precision"],
+            "recall": row["recall"],
+            "f1_score": row["f1_score"],
+        }
+
+        if enums.MetricType.DetailedPrecisionRecallCurve in metrics_to_return:
+            detailed_pr_output[row["label_key"]][row["label_value"]][
+                row["confidence_threshold"]
+            ] = {
+                "tp": {
+                    "total": row["true_positives"],
+                    "observations": {
+                        "all": {
+                            "count": row["true_positives"],
+                            "examples": row["true_positive_flag_samples"],
+                        }
+                    },
+                },
+                "tn": {
+                    "total": row["true_negatives"],
+                    "observations": {
+                        "all": {
+                            "count": row["true_negatives"],
+                            "examples": row["true_negative_flag_samples"],
+                        }
+                    },
+                },
+                "fn": {
+                    "total": row["false_negatives"],
+                    "observations": {
+                        "misclassifications": {
+                            "count": row["misclassification_false_negatives"],
+                            "examples": row[
+                                "misclassification_false_negative_flag_samples"
+                            ],
+                        },
+                        "no_predictions": {
+                            "count": row["no_predictions_false_negatives"],
+                            "examples": row[
+                                "no_predictions_false_negative_flag_samples"
+                            ],
+                        },
+                    },
+                },
+                "fp": {
+                    "total": row["false_positives"],
+                    "observations": {
+                        "misclassifications": {
+                            "count": row["misclassification_false_positives"],
+                            "examples": row[
+                                "misclassification_false_positive_flag_samples"
+                            ],
+                        },
+                    },
+                },
+            }
+
+    output = []
+
+    if enums.MetricType.PrecisionRecallCurve in metrics_to_return:
+        output += [
+            metrics.PrecisionRecallCurve(
+                label_key=key, value=dict(value), pr_curve_iou_threshold=None
+            )
+            for key, value in pr_output.items()
+        ]
+
+    if enums.MetricType.DetailedPrecisionRecallCurve in metrics_to_return:
+        output += [
+            metrics.DetailedPrecisionRecallCurve(
+                label_key=key, value=dict(value), pr_curve_iou_threshold=None
+            )
+            for key, value in detailed_pr_output.items()
+        ]
+
+    return output
+
+
+def _compute_clf_metrics(
+    groundtruth_df: pd.DataFrame,
+    prediction_df: pd.DataFrame,
+    metrics_to_return: list[enums.MetricType] | None = None,
+    pr_curve_max_examples: int = 1,
+) -> tuple[list[dict], list[dict]]:
+    """
+    Compute classification metrics including confusion matrices and various performance metrics.
+
+    Parameters
+    ----------
+    groundtruth_df : pd.DataFrame
+        DataFrame containing ground truth annotations with necessary columns.
+    prediction_df : pd.DataFrame
+        DataFrame containing predictions with necessary columns.
+    metrics_to_return : list[enums.MetricType], optional
+        list of metric types to return. If None, default metrics are used.
+    pr_curve_max_examples : int
+        Maximum number of examples to use for Precision-Recall curve calculations.
+
+    Returns
+    -------
+    tuple[list[dict], list[dict]]
+        A tuple where:
+        - The first element is a list of dictionaries representing confusion matrices.
+        - The second element is a list of dictionaries representing the requested classification metrics.
+    """
+
+    # add label as a column
+    for df in (groundtruth_df, prediction_df):
+        df.loc[:, "label"] = df.apply(
+            lambda chain_df: (chain_df["label_key"], chain_df["label_value"]),
+            axis=1,
+        )
+
+    confusion_matrices, metrics_to_output = [], []
+
+    merged_groundtruths_and_predictions_df = _get_joint_df(
+        prediction_df=prediction_df, groundtruth_df=groundtruth_df
+    )
+
+    cm_counts_df, confusion_matrices = _calculate_confusion_matrix_df(
+        merged_groundtruths_and_predictions_df=merged_groundtruths_and_predictions_df
+    )
+
+    metrics_per_label_key_and_label_value_df = (
+        _calculate_metrics_at_label_value_level(cm_counts_df=cm_counts_df)
+    )
+
+    metrics_to_output += _calculate_precision_recall_f1_metrics(
+        metrics_per_label_key_and_label_value_df=metrics_per_label_key_and_label_value_df
+    )
+
+    metrics_to_output += _calculate_accuracy_metrics(cm_counts_df=cm_counts_df)
+
+    metrics_to_output += _calculate_rocauc(
+        prediction_df=prediction_df, groundtruth_df=groundtruth_df
+    )
+
+    # handle type error
+    if not metrics_to_return:
+        raise ValueError("metrics_to_return must be defined.")
+
+    metrics_to_output += _calculate_pr_curves(
+        prediction_df=prediction_df,
+        groundtruth_df=groundtruth_df,
+        metrics_to_return=metrics_to_return,
+        pr_curve_max_examples=pr_curve_max_examples,
+    )
+
+    # convert objects to dictionaries and only return what was asked for
+    metrics_to_output = [
+        m.to_dict()
+        for m in metrics_to_output
+        if m.to_dict()["type"] in metrics_to_return
+    ]
+    confusion_matrices = [cm.to_dict() for cm in confusion_matrices]
+
+    return confusion_matrices, metrics_to_output
+
+
+def evaluate_classification(
+    groundtruths: pd.DataFrame | list[schemas.GroundTruth],
+    predictions: pd.DataFrame | list[schemas.Prediction],
+    label_map: dict[schemas.Label, schemas.Label] | None = None,
+    metrics_to_return: list[enums.MetricType] | None = None,
+    pr_curve_max_examples: int = 1,
+) -> schemas.Evaluation:
+    """
+    Evaluate an object detection task using some set of groundtruths and predictions.
+
+    The groundtruths and predictions can be inputted as a pandas DataFrame or as a list of GroundTruth/Prediction objects. When passing a dataframe of groundtruths / predictions, the dataframe should contain the following columns:
+    - datum_uid (str): The unique identifier for the datum.
+    - datum_id (int): A hashed identifier that's unique to each datum.
+    - datum_metadata (dict): Metadata associated with the datum.
+    - annotation_id (int): A hashed identifier for each unique (datum_uid, annotation) combination.
+    - annotation_metadata (dict): Metadata associated with the annotation.
+    - is_instance (bool): A boolean indicating whether the annotation is an instance segjmentation (True) or not (False).
+    - label_key (str): The key associated with the label.
+    - label_value (str): The value associated with the label.
+    - score (float): The confidence score of the prediction. Should be bound between 0 and 1. Should only be included for prediction dataframes.
+    - label_id (int): A hashed identifier for each unique label.
+    - id (str): A unique identifier for the combination of datum, annotation, and label, created by concatenating the indices of these components.
+
+    Parameters
+    ----------
+    groundtruths : pd.DataFrame | list[schemas.GroundTruth]
+        Ground truth annotations as either a DataFrame or a list of GroundTruth objects.
+    predictions : pd.DataFrame | list[schemas.Prediction]
+        Predictions as either a DataFrame or a list of Prediction objects.
+    label_map : dict[schemas.Label, schemas.Label], optional
+        Optional dictionary mapping ground truth labels to prediction labels.
+    metrics_to_return : list[enums.MetricType], optional
+        List of metric types to return. Defaults to Precision, Recall, F1, Accuracy, ROCAUC if None.
+    pr_curve_max_examples : int, default=1
+        Maximum number of examples to use for Precision-Recall curve calculations.
+
+    Returns
+    -------
+    schemas.Evaluation
+        An Evaluation object containing:
+        - parameters: EvaluationParameters used for the calculation.
+        - metrics: List of dictionaries representing the calculated classification metrics.
+        - confusion_matrices: List of dictionaries representing the confusion matrices.
+        - meta: Dictionary with metadata including the count of labels, datums, annotations, and duration of the evaluation.
+        - ignored_pred_labels: List of ignored prediction labels (empty in this context).
+        - missing_pred_labels: List of missing prediction labels (empty in this context).
+    """
+    start_time = time.time()
+
+    if not label_map:
+        label_map = {}
+
+    if metrics_to_return is None:
+        metrics_to_return = [
+            enums.MetricType.Precision,
+            enums.MetricType.Recall,
+            enums.MetricType.F1,
+            enums.MetricType.Accuracy,
+            enums.MetricType.ROCAUC,
+        ]
+
+    utilities.validate_label_map(label_map=label_map)
+    utilities.validate_metrics_to_return(
+        metrics_to_return=metrics_to_return,
+        task_type=enums.TaskType.CLASSIFICATION,
+    )
+    utilities.validate_parameters(pr_curve_max_examples=pr_curve_max_examples)
+
+    groundtruth_df = utilities.create_validated_groundtruth_df(
+        groundtruths, task_type=enums.TaskType.CLASSIFICATION
+    )
+    prediction_df = utilities.create_validated_prediction_df(
+        predictions, task_type=enums.TaskType.CLASSIFICATION
+    )
+
+    # filter dataframes based on task type
+    groundtruth_df = utilities.filter_dataframe_by_task_type(
+        df=groundtruth_df, task_type=enums.TaskType.CLASSIFICATION
+    )
+
+    if not prediction_df.empty:
+        prediction_df = utilities.filter_dataframe_by_task_type(
+            df=prediction_df, task_type=enums.TaskType.CLASSIFICATION
+        )
+
+    # drop intermediary columns that are no longer needed
+    groundtruth_df = groundtruth_df.loc[
+        :,
+        [
+            "datum_uid",
+            "datum_id",
+            "annotation_id",
+            "label_key",
+            "label_value",
+            "label_id",
+            "id",
+        ],
+    ]
+
+    prediction_df = prediction_df.loc[
+        :,
+        [
+            "datum_uid",
+            "datum_id",
+            "annotation_id",
+            "label_key",
+            "label_value",
+            "score",
+            "label_id",
+            "id",
+        ],
+    ]
+
+    utilities.validate_matching_label_keys(
+        groundtruths=groundtruth_df,
+        predictions=prediction_df,
+        label_map=label_map,
+    )
+
+    unique_labels = list(
+        set(zip(groundtruth_df["label_key"], groundtruth_df["label_value"]))
+        | set(zip(prediction_df["label_key"], prediction_df["label_value"]))
+    )
+    unique_datums_cnt = len(
+        set(groundtruth_df["datum_uid"]) | set(prediction_df["datum_uid"])
+    )
+    unique_annotations_cnt = len(
+        set(groundtruth_df["annotation_id"])
+        | set(prediction_df["annotation_id"])
+    )
+
+    groundtruth_df, prediction_df = utilities.replace_labels_using_label_map(
+        groundtruth_df=groundtruth_df,
+        prediction_df=prediction_df,
+        label_map=label_map,
+    )
+
+    confusion_matrices, metrics = _compute_clf_metrics(
+        groundtruth_df=groundtruth_df,
+        prediction_df=prediction_df,
+        metrics_to_return=metrics_to_return,
+        pr_curve_max_examples=pr_curve_max_examples,
+    )
+
+    return schemas.Evaluation(
+        parameters=schemas.EvaluationParameters(
+            metrics_to_return=metrics_to_return,
+            label_map=label_map,
+            pr_curve_max_examples=pr_curve_max_examples,
+        ),
+        metrics=metrics,
+        confusion_matrices=confusion_matrices,
+        meta={
+            "labels": len(unique_labels),
+            "datums": unique_datums_cnt,
+            "annotations": unique_annotations_cnt,
+            "duration": time.time() - start_time,
+        },
+        ignored_pred_labels=[],
+        missing_pred_labels=[],
+    )
diff --git a/core/valor_core/detection.py b/core/valor_core/detection.py
new file mode 100644
index 000000000..33440bdff
--- /dev/null
+++ b/core/valor_core/detection.py
@@ -0,0 +1,1484 @@
+import heapq
+import math
+import time
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+from valor_core import enums, geometry, metrics, schemas, utilities
+
+pd.set_option("display.max_columns", None)
+
+
+def _get_joint_df(
+    groundtruth_df: pd.DataFrame,
+    prediction_df: pd.DataFrame,
+) -> pd.DataFrame:
+    """Create a joint dataframe of groundtruths and predictions for calculating AR/AP metrics."""
+
+    joint_df = pd.merge(
+        groundtruth_df,
+        prediction_df,
+        on=["datum_id", "label_id", "label"],
+        how="outer",
+        suffixes=("_gt", "_pd"),
+    )
+
+    return joint_df
+
+
+def _get_dtypes_in_series_of_arrays(series: pd.Series):
+    """Get the data type inside of a 2D numpy array. Used to check if a np.array contains coordinates or a mask."""
+    if not isinstance(series, pd.Series) or not all(
+        series.apply(lambda x: x.ndim == 2)
+    ):
+        raise ValueError(
+            "series must be a pandas Series filled with two-dimensional arrays."
+        )
+
+    unique_primitives = series.map(lambda x: x.dtype).unique()
+
+    if len(unique_primitives) > 1:
+        raise ValueError("series contains more than one type of primitive.")
+
+    return unique_primitives[0]
+
+
+def _check_if_series_contains_masks(series: pd.Series) -> bool:
+    """Check if any element in a pandas.Series is a mask."""
+    if series.empty:
+        return False
+
+    primitive = _get_dtypes_in_series_of_arrays(series=series)
+
+    if np.issubdtype(primitive, np.bool_):
+        return True
+
+    return False
+
+
+def _calculate_iou(
+    joint_df: pd.DataFrame,
+) -> pd.DataFrame:
+    """Calculate the IOUs between predictions and groundtruths in a joint dataframe."""
+    if _check_if_series_contains_masks(
+        joint_df.loc[
+            joint_df["converted_geometry_pd"].notnull(),
+            "converted_geometry_pd",
+        ]
+    ):
+        iou_calculation_df = (
+            joint_df.assign(
+                intersection=lambda chain_df: chain_df.apply(
+                    lambda row: (
+                        0
+                        if row["converted_geometry_pd"] is None
+                        or row["converted_geometry_gt"] is None
+                        else np.logical_and(
+                            row["converted_geometry_pd"],
+                            row["converted_geometry_gt"],
+                        ).sum()
+                    ),
+                    axis=1,
+                )
+            )
+            .assign(
+                union_=lambda chain_df: chain_df.apply(
+                    lambda row: (
+                        0
+                        if row["converted_geometry_pd"] is None
+                        or row["converted_geometry_gt"] is None
+                        else np.sum(row["converted_geometry_gt"])
+                        + np.sum(row["converted_geometry_pd"])
+                        - row["intersection"]
+                    ),
+                    axis=1,
+                )
+            )
+            .assign(
+                iou_=lambda chain_df: chain_df["intersection"]
+                / chain_df["union_"]
+            )
+        )
+
+        joint_df = joint_df.join(iou_calculation_df["iou_"])
+
+    else:
+        iou_calculation_df = joint_df.loc[
+            ~joint_df["converted_geometry_gt"].isnull()
+            & ~joint_df["converted_geometry_pd"].isnull(),
+            ["converted_geometry_gt", "converted_geometry_pd"],
+        ].apply(
+            lambda row: geometry.calculate_iou(
+                row["converted_geometry_gt"], row["converted_geometry_pd"]
+            ),
+            axis=1,
+        )
+
+        if not iou_calculation_df.empty:
+            iou_calculation_df = iou_calculation_df.rename("iou_")
+            joint_df = joint_df.join(iou_calculation_df)
+        else:
+            joint_df["iou_"] = 0
+
+    return joint_df
+
+
+def _calculate_label_id_level_metrics(
+    calculation_df: pd.DataFrame, recall_score_threshold: float
+) -> pd.DataFrame:
+    """Calculate the flags and metrics needed to compute AP, AR, and PR curves."""
+
+    # create flags where predictions meet the score and IOU criteria
+    calculation_df["recall_true_positive_flag"] = (
+        calculation_df["iou_"] >= calculation_df["iou_threshold"]
+    ) & (calculation_df["score"] >= recall_score_threshold)
+    # only consider the highest scoring true positive as an actual true positive
+    calculation_df["recall_true_positive_flag"] = calculation_df[
+        "recall_true_positive_flag"
+    ] & (
+        ~calculation_df.groupby(
+            ["label_id", "label", "iou_threshold", "id_gt"], as_index=False
+        )["recall_true_positive_flag"].shift(1, fill_value=False)
+    )
+
+    calculation_df["precision_true_positive_flag"] = (
+        calculation_df["iou_"] >= calculation_df["iou_threshold"]
+    ) & (calculation_df["score"] > 0)
+    calculation_df["precision_true_positive_flag"] = calculation_df[
+        "precision_true_positive_flag"
+    ] & (
+        ~calculation_df.groupby(
+            ["label_id", "iou_threshold", "id_gt"], as_index=False
+        )["precision_true_positive_flag"].shift(1, fill_value=False)
+    )
+
+    calculation_df["recall_false_positive_flag"] = ~calculation_df[
+        "recall_true_positive_flag"
+    ] & (calculation_df["score"] >= recall_score_threshold)
+    calculation_df["precision_false_positive_flag"] = ~calculation_df[
+        "precision_true_positive_flag"
+    ] & (calculation_df["score"] > 0)
+
+    # calculate true and false positives
+    calculation_df = (
+        calculation_df.join(
+            calculation_df.groupby(
+                ["label_id", "label", "iou_threshold"], as_index=False
+            )["recall_true_positive_flag"]
+            .cumsum()
+            .rename("rolling_recall_tp")
+        )
+        .join(
+            calculation_df.groupby(
+                ["label_id", "label", "iou_threshold"], as_index=False
+            )["recall_false_positive_flag"]
+            .cumsum()
+            .rename("rolling_recall_fp")
+        )
+        .join(
+            calculation_df.groupby(
+                ["label_id", "label", "iou_threshold"], as_index=False
+            )["precision_true_positive_flag"]
+            .cumsum()
+            .rename("rolling_precision_tp")
+        )
+        .join(
+            calculation_df.groupby(
+                ["label_id", "label", "iou_threshold"], as_index=False
+            )["precision_false_positive_flag"]
+            .cumsum()
+            .rename("rolling_precision_fp")
+        )
+    )
+
+    # calculate false negatives, then precision / recall
+    calculation_df = (
+        calculation_df.assign(
+            rolling_recall_fn=lambda chain_df: chain_df["gts_per_grouper"]
+            - chain_df["rolling_recall_tp"]
+        )
+        .assign(
+            rolling_precision_fn=lambda chain_df: chain_df["gts_per_grouper"]
+            - chain_df["rolling_precision_tp"]
+        )
+        .assign(
+            precision=lambda chain_df: chain_df["rolling_precision_tp"]
+            / (
+                chain_df["rolling_precision_tp"]
+                + chain_df["rolling_precision_fp"]
+            )
+        )
+        .assign(
+            recall_for_AP=lambda chain_df: chain_df["rolling_precision_tp"]
+            / (
+                chain_df["rolling_precision_tp"]
+                + chain_df["rolling_precision_fn"]
+            )
+        )
+        .assign(
+            recall_for_AR=lambda chain_df: chain_df["rolling_recall_tp"]
+            / (chain_df["rolling_recall_tp"] + chain_df["rolling_recall_fn"])
+        )
+    )
+
+    # fill any predictions that are missing groundtruths with -1
+    # leave any groundtruths that are missing predictions with 0
+    calculation_df.loc[
+        calculation_df["id_gt"].isnull(),
+        ["precision", "recall_for_AP", "recall_for_AR"],
+    ] = -1
+
+    calculation_df.loc[
+        calculation_df["id_pd"].isnull(),
+        ["precision", "recall_for_AP", "recall_for_AR"],
+    ] = 0
+
+    return calculation_df
+
+
+def _calculate_101_pt_interp(precisions, recalls) -> float:
+    """Use the 101 point interpolation method (following torchmetrics)."""
+    assert len(precisions) == len(recalls)
+
+    if len(precisions) == 0:
+        return 0
+
+    if all([x == -1 for x in precisions + recalls]):
+        return -1
+
+    data = list(zip(precisions, recalls))
+    data.sort(key=lambda x: x[1])
+    # negative is because we want a max heap
+    prec_heap = [[-precision, i] for i, (precision, _) in enumerate(data)]
+    heapq.heapify(prec_heap)
+
+    cutoff_idx = 0
+    ret = 0
+    for r in [0.01 * i for i in range(101)]:
+        while (
+            cutoff_idx < len(data)
+            and data[cutoff_idx][1] < r
+            and not math.isclose(data[cutoff_idx][1], r)
+        ):
+            cutoff_idx += 1
+        while prec_heap and prec_heap[0][1] < cutoff_idx:
+            heapq.heappop(prec_heap)
+        if cutoff_idx >= len(data):
+            continue
+        ret -= prec_heap[0][0]
+
+    return ret / 101
+
+
+def _calculate_mean_ignoring_negative_one(series: pd.Series) -> float:
+    """Calculate the mean of a series, ignoring any values that are -1."""
+    filtered = series[series != -1]
+    return filtered.mean() if not filtered.empty else -1.0
+
+
+def _calculate_ap_metrics(
+    calculation_df: pd.DataFrame,
+    iou_thresholds_to_compute: list[float],
+    iou_thresholds_to_return: list[float],
+) -> list[
+    metrics.APMetric
+    | metrics.APMetricAveragedOverIOUs
+    | metrics.mAPMetric
+    | metrics.mAPMetricAveragedOverIOUs
+]:
+    """Calculates all AP metrics, including aggregated metrics like mAP."""
+    ap_metrics_df = (
+        calculation_df.loc[
+            ~calculation_df[
+                "id_gt"
+            ].isnull(),  # for AP, we don't include any predictions without groundtruths
+            [
+                "label_id",
+                "label",
+                "iou_threshold",
+                "precision",
+                "recall_for_AP",
+            ],
+        ]
+        .groupby(["label_id", "label", "iou_threshold"], as_index=False)
+        .apply(
+            lambda x: pd.Series(
+                {
+                    "calculated_precision": _calculate_101_pt_interp(
+                        x["precision"].tolist(),
+                        x["recall_for_AP"].tolist(),
+                    )
+                }
+            ),
+            include_groups=False,
+        )
+    )
+
+    ap_metrics = [
+        metrics.APMetric(
+            iou=row["iou_threshold"],
+            value=row["calculated_precision"],
+            label=schemas.Label(key=row["label"][0], value=row["label"][1]),
+        )
+        for row in ap_metrics_df.to_dict(orient="records")
+    ]
+
+    # calculate mean AP metrics
+    ap_metrics_df["label_key"] = ap_metrics_df["label"].apply(lambda x: x[0])
+
+    ap_over_ious_df = ap_metrics_df.groupby(
+        ["label_id", "label"], as_index=False
+    )["calculated_precision"].apply(_calculate_mean_ignoring_negative_one)
+
+    ap_over_ious = [
+        metrics.APMetricAveragedOverIOUs(
+            ious=set(iou_thresholds_to_compute),
+            value=row["calculated_precision"],
+            label=schemas.Label(key=row["label"][0], value=row["label"][1]),
+        )
+        for row in ap_over_ious_df.to_dict(
+            orient="records"
+        )  # pyright: ignore - pandas .to_dict() typing error
+    ]
+
+    map_metrics_df = ap_metrics_df.groupby(
+        ["iou_threshold", "label_key"], as_index=False
+    )["calculated_precision"].apply(_calculate_mean_ignoring_negative_one)
+
+    map_metrics = [
+        metrics.mAPMetric(
+            iou=row["iou_threshold"],
+            value=row["calculated_precision"],
+            label_key=row["label_key"],
+        )
+        for row in map_metrics_df.to_dict(
+            orient="records"
+        )  # pyright: ignore - pandas .to_dict() typing error
+    ]
+
+    map_over_ious_df = ap_metrics_df.groupby(["label_key"], as_index=False)[
+        "calculated_precision"
+    ].apply(_calculate_mean_ignoring_negative_one)
+
+    map_over_ious = [
+        metrics.mAPMetricAveragedOverIOUs(
+            ious=set(iou_thresholds_to_compute),
+            value=row["calculated_precision"],
+            label_key=row["label_key"],
+        )
+        for row in map_over_ious_df.to_dict(
+            orient="records"
+        )  # pyright: ignore - pandas .to_dict() typing error
+    ]
+
+    return (
+        [m for m in ap_metrics if m.iou in iou_thresholds_to_return]
+        + [m for m in map_metrics if m.iou in iou_thresholds_to_return]
+        + ap_over_ious
+        + map_over_ious
+    )
+
+
+def _calculate_ar_metrics(
+    calculation_df: pd.DataFrame,
+    iou_thresholds_to_compute: list[float],
+) -> list[metrics.ARMetric | metrics.mARMetric]:
+    """Calculates all AR metrics, including aggregated metrics like mAR."""
+
+    # get the max recall_for_AR for each threshold, then take the mean across thresholds
+    ar_metrics_df = (
+        calculation_df.groupby(
+            ["label_id", "label", "iou_threshold"], as_index=False
+        )["recall_for_AR"]
+        .max()
+        .groupby(["label_id", "label"], as_index=False)["recall_for_AR"]
+        .mean()
+    )
+
+    ious_ = set(iou_thresholds_to_compute)
+    ar_metrics = [
+        metrics.ARMetric(
+            ious=ious_,
+            value=row["recall_for_AR"],
+            label=schemas.Label(key=row["label"][0], value=row["label"][1]),
+        )
+        for row in ar_metrics_df.to_dict(orient="records")
+    ]
+
+    # calculate mAR
+    ar_metrics_df["label_key"] = ar_metrics_df["label"].apply(lambda x: x[0])
+    mar_metrics_df = ar_metrics_df.groupby(["label_key"], as_index=False)[
+        "recall_for_AR"
+    ].apply(_calculate_mean_ignoring_negative_one)
+
+    mar_metrics = [
+        metrics.mARMetric(
+            ious=ious_,
+            value=row["recall_for_AR"],
+            label_key=row["label_key"],
+        )
+        for row in mar_metrics_df.to_dict(orient="records")
+    ]
+
+    return ar_metrics + mar_metrics
+
+
+def _calculate_pr_metrics(
+    joint_df: pd.DataFrame,
+    metrics_to_return: list[enums.MetricType],
+    pr_curve_iou_threshold: float,
+) -> list[metrics.PrecisionRecallCurve]:
+    """Calculates all PrecisionRecallCurve metrics."""
+
+    if not (
+        metrics_to_return
+        and enums.MetricType.PrecisionRecallCurve in metrics_to_return
+    ):
+        return []
+
+    confidence_thresholds = [x / 100 for x in range(5, 100, 5)]
+    pr_calculation_df = pd.concat(
+        [
+            joint_df.assign(confidence_threshold=threshold)
+            for threshold in confidence_thresholds
+        ],
+        ignore_index=True,
+    ).sort_values(
+        by=[
+            "label_id",
+            "confidence_threshold",
+            "score",
+            "iou_",
+        ],
+        ascending=False,
+    )
+
+    pr_calculation_df["true_positive_flag"] = (
+        (pr_calculation_df["iou_"] >= pr_curve_iou_threshold)
+        & (
+            pr_calculation_df["score"]
+            >= pr_calculation_df["confidence_threshold"]
+        )
+        & (
+            pr_calculation_df.groupby(
+                ["label_id", "confidence_threshold", "id_gt"]
+            ).cumcount()
+            == 0
+        )  # only the first gt_id in this sorted list should be considered a true positive
+    )
+
+    pr_calculation_df["false_positive_flag"] = ~pr_calculation_df[
+        "true_positive_flag"
+    ] & (
+        pr_calculation_df["score"] >= pr_calculation_df["confidence_threshold"]
+    )
+
+    pr_metrics_df = (
+        pr_calculation_df.groupby(
+            [
+                "label_id",
+                "label",
+                "confidence_threshold",
+                "gts_per_grouper",
+            ],
+            as_index=False,
+        )["true_positive_flag"]
+        .sum()
+        .merge(
+            pr_calculation_df.groupby(
+                ["label_id", "label", "confidence_threshold"],
+                as_index=False,
+            )["false_positive_flag"].sum(),
+            on=["label_id", "label", "confidence_threshold"],
+            how="outer",
+        )
+        .rename(
+            columns={
+                "true_positive_flag": "true_positives",
+                "false_positive_flag": "false_positives",
+            }
+        )
+        .assign(
+            false_negatives=lambda chain_df: chain_df["gts_per_grouper"]
+            - chain_df["true_positives"]
+        )
+        .assign(
+            precision=lambda chain_df: chain_df["true_positives"]
+            / (chain_df["true_positives"] + chain_df["false_positives"])
+        )
+        .assign(
+            recall=lambda chain_df: chain_df["true_positives"]
+            / (chain_df["true_positives"] + chain_df["false_negatives"])
+        )
+        .assign(
+            f1_score=lambda chain_df: (
+                2 * chain_df["precision"] * chain_df["recall"]
+            )
+            / (chain_df["precision"] + chain_df["recall"])
+        )
+    )
+
+    pr_metrics_df.fillna(0, inplace=True)
+
+    curves = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
+
+    for row in pr_metrics_df.to_dict(orient="records"):
+        curves[row["label"][0]][row["label"][1]][
+            row["confidence_threshold"]
+        ] = {
+            "tp": row["true_positives"],
+            "fp": row["false_positives"],
+            "fn": row["false_negatives"],
+            "tn": None,  # tn and accuracy aren't applicable to detection tasks because there's an infinite number of true negatives
+            "precision": row["precision"],
+            "recall": row["recall"],
+            "accuracy": None,
+            "f1_score": row["f1_score"],
+        }
+
+    return [
+        metrics.PrecisionRecallCurve(
+            label_key=key,
+            value=value,  # type: ignore - defaultdict doesn't have strict typing
+            pr_curve_iou_threshold=pr_curve_iou_threshold,
+        )
+        for key, value in curves.items()
+    ]
+
+
+def _add_samples_to_dataframe(
+    detailed_pr_curve_counts_df: pd.DataFrame,
+    detailed_pr_calc_df: pd.DataFrame,
+    max_examples: int,
+    flag_column: str,
+) -> pd.DataFrame:
+    """Efficienctly gather samples for a given flag."""
+
+    sample_df = pd.concat(
+        [
+            detailed_pr_calc_df[detailed_pr_calc_df[flag_column]]
+            .groupby(
+                [
+                    "label_key",
+                    "label_value_gt",
+                    "confidence_threshold",
+                ],
+                as_index=False,
+            )[["datum_uid_gt", "converted_geometry_gt"]]
+            .agg(tuple)
+            .rename(
+                columns={
+                    "datum_uid_gt": "datum_uid",
+                    "label_value_gt": "label_value",
+                    "converted_geometry_gt": "converted_geometry",
+                }
+            ),
+            detailed_pr_calc_df[detailed_pr_calc_df[flag_column]]
+            .groupby(
+                [
+                    "label_key",
+                    "label_value_pd",
+                    "confidence_threshold",
+                ],
+                as_index=False,
+            )[["datum_uid_pd", "converted_geometry_pd"]]
+            .agg(tuple)
+            .rename(
+                columns={
+                    "datum_uid_pd": "datum_uid",
+                    "label_value_pd": "label_value",
+                    "converted_geometry_pd": "converted_geometry",
+                }
+            ),
+        ],
+        axis=0,
+    )
+
+    sample_df["converted_geometry"] = sample_df["converted_geometry"].apply(
+        lambda row: tuple(str(x.tolist()) for x in row)
+    )
+
+    sample_df.drop_duplicates(inplace=True)
+
+    if not sample_df.empty:
+        sample_df[f"{flag_column}_samples"] = sample_df.apply(
+            lambda row: set(zip(*row[["datum_uid", "converted_geometry"]])),  # type: ignore - pd typing error
+            axis=1,
+        )
+
+        detailed_pr_curve_counts_df = detailed_pr_curve_counts_df.merge(
+            sample_df[
+                [
+                    "label_key",
+                    "label_value",
+                    "confidence_threshold",
+                    f"{flag_column}_samples",
+                ]
+            ],
+            on=["label_key", "label_value", "confidence_threshold"],
+            how="outer",
+        )
+        detailed_pr_curve_counts_df[
+            f"{flag_column}_samples"
+        ] = detailed_pr_curve_counts_df[f"{flag_column}_samples"].apply(
+            lambda x: list(x)[:max_examples] if isinstance(x, set) else list()
+        )
+    else:
+        detailed_pr_curve_counts_df[f"{flag_column}_samples"] = [
+            list() for _ in range(len(detailed_pr_curve_counts_df))
+        ]
+
+    return detailed_pr_curve_counts_df
+
+
+def _calculate_detailed_pr_metrics(
+    detailed_pr_joint_df: pd.DataFrame | None,
+    metrics_to_return: list[enums.MetricType],
+    pr_curve_iou_threshold: float,
+    pr_curve_max_examples: int,
+) -> list[metrics.DetailedPrecisionRecallCurve]:
+    """Calculates all DetailedPrecisionRecallCurve metrics."""
+
+    if not (
+        metrics_to_return
+        and enums.MetricType.DetailedPrecisionRecallCurve in metrics_to_return
+    ) or (detailed_pr_joint_df is None):
+        return []
+
+    # add confidence_threshold to the dataframe and sort
+    detailed_pr_calc_df = pd.concat(
+        [
+            detailed_pr_joint_df.assign(confidence_threshold=threshold)
+            for threshold in [x / 100 for x in range(5, 100, 5)]
+        ],
+        ignore_index=True,
+    ).sort_values(
+        by=[
+            "label_id_pd",
+            "confidence_threshold",
+            "score",
+            "iou_",
+        ],
+        ascending=False,
+    )
+
+    # create flags where predictions meet the score and IOU criteria
+    detailed_pr_calc_df["true_positive_flag"] = (
+        (detailed_pr_calc_df["iou_"] >= pr_curve_iou_threshold)
+        & (
+            detailed_pr_calc_df["score"]
+            >= detailed_pr_calc_df["confidence_threshold"]
+        )
+        & detailed_pr_calc_df["is_label_match"]
+    )
+
+    # for all the false positives, we consider them to be a misclassification if they overlap with a groundtruth of the same label key
+    detailed_pr_calc_df["misclassification_false_positive_flag"] = (
+        (detailed_pr_calc_df["iou_"] >= pr_curve_iou_threshold)
+        & (
+            detailed_pr_calc_df["score"]
+            >= detailed_pr_calc_df["confidence_threshold"]
+        )
+        & ~detailed_pr_calc_df["is_label_match"]
+    )
+
+    # if they aren't a true positive nor a misclassification FP but they meet the iou and score conditions, then they are a hallucination
+    detailed_pr_calc_df["hallucination_false_positive_flag"] = (
+        (detailed_pr_calc_df["iou_"] < pr_curve_iou_threshold)
+        | (detailed_pr_calc_df["iou_"].isnull())
+    ) & (
+        detailed_pr_calc_df["score"]
+        >= detailed_pr_calc_df["confidence_threshold"]
+    )
+
+    # any prediction that is considered a misclassification shouldn't be counted as a hallucination, so we go back and remove these flags
+    predictions_associated_with_tps_or_misclassification_fps = (
+        detailed_pr_calc_df[
+            detailed_pr_calc_df["true_positive_flag"]
+            | detailed_pr_calc_df["misclassification_false_positive_flag"]
+        ]
+        .groupby(["confidence_threshold"], as_index=False)["id_pd"]
+        .unique()
+    )
+
+    if not predictions_associated_with_tps_or_misclassification_fps.empty:
+        confidence_interval_to_predictions_associated_with_tps_or_misclassification_fps_dict = (
+            predictions_associated_with_tps_or_misclassification_fps.set_index(
+                "confidence_threshold"
+            )["id_pd"]
+            .apply(set)
+            .to_dict()
+        )
+
+        mask = pd.Series(False, index=detailed_pr_calc_df.index)
+
+        for (
+            threshold,
+            elements,
+        ) in (
+            confidence_interval_to_predictions_associated_with_tps_or_misclassification_fps_dict.items()
+        ):
+            threshold_mask = (
+                detailed_pr_calc_df["confidence_threshold"] == threshold
+            )
+            membership_mask = detailed_pr_calc_df["id_pd"].isin(elements)
+            mask |= (
+                threshold_mask
+                & membership_mask
+                & detailed_pr_calc_df["hallucination_false_positive_flag"]
+            )
+
+        detailed_pr_calc_df.loc[
+            mask,
+            "hallucination_false_positive_flag",
+        ] = False
+
+    # next, we flag false negatives by declaring any groundtruth that isn't associated with a true positive to be a false negative
+    groundtruths_associated_with_true_positives = (
+        detailed_pr_calc_df[detailed_pr_calc_df["true_positive_flag"]]
+        .groupby(["confidence_threshold"], as_index=False)["id_gt"]
+        .unique()
+    )
+
+    if not groundtruths_associated_with_true_positives.empty:
+        confidence_interval_to_groundtruths_associated_with_true_positives_dict = (
+            groundtruths_associated_with_true_positives.set_index(
+                "confidence_threshold"
+            )["id_gt"]
+            .apply(set)
+            .to_dict()
+        )
+
+        mask = pd.Series(False, index=detailed_pr_calc_df.index)
+
+        for (
+            threshold,
+            elements,
+        ) in (
+            confidence_interval_to_groundtruths_associated_with_true_positives_dict.items()
+        ):
+            threshold_mask = (
+                detailed_pr_calc_df["confidence_threshold"] == threshold
+            )
+            membership_mask = detailed_pr_calc_df["id_gt"].isin(elements)
+            mask |= threshold_mask & membership_mask
+
+        detailed_pr_calc_df["false_negative_flag"] = ~mask
+
+    else:
+        detailed_pr_calc_df["false_negative_flag"] = False
+
+    # it's a misclassification if there is a corresponding misclassification false positive
+    detailed_pr_calc_df["misclassification_false_negative_flag"] = (
+        detailed_pr_calc_df["misclassification_false_positive_flag"]
+        & detailed_pr_calc_df["false_negative_flag"]
+    )
+
+    # assign all id_gts that aren't misclassifications but are false negatives to be no_predictions
+    groundtruths_associated_with_misclassification_false_negatives = (
+        detailed_pr_calc_df[
+            detailed_pr_calc_df["misclassification_false_negative_flag"]
+        ]
+        .groupby(["confidence_threshold"], as_index=False)["id_gt"]
+        .unique()
+    )
+
+    if (
+        not groundtruths_associated_with_misclassification_false_negatives.empty
+    ):
+        confidence_interval_to_groundtruths_associated_with_misclassification_fn_dict = (
+            groundtruths_associated_with_misclassification_false_negatives.set_index(
+                "confidence_threshold"
+            )[
+                "id_gt"
+            ]
+            .apply(set)
+            .to_dict()
+        )
+
+        mask = pd.Series(False, index=detailed_pr_calc_df.index)
+
+        for (
+            threshold,
+            elements,
+        ) in (
+            confidence_interval_to_groundtruths_associated_with_misclassification_fn_dict.items()
+        ):
+            threshold_mask = (
+                detailed_pr_calc_df["confidence_threshold"] == threshold
+            )
+            membership_mask = detailed_pr_calc_df["id_gt"].isin(elements)
+            mask |= threshold_mask & membership_mask
+
+        detailed_pr_calc_df["no_predictions_false_negative_flag"] = (
+            ~mask & detailed_pr_calc_df["false_negative_flag"]
+        )
+    else:
+        detailed_pr_calc_df[
+            "no_predictions_false_negative_flag"
+        ] = detailed_pr_calc_df["false_negative_flag"]
+
+    # next, we sum up the occurences of each classification and merge them together into one dataframe
+    true_positives = (
+        detailed_pr_calc_df[detailed_pr_calc_df["true_positive_flag"]]
+        .groupby(["label_key", "label_value_pd", "confidence_threshold"])[
+            "id_pd"
+        ]
+        .nunique()
+    )
+    true_positives.name = "true_positives"
+
+    hallucination_false_positives = (
+        detailed_pr_calc_df[
+            detailed_pr_calc_df["hallucination_false_positive_flag"]
+        ]
+        .groupby(["label_key", "label_value_pd", "confidence_threshold"])[
+            "id_pd"
+        ]
+        .nunique()
+    )
+    hallucination_false_positives.name = "hallucinations_false_positives"
+
+    misclassification_false_positives = (
+        detailed_pr_calc_df[
+            detailed_pr_calc_df["misclassification_false_positive_flag"]
+        ]
+        .groupby(["label_key", "label_value_pd", "confidence_threshold"])[
+            "id_pd"
+        ]
+        .nunique()
+    )
+    misclassification_false_positives.name = (
+        "misclassification_false_positives"
+    )
+
+    misclassification_false_negatives = (
+        detailed_pr_calc_df[
+            detailed_pr_calc_df["misclassification_false_negative_flag"]
+        ]
+        .groupby(["label_key", "label_value_gt", "confidence_threshold"])[
+            "id_gt"
+        ]
+        .nunique()
+    )
+    misclassification_false_negatives.name = (
+        "misclassification_false_negatives"
+    )
+
+    no_predictions_false_negatives = (
+        detailed_pr_calc_df[
+            detailed_pr_calc_df["no_predictions_false_negative_flag"]
+        ]
+        .groupby(["label_key", "label_value_gt", "confidence_threshold"])[
+            "id_gt"
+        ]
+        .nunique()
+    )
+    no_predictions_false_negatives.name = "no_predictions_false_negatives"
+
+    # combine these outputs
+    detailed_pr_curve_counts_df = (
+        pd.concat(
+            [
+                detailed_pr_calc_df.loc[
+                    ~detailed_pr_calc_df["label_value_pd"].isnull(),
+                    [
+                        "label_key",
+                        "label_value_pd",
+                        "confidence_threshold",
+                    ],
+                ].rename(columns={"label_value_pd": "label_value"}),
+                detailed_pr_calc_df.loc[
+                    ~detailed_pr_calc_df["label_value_gt"].isnull(),
+                    [
+                        "label_key",
+                        "label_value_gt",
+                        "confidence_threshold",
+                    ],
+                ].rename(columns={"label_value_gt": "label_value"}),
+            ],
+            axis=0,
+        )
+        .drop_duplicates()
+        .merge(
+            true_positives,
+            left_on=[
+                "label_key",
+                "label_value",
+                "confidence_threshold",
+            ],
+            right_index=True,
+            how="outer",
+        )
+        .merge(
+            hallucination_false_positives,
+            left_on=[
+                "label_key",
+                "label_value",
+                "confidence_threshold",
+            ],
+            right_index=True,
+            how="outer",
+        )
+        .merge(
+            misclassification_false_positives,
+            left_on=[
+                "label_key",
+                "label_value",
+                "confidence_threshold",
+            ],
+            right_index=True,
+            how="outer",
+        )
+        .merge(
+            misclassification_false_negatives,
+            left_on=[
+                "label_key",
+                "label_value",
+                "confidence_threshold",
+            ],
+            right_index=True,
+            how="outer",
+        )
+        .merge(
+            no_predictions_false_negatives,
+            left_on=[
+                "label_key",
+                "label_value",
+                "confidence_threshold",
+            ],
+            right_index=True,
+            how="outer",
+        )
+    )
+
+    # we're doing an outer join, so any nulls should be zeroes
+    detailed_pr_curve_counts_df.fillna(0, inplace=True)
+
+    # add samples to the dataframe for DetailedPrecisionRecallCurves
+    for flag in [
+        "true_positive_flag",
+        "misclassification_false_negative_flag",
+        "no_predictions_false_negative_flag",
+        "misclassification_false_positive_flag",
+        "hallucination_false_positive_flag",
+    ]:
+        detailed_pr_curve_counts_df = _add_samples_to_dataframe(
+            detailed_pr_calc_df=detailed_pr_calc_df,
+            detailed_pr_curve_counts_df=detailed_pr_curve_counts_df,
+            max_examples=pr_curve_max_examples,
+            flag_column=flag,
+        )
+
+    # create output
+    detailed_pr_curves = defaultdict(lambda: defaultdict(dict))
+    for _, row in detailed_pr_curve_counts_df.iterrows():
+        label_key = row["label_key"]
+        label_value = row["label_value"]
+        confidence_threshold = row["confidence_threshold"]
+
+        detailed_pr_curves[label_key][label_value][confidence_threshold] = {
+            "tp": {
+                "total": row["true_positives"],
+                "observations": {
+                    "all": {
+                        "count": row["true_positives"],
+                        "examples": row["true_positive_flag_samples"],
+                    }
+                },
+            },
+            "fn": {
+                "total": row["misclassification_false_negatives"]
+                + row["no_predictions_false_negatives"],
+                "observations": {
+                    "misclassifications": {
+                        "count": row["misclassification_false_negatives"],
+                        "examples": row[
+                            "misclassification_false_negative_flag_samples"
+                        ],
+                    },
+                    "no_predictions": {
+                        "count": row["no_predictions_false_negatives"],
+                        "examples": row[
+                            "no_predictions_false_negative_flag_samples"
+                        ],
+                    },
+                },
+            },
+            "fp": {
+                "total": row["misclassification_false_positives"]
+                + row["hallucinations_false_positives"],
+                "observations": {
+                    "misclassifications": {
+                        "count": row["misclassification_false_positives"],
+                        "examples": row[
+                            "misclassification_false_positive_flag_samples"
+                        ],
+                    },
+                    "hallucinations": {
+                        "count": row["hallucinations_false_positives"],
+                        "examples": row[
+                            "hallucination_false_positive_flag_samples"
+                        ],
+                    },
+                },
+            },
+        }
+
+    detailed_pr_metrics = [
+        metrics.DetailedPrecisionRecallCurve(
+            label_key=key,
+            value=dict(value),
+            pr_curve_iou_threshold=pr_curve_iou_threshold,
+        )
+        for key, value in detailed_pr_curves.items()
+    ]
+
+    return detailed_pr_metrics
+
+
+def _create_detailed_joint_df(
+    groundtruth_df: pd.DataFrame, prediction_df: pd.DataFrame
+):
+    """Create the dataframe needed to calculate DetailedPRCurves from a groundtruth and prediction dataframe."""
+    detailed_joint_df = pd.merge(
+        groundtruth_df,
+        prediction_df,
+        on=["datum_id", "label_key"],
+        how="outer",
+        suffixes=("_gt", "_pd"),
+    ).assign(
+        is_label_match=lambda chain_df: (
+            chain_df["label_id_pd"] == chain_df["label_id_gt"]
+        )
+    )
+    detailed_joint_df = _calculate_iou(joint_df=detailed_joint_df)
+    return detailed_joint_df
+
+
+def create_detection_evaluation_inputs(
+    groundtruths,
+    predictions,
+    metrics_to_return,
+    label_map,
+    convert_annotations_to_type,
+):
+
+    groundtruth_df = utilities.create_validated_groundtruth_df(
+        groundtruths, task_type=enums.TaskType.OBJECT_DETECTION
+    )
+    prediction_df = utilities.create_validated_prediction_df(
+        predictions, task_type=enums.TaskType.OBJECT_DETECTION
+    )
+
+    # filter dataframes based on task type
+    groundtruth_df = utilities.filter_dataframe_by_task_type(
+        df=groundtruth_df, task_type=enums.TaskType.OBJECT_DETECTION
+    )
+
+    if not prediction_df.empty:
+        prediction_df = utilities.filter_dataframe_by_task_type(
+            df=prediction_df, task_type=enums.TaskType.OBJECT_DETECTION
+        )
+
+    # ensure that all annotations have a common type to operate over
+    (
+        groundtruth_df,
+        prediction_df,
+    ) = utilities.convert_annotations_to_common_type(
+        groundtruth_df=groundtruth_df,
+        prediction_df=prediction_df,
+        target_type=convert_annotations_to_type,
+    )
+
+    groundtruth_df, prediction_df = utilities.replace_labels_using_label_map(
+        groundtruth_df=groundtruth_df,
+        prediction_df=prediction_df,
+        label_map=label_map,
+    )
+    # add label as a column
+    for df in (groundtruth_df, prediction_df):
+        df.loc[:, "label"] = df.apply(
+            lambda chain_df: (chain_df["label_key"], chain_df["label_value"]),
+            axis=1,
+        )
+
+    joint_df = _get_joint_df(
+        groundtruth_df=groundtruth_df,
+        prediction_df=prediction_df,
+    )
+
+    # store solo groundtruths and predictions such that we can add them back after we calculate IOU
+    predictions_missing_groundtruths = joint_df[
+        joint_df["id_gt"].isnull()
+    ].assign(iou_=0)
+    groundtruths_missing_predictions = joint_df[
+        joint_df["id_pd"].isnull()
+    ].assign(iou_=0)
+
+    joint_df = _calculate_iou(joint_df=joint_df)
+
+    # filter out null groundtruths and sort by score and iou so that idxmax returns the best row for each prediction
+    joint_df = joint_df[~joint_df["id_gt"].isnull()].sort_values(
+        by=["score", "iou_"], ascending=[False, False]
+    )
+
+    # get the best prediction (in terms of score and iou) for each groundtruth
+    prediction_has_best_score = joint_df.groupby(["id_pd"])["score"].idxmax()
+
+    joint_df = joint_df.loc[prediction_has_best_score]
+
+    # add back missing predictions and groundtruths
+    joint_df = pd.concat(
+        [
+            joint_df,
+            predictions_missing_groundtruths,
+            groundtruths_missing_predictions,
+        ],
+        axis=0,
+    )
+
+    if (
+        metrics_to_return
+        and enums.MetricType.DetailedPrecisionRecallCurve in metrics_to_return
+    ):
+        detailed_joint_df = _create_detailed_joint_df(
+            groundtruth_df=groundtruth_df, prediction_df=prediction_df
+        )
+    else:
+        detailed_joint_df = None
+
+    # remove unnecessary columns to save memory
+    groundtruth_df = groundtruth_df.loc[
+        :,
+        [
+            "datum_uid",
+            "label_key",
+            "annotation_id",
+            "label_value",
+            "id",
+            "label",
+        ],
+    ]
+
+    prediction_df = prediction_df.loc[
+        :,
+        [
+            "datum_uid",
+            "annotation_id",
+            "label_key",
+            "label_value",
+        ],
+    ]
+
+    joint_df = joint_df.loc[
+        :,
+        [
+            "label_id",
+            "id_gt",
+            "label",
+            "score",
+            "id_pd",
+            "iou_",
+        ],
+    ]
+
+    if detailed_joint_df is not None:
+        detailed_joint_df = detailed_joint_df.loc[
+            :,
+            [
+                "datum_uid_gt",
+                "label_key",
+                "label_value_gt",
+                "id_gt",
+                "converted_geometry_gt",
+                "datum_uid_pd",
+                "label_value_pd",
+                "score",
+                "label_id_pd",
+                "id_pd",
+                "converted_geometry_pd",
+                "is_label_match",
+                "iou_",
+            ],
+        ]
+
+    return groundtruth_df, prediction_df, joint_df, detailed_joint_df
+
+
+def compute_detection_metrics(
+    joint_df: pd.DataFrame,
+    detailed_joint_df: pd.DataFrame | None,
+    metrics_to_return: list[enums.MetricType],
+    iou_thresholds_to_compute: list[float],
+    iou_thresholds_to_return: list[float],
+    recall_score_threshold: float,
+    pr_curve_iou_threshold: float,
+    pr_curve_max_examples: int,
+) -> list[dict]:
+    """
+    Compute detection metrics for evaluating object detection models. This function calculates Intersection over Union (IoU) for each ground truth-prediction pair that shares a common grouper id, and computes metrics such as Average Precision (AP), Average Recall (AR), and Precision-Recall (PR) curves.
+
+    Parameters
+    ----------
+    joint_df : pd.DataFrame
+        Dataframe containing merged groundtruths and predictions, joined by label.
+    detailed_joint_df : pd.DataFrame
+        Dataframe containing merged groundtruths and predictions, joined by label key.
+    metrics_to_return : list[enums.MetricType]
+        List of metric types to calculate and return, such as AP, AR, or PR curves.
+    iou_thresholds_to_compute : list[float]
+        List of IoU thresholds for which metrics should be computed.
+    iou_thresholds_to_return : list[float]
+        List of IoU thresholds for which metrics should be returned.
+    recall_score_threshold : float
+        Threshold for the recall score to consider in metric calculations.
+    pr_curve_iou_threshold : float
+        IoU threshold for computing Precision-Recall curves.
+    pr_curve_max_examples : int
+        Maximum number of examples to use for Precision-Recall curve calculations.
+
+    Returns
+    -------
+    list[dict]
+        A list of dictionaries containing computed metrics, including AP, AR, and PR curves, filtered according to `metrics_to_return`.
+
+    Raises
+    ------
+    ValueError
+        If there is an issue with the data or parameters provided.
+    """
+
+    metrics_to_output = []
+
+    # add iou_threshold to the dataframe and sort
+    calculation_df = pd.concat(
+        [
+            joint_df.assign(iou_threshold=threshold)
+            for threshold in iou_thresholds_to_compute
+        ],
+        ignore_index=True,
+    ).sort_values(
+        by=["label_id", "label", "iou_threshold", "score", "iou_"],
+        ascending=False,
+    )
+
+    # calculate metrics
+    calculation_df = _calculate_label_id_level_metrics(
+        calculation_df=calculation_df,
+        recall_score_threshold=recall_score_threshold,
+    )
+
+    metrics_to_output += _calculate_ap_metrics(
+        calculation_df=calculation_df,
+        iou_thresholds_to_compute=iou_thresholds_to_compute,
+        iou_thresholds_to_return=iou_thresholds_to_return,
+    )
+
+    metrics_to_output += _calculate_ar_metrics(
+        calculation_df=calculation_df,
+        iou_thresholds_to_compute=iou_thresholds_to_compute,
+    )
+
+    metrics_to_output += _calculate_pr_metrics(
+        joint_df=joint_df,
+        metrics_to_return=metrics_to_return,
+        pr_curve_iou_threshold=pr_curve_iou_threshold,
+    )
+
+    metrics_to_output += _calculate_detailed_pr_metrics(
+        detailed_pr_joint_df=detailed_joint_df,
+        metrics_to_return=metrics_to_return,
+        pr_curve_iou_threshold=pr_curve_iou_threshold,
+        pr_curve_max_examples=pr_curve_max_examples,
+    )
+
+    # convert objects to dictionaries and only return what was asked for
+    metrics_to_output = [
+        m.to_dict()
+        for m in metrics_to_output
+        if m.to_dict()["type"] in metrics_to_return
+    ]
+
+    return metrics_to_output
+
+
+def evaluate_detection(
+    groundtruths: pd.DataFrame | list[schemas.GroundTruth],
+    predictions: pd.DataFrame | list[schemas.Prediction],
+    label_map: dict[schemas.Label, schemas.Label] | None = None,
+    metrics_to_return: list[enums.MetricType] | None = None,
+    convert_annotations_to_type: enums.AnnotationType | None = None,
+    iou_thresholds_to_compute: list[float] | None = None,
+    iou_thresholds_to_return: list[float] | None = None,
+    recall_score_threshold: float = 0.0,
+    pr_curve_iou_threshold: float = 0.5,
+    pr_curve_max_examples: int = 1,
+) -> schemas.Evaluation:
+    """
+    Evaluate an object detection task using some set of groundtruths and predictions.
+
+    The groundtruths and predictions can be inputted as a pandas DataFrame or as a list of GroundTruth/Prediction objects. A dataframe of groundtruths / predictions should contain the following columns:
+    - datum_uid (str): The unique identifier for the datum.
+    - datum_id (int): A hashed identifier that's unique to each datum.
+    - datum_metadata (dict): Metadata associated with the datum.
+    - annotation_id (int): A hashed identifier for each unique (datum_uid, annotation) combination.
+    - annotation_metadata (dict): Metadata associated with the annotation.
+    - bounding_box (tuple): The bounding box coordinates of the annotation, if available.
+    - raster (schemas.Raster): The raster representation of the annotation, if available.
+    - polygon (schemas.Polygon): The polygon coordinates of the annotation, if available.
+    - embedding (schemas.Embedding): The embedding vector associated with the annotation, if available.
+    - is_instance (bool): A boolean indicating whether the annotation is an instance segjmentation (True) or not (False).
+    - label_key (str): The key associated with the label.
+    - label_value (str): The value associated with the label.
+    - score (float): The confidence score of the prediction. Should be bound between 0 and 1. Should only be included for prediction dataframes.
+    - label_id (int): A hashed identifier for each unique label.
+    - id (str): A unique identifier for the combination of datum, annotation, and label, created by concatenating the indices of these components.
+
+
+    Parameters
+    ----------
+    groundtruths : pd.DataFrame | list[schemas.GroundTruth]
+        A list of GroundTruth objects or a pandas DataFrame describing your ground truths.
+    predictions : pd.DataFrame | list[schemas.Prediction]
+        A list of Prediction objects or a pandas DataFrame describing your predictions.
+    label_map : dict[schemas.Label, schemas.Label], optional
+        Mapping of ground truth labels to prediction labels.
+    metrics_to_return : list[enums.MetricType], optional
+        List of metric types to calculate and return.
+    convert_annotations_to_type : enums.AnnotationType, optional
+        Annotation type to convert all annotations to.
+    iou_thresholds_to_compute : list[float], optional
+        IoU thresholds for which metrics should be computed.
+    iou_thresholds_to_return : list[float], optional
+        IoU thresholds for which metrics should be returned.
+    recall_score_threshold : float, default=0.0
+        Threshold for recall score to consider in metric calculations.
+    pr_curve_iou_threshold : float, default=0.5
+        IoU threshold for computing Precision-Recall curves.
+    pr_curve_max_examples : int, default=1
+        Maximum number of examples for Precision-Recall curve calculations.
+
+    Returns
+    -------
+    schemas.Evaluation
+        An Evaluation object containing the calculated metrics and other details.
+
+    Raises
+    ------
+    ValueError
+        If there is an issue with the provided parameters or data.
+    """
+    start_time = time.time()
+
+    if not label_map:
+        label_map = {}
+
+    if metrics_to_return is None:
+        metrics_to_return = [
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+        ]
+
+    if iou_thresholds_to_compute is None:
+        iou_thresholds_to_compute = [
+            round(0.5 + 0.05 * i, 2) for i in range(10)
+        ]
+    if iou_thresholds_to_return is None:
+        iou_thresholds_to_return = [0.5, 0.75]
+
+    utilities.validate_label_map(label_map=label_map)
+    utilities.validate_metrics_to_return(
+        metrics_to_return=metrics_to_return,
+        task_type=enums.TaskType.OBJECT_DETECTION,
+    )
+    utilities.validate_parameters(
+        pr_curve_iou_threshold=pr_curve_iou_threshold,
+        pr_curve_max_examples=pr_curve_max_examples,
+        recall_score_threshold=recall_score_threshold,
+    )
+
+    (
+        groundtruth_df,
+        prediction_df,
+        joint_df,
+        detailed_joint_df,
+    ) = create_detection_evaluation_inputs(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        metrics_to_return=metrics_to_return,
+        label_map=label_map,
+        convert_annotations_to_type=convert_annotations_to_type,
+    )
+
+    # add the number of groundtruth observations per grouper
+    number_of_groundtruths_per_label_df = (
+        groundtruth_df.groupby(["label"], as_index=False)["id"]
+        .nunique()
+        .rename({"id": "gts_per_grouper"}, axis=1)
+    )
+    joint_df = pd.merge(
+        joint_df,
+        number_of_groundtruths_per_label_df,
+        on=["label"],
+        how="outer",
+    )
+
+    (
+        missing_pred_labels,
+        ignored_pred_labels,
+    ) = utilities.get_disjoint_labels(
+        groundtruth_df=groundtruth_df,
+        prediction_df=prediction_df,
+        label_map=label_map,
+    )
+
+    unique_labels = list(
+        set(zip(groundtruth_df["label_key"], groundtruth_df["label_value"]))
+        | set(zip(prediction_df["label_key"], prediction_df["label_value"]))
+    )
+    unique_datums_cnt = len(
+        set(groundtruth_df["datum_uid"]) | set(prediction_df["datum_uid"])
+    )
+    unique_annotations_cnt = len(
+        set(groundtruth_df["annotation_id"])
+        | set(prediction_df["annotation_id"])
+    )
+
+    metrics = compute_detection_metrics(
+        joint_df=joint_df,
+        detailed_joint_df=detailed_joint_df,
+        metrics_to_return=metrics_to_return,
+        iou_thresholds_to_compute=iou_thresholds_to_compute,
+        iou_thresholds_to_return=iou_thresholds_to_return,
+        recall_score_threshold=recall_score_threshold,
+        pr_curve_iou_threshold=pr_curve_iou_threshold,
+        pr_curve_max_examples=pr_curve_max_examples,
+    )
+
+    return schemas.Evaluation(
+        parameters=schemas.EvaluationParameters(
+            label_map=label_map,
+            metrics_to_return=metrics_to_return,
+            iou_thresholds_to_compute=iou_thresholds_to_compute,
+            iou_thresholds_to_return=iou_thresholds_to_return,
+            recall_score_threshold=recall_score_threshold,
+            pr_curve_iou_threshold=pr_curve_iou_threshold,
+            pr_curve_max_examples=pr_curve_max_examples,
+        ),
+        metrics=metrics,
+        confusion_matrices=[],
+        ignored_pred_labels=ignored_pred_labels,
+        missing_pred_labels=missing_pred_labels,
+        meta={
+            "labels": len(unique_labels),
+            "datums": unique_datums_cnt,
+            "annotations": unique_annotations_cnt,
+            "duration": time.time() - start_time,
+        },
+    )
diff --git a/core/valor_core/enums.py b/core/valor_core/enums.py
new file mode 100644
index 000000000..e804f5aa8
--- /dev/null
+++ b/core/valor_core/enums.py
@@ -0,0 +1,77 @@
+from enum import Enum
+
+
+class AnnotationType(str, Enum):
+    NONE = "none"
+    BOX = "box"
+    POLYGON = "polygon"
+    RASTER = "raster"
+
+
+class TaskType(str, Enum):
+    SKIP = "skip"
+    EMPTY = "empty"
+    CLASSIFICATION = "classification"
+    OBJECT_DETECTION = "object-detection"
+    SEMANTIC_SEGMENTATION = "semantic-segmentation"
+    EMBEDDING = "embedding"
+
+
+class MetricType(str, Enum):
+
+    Accuracy = ("Accuracy",)
+    Precision = ("Precision",)
+    Recall = ("Recall",)
+    F1 = ("F1",)
+    ROCAUC = ("ROCAUC",)
+    AP = "AP"
+    AR = "AR"
+    mAP = "mAP"
+    mAR = "mAR"
+    APAveragedOverIOUs = "APAveragedOverIOUs"
+    mAPAveragedOverIOUs = "mAPAveragedOverIOUs"
+    IOU = "IOU"
+    mIOU = "mIOU"
+    PrecisionRecallCurve = "PrecisionRecallCurve"
+    DetailedPrecisionRecallCurve = "DetailedPrecisionRecallCurve"
+
+    @classmethod
+    def classification(cls) -> set["MetricType"]:
+        """
+        MetricTypes for classification tasks.
+        """
+        return {
+            cls.Accuracy,
+            cls.Precision,
+            cls.Recall,
+            cls.F1,
+            cls.ROCAUC,
+            cls.PrecisionRecallCurve,
+            cls.DetailedPrecisionRecallCurve,
+        }
+
+    @classmethod
+    def object_detection(cls) -> set["MetricType"]:
+        """
+        MetricTypes for object-detection tasks.
+        """
+        return {
+            cls.AP,
+            cls.AR,
+            cls.mAP,
+            cls.mAR,
+            cls.APAveragedOverIOUs,
+            cls.mAPAveragedOverIOUs,
+            cls.PrecisionRecallCurve,
+            cls.DetailedPrecisionRecallCurve,
+        }
+
+    @classmethod
+    def semantic_segmentation(cls) -> set["MetricType"]:
+        """
+        MetricTypes for semantic-segmentation tasks.
+        """
+        return {
+            cls.IOU,
+            cls.mIOU,
+        }
diff --git a/core/valor_core/geometry.py b/core/valor_core/geometry.py
new file mode 100644
index 000000000..a1942643f
--- /dev/null
+++ b/core/valor_core/geometry.py
@@ -0,0 +1,139 @@
+import numpy as np
+import shapely.affinity
+from shapely.geometry import Polygon as ShapelyPolygon
+
+# turn off "invalid value encountered in scalar divide" warning
+# when dividing by 0 or NaN, the returned value will be NaN. we'll then handle those NaNs later in the evaluation code
+np.seterr(divide="ignore", invalid="ignore")
+
+
+def calculate_iou(
+    bbox1: list[tuple[float, float]], bbox2: list[tuple[float, float]]
+) -> float:
+    """
+    Calculate the Intersection over Union (IOU) for two bounding boxes.
+
+    Parameters
+    ----------
+    bbox1 : list[tuple[float, float]]
+        Coordinates of the first bounding box.
+    bbox2 : list[tuple[float, float]]
+        Coordinates of the second bounding box.
+
+    Returns
+    ----------
+    float
+        The IOU value between 0 and 1.
+    """
+    poly1 = ShapelyPolygon(bbox1)
+    poly2 = ShapelyPolygon(bbox2)
+    intersection_area = poly1.intersection(poly2).area
+    union_area = poly1.area + poly2.area - intersection_area
+    return intersection_area / union_area if union_area != 0 else 0
+
+
+def rotate_bbox(
+    bbox: list[tuple[float, float]],
+    angle: float,
+    origin: str | tuple[float, float] = "centroid",
+) -> list[tuple[float, float]]:
+    """
+    Rotate a bounding box by a given angle around the centroid of a polygon.
+
+    Parameters
+    ----------
+    bbox : list[tuple[float, float]]
+        Coordinates of the bounding box.
+    angle : float
+        The rotation angle in degrees.
+    origin : str | tuple[float, float]
+        The point around which to rotate. Default is "centroid".
+
+    Returns
+    ----------
+    list[tuple[float, float]]
+        Coordinates of the rotated bounding box.
+    """
+    return list(
+        shapely.affinity.rotate(
+            ShapelyPolygon(bbox), angle=angle, origin=origin  # type: ignore - shapely type error. can be a string ("centroid", "center") or a tuple of coordinates
+        ).exterior.coords
+    )
+
+
+def is_axis_aligned(bbox: list[tuple[float, float]]) -> bool:
+    """
+    Check if the bounding box is axis-aligned.
+
+    Parameters
+    ----------
+    bbox : list[tuple[float, float]]
+        Coordinates of the bounding box.
+
+    Returns
+    ----------
+    bool
+        True if the bounding box is axis-aligned, otherwise False.
+    """
+    return all(
+        x1 == x2 or y1 == y2
+        for (x1, y1), (x2, y2) in zip(bbox, bbox[1:] + bbox[:1])
+    )
+
+
+def is_skewed(bbox: list[tuple[float, float]]) -> bool:
+    """
+    Check if the bounding box is skewed.
+
+    Parameters
+    ----------
+    bbox : list[tuple[float, float]]
+        Coordinates of the bounding box.
+
+    Returns
+    ----------
+    bool
+        True if the bounding box is skewed, otherwise False.
+    """
+
+    def _calculate_angle_between_arrays(
+        v1: np.ndarray, v2: np.ndarray
+    ) -> float:
+        dot_product = np.dot(v1, v2)
+        norm_product = np.linalg.norm(v1) * np.linalg.norm(v2)
+        cos_angle = dot_product / norm_product
+        return np.arccos(np.clip(cos_angle, -1.0, 1.0))
+
+    vectors = []
+    for (x1, y1), (x2, y2) in zip(bbox, bbox[1:] + bbox[:1]):
+        vectors.append(np.array([x2 - x1, y2 - y1]))
+
+    angles = [
+        _calculate_angle_between_arrays(
+            vectors[i], vectors[(i + 1) % len(vectors)]
+        )
+        for i in range(len(vectors))
+    ]
+
+    return not all(
+        np.isclose(angle, np.pi / 2, atol=1e-2)  # if close to 90 degrees
+        for angle in angles
+        if not np.isnan(angle)
+    )
+
+
+def is_rotated(bbox: list[tuple[float, float]]) -> bool:
+    """
+    Check if the bounding box is rotated (not axis-aligned and not skewed).
+
+    Parameters
+    ----------
+    bbox : list[tuple[float, float]]
+        Coordinates of the bounding box.
+
+    Returns
+    ----------
+    bool
+        True if the bounding box is rotated, otherwise False.
+    """
+    return not is_axis_aligned(bbox) and not is_skewed(bbox)
diff --git a/core/valor_core/managers.py b/core/valor_core/managers.py
new file mode 100644
index 000000000..eefed359f
--- /dev/null
+++ b/core/valor_core/managers.py
@@ -0,0 +1,322 @@
+import time
+from dataclasses import dataclass, field
+
+import pandas as pd
+from valor_core import enums, schemas
+from valor_core.detection import (
+    compute_detection_metrics,
+    create_detection_evaluation_inputs,
+)
+
+
+def _concatenate_df_if_not_empty(
+    df1: pd.DataFrame, df2: pd.DataFrame | None
+) -> pd.DataFrame:
+    """Checks to see if a dataframe is None before attempting a concatenation. Handles pandas warning about not using pd.concat on empty dataframes."""
+
+    if not df1.empty and (df2 is not None):
+        df1 = pd.concat(
+            [df1, df2],
+            ignore_index=True,
+        )
+    elif df1.empty and (df2 is not None):
+        df1 = df2
+
+    return df1
+
+
+@dataclass
+class ValorDetectionManager:
+    """
+    Manages the evaluation of object detection predictions against groundtruths.
+
+    Attributes
+    ----------
+    datum_uids : set[str]
+        A set of unique identifiers for the data samples.
+    label_map : dict[schemas.Label, schemas.Label]
+        A mapping from one label schema to another.
+    convert_annotations_to_type : AnnotationType, optional
+        The target annotation type to convert the data to.
+    metrics_to_return : list[enums.MetricType]
+        A list of metrics to calculate during the evaluation.
+    iou_thresholds_to_compute : list[float]
+        A list of IoU thresholds to compute metrics for.
+    iou_thresholds_to_return : list[float]
+        A list of IoU thresholds to return metrics for.
+    recall_score_threshold : float
+        The score threshold for recall calculations.
+    pr_curve_iou_threshold : float
+        The IoU threshold used for precision-recall curve calculation.
+    pr_curve_max_examples : int
+        The maximum number of examples to include in the precision-recall curve.
+    joint_df : pd.DataFrame
+        A DataFrame containing merged groundtruth and prediction data with calculated IoU.
+    detailed_joint_df : pd.DataFrame
+        A DataFrame containing detailed data for precision-recall curves.
+    unique_groundtruth_labels : dict[set[tuple[str, str]], set[str]]
+        A dictionary mapping labels to unique groundtruth annotation IDs.
+    unique_prediction_labels : set[tuple[str, str]]
+        A set of unique labels present in the predictions.
+    unique_annotation_ids : set[int]
+        A set of unique annotation IDs across groundtruth and prediction data.
+    """
+
+    datum_uids: set = field(default_factory=set)
+    label_map: dict[schemas.Label, schemas.Label] = field(default_factory=dict)
+    convert_annotations_to_type: enums.AnnotationType | None = None
+    metrics_to_return: list[enums.MetricType] = field(
+        default_factory=lambda: [
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+        ]
+    )
+    iou_thresholds_to_compute: list[float] = field(
+        default_factory=lambda: [round(0.5 + 0.05 * i, 2) for i in range(10)]
+    )
+    iou_thresholds_to_return: list[float] = field(
+        default_factory=lambda: [0.5, 0.75]
+    )
+    recall_score_threshold: float = field(default=0.0)
+    pr_curve_iou_threshold: float = field(default=0.5)
+    pr_curve_max_examples: int = field(default=1)
+    joint_df: pd.DataFrame = field(
+        default_factory=lambda: pd.DataFrame(
+            [],
+            columns=[
+                "label_id",
+                "id_gt",
+                "label",
+                "score",
+                "id_pd",
+                "iou_",
+            ],
+        )
+    )
+    detailed_joint_df: pd.DataFrame = field(
+        default_factory=lambda: pd.DataFrame(
+            [],
+            columns=[
+                "datum_uid_gt",
+                "label_key",
+                "label_value_gt",
+                "id_gt",
+                "converted_geometry_gt",
+                "datum_uid_pd",
+                "label_value_pd",
+                "score",
+                "label_id_pd",
+                "id_pd",
+                "converted_geometry_pd",
+                "is_label_match",
+                "iou_",
+            ],
+        )
+    )
+    unique_groundtruth_labels: dict[set[tuple[str, str]], set[str]] = field(
+        default_factory=dict
+    )
+    unique_prediction_labels: set[tuple[str, str]] = field(default_factory=set)
+    unique_annotation_ids: set[int] = field(default_factory=set)
+    _locked = False
+
+    def __post_init__(self):
+        """Locks the class attributes to prevent modification after initialization."""
+        self._locked = True
+
+    def __setattr__(self, key, value):
+        """Overrides attribute setting to enforce immutability after initialization."""
+
+        if (
+            key
+            in [
+                "label_map",
+                "convert_annotations_to_type",
+                "metrics_to_return",
+                "iou_thresholds_to_compute",
+                "iou_thresholds_to_return",
+                "recall_score_threshold",
+                "pr_curve_iou_threshold",
+                "pr_curve_max_examples",
+            ]
+        ) and self._locked:
+            raise AttributeError(
+                f"Cannot manually modify '{key}' after instantiation."
+            )
+        super().__setattr__(key, value)
+
+    def add_data(
+        self,
+        groundtruths: list[schemas.GroundTruth],
+        predictions: list[schemas.Prediction],
+    ) -> None:
+        """
+        Adds groundtruth and prediction data to the manager.
+
+        Parameters
+        ----------
+        groundtruths : list[schemas.GroundTruth]
+            A list of GroundTruth objects.
+        predictions : list[schemas.Prediction]
+            A list of Prediction objects.
+
+        Raises
+        ------
+        ValueError
+            If the groundtruths or predictions are not valid lists, or if duplicate
+            datum_uids are detected.
+        """
+        if not (
+            isinstance(groundtruths, list)
+            and (len(groundtruths) > 0)
+            and all([isinstance(x, schemas.GroundTruth) for x in groundtruths])
+        ):
+            raise ValueError(
+                "groundtruths should be a non-empty list of schemas.GroundTruth objects."
+            )
+
+        if not (isinstance(predictions, list)):
+            raise ValueError(
+                "predictions should be a non-empty list of schemas.Prediction objects."
+            )
+
+        # check that datum_uids don't exist in the data yet
+        unique_datum_uids = set([x.datum.uid for x in groundtruths]).union(
+            set([x.datum.uid for x in predictions])
+        )
+
+        if not unique_datum_uids.isdisjoint(self.datum_uids):
+            raise ValueError(
+                "Attempted to add data for a datum_uid which already exists in this instantiated class."
+            )
+        else:
+            self.datum_uids = self.datum_uids.union(unique_datum_uids)
+
+        (
+            groundtruth_df,
+            prediction_df,
+            joint_df,
+            detailed_joint_df,
+        ) = create_detection_evaluation_inputs(
+            groundtruths=groundtruths,
+            predictions=predictions,
+            metrics_to_return=self.metrics_to_return,
+            label_map=self.label_map,
+            convert_annotations_to_type=self.convert_annotations_to_type,
+        )
+
+        # append these dataframes to self
+        self.joint_df = _concatenate_df_if_not_empty(
+            df1=self.joint_df, df2=joint_df
+        )
+        self.detailed_joint_df = _concatenate_df_if_not_empty(
+            df1=self.detailed_joint_df, df2=detailed_joint_df
+        )
+
+        # store unique labels (split by gt and pd) and unique annotations
+        ids_per_label = (
+            groundtruth_df.groupby(["label"])["id"].apply(set).to_dict()
+        )
+
+        for label, value in ids_per_label.items():
+            if label in self.unique_groundtruth_labels.keys():
+                self.unique_groundtruth_labels[
+                    label
+                ] = self.unique_groundtruth_labels[label].union(value)
+            else:
+                self.unique_groundtruth_labels[label] = value
+
+        self.unique_prediction_labels.update(
+            set(zip(prediction_df["label_key"], prediction_df["label_value"]))
+        )
+        self.unique_annotation_ids.update(
+            set(groundtruth_df["annotation_id"])
+            | set(prediction_df["annotation_id"])
+        )
+
+    def evaluate(self):
+        """
+        Evaluates the added data to compute detection metrics.
+
+        Returns
+        -------
+        schemas.Evaluation
+            An evaluation object containing metrics, confusion matrices, and metadata.
+
+        Raises
+        ------
+        ValueError
+            If the method is called before any data has been added.
+        """
+        if self.joint_df.empty:
+            raise ValueError(
+                "Attempted to call .evaluate() without adding any data first. Please use add_data to add data to this class."
+            )
+
+        start_time = time.time()
+
+        # add the number of groundtruth observations per grouper to joint_df
+        count_of_unique_ids_per_label = {
+            key: len(value)
+            for key, value in self.unique_groundtruth_labels.items()
+        }
+
+        self.joint_df["gts_per_grouper"] = self.joint_df["label"].map(
+            count_of_unique_ids_per_label
+        )
+
+        metrics = compute_detection_metrics(
+            joint_df=self.joint_df,
+            detailed_joint_df=self.detailed_joint_df,
+            metrics_to_return=self.metrics_to_return,
+            iou_thresholds_to_compute=self.iou_thresholds_to_compute,
+            iou_thresholds_to_return=self.iou_thresholds_to_return,
+            recall_score_threshold=self.recall_score_threshold,
+            pr_curve_iou_threshold=self.pr_curve_iou_threshold,
+            pr_curve_max_examples=self.pr_curve_max_examples,
+        )
+
+        missing_pred_labels = [
+            (key, value)
+            for key, value in (
+                self.unique_groundtruth_labels.keys()
+                - self.unique_prediction_labels
+            )
+        ]
+
+        ignored_pred_labels = [
+            (key, value)
+            for key, value in (
+                self.unique_prediction_labels
+                - self.unique_groundtruth_labels.keys()
+            )
+        ]
+
+        return schemas.Evaluation(
+            parameters=schemas.EvaluationParameters(
+                label_map=self.label_map,
+                metrics_to_return=self.metrics_to_return,
+                iou_thresholds_to_compute=self.iou_thresholds_to_compute,
+                iou_thresholds_to_return=self.iou_thresholds_to_return,
+                recall_score_threshold=self.recall_score_threshold,
+                pr_curve_iou_threshold=self.pr_curve_iou_threshold,
+                pr_curve_max_examples=self.pr_curve_max_examples,
+            ),
+            metrics=metrics,
+            confusion_matrices=[],
+            ignored_pred_labels=ignored_pred_labels,
+            missing_pred_labels=missing_pred_labels,  # type: ignore - confirmed that this object is list[tuple[str, str]], but it isn't registerring as such
+            meta={
+                "labels": len(
+                    self.unique_groundtruth_labels.keys()
+                    | self.unique_prediction_labels
+                ),
+                "datums": len(self.datum_uids),
+                "annotations": len(self.unique_annotation_ids),
+                "duration": time.time() - start_time,
+            },
+        )
diff --git a/core/valor_core/metrics.py b/core/valor_core/metrics.py
new file mode 100644
index 000000000..fca9a1972
--- /dev/null
+++ b/core/valor_core/metrics.py
@@ -0,0 +1,661 @@
+from dataclasses import dataclass
+
+import numpy as np
+from valor_core import schemas
+
+
+@dataclass
+class _LabelMetricBase:
+    """
+    Defines a base class for label-level metrics.
+
+    Attributes
+    ----------
+    label : label
+        A label for the metric.
+    value : float
+        The metric value.
+    """
+
+    label: schemas.Label
+    value: float | None
+    __type__ = "BaseClass"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.label, schemas.Label):
+            raise TypeError(
+                f"Expected label to be an instance of schemas.Label, got {type(self.label).__name__}"
+            )
+        if self.value is not None and not isinstance(self.value, float):
+            raise TypeError(
+                f"Expected value to be a float or None, got {type(self.value).__name__}"
+            )
+
+    def to_dict(self):
+        """Converts a metric object into a dictionary."""
+        return {
+            "label": {"key": self.label.key, "value": self.label.value},
+            "value": self.value,
+            "type": self.__type__,
+        }
+
+
+@dataclass
+class _LabelKeyMetricBase:
+    """
+    Defines a base class for label key-level metrics.
+
+    Attributes
+    ----------
+    label_key : str
+        The label key associated with the metric.
+    value : float
+        The metric value.
+    """
+
+    label_key: str
+    value: float | None
+    __type__ = "BaseClass"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.label_key, str):
+            raise TypeError(
+                f"Expected label_key to be a string, got {type(self.label_key).__name__}"
+            )
+        if self.value is not None and not isinstance(self.value, float):
+            raise TypeError(
+                f"Expected value to be a float or None, got {type(self.value).__name__}"
+            )
+
+    def to_dict(self):
+        """Converts a metric object into a dictionary."""
+        return {
+            "parameters": {"label_key": self.label_key},
+            "value": self.value,
+            "type": self.__type__,
+        }
+
+
+@dataclass
+class ARMetric(_LabelMetricBase):
+    """
+    Defines an AR metric.
+
+    Attributes
+    ----------
+    ious : set[float]
+        A set of intersect-over-union (IOU) values.
+    value : float
+        The value of the metric.
+    label : Label
+        The `Label` for the metric.
+    """
+
+    ious: set[float]
+    __type__ = "AR"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+        if not isinstance(self.ious, set):
+            raise TypeError(
+                f"Expected ious to be a set, got {type(self.ious).__name__}"
+            )
+
+    def to_dict(self):
+        """Converts a metric object into a dictionary."""
+        return {
+            "label": {"key": self.label.key, "value": self.label.value},
+            "parameters": {"ious": sorted(list(self.ious))},
+            "value": self.value,
+            "type": self.__type__,
+        }
+
+
+@dataclass
+class APMetric(_LabelMetricBase):
+    """
+    Defines an AP metric.
+
+    Attributes
+    ----------
+    ious : set[float]
+        A set of intersect-over-union (IOU) values.
+    value : float
+        The value of the metric.
+    label : Label
+        The `Label` for the metric.
+    """
+
+    iou: float
+    __type__ = "AP"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+        if not isinstance(self.iou, float):
+            raise TypeError(
+                f"Expected iou to be a float, got {type(self.iou).__name__}"
+            )
+
+    def to_dict(self):
+        """Converts a metric object into a dictionary."""
+        return {
+            "label": {"key": self.label.key, "value": self.label.value},
+            "parameters": {"iou": self.iou},
+            "value": self.value,
+            "type": self.__type__,
+        }
+
+
+@dataclass
+class APMetricAveragedOverIOUs(_LabelMetricBase):
+    """
+    Defines an APMetricAveragedOverIOUs metric.
+
+    Attributes
+    ----------
+    ious : set[float]
+        A set of intersect-over-union (IOU) values.
+    value : float
+        The value of the metric.
+    label : Label
+        The `Label` for the metric.
+    """
+
+    ious: set[float]
+    __type__ = "APAveragedOverIOUs"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+        if not isinstance(self.ious, set):
+            raise TypeError(
+                f"Expected ious to be a set, got {type(self.ious).__name__}"
+            )
+
+    def to_dict(self):
+        """Converts a metric object into a dictionary."""
+        return {
+            "label": {"key": self.label.key, "value": self.label.value},
+            "parameters": {"ious": sorted(list(self.ious))},
+            "value": self.value,
+            "type": self.__type__,
+        }
+
+
+@dataclass
+class mARMetric(_LabelKeyMetricBase):
+    """
+    Defines a mAR metric.
+
+    Attributes
+    ----------
+    ious : set[float]
+        A set of intersect-over-union (IOU) values.
+    value : float
+        The value of the metric.
+    label_key : str
+        The label key associated with the metric.
+    """
+
+    ious: set[float]
+    __type__ = "mAR"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+        if not isinstance(self.ious, set):
+            raise TypeError(
+                f"Expected ious to be a set, got {type(self.ious).__name__}"
+            )
+
+    def to_dict(self):
+        """Converts a metric object into a dictionary."""
+        return {
+            "parameters": {
+                "label_key": self.label_key,
+                "ious": sorted(list(self.ious)),
+            },
+            "value": self.value,
+            "type": self.__type__,
+        }
+
+
+@dataclass
+class mAPMetric(_LabelKeyMetricBase):
+    """
+    Defines a mAP metric.
+
+    Attributes
+    ----------
+    iou: float
+        An intersect-over-union (IOU) value.
+    value : float
+        The value of the metric.
+    label_key : str
+        The label key associated with the metric.
+    """
+
+    iou: float
+    __type__ = "mAP"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+        if not isinstance(self.iou, float):
+            raise TypeError(
+                f"Expected iou to be a float, got {type(self.iou).__name__}"
+            )
+
+    def to_dict(self):
+        """Converts a metric object into a dictionary."""
+        return {
+            "parameters": {"label_key": self.label_key, "iou": self.iou},
+            "value": self.value,
+            "type": self.__type__,
+        }
+
+
+@dataclass
+class mAPMetricAveragedOverIOUs(_LabelKeyMetricBase):
+    """
+    Defines a mAR metric.
+
+    Attributes
+    ----------
+    ious : set[float]
+        A set of intersect-over-union (IOU) values.
+    value : float
+        The value of the metric.
+    label_key : str
+        The label key associated with the metric.
+    """
+
+    ious: set[float]
+    __type__ = "mAPAveragedOverIOUs"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+        if not isinstance(self.ious, set):
+            raise TypeError(
+                f"Expected ious to be a set, got {type(self.ious).__name__}"
+            )
+
+    def to_dict(self):
+        """Converts a metric object into a dictionary."""
+        return {
+            "parameters": {
+                "label_key": self.label_key,
+                "ious": sorted(list(self.ious)),
+            },
+            "value": self.value,
+            "type": self.__type__,
+        }
+
+
+class PrecisionMetric(_LabelMetricBase):
+    """
+    Defines a Precision metric.
+
+    Attributes
+    ----------
+    label : Label
+        A key-value pair.
+    value : float, optional
+        The metric value.
+    """
+
+    __type__ = "Precision"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+
+class RecallMetric(_LabelMetricBase):
+    """
+    Defines a Recall metric.
+
+    Attributes
+    ----------
+    label : Label
+        A key-value pair.
+    value : float, optional
+        The metric value.
+    """
+
+    __type__ = "Recall"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+
+class F1Metric(_LabelMetricBase):
+    """
+    Defines a F1 metric.
+
+    Attributes
+    ----------
+    label : Label
+        A key-value pair.
+    value : float, optional
+        The metric value.
+    """
+
+    __type__ = "F1"
+
+    def __post_init__(self):
+        super().__post_init__()
+
+
+class ROCAUCMetric(_LabelKeyMetricBase):
+    """
+    Defines a ROC AUC metric.
+
+    Attributes
+    ----------
+    label_key : str
+        The label key associated with the metric.
+    value : float
+        The metric value.
+    """
+
+    __type__ = "ROCAUC"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+
+class AccuracyMetric(_LabelKeyMetricBase):
+    """
+    Defines a accuracy metric.
+
+    Attributes
+    ----------
+    label_key : str
+        The label key associated with the metric.
+    value : float
+        The metric value.
+    """
+
+    __type__ = "Accuracy"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+
+@dataclass
+class _BasePrecisionRecallCurve:
+    """
+    Describes the parent class of our precision-recall curve metrics.
+
+    Attributes
+    ----------
+    label_key: str
+        The label key associated with the metric.
+    pr_curve_iou_threshold: float, optional
+        The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5.
+    """
+
+    label_key: str
+    value: dict
+    pr_curve_iou_threshold: float | None
+    __type__ = "BaseClass"
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.label_key, str):
+            raise TypeError(
+                f"Expected label_key to be a string, but got {type(self.label_key).__name__}."
+            )
+
+        if not isinstance(self.value, dict):
+            raise TypeError(
+                f"Expected value to be a dictionary, but got {type(self.value).__name__}."
+            )
+
+        if self.pr_curve_iou_threshold is not None and not isinstance(
+            self.pr_curve_iou_threshold, float
+        ):
+            raise TypeError(
+                f"Expected pr_curve_iou_threshold to be a float or None, but got {type(self.pr_curve_iou_threshold).__name__}."
+            )
+
+    def to_dict(self):
+        """Converts a metric object into a dictionary."""
+        return {
+            "parameters": {"label_key": self.label_key},
+            "value": self.value,
+            "type": self.__type__,
+        }
+
+
+class PrecisionRecallCurve(_BasePrecisionRecallCurve):
+    """
+    Describes a precision-recall curve.
+
+    Attributes
+    ----------
+    label_key: str
+        The label key associated with the metric.
+    value: dict
+        A nested dictionary where the first key is the class label, the second key is the confidence threshold (e.g., 0.05), the third key is the metric name (e.g., "precision"), and the final key is either the value itself (for precision, recall, etc.) or a list of tuples containing data for each observation.
+    pr_curve_iou_threshold: float, optional
+        The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5.
+    """
+
+    __type__ = "PrecisionRecallCurve"
+    value: dict[
+        str,  # the label value
+        dict[
+            float,  # the score threshold
+            dict[
+                str,  # the metric (e.g., "tp" for true positive)
+                int | float | None,
+            ],  # the count or metric value
+        ],
+    ]
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+
+class DetailedPrecisionRecallCurve(_BasePrecisionRecallCurve):
+    """
+    Describes a detailed precision-recall curve, which includes datum examples for each classification (e.g., true positive, false negative, etc.).
+
+    Attributes
+    ----------
+    label_key: str
+        The label key associated with the metric.
+    value: dict
+        A nested dictionary where the first key is the class label, the second key is the confidence threshold (e.g., 0.05), the third key is the metric name (e.g., "precision"), and the final key is either the value itself (for precision, recall, etc.) or a list of tuples containing data for each observation.
+    pr_curve_iou_threshold: float, optional
+        The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5.
+    """
+
+    __type__ = "DetailedPrecisionRecallCurve"
+    value: dict[
+        str,  # the label value
+        dict[
+            float,  # the score threshold
+            dict[
+                str,  # the metric (e.g., "tp" for true positive)
+                dict[
+                    str,  # the label for the next level of the dictionary (e.g., "observations" or "total")
+                    int  # the count of classifications
+                    | dict[
+                        str,  # the subclassification for the label (e.g., "misclassifications")
+                        dict[
+                            str,  # the label for the next level of the dictionary (e.g., "count" or "examples")
+                            int  # the count of subclassifications
+                            | list[
+                                tuple[str, str] | tuple[str, str, str],
+                            ],
+                        ],  # a list containing examples
+                    ],
+                ],
+            ],
+        ],
+    ]
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        super().__post_init__()
+
+
+@dataclass
+class ConfusionMatrixEntry:
+    """
+    Describes one element in a confusion matrix.
+
+    Attributes
+    ----------
+    prediction : str
+        The prediction.
+    groundtruth : str
+        The ground truth.
+    count : int
+        The value of the element in the matrix.
+    """
+
+    prediction: str
+    groundtruth: str
+    count: int
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.prediction, str):
+            raise TypeError(
+                f"Expected prediction to be a string, but got {type(self.prediction).__name__}."
+            )
+
+        if not isinstance(self.groundtruth, str):
+            raise TypeError(
+                f"Expected groundtruth to be a string, but got {type(self.groundtruth).__name__}."
+            )
+
+        if not isinstance(self.count, int):
+            raise TypeError(
+                f"Expected count to be an integer, but got {type(self.count).__name__}."
+            )
+
+    def to_dict(self):
+        """Converts a ConfusionMatrixEntry object into a dictionary."""
+        return {
+            "prediction": self.prediction,
+            "groundtruth": self.groundtruth,
+            "count": self.count,
+        }
+
+
+@dataclass
+class _BaseConfusionMatrix:
+    """
+    Describes a base confusion matrix.
+
+    Attributes
+    ----------
+    label_ley : str
+        A label for the matrix.
+    entries : list[ConfusionMatrixEntry]
+        A list of entries for the matrix.
+    """
+
+    label_key: str
+    entries: list[ConfusionMatrixEntry]
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.label_key, str):
+            raise TypeError(
+                f"Expected label_key to be a string, but got {type(self.label_key).__name__}."
+            )
+
+        if not isinstance(self.entries, list):
+            raise TypeError(
+                f"Expected entries to be a list, but got {type(self.entries).__name__}."
+            )
+
+        for entry in self.entries:
+            if not isinstance(entry, ConfusionMatrixEntry):
+                raise TypeError(
+                    f"Expected entry to be of type ConfusionMatrixEntry, but got {type(entry).__name__}."
+                )
+
+    def to_dict(self):
+        """Converts a ConfusionMatrix object into a dictionary."""
+        return {
+            "label_key": self.label_key,
+            "entries": [entry.to_dict() for entry in self.entries],
+        }
+
+
+class ConfusionMatrix(_BaseConfusionMatrix):
+    """
+    Describes a confusion matrix.
+
+    Attributes
+    ----------
+    label_key : str
+        A label for the matrix.
+    entries : list[ConfusionMatrixEntry]
+        A list of entries for the matrix.
+
+    Attributes
+    ----------
+    matrix : np.ndarray
+        A sparse matrix representing the confusion matrix.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        label_values = set(
+            [entry.prediction for entry in self.entries]
+            + [entry.groundtruth for entry in self.entries]
+        )
+        self.label_map = {
+            label_value: i
+            for i, label_value in enumerate(sorted(label_values))
+        }
+        n_label_values = len(self.label_map)
+
+        matrix = np.zeros((n_label_values, n_label_values), dtype=int)
+        for entry in self.entries:
+            matrix[
+                self.label_map[entry.groundtruth],
+                self.label_map[entry.prediction],
+            ] = entry.count
+
+        self.matrix = matrix
diff --git a/core/valor_core/schemas.py b/core/valor_core/schemas.py
new file mode 100644
index 000000000..d1dfe023a
--- /dev/null
+++ b/core/valor_core/schemas.py
@@ -0,0 +1,1739 @@
+import io
+import json
+import math
+from base64 import b64decode, b64encode
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+import PIL.ImageDraw as ImageDraw
+from PIL import Image
+from valor_core import enums, geometry
+
+
+def _generate_type_error(received_value: Any, expected_type: str):
+    """Raise a TypeError with a specific error string format."""
+    raise TypeError(
+        f"Expected value of type '{expected_type}', received value '{received_value}' with type '{type(received_value).__name__}'."
+    )
+
+
+def _validate_type_point(v: Any) -> None:
+    """
+    Validates geometric point values.
+
+    Parameters
+    ----------
+    v : Any
+        The value to validate.
+
+    Raises
+    ------
+    TypeError
+        If the value is not of type 'tuple' or 'list'.
+    ValueError
+        If the point is not an (x,y) position.
+    """
+    if not isinstance(v, (tuple, list)):
+        _generate_type_error(v, "tuple[float, float] or list[float]")
+    elif not (
+        len(v) == 2
+        and isinstance(v[0], (int, float, np.number))
+        and isinstance(v[1], (int, float, np.number))
+    ):
+        raise TypeError(
+            f"Expected point to have two numeric values representing an (x, y) pair. Received '{v}'."
+        )
+
+
+def _validate_type_multipoint(v: Any) -> None:
+    """
+    Validates geometric multipoint values.
+
+    Parameters
+    ----------
+    v : Any
+        The value to validate.
+
+    Raises
+    ------
+    TypeError
+        If the value is not of type 'list'.
+    ValueError
+        If there are no points or they are not (x,y) positions.
+    """
+    if not isinstance(v, list):
+        _generate_type_error(
+            v, "list[tuple[float, float]] or list[list[float]]"
+        )
+    elif not v:
+        raise TypeError("List cannot be empty.")
+    for point in v:
+        _validate_type_point(point)
+
+
+def _validate_type_linestring(v: Any) -> None:
+    """
+    Validates geometric linestring values.
+
+    Parameters
+    ----------
+    v : Any
+        The value to validate.
+
+    Raises
+    ------
+    TypeError
+        If the value is not of type 'list'.
+    ValueError
+        If the value does not conform to the linestring requirements.
+    """
+    _validate_type_multipoint(v)
+    if len(v) < 2:
+        raise TypeError(f"A line requires two or more points. Received '{v}'.")
+
+
+def _validate_type_multilinestring(v: Any) -> None:
+    """
+    Validates geometric multilinestring values.
+
+    Parameters
+    ----------
+    v : Any
+        The value to validate.
+
+    Raises
+    ------
+    TypeError
+        If the value is not of type 'list'.
+    ValueError
+        If the value does not conform to the multilinestring requirements.
+    """
+    if not isinstance(v, list):
+        return _generate_type_error(
+            v, "list[list[tuple[float, float]]] or list[list[list[float]]]"
+        )
+    elif not v:
+        raise ValueError("List cannot be empty.")
+    for line in v:
+        _validate_type_linestring(line)
+
+
+def _validate_type_polygon(v: Any) -> None:
+    """
+    Validates geometric polygon values.
+
+    Parameters
+    ----------
+    v : Any
+        The value to validate.
+
+    Raises
+    ------
+    TypeError
+        If the value is not of type 'list'.
+    ValueError
+        If the value does not conform to the polygon requirements.
+    """
+    if not isinstance(v, list):
+        raise TypeError("Expected value to be a list.")
+
+    _validate_type_multilinestring(v)
+    for line in v:
+        if not (len(line) >= 4 and line[0] == line[-1]):
+            raise ValueError(
+                "A polygon is defined by a list of at least four points with the first and last points being equal."
+            )
+
+
+def _validate_type_box(v: Any) -> None:
+    """
+    Validates geometric box values.
+
+    Parameters
+    ----------
+    v : Any
+        The value to validate.
+
+    Raises
+    ------
+    TypeError
+        If the value is not of type 'list'.
+    ValueError
+        If the value does not conform to the box requirements.
+    """
+    _validate_type_polygon(v)
+    if not (len(v) == 1 and len(v[0]) == 5 and v[0][0] == v[0][-1]):
+        raise ValueError(
+            "Boxes are defined by five points with the first and last being equal."
+        )
+
+    if geometry.is_skewed(v[0]):
+        raise NotImplementedError("Skewed boxes are not implemented yet.")
+
+
+def _validate_geojson(geojson: dict) -> None:
+    """
+    Validates that a dictionary conforms to the GeoJSON geometry specification.
+
+    Parameters
+    ----------
+    geojson: dict
+        The dictionary to validate.
+
+    Raises
+    ------
+    TypeError
+        If the passed in value is not a dictionary.
+        If the GeoJSON 'type' attribute is not supported.
+    ValueError
+        If the dictionary does not conform to the GeoJSON format.
+    """
+    map_str_to_geojson_validator = {
+        "point": _validate_type_point,
+        "multipoint": _validate_type_multipoint,
+        "linestring": _validate_type_linestring,
+        "multilinestring": _validate_type_multilinestring,
+        "polygon": _validate_type_polygon,
+    }
+    # validate geojson
+    if not isinstance(geojson, dict):
+        raise TypeError(
+            f"Expected a GeoJSON dictionary as input, received '{geojson}'."
+        )
+    elif not (
+        set(geojson.keys()) == {"type", "coordinates"}
+        and (geometry_type := geojson.get("type"))
+        and (geometry_value := geojson.get("coordinates"))
+    ):
+        raise ValueError(
+            f"Expected geojson to be a dictionary with keys 'type' and 'coordinates'. Received value '{geojson}'."
+        )
+
+    # validate type
+    geometry_type = geometry_type.lower()
+    if geometry_type not in map_str_to_geojson_validator:
+        raise TypeError(
+            f"Class '{geometry_type}' is not a supported GeoJSON geometry type."
+        )
+
+    # validate coordinates
+    try:
+        map_str_to_geojson_validator[geometry_type](geometry_value)
+    except (ValueError, ValueError) as e:
+        raise ValueError(
+            f"Value does not conform to '{geometry_type}'. Validation error: {str(e)}"
+        )
+
+
+@dataclass
+class Point:
+    """
+    Describes a Point in (x,y) coordinates.
+
+    Attributes
+    ----------
+    value : tuple[int | float, int | float]
+        A list of coordinates describing the Point.
+
+    Raises
+    ------
+    ValueError
+        If the value doesn't conform to the type.
+    """
+
+    value: tuple[int | float, int | float]
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        _validate_type_point(self.value)
+
+    @classmethod
+    def from_dict(cls, geojson: dict) -> "Point":
+        """
+        Create a Point from a GeoJSON in dictionary format.
+
+        Parameters
+        ----------
+        geojson: dict[str, str | list[int | float]]
+            A Point value in GeoJSON format.
+        """
+        geometry = GeoJSON(**geojson).geometry
+        if not isinstance(geometry, Point):
+            raise TypeError(f"GeoJSON is for a different type '{geojson}'.")
+        return geometry
+
+    def to_dict(self) -> dict[str, str | list[int | float]]:
+        """
+        Create a dictionary that represents the Point in GeoJSON format.
+
+        Returns
+        ----------
+        dict[str, str | list[int | float]]
+            A Point value in GeoJSON format.
+        """
+        return {"type": "Point", "coordinates": list(self.value)}
+
+    @classmethod
+    def from_json(cls, geojson: str) -> "Point":
+        """
+        Create a Point from a GeoJSON in json format.
+
+        Parameters
+        ----------
+        geojson: str
+            A Point value in GeoJSON format.
+        """
+        return cls.from_dict(json.loads(geojson))
+
+    def to_json(self) -> str:
+        """
+        Create a json string that represents the Point in GeoJSON format.
+
+        Returns
+        ----------
+        str
+            A Point value in GeoJSON format.
+        """
+        return json.dumps(self.to_dict())
+
+    def to_wkt(self) -> str:
+        """
+        Casts the geometric object into a string using Well-Known-Text (WKT) Format.
+
+        Returns
+        -------
+        str
+            The WKT formatted string.
+        """
+        return f"POINT ({self.value[0]} {self.value[1]})"
+
+    def resize(
+        self,
+        og_img_h=10,
+        og_img_w=10,
+        new_img_h=100,
+        new_img_w=100,
+    ):
+        h_ratio = new_img_h / og_img_h
+        w_ratio = new_img_w / og_img_w
+        return Point((self.value[0] * h_ratio, self.value[1] * w_ratio))
+
+    @property
+    def x(self):
+        return self.value[0]
+
+    @property
+    def y(self):
+        return self.value[1]
+
+    def __hash__(self):
+        return hash(str([float(x) for x in self.value]))
+
+
+@dataclass
+class MultiPoint:
+    """
+    Describes a MultiPoint in (x,y) coordinates.
+
+    Attributes
+    ----------
+    value : list[tuple[int | float, int | float]]
+        A list of coordinates describing the MultiPoint.
+
+    Raises
+    ------
+    ValueError
+        If the value doesn't conform to the type.
+    """
+
+    value: list[tuple[int | float, int | float]]
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        _validate_type_multipoint(self.value)
+
+    @classmethod
+    def from_dict(cls, geojson: dict) -> "MultiPoint":
+        """
+        Create a MultiPoint from a GeoJSON in dictionary format.
+
+        Parameters
+        ----------
+        geojson: dict[str, str | list[list[int | float]]]
+            A MultiPoint value in GeoJSON format.
+        """
+        geometry = GeoJSON(**geojson).geometry
+        if not isinstance(geometry, MultiPoint):
+            raise TypeError(f"GeoJSON is for a different type '{geojson}'.")
+        return geometry
+
+    def to_dict(self) -> dict[str, str | list[list[int | float]]]:
+        """
+        Create a dictionary that represents the MultiPoint in GeoJSON format.
+
+        Returns
+        ----------
+        dict[str, str | list[list[int | float]]]
+            A MultiPoint value in GeoJSON format.
+        """
+        return {
+            "type": "MultiPoint",
+            "coordinates": [list(point) for point in self.value],
+        }
+
+    @classmethod
+    def from_json(cls, geojson: str) -> "MultiPoint":
+        """
+        Create a MultiPoint from a GeoJSON in json format.
+
+        Parameters
+        ----------
+        geojson: str
+            A MultiPoint value in GeoJSON format.
+        """
+        return cls.from_dict(json.loads(geojson))
+
+    def to_json(self) -> str:
+        """
+        Create a json string that represents the MultiPoint in GeoJSON format.
+
+        Returns
+        ----------
+        str
+            A MultiPoint value in GeoJSON format.
+        """
+        return json.dumps(self.to_dict())
+
+    def to_wkt(self) -> str:
+        """
+        Casts the geometric object into a string using Well-Known-Text (WKT) Format.
+
+        Returns
+        -------
+        str
+            The WKT formatted string.
+        """
+        points = ", ".join(
+            [f"({point[0]} {point[1]})" for point in self.value]
+        )
+        return f"MULTIPOINT ({points})"
+
+
+@dataclass
+class LineString:
+    """
+    Describes a LineString in (x,y) coordinates.
+
+    Attributes
+    ----------
+    value : list[tuple[int | float, int | float]]
+        A list of coordinates describing the LineString.
+
+    Raises
+    ------
+    ValueError
+        If the value doesn't conform to the type.
+    """
+
+    value: list[tuple[int | float, int | float]]
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        _validate_type_linestring(self.value)
+
+    @classmethod
+    def from_dict(cls, geojson: dict) -> "LineString":
+        """
+        Create a LineString from a GeoJSON in dictionary format.
+
+        Parameters
+        ----------
+        geojson: dict[str, str | list[list[int | float]]]
+            A LineString value in GeoJSON format.
+        """
+        geometry = GeoJSON(**geojson).geometry
+        if not isinstance(geometry, LineString):
+            raise TypeError(f"GeoJSON is for a different type '{geojson}'.")
+        return geometry
+
+    def to_dict(self) -> dict[str, str | list[list[int | float]]]:
+        """
+        Create a dictionary that represents the LineString in GeoJSON format.
+
+        Returns
+        ----------
+        dict[str, str | list[list[int | float]]]
+            A LineString value in GeoJSON format.
+        """
+        return {
+            "type": "LineString",
+            "coordinates": [list(point) for point in self.value],
+        }
+
+    @classmethod
+    def from_json(cls, geojson: str) -> "LineString":
+        """
+        Create a LineString from a GeoJSON in json format.
+
+        Parameters
+        ----------
+        geojson: str
+            A LineString value in GeoJSON format.
+        """
+        return cls.from_dict(json.loads(geojson))
+
+    def to_json(self) -> str:
+        """
+        Create a json string that represents the LineString in GeoJSON format.
+
+        Returns
+        ----------
+        str
+            A LineString value in GeoJSON format.
+        """
+        return json.dumps(self.to_dict())
+
+    def to_wkt(self) -> str:
+        """
+        Casts the geometric object into a string using Well-Known-Text (WKT) Format.
+
+        Returns
+        -------
+        str
+            The WKT formatted string.
+        """
+        points = ", ".join([f"{point[0]} {point[1]}" for point in self.value])
+        return f"LINESTRING ({points})"
+
+
+@dataclass
+class MultiLineString:
+    """
+    Describes a MultiLineString in (x,y) coordinates.
+
+    Attributes
+    ----------
+    value : list[list[tuple[int | float, int | float]]]
+        A list of coordinates describing the MultiLineString.
+
+    Raises
+    ------
+    ValueError
+        If the value doesn't conform to the type.
+    """
+
+    value: list[list[tuple[int | float, int | float]]]
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        _validate_type_multilinestring(self.value)
+
+    @classmethod
+    def from_dict(cls, geojson: dict) -> "MultiLineString":
+        """
+        Create a MultiLineString from a GeoJSON in dictionary format.
+
+        Parameters
+        ----------
+        geojson: dict[str, str | list[list[list[int | float]]]]
+            A MultiLineString value in GeoJSON format.
+        """
+        geometry = GeoJSON(**geojson).geometry
+        if not isinstance(geometry, MultiLineString):
+            raise TypeError(f"GeoJSON is for a different type '{geojson}'.")
+        return geometry
+
+    def to_dict(self) -> dict[str, str | list[list[list[int | float]]]]:
+        """
+        Create a dictionary that represents the MultiLineString in GeoJSON format.
+
+        Returns
+        ----------
+        dict[str, str | list[list[list[int | float]]]]
+            A MultiLineString value in GeoJSON format.
+        """
+        return {
+            "type": "MultiLineString",
+            "coordinates": [
+                [list(point) for point in line] for line in self.value
+            ],
+        }
+
+    @classmethod
+    def from_json(cls, geojson: str) -> "MultiLineString":
+        """
+        Create a MultiLineString from a GeoJSON in json format.
+
+        Parameters
+        ----------
+        geojson: str
+            A MultiLineString value in GeoJSON format.
+        """
+        return cls.from_dict(json.loads(geojson))
+
+    def to_json(self) -> str:
+        """
+        Create a json string that represents the MultiLineString in GeoJSON format.
+
+        Returns
+        ----------
+        str
+            A MultiLineString value in GeoJSON format.
+        """
+        return json.dumps(self.to_dict())
+
+    def to_wkt(self) -> str:
+        """
+        Casts the geometric object into a string using Well-Known-Text (WKT) Format.
+
+        Returns
+        -------
+        str
+            The WKT formatted string.
+        """
+        points = "),(".join(
+            [
+                ", ".join([f"{point[0]} {point[1]}" for point in line])
+                for line in self.value
+            ]
+        )
+        return f"MULTILINESTRING (({points}))"
+
+
+@dataclass
+class Polygon:
+    """
+    Describes a Polygon in (x,y) coordinates.
+
+    Attributes
+    ----------
+    value : list[list[tuple[int | float, int | float]]]
+        A list of coordinates describing the Box.
+
+    Raises
+    ------
+    ValueError
+        If the value doesn't conform to the type.
+    """
+
+    value: list[list[tuple[int, int]]] | list[list[tuple[float, float]]]
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not (
+            isinstance(self.value, list)
+            and len(self.value) > 0
+            and isinstance(self.value[0], list)
+        ):
+            raise TypeError("Expected list of lists.")
+        _validate_type_polygon(self.value)
+
+    @classmethod
+    def from_dict(cls, geojson: dict) -> "Polygon":
+        """
+        Create a Polygon from a GeoJSON in dictionary format.
+
+        Parameters
+        ----------
+        geojson: dict[str, str | list[list[list[int | float]]]]
+            A Polygon value in GeoJSON format.
+        """
+        geometry = GeoJSON(**geojson).geometry
+        if not isinstance(geometry, Polygon):
+            raise TypeError(f"GeoJSON is for a different type '{geojson}'.")
+        return geometry
+
+    def to_dict(self) -> dict[str, str | list[list[list[int | float]]]]:
+        """
+        Create a dictionary that represents the Polygon in GeoJSON format.
+
+        Returns
+        ----------
+        dict[str, str | list[list[list[int | float]]]]
+            A Polygon value in GeoJSON format.
+        """
+        return {
+            "type": "Polygon",
+            "coordinates": [
+                [list(point) for point in subpolygon]
+                for subpolygon in self.value
+            ],
+        }
+
+    @classmethod
+    def from_json(cls, geojson: str) -> "Polygon":
+        """
+        Create a Polygon from a GeoJSON in json format.
+
+        Parameters
+        ----------
+        geojson: str
+            A Polygon value in GeoJSON format.
+        """
+        return cls.from_dict(json.loads(geojson))
+
+    def to_json(self) -> str:
+        """
+        Create a json string that represents the Polygon in GeoJSON format.
+
+        Returns
+        ----------
+        str
+            A Polygon value in GeoJSON format.
+        """
+        return json.dumps(self.to_dict())
+
+    def to_wkt(self) -> str:
+        """
+        Casts the geometric object into a string using Well-Known-Text (WKT) Format.
+
+        Returns
+        -------
+        str
+            The WKT formatted string.
+        """
+        coords = "),(".join(
+            [
+                ", ".join([f"{point[0]} {point[1]}" for point in subpolygon])
+                for subpolygon in self.value
+            ]
+        )
+        return f"POLYGON (({coords}))"
+
+    @property
+    def boundary(self):
+        """
+        The boundary of the polygon.
+
+        Returns
+        -------
+        list[Tuple(float, float)]
+            A list of points.
+        """
+        value = self.value
+        if value is None:
+            raise ValueError("Polygon is 'None'")
+        return value[0]
+
+    @property
+    def holes(self):
+        """
+        Any holes in the polygon.
+
+        Returns
+        -------
+        list[list[Tuple(float, float)]]
+            A list of holes.
+        """
+        value = self.value
+        if value is None:
+            raise ValueError("Polygon is 'None'")
+        return value[1:]
+
+    @property
+    def xmin(self) -> float:
+        """
+        Minimum x-value.
+
+        Returns
+        -------
+        float
+        """
+        return min([p[0] for p in self.boundary])
+
+    @property
+    def xmax(self) -> float:
+        """
+        Maximum x-value.
+
+        Returns
+        -------
+        float
+        """
+        return max([p[0] for p in self.boundary])
+
+    @property
+    def ymin(self) -> float:
+        """
+        Minimum y-value.
+
+        Returns
+        -------
+        float
+        """
+        return min([p[1] for p in self.boundary])
+
+    @property
+    def ymax(self) -> float:
+        """
+        Maximum y-value.
+
+        Returns
+        -------
+        float
+        """
+        return max([p[1] for p in self.boundary])
+
+    def to_array(self) -> np.ndarray:
+        """
+        Convert Polygon to an array.
+
+        Returns
+        -------
+        np.ndarray
+        """
+        return np.array(self.value[0])
+
+    def to_coordinates(self) -> list[list[dict[str, int | float]]]:
+        """
+        Convert Polygon to a nested list of coordinates.
+
+        Returns
+        -------
+        np.ndarray
+        """
+        return [[{"x": points[0], "y": points[1]} for points in self.value[0]]]
+
+
+@dataclass
+class Box:
+    """
+    Describes a Box in (x,y) coordinates.
+
+    Attributes
+    ----------
+    value : list[list[tuple[int | float, int | float]]]
+        A list of coordinates describing the Box.
+
+    Raises
+    ------
+    ValueError
+        If the value doesn't conform to the type.
+    """
+
+    value: list[list[tuple[int, int]]] | list[list[tuple[float, float]]]
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        _validate_type_box(self.value)
+
+    @classmethod
+    def from_extrema(
+        cls,
+        xmin: float,
+        xmax: float,
+        ymin: float,
+        ymax: float,
+    ):
+        """
+        Create a box from extrema.
+
+        Parameters
+        ----------
+        xmin: float
+            The minimum x-coordinate.
+        xmax: float
+            The maximum x-coordinate.
+        ymin: float
+            The minimum y-coordinate.
+        ymax: float
+            The maximum y-coordinate.
+        """
+        if xmin >= xmax or ymin >= ymax:
+            raise ValueError(
+                "Minimums cannot be greater-than or equal to maximums."
+            )
+        return cls(
+            value=[
+                [
+                    (xmin, ymin),
+                    (xmax, ymin),
+                    (xmax, ymax),
+                    (xmin, ymax),
+                    (xmin, ymin),
+                ]
+            ]
+        )
+
+    @classmethod
+    def from_dict(cls, geojson: dict) -> "Box":
+        """
+        Create a Box from a GeoJSON Polygon in dictionary format.
+
+        Parameters
+        ----------
+        geojson: dict[str, str | list[list[list[int | float]]]]
+            A Polygon value in GeoJSON format.
+        """
+        return cls(value=Polygon.from_dict(geojson).value)
+
+    def to_dict(self) -> dict[str, str | list[list[list[int | float]]]]:
+        """
+        Create a dictionary that represents the Box using a GeoJSON Polygon.
+
+        Returns
+        ----------
+        dict[str, str | list[list[list[int | float]]]]
+            A Polygon value in GeoJSON format.
+        """
+        return Polygon(value=self.value).to_dict()
+
+    @classmethod
+    def from_json(cls, geojson: str) -> "Box":
+        """
+        Create a Box from a GeoJSON Polygon in json format.
+
+        Parameters
+        ----------
+        geojson: str
+            A Polygon value in GeoJSON format.
+        """
+        return cls.from_dict(json.loads(geojson))
+
+    def to_json(self) -> str:
+        """
+        Create a json string that represents the Box using a GeoJSON Polygon.
+
+        Returns
+        ----------
+        str
+            A Polygon value in GeoJSON format.
+        """
+        return json.dumps(self.to_dict())
+
+    def to_wkt(self) -> str:
+        """
+        Casts the geometric object into a string using Well-Known-Text (WKT) Format.
+
+        Note that 'Box' is not a supported geometry so the output will use the format for 'Polygon'.
+
+        Returns
+        -------
+        str
+            The WKT formatted string.
+        """
+        return Polygon(value=self.value).to_wkt()
+
+    def to_array(
+        self,
+    ) -> np.ndarray:
+        """
+        Convert Box to an array.
+
+        Returns
+        -------
+        np.ndarray
+        """
+        return np.array(self.value[0])
+
+    def to_coordinates(self) -> list[list[dict[str, int | float]]]:
+        """
+        Convert Polygon to a nested list of coordinates.
+
+        Returns
+        -------
+        np.ndarray
+        """
+        return [[{"x": points[0], "y": points[1]} for points in self.value[0]]]
+
+    @property
+    def xmin(self):
+        return min([point[0] for point in self.value[0]])
+
+    @property
+    def xmax(self):
+        return max([point[0] for point in self.value[0]])
+
+    @property
+    def ymin(self):
+        return min([point[1] for point in self.value[0]])
+
+    @property
+    def ymax(self):
+        return max([point[1] for point in self.value[0]])
+
+
+@dataclass
+class GeoJSON:
+    type: str
+    coordinates: (
+        list[float]
+        | list[list[float]]
+        | list[list[list[float]]]
+        | list[list[list[list[float]]]]
+    )
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        _validate_geojson({"type": self.type, "coordinates": self.coordinates})
+
+    @property
+    def geometry(
+        self,
+    ) -> Point | MultiPoint | LineString | MultiLineString | Polygon:
+        map_str_to_type = {
+            "Point": Point,
+            "MultiPoint": MultiPoint,
+            "LineString": LineString,
+            "MultiLineString": MultiLineString,
+            "Polygon": Polygon,
+        }
+        return map_str_to_type[self.type](value=self.coordinates)
+
+    def to_wkt(self) -> str:
+        """
+        Converts the GeoJSON to a string in Well-Known-Text (WKT) formatting.
+
+        Returns
+        -------
+        str
+            The geometry in WKT format.
+        """
+        return self.geometry.to_wkt()
+
+
+@dataclass
+class Raster:
+    """
+    Represents a binary mask.
+
+    Parameters
+    ----------
+    value : dict[str, np.ndarray | str | None], optional
+        An raster value.
+
+    Attributes
+    ----------
+    area
+    array
+    geometry
+    height
+    width
+
+    Raises
+    ------
+    TypeError
+        If `encoding` is not a string.
+
+    Examples
+    --------
+    Generate a random mask.
+    >>> import numpy.random
+    >>> height = 640
+    >>> width = 480
+    >>> array = numpy.random.rand(height, width)
+
+    Convert to binary mask.
+    >>> mask = (array > 0.5)
+
+    Create Raster.
+    >>> Raster(mask)
+    """
+
+    mask: np.ndarray
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.mask, np.ndarray):
+            raise TypeError(
+                "Raster should contain a numpy array describing the Raster mask."
+            )
+        if len(self.mask.shape) != 2:
+            raise ValueError("raster only supports 2d arrays")
+
+        if self.mask is not None and self.mask.dtype != bool:
+            raise ValueError(
+                f"Expecting a binary mask (i.e. of dtype bool) but got dtype {self.mask.dtype}"
+            )
+
+    def encode_value(self) -> Any:
+        """Encode object to JSON compatible dictionary."""
+        value = self.mask
+        if value is None:
+            return None
+
+        if self.mask is not None:
+            f = io.BytesIO()
+            Image.fromarray(self.mask).save(f, format="PNG")
+            f.seek(0)
+            mask_bytes = f.read()
+            f.close()
+            decoded_mask_bytes = b64encode(mask_bytes).decode()
+        else:
+            decoded_mask_bytes = None
+        return {
+            "mask": decoded_mask_bytes,
+        }
+
+    @classmethod
+    def decode_value(cls, mask: Any):
+        """Decode object from JSON compatible dictionary."""
+        mask_bytes = b64decode(mask)
+        with io.BytesIO(mask_bytes) as f:
+            img = Image.open(f)
+            value = np.array(img)
+
+        return cls(mask=value)
+
+    def to_array(self) -> np.ndarray:
+        """
+        Convert Raster to a numpy array.
+
+        Returns
+        -------
+        np.ndarray | None
+            A 2D binary array representing the mask if it exists.
+        """
+        return self.mask
+
+    @classmethod
+    def from_coordinates(
+        cls,
+        coordinates: list[list[dict[str, int]]] | list[list[dict[str, float]]],
+        height: int,
+        width: int,
+    ):
+        """
+        Create a Raster object from coordinates.
+
+        Parameters
+        ----------
+        coordinates : list[list[dict[str, int]]]
+            Defines the bitmask as a nested list of coordinates.
+        height : int
+            The intended height of the binary mask.
+        width : int
+            The intended width of the binary mask.
+
+        Returns
+        -------
+        schemas.Raster
+        """
+        if not (isinstance(coordinates, list)):
+            raise TypeError(
+                "coordinates should either be an empty list, or it should be a list of lists containing dictionaries with 'x' and 'y' keys."
+            )
+
+        if len(coordinates) > 0 and not (
+            isinstance(coordinates[0], list)
+            and len(coordinates[0]) > 0
+            and isinstance(coordinates[0][0], dict)
+            and all(
+                all(set(pt.keys()) == {"x", "y"} for pt in contour)
+                for contour in coordinates
+            )
+        ):
+            raise TypeError(
+                "Coordinates should either be an empty list, or it should be a list of lists containing dictionaries with 'x' and 'y' keys."
+            )
+
+        if not (
+            all(
+                all(pt["x"] >= 0 and pt["y"] >= 0 for pt in contour)
+                for contour in coordinates
+            )
+        ):
+            raise ValueError(
+                "Coordinates cannot be negative when converting to a raster."
+            )
+
+        contours = [
+            [(min(pt["x"], width), min(pt["y"], height)) for pt in contour]
+            for contour in coordinates
+        ]
+
+        img = Image.new("1", (width, height), 0)
+
+        for contour in contours:
+            if len(contour) >= 2:
+                ImageDraw.Draw(img).polygon(contour, outline=1, fill=1)
+
+        return cls(np.array(img))
+
+    @classmethod
+    def from_geometry(cls, geometry: Box | Polygon, height: int, width: int):
+        """
+        Create a Raster object from a geometry.
+
+        Parameters
+        ----------
+        coordinates : list[list[dict[str, int]]]
+            Defines the bitmask as a nested list of coordinates.
+        height : int
+            The intended height of the binary mask.
+        width : int
+            The intended width of the binary mask.
+
+        Returns
+        -------
+        schemas.Raster
+        """
+        if not (isinstance(geometry, Box) or isinstance(geometry, Polygon)):
+            raise TypeError("Geometry should be a Box or Polygon.")
+
+        return cls.from_coordinates(
+            geometry.to_coordinates(), height=height, width=width
+        )
+
+
+@dataclass
+class Embedding:
+    """
+    Represents a model embedding.
+
+    Parameters
+    ----------
+    value : list[float], optional
+        An embedding value.
+    """
+
+    value: list[int] | list[float] | None = None
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.value, list):
+            raise TypeError(
+                f"Expected type 'list[float] | None' received type '{type(self.value)}'"
+            )
+        elif len(self.value) < 1:
+            raise ValueError("embedding should have at least one dimension")
+
+
+@dataclass
+class Datum:
+    """
+    A class used to store information about a datum for either a 'GroundTruth' or a 'Prediction'.
+
+    Attributes
+    ----------
+    uid : str
+        The UID of the datum.
+    metadata : dict[str, Any]
+        A dictionary of metadata that describes the datum.
+
+    Examples
+    --------
+    >>> Datum(uid="uid1")
+    >>> Datum(uid="uid1", metadata={})
+    >>> Datum(uid="uid1", metadata={"foo": "bar", "pi": 3.14})
+    """
+
+    uid: str | None = None
+    metadata: dict | None = None
+
+    def __post_init__(
+        self,
+    ):
+        """Validate instantiated class."""
+
+        if not isinstance(self.uid, (str, type(None))):
+            raise TypeError(
+                f"Expected 'uid' to be of type 'str' or 'None', got {type(self.uid).__name__}"
+            )
+        if not isinstance(self.metadata, (dict, type(None))):
+            raise TypeError(
+                f"Expected 'metadata' to be of type 'dict' or 'None', got {type(self.metadata).__name__}"
+            )
+
+
+@dataclass
+class Label:
+    """
+    An object for labeling datasets, models, and annotations.
+
+    Attributes
+    ----------
+    key : str
+        The label key. (e.g. 'class', 'category')
+    value : str
+        The label's value. (e.g. 'dog', 'cat')
+    score : float, optional
+        A score assigned to the label in the case of a prediction.
+    """
+
+    key: str
+    value: str
+    score: float | None = None
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.key, str):
+            raise TypeError(
+                f"Expected 'key' to be of type 'str', got {type(self.key).__name__}"
+            )
+
+        if not isinstance(self.value, str):
+            raise TypeError(
+                f"Expected 'value' to be of type 'str', got {type(self.value).__name__}"
+            )
+
+        if self.score is not None and not isinstance(
+            self.score,
+            (
+                float,
+                int,
+            ),
+        ):
+            raise TypeError(
+                f"Expected 'score' to be of type 'float' or 'int' or 'None', got {type(self.score).__name__}"
+            )
+
+        # Ensure score is a float if provided as int
+        if isinstance(self.score, int):
+            self.score = float(self.score)
+
+    def __eq__(self, other):
+        """
+        Defines how labels are compared to one another.
+
+        Parameters
+        ----------
+        other : Label
+            The object to compare with the label.
+
+        Returns
+        ----------
+        bool
+            A boolean describing whether the two objects are equal.
+        """
+        if (
+            not hasattr(other, "key")
+            or not hasattr(other, "key")
+            or not hasattr(other, "score")
+        ):
+            return False
+
+        # if the scores aren't the same type return False
+        if (other.score is None) != (self.score is None):
+            return False
+
+        if self.score is None or other.score is None:
+            scores_equal = other.score is None and self.score is None
+        else:
+            scores_equal = math.isclose(self.score, other.score)
+
+        return (
+            scores_equal
+            and self.key == other.key
+            and self.value == other.value
+        )
+
+    def __hash__(self) -> int:
+        """
+        Defines how a 'Label' is hashed.
+
+        Returns
+        ----------
+        int
+            The hashed 'Label'.
+        """
+        return hash(f"key:{self.key},value:{self.value},score:{self.score}")
+
+
+@dataclass
+class Annotation:
+    """
+    A class used to annotate `GroundTruths` and `Predictions`.
+
+    Attributes
+    ----------
+    metadata: dict[str, Any]
+        A dictionary of metadata that describes the `Annotation`.
+    labels: list[Label], optional
+        A list of labels to use for the `Annotation`.
+    bounding_box: schemas.Box
+        A bounding box to assign to the `Annotation`.
+    polygon: BoundingPolygon
+        A polygon to assign to the `Annotation`.
+    raster: Raster
+        A raster to assign to the `Annotation`.
+    embedding: list[float]
+        An embedding, described by a list of values with type float and a maximum length of 16,000.
+    is_instance: bool, optional
+        A boolean describing whether we should treat the Raster attached to an annotation as an instance segmentation or not. If set to true, then the Annotation will be validated for use in object detection tasks. If set to false, then the Annotation will be validated for use in semantic segmentation tasks.
+    implied_task_types: list[str], optional
+        The validated task types that are applicable to each Annotation. Doesn't need to bet set by the user.
+
+    Examples
+    --------
+
+    Classification
+    >>> Annotation.create(
+    ...     labels=[
+    ...         Label(key="class", value="dog"),
+    ...         Label(key="category", value="animal"),
+    ...     ]
+    ... )
+
+    Object-Detection schemas.Box
+    >>> annotation = Annotation(
+    ...     labels=[Label(key="k1", value="v1")],
+    ...     bounding_box=box2,
+    ... )
+
+    Object-Detection schemas.Polygon
+    >>> annotation = Annotation(
+    ...     labels=[Label(key="k1", value="v1")],
+    ...     polygon=BoundingPolygon(...),
+    ... )
+
+     Raster
+    >>> annotation = Annotation(
+    ...     labels=[Label(key="k1", value="v1")],
+    ...     raster=Raster(...),
+    ...     is_instance=True
+    ... )
+
+    Object-Detection with all supported Geometries defined.
+    >>> Annotation(
+    ...     labels=[Label(key="k1", value="v1")],
+    ...     bounding_box=schemas.Box(...),
+    ...     polygon=BoundingPolygon(...),
+    ...     raster=Raster(...),
+    ...     is_instance=True,
+    ... )
+
+    Semantic-Segmentation Raster
+    >>> annotation = Annotation(
+    ...     labels=[Label(key="k1", value="v1")],
+    ...     raster=Raster(...),
+    ...     is_instance=False # or None
+    ... )
+    """
+
+    labels: list[Label]
+    metadata: dict | None = None
+    bounding_box: Box | None = None
+    polygon: Polygon | Box | None = None
+    raster: Raster | None = None
+    embedding: Embedding | None = None
+    is_instance: bool | None = None
+    implied_task_types: list[str] | None = None
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.labels, list):
+            raise TypeError(
+                f"Expected 'labels' to be of type 'list', got {type(self.labels).__name__}"
+            )
+        if not all(isinstance(label, Label) for label in self.labels):
+            raise TypeError("All items in 'labels' must be of type 'Label'")
+
+        if not isinstance(self.metadata, (dict, type(None))):
+            raise TypeError(
+                f"Expected 'metadata' to be of type 'dict' or 'None', got {type(self.metadata).__name__}"
+            )
+
+        if not isinstance(self.bounding_box, (Box, type(None))):
+            raise TypeError(
+                f"Expected 'bounding_box' to be of type 'schemas.Box' or 'None', got {type(self.bounding_box).__name__}"
+            )
+
+        if not isinstance(self.polygon, (Polygon, Box, type(None))):
+            raise TypeError(
+                f"Expected 'polygon' to be of type 'schemas.Polygon' or 'None', got {type(self.polygon).__name__}"
+            )
+
+        if not isinstance(self.raster, (Raster, type(None))):
+            raise TypeError(
+                f"Expected 'raster' to be of type 'schemas.Raster' or 'None', got {type(self.raster).__name__}"
+            )
+
+        if not isinstance(self.embedding, (Embedding, type(None))):
+            raise TypeError(
+                f"Expected 'embedding' to be of type 'Embedding' or 'None', got {type(self.embedding).__name__}"
+            )
+
+        if not isinstance(self.is_instance, (bool, type(None))):
+            raise TypeError(
+                f"Expected 'is_instance' to be of type 'bool' or 'None', got {type(self.is_instance).__name__}"
+            )
+
+        if not isinstance(self.implied_task_types, (list, type(None))):
+            raise TypeError(
+                f"Expected 'implied_task_types' to be of type 'list' or 'None', got {type(self.implied_task_types).__name__}"
+            )
+        if self.implied_task_types is not None and not all(
+            isinstance(task_type, str) for task_type in self.implied_task_types
+        ):
+            raise TypeError(
+                "All items in 'implied_task_types' must be of type 'str'"
+            )
+
+
+@dataclass
+class EvaluationParameters:
+    """
+    Defines optional parameters for evaluation methods.
+
+    Attributes
+    ----------
+    label_map: list[list[list[str]]], optional
+        Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
+    metrics: list[str], optional
+        The list of metrics to compute, store, and return to the user.
+    iou_thresholds_to_compute: list[float], optional
+        A list of floats describing which Intersection over Unions (IoUs) to use when calculating metrics (i.e., mAP).
+    iou_thresholds_to_return: list[float], optional
+        A list of floats describing which Intersection over Union (IoUs) thresholds to calculate a metric for. Must be a subset of `iou_thresholds_to_compute`.
+    recall_score_threshold: float, default=0
+        The confidence score threshold for use when determining whether to count a prediction as a true positive or not while calculating Average Recall.
+    pr_curve_iou_threshold: float, optional
+            The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5.
+    pr_curve_max_examples: int
+        The maximum number of datum examples to store when calculating PR curves.
+    """
+
+    label_map: dict[Label, Label] | None = None
+    metrics_to_return: list[enums.MetricType] | None = None
+    iou_thresholds_to_compute: list[float] | None = None
+    iou_thresholds_to_return: list[float] | None = None
+    convert_annotations_to_type: enums.AnnotationType | None = None
+    recall_score_threshold: float = 0.0
+    pr_curve_iou_threshold: float = 0.5
+    pr_curve_max_examples: int = 1
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.label_map, (dict, type(None))):
+            raise TypeError(
+                f"Expected 'label_map' to be of type 'dict' or 'None', got {type(self.label_map).__name__}"
+            )
+        if self.label_map and not isinstance(self.label_map, dict):
+            raise TypeError("label_map should be a dictionary of Labels.")
+
+        if self.label_map is not None and not all(
+            isinstance(k, Label) and isinstance(v, Label)
+            for k, v in self.label_map.items()
+        ):
+            raise TypeError(
+                "All keys and values in 'label_map' must be of type 'Label'"
+            )
+
+        if not isinstance(self.metrics_to_return, (list, type(None))):
+            raise TypeError(
+                f"Expected 'metrics_to_return' to be of type 'list' or 'None', got {type(self.metrics_to_return).__name__}"
+            )
+        if self.metrics_to_return is not None and not all(
+            isinstance(metric, enums.MetricType)
+            for metric in self.metrics_to_return
+        ):
+            raise TypeError(
+                "All items in 'metrics_to_return' must be of type 'enums.MetricType'"
+            )
+
+        if not isinstance(self.iou_thresholds_to_compute, (list, type(None))):
+            raise TypeError(
+                f"Expected 'iou_thresholds_to_compute' to be of type 'list' or 'None', got {type(self.iou_thresholds_to_compute).__name__}"
+            )
+        if self.iou_thresholds_to_compute is not None and not all(
+            isinstance(threshold, float)
+            for threshold in self.iou_thresholds_to_compute
+        ):
+            raise TypeError(
+                "All items in 'iou_thresholds_to_compute' must be of type 'float'"
+            )
+
+        if not isinstance(self.iou_thresholds_to_return, (list, type(None))):
+            raise TypeError(
+                f"Expected 'iou_thresholds_to_return' to be of type 'list' or 'None', got {type(self.iou_thresholds_to_return).__name__}"
+            )
+        if self.iou_thresholds_to_return is not None and not all(
+            isinstance(threshold, float)
+            for threshold in self.iou_thresholds_to_return
+        ):
+            raise TypeError(
+                "All items in 'iou_thresholds_to_return' must be of type 'float'"
+            )
+
+        if not isinstance(self.recall_score_threshold, float):
+            raise TypeError(
+                f"Expected 'recall_score_threshold' to be of type 'float', got {type(self.recall_score_threshold).__name__}"
+            )
+
+        if not isinstance(self.pr_curve_iou_threshold, float):
+            raise TypeError(
+                f"Expected 'pr_curve_iou_threshold' to be of type 'float', got {type(self.pr_curve_iou_threshold).__name__}"
+            )
+
+        if not isinstance(self.pr_curve_max_examples, int):
+            raise TypeError(
+                f"Expected 'pr_curve_max_examples' to be of type 'int', got {type(self.pr_curve_max_examples).__name__}"
+            )
+
+
+@dataclass
+class Evaluation:
+    parameters: EvaluationParameters
+    metrics: list[dict]
+    confusion_matrices: list[dict] | None
+    ignored_pred_labels: list[tuple[str, str]] | None
+    missing_pred_labels: list[tuple[str, str]] | None
+    meta: dict | None = None
+
+    def __str__(self) -> str:
+        """Dumps the object into a JSON formatted string."""
+        return json.dumps(self.__dict__, indent=4)
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.parameters, EvaluationParameters):
+            raise TypeError(
+                f"Expected 'parameters' to be of type 'EvaluationParameters', got {type(self.parameters).__name__}"
+            )
+
+        if not isinstance(self.metrics, list):
+            raise TypeError(
+                f"Expected 'metrics' to be of type 'list', got {type(self.metrics).__name__}"
+            )
+        if not all(isinstance(metric, dict) for metric in self.metrics):
+            raise TypeError("All items in 'metrics' must be of type 'dict'")
+
+        if not isinstance(self.confusion_matrices, (list, type(None))):
+            raise TypeError(
+                f"Expected 'confusion_matrices' to be of type 'list' or 'None', got {type(self.confusion_matrices).__name__}"
+            )
+        if self.confusion_matrices is not None and not all(
+            isinstance(cm, dict) for cm in self.confusion_matrices
+        ):
+            raise TypeError(
+                "All items in 'confusion_matrices' must be of type 'dict'"
+            )
+
+        if not isinstance(self.meta, (dict, type(None))):
+            raise TypeError(
+                f"Expected 'meta' to be of type 'dict' or 'None', got {type(self.meta).__name__}"
+            )
+
+    def to_dict(self) -> dict:
+        """
+        Defines how a `valor.Evaluation` object is serialized into a dictionary.
+
+        Returns
+        ----------
+        dict
+            A dictionary describing an evaluation.
+        """
+        return {
+            "parameters": self.parameters.__dict__,
+            "metrics": self.metrics,
+            "confusion_matrices": self.confusion_matrices,
+            "ignored_pred_labels": self.ignored_pred_labels,
+            "missing_pred_labels": self.missing_pred_labels,
+            "meta": self.meta,
+        }
+
+
+@dataclass
+class GroundTruth:
+    """
+    An object describing a ground truth (e.g., a human-drawn bounding box on an image).
+
+    Attributes
+    ----------
+    datum : Datum
+        The datum associated with the groundtruth.
+    annotations : list[Annotation]
+        The list of annotations associated with the groundtruth.
+
+    Examples
+    --------
+    >>> GroundTruth(
+    ...     datum=Datum(uid="uid1"),
+    ...     annotations=[
+    ...         Annotation(
+    ...             labels=[Label(key="k1", value="v1")],
+    ...         )
+    ...     ]
+    ... )
+    """
+
+    datum: Datum
+    annotations: list[Annotation]
+
+    def __post_init__(
+        self,
+    ):
+        """Validate instantiated class."""
+
+        if not isinstance(self.datum, Datum):
+            raise TypeError(
+                f"Expected 'datum' to be of type 'Datum', got {type(self.datum).__name__}"
+            )
+
+        if not isinstance(self.annotations, list):
+            raise TypeError(
+                f"Expected 'annotations' to be of type 'list', got {type(self.annotations).__name__}"
+            )
+        if not all(
+            isinstance(annotation, Annotation)
+            for annotation in self.annotations
+        ):
+            raise TypeError(
+                "All items in 'annotations' must be of type 'Annotation'"
+            )
+
+
+@dataclass
+class Prediction:
+    """
+    An object describing a prediction (e.g., a machine-drawn bounding box on an image).
+
+    Attributes
+    ----------
+    datum : Datum
+        The datum associated with the prediction.
+    annotations : list[Annotation]
+        The list of annotations associated with the prediction.
+
+    Examples
+    --------
+    >>> Prediction(
+    ...     datum=Datum(uid="uid1"),
+    ...     annotations=[
+    ...         Annotation(
+    ...             labels=[
+    ...                 Label(key="k1", value="v1", score=0.9),
+    ...                 Label(key="k1", value="v1", score=0.1)
+    ...             ],
+    ...         )
+    ...     ]
+    ... )
+    """
+
+    datum: Datum
+    annotations: list[Annotation]
+
+    def __post_init__(self):
+        """Validate instantiated class."""
+
+        if not isinstance(self.datum, Datum):
+            raise TypeError(
+                f"Expected 'datum' to be of type 'Datum', got {type(self.datum).__name__}"
+            )
+
+        if not isinstance(self.annotations, list):
+            raise TypeError(
+                f"Expected 'annotations' to be of type 'list', got {type(self.annotations).__name__}"
+            )
+        if not all(
+            isinstance(annotation, Annotation)
+            for annotation in self.annotations
+        ):
+            raise TypeError(
+                "All items in 'annotations' must be of type 'Annotation'"
+            )
diff --git a/core/valor_core/utilities.py b/core/valor_core/utilities.py
new file mode 100644
index 000000000..2cfbe7944
--- /dev/null
+++ b/core/valor_core/utilities.py
@@ -0,0 +1,1052 @@
+import numpy as np
+import pandas as pd
+from valor_core import enums, schemas
+
+
+def replace_labels_using_label_map(
+    groundtruth_df: pd.DataFrame,
+    prediction_df: pd.DataFrame,
+    label_map: dict[schemas.Label, schemas.Label] | None,
+):
+    """
+    Replace label keys, values, and IDs in the groundtruth and prediction DataFrames using a given label map.
+
+    This function updates the `label_key`, `label_value`, and `label_id` columns in both the groundtruth and prediction
+    DataFrames based on the provided label map. If the `label_map` is not provided, the function returns the original DataFrames
+    without modification.
+
+    Parameters
+    ----------
+    groundtruth_df : pd.DataFrame
+        DataFrame containing groundtruth data with columns `label_key`, `label_value`, and `label_id`.
+    prediction_df : pd.DataFrame
+        DataFrame containing prediction data with columns `label_key`, `label_value`, and `label_id`.
+    label_map : dict[schemas.Label, schemas.Label], optional
+        Dictionary mapping tuples of (label_key, label_value) to (grouper_key, grouper_value). Used to replace the labels in the DataFrames.
+
+    Returns
+    -------
+    Tuple[pd.DataFrame, pd.DataFrame]
+        Updated groundtruth and prediction DataFrames with replaced labels and IDs based on the provided label map.
+    """
+    if not label_map:
+        return (groundtruth_df, prediction_df)
+
+    # create a mapping dictionary to map each label to its grouper label
+    mapping_dict = dict()
+    unique_grouper_labels = set()
+    if label_map:
+        for label, grouper in label_map.items():
+            mapping_dict[(label.key, label.value)] = (
+                grouper.key,
+                grouper.value,
+            )
+            unique_grouper_labels.add(
+                (
+                    grouper.key,
+                    grouper.value,
+                )
+            )
+
+    # get a dictionary mapping all current labels to their ids
+    label_id_lookup_df = pd.concat(
+        [
+            groundtruth_df[["label_key", "label_value", "label_id"]],
+            prediction_df[["label_key", "label_value", "label_id"]],
+        ]
+    )
+    label_id_lookup_df = label_id_lookup_df[~label_id_lookup_df.duplicated()]
+
+    label_to_label_id_dict = dict(
+        zip(
+            zip(
+                label_id_lookup_df["label_key"],
+                label_id_lookup_df["label_value"],
+            ),
+            label_id_lookup_df["label_id"],
+        )
+    )
+
+    # create unique ids for any new labels that will be created by the label_map
+    new_labels = unique_grouper_labels - set(label_to_label_id_dict.keys())
+    for label_key, label_value in new_labels:
+        label_id = hash(label_key + label_value)
+        label_to_label_id_dict[(label_key, label_value)] = label_id
+
+    # replace the labels both dataframes with the correct values
+    for df in (groundtruth_df, prediction_df):
+        df.loc[:, ["label_key", "label_value"]] = (
+            df.apply(
+                lambda row: mapping_dict.get(
+                    (row["label_key"], row["label_value"]),
+                    (row["label_key"], row["label_value"]),
+                ),
+                axis=1,
+            )
+            .apply(pd.Series)
+            .values
+        )
+
+        df.loc[:, ["label_id"]] = df.apply(
+            lambda row: label_to_label_id_dict.get(
+                (row["label_key"], row["label_value"]),
+                row["label_id"],
+            ),
+            axis=1,
+        ).values
+
+    return groundtruth_df, prediction_df
+
+
+def validate_label_map(
+    label_map: dict[schemas.Label, schemas.Label] | None,
+) -> None:
+    """
+    Validate the label mapping if necessary.
+
+    This function checks if the provided label_map is a dictionary with both
+    keys and values being instances of schemas.Label. If the label_map is
+    invalid, a TypeError is raised.
+
+    Parameters
+    ----------
+    label_map : dict[schemas.Label, schemas.Label], optional
+        A dictionary mapping labels to other labels, or None if no mapping
+        is provided.
+
+    Raises
+    ------
+    TypeError
+        If label_map is not a dictionary or if its keys and values are not
+        instances of schemas.Label.
+    """
+    if label_map and (
+        not isinstance(label_map, dict)
+        or not all(
+            [
+                isinstance(key, schemas.Label)
+                and isinstance(value, schemas.Label)
+                for key, value in label_map.items()
+            ]
+        )
+    ):
+        raise TypeError(
+            "label_map should be a dictionary with valid Labels for both the key and value."
+        )
+
+
+def validate_metrics_to_return(
+    task_type: enums.TaskType, metrics_to_return: list[enums.MetricType]
+) -> None:
+    """
+    Validate that the provided metrics are appropriate for the specified task type.
+
+    This function checks if the provided metrics_to_return are valid for the given
+    task_type. It raises a ValueError if any of the metrics are not supported for
+    the specified task type.
+
+    Parameters
+    ----------
+    task_type : enums.TaskType
+        The type of task for which the metrics are being validated. This can be
+        either `enums.TaskType.CLASSIFICATION` or `enums.TaskType.OBJECT_DETECTION`.
+    metrics_to_return : List[enums.MetricType]
+        A list of metrics that need to be validated against the task type.
+
+    Raises
+    ------
+    ValueError
+        If any of the provided metrics are not supported for the specified task type.
+    """
+
+    if task_type == enums.TaskType.CLASSIFICATION:
+        if not set(metrics_to_return).issubset(
+            enums.MetricType.classification()
+        ):
+            raise ValueError(
+                f"The following metrics are not supported for classification: '{set(metrics_to_return) - enums.MetricType.classification()}'"
+            )
+
+    if task_type == enums.TaskType.OBJECT_DETECTION:
+        if not set(metrics_to_return).issubset(
+            enums.MetricType.object_detection()
+        ):
+            raise ValueError(
+                f"The following metrics are not supported for object detection: '{set(metrics_to_return) - enums.MetricType.object_detection()}'"
+            )
+
+
+def validate_parameters(
+    recall_score_threshold: float | None = None,
+    pr_curve_iou_threshold: float | None = None,
+    pr_curve_max_examples: int | None = None,
+) -> None:
+    """
+    Validate parameters for scoring and PR curves.
+
+    Parameters
+    ----------
+    recall_score_threshold : float, optional
+        The threshold for recall score.
+    pr_curve_iou_threshold : float, optional
+        The IOU threshold for PR curve.
+    pr_curve_max_examples : int, optional
+        The maximum number of examples for PR curve.
+
+    Raises
+    ------
+    ValueError
+        If any of the parameters are out of their valid ranges.
+    """
+
+    if recall_score_threshold and (
+        recall_score_threshold > 1 or recall_score_threshold < 0
+    ):
+        raise ValueError(
+            "recall_score_threshold should exist in the range 0 <= threshold <= 1."
+        )
+
+    if pr_curve_iou_threshold and (
+        pr_curve_iou_threshold <= 0 or pr_curve_iou_threshold > 1.0
+    ):
+        raise ValueError(
+            "IOU thresholds should exist in the range 0 < threshold <= 1."
+        )
+
+    if pr_curve_max_examples and (pr_curve_max_examples < 0):
+        raise ValueError(
+            "pr_curve_max_examples should be an integer greater than or equal to zero."
+        )
+
+
+def validate_matching_label_keys(
+    groundtruths: pd.DataFrame,
+    predictions: pd.DataFrame,
+    label_map: dict[schemas.Label, schemas.Label] | None,
+) -> None:
+    """
+    Validates that every datum has the same set of label keys for both ground truths and predictions. This check is only needed for classification tasks.
+
+    Parameters
+    ----------
+    groundtruths : pd.DataFrame
+        The DataFrame containing ground truth data.
+    predictions : pd.DataFrame
+        The DataFrame containing prediction data.
+    label_map : dict[schemas.Label, schemas.Label], optional
+        Optional mapping of individual labels to a grouper label.
+
+    Raises
+    ------
+    ValueError
+        If the distinct ground truth label keys don't match the distinct prediction label keys for any datum.
+    """
+    # allow for case where our predictions don't have any labels
+    if len(predictions) == 0:
+        return
+
+    if not label_map:
+        label_map = dict()
+
+    # get the label keys per datum
+    groundtruths["mapped_groundtruth_label_keys"] = groundtruths.apply(
+        lambda row: label_map.get(
+            schemas.Label(key=row["label_key"], value=row["label_value"]),
+            schemas.Label(key=row["label_key"], value=row["label_value"]),
+        ).key,
+        axis=1,
+    )
+
+    predictions["mapped_prediction_label_keys"] = predictions.apply(
+        lambda row: label_map.get(
+            schemas.Label(key=row["label_key"], value=row["label_value"]),
+            schemas.Label(key=row["label_key"], value=row["label_value"]),
+        ).key,
+        axis=1,
+    )
+
+    gt_label_keys_per_datum = groundtruths.groupby(
+        ["datum_id"], as_index=False
+    )["mapped_groundtruth_label_keys"].unique()
+
+    pd_label_keys_per_datum = predictions.groupby(
+        ["datum_id"], as_index=False
+    )["mapped_prediction_label_keys"].unique()
+
+    joined = gt_label_keys_per_datum.merge(
+        pd_label_keys_per_datum,
+        on=["datum_id"],
+    )
+
+    if not joined["mapped_groundtruth_label_keys"].equals(
+        joined["mapped_prediction_label_keys"]
+    ):
+        raise ValueError(
+            "Ground truth label keys must match prediction label keys for classification tasks."
+        )
+
+    # delete interediary columns
+    del groundtruths["mapped_groundtruth_label_keys"]
+    del predictions["mapped_prediction_label_keys"]
+
+
+def _validate_groundtruth_dataframe(
+    df: pd.DataFrame, task_type: enums.TaskType
+) -> None:
+    """Validate the details of a ground truth dataframe."""
+    null_placeholder_column = pd.Series([None] * len(df))
+
+    required_columns = [
+        "datum_uid",
+        "datum_id",
+        "id",
+        "label_key",
+        "label_value",
+        "annotation_id",
+        "label_id",
+    ]
+
+    if not all(col in df.columns for col in required_columns):
+        raise ValueError(
+            f"DataFrame must contain columns: {', '.join(required_columns)}"
+        )
+
+    if not df["id"].is_unique:
+        raise ValueError("The column 'id' contains duplicate values.")
+
+    if df.get("score", null_placeholder_column).notnull().any():
+        raise ValueError("GroundTruth labels should not have scores.")
+
+    if task_type == enums.TaskType.SEMANTIC_SEGMENTATION:
+        if not (df.groupby("label")["annotation_id"].nunique() == 1).all():
+            raise ValueError(
+                "For semantic segmentation tasks, each label can only be associated with a single annotation id."
+            )
+
+
+def _validate_prediction_dataframe(
+    df: pd.DataFrame, task_type: enums.TaskType
+) -> None:
+    """Validate the details of a prediction dataframe."""
+
+    required_columns = [
+        "datum_uid",
+        "datum_id",
+        "id",
+        "label_key",
+        "label_value",
+        "annotation_id",
+        "label_id",
+        "score",
+    ]
+
+    if not all(col in df.columns for col in required_columns):
+        raise ValueError(
+            f"DataFrame must contain columns: {', '.join(required_columns)}"
+        )
+
+    if not df["id"].is_unique:
+        raise ValueError("The column 'id' contains duplicate values.")
+
+    if task_type == enums.TaskType.CLASSIFICATION:
+        if df["score"].isnull().any():
+            raise ValueError(
+                "All classification predictions must have an associated score."
+            )
+
+        if not (
+            abs(df.groupby(["datum_id", "label_key"])["score"].sum() - 1.0)
+            <= 1e-6
+        ).all():
+            raise ValueError(
+                "All classification scores must sum to one for each label key."
+            )
+    if task_type == enums.TaskType.OBJECT_DETECTION:
+        if df["score"].isnull().any():
+            raise ValueError(
+                "All object detection predictions must have an associated score."
+            )
+    if task_type == enums.TaskType.SEMANTIC_SEGMENTATION:
+        if df["score"].notnull().any():
+            raise ValueError(
+                "All classification predictions must have an associated score."
+            )
+
+        if not (df.groupby("label")["annotation_id"].nunique() == 1).all():
+            raise ValueError(
+                "For semantic segmentation tasks, each label can only be associated with a single annotation id."
+            )
+
+
+def create_validated_groundtruth_df(
+    obj: pd.DataFrame | list[schemas.GroundTruth],
+    task_type: enums.TaskType,
+) -> pd.DataFrame:
+    """
+    Create a validated DataFrame of groundtruth data.
+
+    Parameters
+    ----------
+    obj : pd.DataFrame | list[schemas.GroundTruth]
+        The groundtruth data to be processed. This can be either a pandas DataFrame
+        or a list of GroundTruth objects.
+    task_type : enums.TaskType
+        The task type for which the prediction data is being validated.
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame containing the validated prediction data.
+
+    Raises
+    ------
+    ValueError
+        If the input object is neither a DataFrame nor a list of GroundTruth objects.
+    """
+    if not (
+        isinstance(obj, pd.DataFrame)
+        or (
+            obj
+            and isinstance(obj, list)
+            and isinstance(obj[0], schemas.GroundTruth)
+        )
+    ):
+        raise ValueError(
+            f"Could not validate object as it's neither a dataframe nor a list of Valor objects. Object is of type {type(obj)}."
+        )
+    if isinstance(obj, pd.DataFrame):
+        df = obj
+    else:
+        df = _convert_groundtruth_or_prediction_to_dataframe(obj)
+
+    _validate_groundtruth_dataframe(df=df, task_type=task_type)
+
+    return df
+
+
+def create_validated_prediction_df(
+    obj: pd.DataFrame | list[schemas.Prediction],
+    task_type: enums.TaskType,
+) -> pd.DataFrame:
+    """
+    Create a validated DataFrame of prediction data.
+
+    Parameters
+    ----------
+    obj : pd.DataFrame | list[schemas.Prediction]
+        The prediction data to be processed. This can be either a pandas DataFrame
+        or a list of Prediction objects.
+    task_type : enums.TaskType
+        The task type for which the prediction data is being validated.
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame containing the validated prediction data.
+
+    Raises
+    ------
+    ValueError
+        If the input object is neither a DataFrame nor a list of Prediction objects.
+    """
+    if not (
+        isinstance(obj, pd.DataFrame)
+        or (
+            obj
+            and isinstance(obj, list)
+            and isinstance(obj[0], schemas.Prediction)
+        )
+    ):
+        raise ValueError(
+            f"Could not validate object as it's neither a dataframe nor a list of Valor objects. Object is of type {type(obj)}."
+        )
+    if isinstance(obj, pd.DataFrame):
+        df = obj
+    else:
+        df = _convert_groundtruth_or_prediction_to_dataframe(obj)
+
+    if df.empty:
+        return df
+
+    _validate_prediction_dataframe(df=df, task_type=task_type)
+
+    return df
+
+
+def filter_dataframe_by_task_type(df: pd.DataFrame, task_type: enums.TaskType):
+    """
+    Filter a DataFrame by task type.
+
+    This function identifies the task type implied by the data and filters the DataFrame to include only rows
+    that match the specified task type.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The DataFrame containing the data to be filtered.
+
+    task_type : enums.TaskType
+        The task type to filter the DataFrame by (e.g., classification, detection).
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame filtered to contain only rows that match the specified task type.
+    """
+
+    df = _identify_implied_task_types(df=df)
+
+    filtered_df = df[df["implied_task_type"] == task_type]
+
+    return filtered_df
+
+
+def _convert_groundtruth_or_prediction_to_dataframe(
+    list_of_objects: list[schemas.GroundTruth] | list[schemas.Prediction],
+) -> pd.DataFrame:
+    """
+    Convert a list of GroundTruth or Prediction objects to a DataFrame.
+
+    Parameters
+    ----------
+    list_of_objects : list[schemas.GroundTruth] | list[schemas.Prediction]
+        List of GroundTruth or Prediction objects.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame representation of the input list.
+    """
+
+    output = []
+
+    for i, obj in enumerate(list_of_objects):
+        datum_uid = obj.datum.uid
+        datum_id = hash(obj.datum.uid)
+        datum_metadata = obj.datum.metadata
+
+        for j, ann in enumerate(obj.annotations):
+            ann_id = hash(str(datum_uid) + str(ann))
+            ann_metadata = ann.metadata
+            ann_bbox = ann.bounding_box
+            ann_raster = ann.raster
+            ann_embeding = ann.embedding
+            ann_polygon = ann.polygon
+            ann_is_instance = ann.is_instance
+
+            for k, label in enumerate(ann.labels):
+                id_ = (
+                    str(ann_id) + str(i) + str(j) + str(k)
+                )  # we use indices here, rather than a hash() so that the IDs are sequential. this prevents randomness when two predictions share the same score
+                label_key = label.key
+                label_value = label.value
+                label_score = label.score
+                label_id = hash(label_key + label_value)
+
+                # only include scores for predictions
+                if isinstance(obj, schemas.Prediction):
+                    output.append(
+                        {
+                            "datum_uid": datum_uid,
+                            "datum_id": datum_id,
+                            "datum_metadata": datum_metadata,
+                            "annotation_id": ann_id,
+                            "annotation_metadata": ann_metadata,
+                            "bounding_box": ann_bbox,
+                            "raster": ann_raster,
+                            "embedding": ann_embeding,
+                            "polygon": ann_polygon,
+                            "is_instance": ann_is_instance,
+                            "label_key": label_key,
+                            "label_value": label_value,
+                            "score": label_score,
+                            "label_id": label_id,
+                            "id": id_,
+                        }
+                    )
+                else:
+                    output.append(
+                        {
+                            "datum_uid": datum_uid,
+                            "datum_id": datum_id,
+                            "datum_metadata": datum_metadata,
+                            "annotation_id": ann_id,
+                            "annotation_metadata": ann_metadata,
+                            "bounding_box": ann_bbox,
+                            "raster": ann_raster,
+                            "embedding": ann_embeding,
+                            "polygon": ann_polygon,
+                            "is_instance": ann_is_instance,
+                            "label_key": label_key,
+                            "label_value": label_value,
+                            "label_id": label_id,
+                            "id": id_,
+                        }
+                    )
+
+    return (
+        pd.DataFrame(output)
+        if output
+        else pd.DataFrame(
+            [],
+            columns=[
+                "datum_uid",
+                "datum_id",
+                "datum_metadata",
+                "annotation_id",
+                "annotation_metadata",
+                "bounding_box",
+                "raster",
+                "embedding",
+                "polygon",
+                "is_instance",
+                "label_key",
+                "label_value",
+                "score",
+                "label_id",
+                "id",
+            ],
+        )
+    )
+
+
+def get_disjoint_labels(
+    groundtruth_df: pd.DataFrame,
+    prediction_df: pd.DataFrame,
+    label_map: dict[schemas.Label, schemas.Label] | None,
+) -> tuple[list[tuple[str, str]], list[tuple[str, str]]]:
+    """
+    Returns all unique labels that are not shared between two dataframes.
+
+    Parameters
+    ----------
+    groundtruth_df : pd.DataFrame
+        The dataframe representing ground truth objects.
+    prediction_df : pd.DataFrame
+        The dataframe representing prediction objects.
+    label_map : dict[schemas.Label, schemas.Label], optional
+        Dictionary mapping tuples of (label_key, label_value) to (grouper_key, grouper_value). Used to replace the labels in the DataFrames.
+
+    Returns
+    ----------
+    tuple[list[tuple[str, str]], list[tuple[str, str]]]
+        A tuple of disjoint labels, where the first element is those labels which are present in lhs label set but absent in rhs label set.
+    """
+    if not label_map:
+        label_map = {}
+
+    groundtruth_labels = set(
+        groundtruth_df.apply(
+            lambda row: (row["label_key"], row["label_value"]),
+            axis=1,
+        ).values  # type: ignore - pandas typing errors
+    )
+
+    prediction_labels = set(
+        prediction_df.apply(
+            lambda row: (row["label_key"], row["label_value"]),
+            axis=1,
+        ).values  # type: ignore - pandas typing errors
+    )
+
+    # don't count user-mapped labels as disjoint
+    mapped_labels = set()
+    if label_map:
+        for map_from, map_to in label_map.items():
+            mapped_labels.add((map_from.key, map_from.value))
+            mapped_labels.add((map_to.key, map_to.value))
+
+    groundtruth_unique = list(
+        groundtruth_labels - prediction_labels - mapped_labels
+    )
+    prediction_unique = list(
+        prediction_labels - groundtruth_labels - mapped_labels
+    )
+
+    return (groundtruth_unique, prediction_unique)
+
+
+def _identify_implied_task_types(
+    df: pd.DataFrame,
+) -> pd.DataFrame:
+    """Match an annotation to an implied task type."""
+
+    # null series for use if the column doesn't exist
+    null_placeholder_column = pd.Series([None] * len(df))
+
+    # classification rows only have labels
+    classification_rows = df[
+        df.get("label_key", null_placeholder_column).notnull()
+        & df.get("label_value", null_placeholder_column).notnull()
+        & df.get("bounding_box", null_placeholder_column).isnull()
+        & df.get("polygon", null_placeholder_column).isnull()
+        & df.get("raster", null_placeholder_column).isnull()
+        & df.get("embedding", null_placeholder_column).isnull()
+    ].index
+
+    # object detection tasks have is_instance=True & one of (bounding_box, polygon, raster)
+    object_detection_rows = df[
+        df.get("label_key", null_placeholder_column).notnull()
+        & df.get("label_value", null_placeholder_column).notnull()
+        & (
+            df[
+                [
+                    col
+                    for col in ["bounding_box", "polygon", "raster"]
+                    if col in df.columns
+                ]
+            ]
+            .notna()
+            .sum(axis=1)
+            == 1
+        )
+        & df.get("is_instance", null_placeholder_column).isin([True])
+        & df.get("embedding", null_placeholder_column).isnull()
+    ].index
+
+    # semantic segmentation tasks only support rasters
+    semantic_segmentation_rows = df[
+        df.get("label_key", null_placeholder_column).notnull()
+        & df.get("label_value", null_placeholder_column).notnull()
+        & df.get("bounding_box", null_placeholder_column).isnull()
+        & df.get("polygon", null_placeholder_column).isnull()
+        & df.get("raster", null_placeholder_column).notnull()
+        & df.get("embedding", null_placeholder_column).isnull()
+        & df.get("is_instance", null_placeholder_column).isin([None, False])
+    ].index
+
+    # empty annotations shouldn't contain anything
+    empty_rows = df[
+        df.get("label_key", null_placeholder_column).isnull()
+        & df.get("label_value", null_placeholder_column).isnull()
+        & df.get("bounding_box", null_placeholder_column).isnull()
+        & df.get("polygon", null_placeholder_column).isnull()
+        & df.get("raster", null_placeholder_column).isnull()
+        & df.get("embedding", null_placeholder_column).isnull()
+    ].index
+
+    if not classification_rows.empty:
+        df.loc[
+            classification_rows, "implied_task_type"
+        ] = enums.TaskType.CLASSIFICATION
+
+    if not object_detection_rows.empty:
+        df.loc[
+            object_detection_rows, "implied_task_type"
+        ] = enums.TaskType.OBJECT_DETECTION
+
+    if not semantic_segmentation_rows.empty:
+        df.loc[
+            semantic_segmentation_rows, "implied_task_type"
+        ] = enums.TaskType.SEMANTIC_SEGMENTATION
+
+    if not empty_rows.empty:
+        df.loc[empty_rows, "implied_task_type"] = enums.TaskType.EMPTY
+
+    if df["implied_task_type"].isnull().any():
+        raise ValueError(
+            "Input didn't match any known patterns. Classification tasks should only contain labels. Object detection tasks should contain labels and polygons, bounding boxes, or rasters with is_instance == True. Segmentation tasks should contain labels and rasters with is_instance != True. Text generation tasks should only contain text and optionally context."
+        )
+
+    return df
+
+
+def _convert_raster_to_box(raster: np.ndarray) -> schemas.Box:
+    """Convert a raster mask to a Box."""
+    rows = np.any(raster, axis=1)
+    cols = np.any(raster, axis=0)
+    if not np.any(rows) or not np.any(cols):
+        raise ValueError("Raster is empty, cannot create bounding box.")
+
+    ymin, ymax = np.where(rows)[0][[0, -1]]
+    xmin, xmax = np.where(cols)[0][[0, -1]]
+
+    return schemas.Box.from_extrema(xmin, xmax + 1, ymin, ymax + 1)
+
+
+def _convert_raster_to_polygon(raster: np.ndarray) -> schemas.Polygon:
+    """Convert a raster mask to a Polygon."""
+    if raster.ndim != 2:
+        raise ValueError("Raster must be a 2D array.")
+
+    mask = (raster > 0).astype(np.uint8)
+    rows, cols = np.where(mask > 0)
+
+    if len(rows) == 0 or len(cols) == 0:
+        raise ValueError("Raster is empty, cannot create a polygon.")
+
+    contours = []
+    for r, c in zip(rows, cols):
+        if (
+            (r > 0 and mask[r - 1, c] == 0)
+            or (r < mask.shape[0] - 1 and mask[r + 1, c] == 0)
+            or (c > 0 and mask[r, c - 1] == 0)
+            or (c < mask.shape[1] - 1 and mask[r, c + 1] == 0)
+        ):
+            contours.append((c, r))
+
+    if not contours:
+        raise ValueError("No contours found in raster.")
+
+    contours = sorted(contours, key=lambda p: (p[1], p[0]))
+
+    polygon = [[(x, y) for x, y in contours] + [contours[0]]]
+
+    return schemas.Polygon.from_dict(
+        {"type": "Polygon", "coordinates": polygon}
+    )
+
+
+def _convert_polygon_to_box(polygon: schemas.Polygon) -> schemas.Box:
+    """Convert a Polygon to a Box."""
+
+    boundary = polygon.boundary
+
+    xmin = min(point[0] for point in boundary)
+    xmax = max(point[0] for point in boundary)
+    ymin = min(point[1] for point in boundary)
+    ymax = max(point[1] for point in boundary)
+
+    return schemas.Box.from_extrema(xmin, xmax, ymin, ymax)
+
+
+def _identify_most_detailed_annotation_type(
+    df: pd.DataFrame,
+) -> enums.AnnotationType:
+    """
+    Identify the most detailed annotation type present in the DataFrame.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing the annotations.
+
+    Returns
+    -------
+    enums.AnnotationType
+        The most detailed annotation type present in the DataFrame.
+    """
+
+    if df["raster"].notnull().any():
+        return enums.AnnotationType.RASTER
+
+    elif df["polygon"].notnull().any():
+        return enums.AnnotationType.POLYGON
+
+    elif df["bounding_box"].notnull().any():
+        return enums.AnnotationType.BOX
+
+    else:
+        return enums.AnnotationType.NONE
+
+
+def _identify_least_detailed_annotation_type(
+    df: pd.DataFrame,
+) -> enums.AnnotationType:
+    """
+    Identify the least detailed annotation type present in the DataFrame.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing the annotations.
+
+    Returns
+    -------
+    enums.AnnotationType
+        The least detailed annotation type present in the DataFrame.
+    """
+
+    if df["bounding_box"].notnull().any():
+        return enums.AnnotationType.BOX
+
+    elif df["polygon"].notnull().any():
+        return enums.AnnotationType.POLYGON
+
+    elif df["raster"].notnull().any():
+        return enums.AnnotationType.RASTER
+
+    else:
+        return enums.AnnotationType.NONE
+
+
+def _add_converted_geometry_column(
+    df: pd.DataFrame,
+    target_type: enums.AnnotationType,
+) -> pd.DataFrame:
+    """
+    Add a column with converted geometries to the DataFrame.
+
+    The function checks that each annotation contains only one type of geometry
+    (bounding_box, polygon, or raster) and then converts these geometries to the
+    specified target type. The resulting geometries are stored in a new column
+    called 'converted_geometry'.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing the annotations with geometry columns.
+    target_type : enums.AnnotationType
+        The target annotation type to convert the geometries to.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with an added column 'converted_geometry' containing the converted geometries.
+
+    Raises
+    ------
+    ValueError
+        If an annotation contains more than one type of geometry.
+    """
+    if not (
+        df[["bounding_box", "polygon", "raster"]].notna().sum(axis=1) == 1
+    ).all():
+        raise ValueError(
+            "Each Annotation must contain either a bounding_box, polygon, raster, or an embedding. One Annotation cannot have multiple of these attributes (for example, one Annotation can't contain both a raster and a bounding box)."
+        )
+
+    # converted_geometry will be an array representing the original geometry
+    df["converted_geometry"] = (
+        df[["raster", "bounding_box", "polygon"]].bfill(axis=1).iloc[:, 0]
+    )
+
+    if target_type == enums.AnnotationType.RASTER:
+        df["converted_geometry"] = df["converted_geometry"].map(
+            lambda x: (
+                x.to_array()
+                if isinstance(x, schemas.Raster)
+                else None  # pyright: ignore - pandas .to_dict() typing error
+            )
+        )
+    elif target_type == enums.AnnotationType.POLYGON:
+        df["converted_geometry"] = df["converted_geometry"].map(
+            lambda x: (
+                _convert_raster_to_polygon(
+                    x.to_array()  # pyright: ignore - pandas .to_dict() typing error
+                ).to_array()
+                if isinstance(x, schemas.Raster)
+                else x.to_array()
+                if isinstance(x, schemas.Polygon)
+                else None
+            )
+        )
+
+    elif target_type == enums.AnnotationType.BOX:
+        df["converted_geometry"] = df["converted_geometry"].map(
+            lambda x: (
+                _convert_raster_to_box(
+                    x.to_array()  # pyright: ignore - pandas .to_dict() typing error
+                ).to_array()
+                if isinstance(x, schemas.Raster)
+                else (
+                    _convert_polygon_to_box(x).to_array()
+                    if isinstance(x, schemas.Polygon)
+                    else x.to_array()
+                    if isinstance(x, schemas.Box)
+                    else None
+                )
+            )
+        )
+
+    return df
+
+
+def convert_annotations_to_common_type(
+    groundtruth_df: pd.DataFrame,
+    prediction_df: pd.DataFrame,
+    target_type: enums.AnnotationType | None = None,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Convert all annotations to a common type.
+
+    This function converts the geometries in the provided groundtruth and prediction
+    DataFrames to a common target type. If no target type is specified, it determines
+    the most detailed annotation type present in the data and uses that as the target type.
+
+    Parameters
+    ----------
+    groundtruth_df : pd.DataFrame
+        DataFrame containing the groundtruth annotations.
+    prediction_df : pd.DataFrame
+        DataFrame containing the prediction annotations.
+    target_type : enums.AnnotationType, optional
+        The target annotation type to convert the geometries to. If None, the most
+        detailed type present in the data is used.
+
+    Returns
+    -------
+    tuple[pd.DataFrame, pd.DataFrame, enums.AnnotationType]
+        A tuple containing the converted groundtruth DataFrame, the converted prediction
+        DataFrame, and the target annotation type used for conversion.
+
+    Raises
+    ------
+    ValueError
+        If the target annotation type is not supported.
+    """
+    least_detailed_groundtruth_type = _identify_least_detailed_annotation_type(
+        df=groundtruth_df,
+    )
+
+    least_detailed_prediction_type = _identify_least_detailed_annotation_type(
+        df=prediction_df,
+    )
+
+    # throw an error if the user tries to convert from a lower detailed type to a higher detailed type
+    if target_type and (
+        (target_type > least_detailed_groundtruth_type)
+        or (target_type > least_detailed_prediction_type)
+    ):
+        raise ValueError(
+            f"Cannot convert from a lower-dimensional type {min([least_detailed_groundtruth_type, least_detailed_prediction_type])} to a higher-dimensional type {target_type}"
+        )
+
+    if target_type is None:
+        most_detailed_groundtruth_type = (
+            _identify_most_detailed_annotation_type(
+                df=groundtruth_df,
+            )
+        )
+
+        most_detailed_prediction_type = (
+            _identify_most_detailed_annotation_type(
+                df=prediction_df,
+            )
+        )
+
+        if not (
+            most_detailed_groundtruth_type
+            == most_detailed_prediction_type
+            == least_detailed_groundtruth_type
+            == least_detailed_prediction_type
+        ) and (most_detailed_prediction_type != enums.AnnotationType.NONE):
+            raise ValueError(
+                "valor_core doesn't support auto-conversion of mixed AnnotationTypes. Please make sure to pass a convert_annotation_to_type argument to the evaluation function to tell valor_core how to handle mixed annotation types."
+            )
+
+        target_type = min(
+            [most_detailed_groundtruth_type, most_detailed_prediction_type]
+        )
+
+        # Check typing
+        valid_geometric_types = [
+            enums.AnnotationType.BOX,
+            enums.AnnotationType.POLYGON,
+            enums.AnnotationType.RASTER,
+        ]
+
+        # validate that we can convert geometries successfully
+        if target_type not in valid_geometric_types:
+            raise ValueError(
+                f"Annotation target with type `{target_type}` not supported."
+            )
+
+    groundtruth_df = _add_converted_geometry_column(
+        df=groundtruth_df, target_type=target_type
+    )
+    prediction_df = _add_converted_geometry_column(
+        df=prediction_df, target_type=target_type
+    )
+
+    return (groundtruth_df, prediction_df)