diff --git a/.github/workflows/benchmark-evaluations.yml b/.github/workflows/benchmark-evaluations.yml
new file mode 100644
index 000000000..8d19e0d8b
--- /dev/null
+++ b/.github/workflows/benchmark-evaluations.yml
@@ -0,0 +1,46 @@
+name: Run benchmarks on pre-existing data
+
+on:
+  push:
+    branches: "**"
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  run-benchmarks:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: build postgres
+        run: |
+          docker build ./database -t pgvalor
+      - name: setup back end test env
+        run: docker compose -p valor -f docker-compose.yml -f docker-compose.cicd-override.yml --env-file ./api/.env.testing up --build -d
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: install api
+        run: pip install -e ".[test]"
+        working-directory: ./api
+      - name: install client
+        run: pip install -e ".[test]"
+        working-directory: ./client
+      - name: run classification benchmarks
+        run: python benchmark_script.py
+        working-directory: ./integration_tests/benchmarks/classification
+      - name: print classification results
+        run: |
+          export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('results.json', 'r')), indent=4));")
+          echo "$BENCHMARK_RESULTS"
+        working-directory: ./integration_tests/benchmarks/classification
+      - name: run object detection benchmarks
+        run: python benchmark_script.py
+        working-directory: ./integration_tests/benchmarks/object-detection
+      - name: print object detection results
+        run: |
+          export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('results.json', 'r')), indent=4));")
+          echo "$BENCHMARK_RESULTS"
+        working-directory: ./integration_tests/benchmarks/object-detection
+      - run: make stop-env
diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml
deleted file mode 100644
index 46ed2ab27..000000000
--- a/.github/workflows/stress-test.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-name: Run stress tests manually via the GitHub Actions UI
-
-on:
-  workflow_dispatch:
-
-permissions:
-  id-token: write
-  contents: read
-
-env:
-  AWS_ROLE: arn:aws:iam::724664234782:role/Striveworks-Role-github_runner_npe
-  AWS_REGION: us-east-1
-
-jobs:
-  integration-stress-tests:
-    env:
-      COVERAGE_FILE: .coverage.integration-stress-tests
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - name: build postgres
-        run: |
-          docker build ./database -t pgvalor
-      - name: setup back end test env
-        run: docker compose -p valor -f docker-compose.yml -f docker-compose.cicd-override.yml --env-file ./api/.env.testing up --build -d
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.10"
-      - name: install api
-        run: pip install -e ".[test]"
-        working-directory: ./api
-      - name: install client
-        run: pip install -e ".[test]"
-        working-directory: ./client
-      - run: coverage run --source="api/valor_api,client/valor" -m pytest -v integration_tests/stress_test.py
-      - run: coverage report
-      - name: upload coverage report as artifact
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ env.COVERAGE_FILE }}
-          path: ${{ env.COVERAGE_FILE }}
-      - run: make stop-env
-      - run: docker compose -p valor -f docker-compose.yml -f docker-compose.cicd-override.yml --env-file ./api/.env.testing up --build -d
-        env:
-          VALOR_SECRET_KEY: ${{ vars.SECRET_KEY }}
-          VALOR_USERNAME: ${{ vars.USERNAME }}
-          VALOR_PASSWORD: ${{ vars.PASSWORD }}
-      - name: sleep to give back end time to spin up
-        run: sleep 15
-
-  combine-coverage-report:
-    needs: [integration-stress-tests]
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.10"
-      - run: pip install coverage
-      - uses: actions/download-artifact@v3
-        with:
-          name: .coverage.integration-stress-tests
-      - run: coverage combine
-      - run: coverage report
-      # https://nedbatchelder.com/blog/202209/making_a_coverage_badge.html
-      - run: |
-          coverage json
-          export TOTAL=$(python -c "import json;print(json.load(open('coverage.json'))['totals']['percent_covered_display'])")
-          echo "total=$TOTAL" >> $GITHUB_ENV
-      - name: "Make badge"
-        if: github.ref == 'refs/heads/main'
-        uses: schneegans/dynamic-badges-action@v1.4.0
-        with:
-          auth: ${{ secrets.GIST_SECRET }}
-          gistID: 501428c92df8d0de6805f40fb78b1363
-          filename: valor-coverage.json
-          label: Coverage
-          message: ${{ env.total }}%
-          minColorRange: 50
-          maxColorRange: 90
-          valColorRange: ${{ env.total }}
diff --git a/integration_tests/benchmarks/.gitignore b/integration_tests/benchmarks/.gitignore
new file mode 100644
index 000000000..94a2dd146
--- /dev/null
+++ b/integration_tests/benchmarks/.gitignore
@@ -0,0 +1 @@
+*.json
\ No newline at end of file
diff --git a/integration_tests/benchmarks/classification/benchmark_script.py b/integration_tests/benchmarks/classification/benchmark_script.py
new file mode 100644
index 000000000..5a0b8cfbe
--- /dev/null
+++ b/integration_tests/benchmarks/classification/benchmark_script.py
@@ -0,0 +1,212 @@
+import json
+import os
+from datetime import datetime
+from time import time
+
+import requests
+
+from valor import (
+    Annotation,
+    Client,
+    Dataset,
+    Datum,
+    GroundTruth,
+    Label,
+    Model,
+    Prediction,
+    connect,
+)
+
+connect("http://0.0.0.0:8000")
+client = Client()
+
+
+def download_data_if_not_exists(file_path: str, file_url: str):
+    """Download the data from a public bucket if it doesn't exist in the repo."""
+    if os.path.exists(file_path):
+        return
+
+    response = json.loads(requests.get(file_url).text)
+    with open(file_path, "w+") as file:
+        json.dump(response, file, indent=4)
+
+
+def write_results_to_file(write_path: str, result_dict: dict):
+    """Write results to results.json"""
+    current_datetime = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
+
+    if os.path.isfile(write_path):
+        with open(write_path, "r") as file:
+            file.seek(0)
+            data = json.load(file)
+    else:
+        data = {}
+
+    data[current_datetime] = result_dict
+
+    with open(write_path, "w+") as file:
+        json.dump(data, file, indent=4)
+
+
+def ingest_groundtruths_and_predictions(
+    dset: Dataset, model: Model, raw: dict, pair_limit: int
+):
+    """Ingest the data into Valor."""
+
+    groundtruths = []
+    predictions = []
+    slice_ = (
+        raw["groundtruth_prediction_pairs"][:pair_limit]
+        if pair_limit != -1
+        else raw["groundtruth_prediction_pairs"]
+    )
+    for groundtruth, prediction in slice_:
+        groundtruths.append(
+            GroundTruth(
+                datum=Datum(
+                    uid=groundtruth["value"]["datum"]["uid"],
+                    metadata={"width": 224, "height": 224},
+                ),
+                annotations=[
+                    Annotation(
+                        labels=[
+                            Label(
+                                key=label["key"],
+                                value=label["value"],
+                                score=label["score"],
+                            )
+                            for label in annotation["labels"]
+                        ],
+                    )
+                    for annotation in groundtruth["value"]["annotations"]
+                ],
+            )
+        )
+
+        predictions.append(
+            Prediction(
+                datum=Datum(
+                    uid=prediction["value"]["datum"]["uid"],
+                    metadata={"width": 224, "height": 224},
+                ),
+                annotations=[
+                    Annotation(
+                        labels=[
+                            Label(
+                                key=label["key"],
+                                value=label["value"],
+                                score=label["score"],
+                            )
+                            for label in annotation["labels"]
+                        ],
+                    )
+                    for annotation in prediction["value"]["annotations"]
+                ],
+            )
+        )
+
+    for gt in groundtruths:
+        dset.add_groundtruth(gt)
+
+    for pred in predictions:
+        model.add_prediction(dset, pred)
+
+    dset.finalize()
+    model.finalize_inferences(dataset=dset)
+
+
+def run_base_evaluation(dset: Dataset, model: Model):
+    """Run a base evaluation (with no PR curves)."""
+    evaluation = model.evaluate_classification(dset)
+    evaluation.wait_for_completion(timeout=30)
+    return evaluation
+
+
+def run_pr_curve_evaluation(dset: Dataset, model: Model):
+    """Run a base evaluation with PrecisionRecallCurve included."""
+    evaluation = model.evaluate_classification(
+        dset,
+        metrics_to_return=[
+            "Accuracy",
+            "Precision",
+            "Recall",
+            "F1",
+            "ROCAUC",
+            "PrecisionRecallCurve",
+        ],
+    )
+    evaluation.wait_for_completion()
+    return evaluation
+
+
+def run_detailed_pr_curve_evaluation(dset: Dataset, model: Model):
+    """Run a base evaluation with PrecisionRecallCurve and DetailedPrecisionRecallCurve included."""
+
+    evaluation = model.evaluate_classification(
+        dset,
+        metrics_to_return=[
+            "Accuracy",
+            "Precision",
+            "Recall",
+            "F1",
+            "ROCAUC",
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ],
+    )
+    evaluation.wait_for_completion()
+    return evaluation
+
+
+def run_benchmarking_analysis(
+    limits_to_test: list[int] = [1000, 1000],
+    results_file: str = "results.json",
+    data_file: str = "data.json",
+):
+    """Time various function calls and export the results."""
+    current_directory = os.path.dirname(os.path.realpath(__file__))
+    write_path = f"{current_directory}/{results_file}"
+    data_path = f"{current_directory}/{data_file}"
+
+    download_data_if_not_exists(
+        file_path=data_path,
+        file_url="https://pub-fae71003f78140bdaedf32a7c8d331d2.r2.dev/classification_data.json",
+    )
+
+    with open(data_path) as file:
+        file.seek(0)
+        raw_data = json.load(file)
+
+    for limit in limits_to_test:
+        dset = Dataset.create(name="bird-identification")
+        model = Model.create(name="some_model")
+
+        start_time = time()
+
+        ingest_groundtruths_and_predictions(
+            dset=dset, model=model, raw=raw_data, pair_limit=limit
+        )
+        ingest_time = time() - start_time
+
+        try:
+            eval_ = run_base_evaluation(dset=dset, model=model)
+        except TimeoutError:
+            raise TimeoutError(
+                f"Evaluation timed out when processing {limit} datums."
+            )
+
+        results = {
+            "number_of_datums": limit,
+            "number_of_unique_labels": eval_.meta["labels"],
+            "number_of_annotations": eval_.meta["labels"],
+            "ingest_runtime": f"{(ingest_time):.1f} seconds",
+            "eval_runtime": f"{(eval_.meta['duration']):.1f} seconds",
+        }
+        write_results_to_file(write_path=write_path, result_dict=results)
+
+        client.delete_dataset(dset.name, timeout=30)
+        client.delete_model(model.name, timeout=30)
+
+
+if __name__ == "__main__":
+    run_benchmarking_analysis()
diff --git a/integration_tests/benchmarks/object-detection/benchmark_script.py b/integration_tests/benchmarks/object-detection/benchmark_script.py
new file mode 100644
index 000000000..556cd5141
--- /dev/null
+++ b/integration_tests/benchmarks/object-detection/benchmark_script.py
@@ -0,0 +1,290 @@
+import json
+import os
+from datetime import datetime
+from time import time
+
+import requests
+
+from valor import (
+    Annotation,
+    Client,
+    Dataset,
+    Datum,
+    GroundTruth,
+    Label,
+    Model,
+    Prediction,
+    connect,
+)
+from valor.schemas import MultiPolygon, Polygon, Raster
+
+connect("http://0.0.0.0:8000")
+client = Client()
+
+
+def download_data_if_not_exists(file_path: str, file_url: str):
+    """Download the data from a public bucket if it doesn't exist in the repo."""
+    if os.path.exists(file_path):
+        return
+
+    response = json.loads(requests.get(file_url).text)
+    with open(file_path, "w+") as file:
+        json.dump(response, file, indent=4)
+
+
+def _convert_wkt_to_coordinates(wkt: str) -> list[list[tuple]]:
+    """Convert a WKT string into a nested list of coordinates."""
+    return [
+        [tuple(float(y) for y in x) for x in json.loads(wkt)["coordinates"][0]]
+    ]
+
+
+def write_results_to_file(write_path: str, result_dict: dict):
+    """Write results to results.json"""
+    current_datetime = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
+
+    if os.path.isfile(write_path):
+        with open(write_path, "r") as file:
+            file.seek(0)
+            data = json.load(file)
+    else:
+        data = {}
+
+    data[current_datetime] = result_dict
+
+    with open(write_path, "w+") as file:
+        json.dump(data, file, indent=4)
+
+
+def ingest_groundtruths_and_predictions(
+    dset: Dataset, model: Model, raw: list, pair_limit: int
+):
+    """Ingest the data into Valor."""
+    groundtruths = []
+    predictions = []
+
+    for datum_id, data in raw[:pair_limit]:
+        datum = Datum(
+            uid=str(datum_id),
+            metadata=data["datum_metadata"],
+        )
+        groundtruths.append(
+            GroundTruth(
+                datum=datum,
+                annotations=list(
+                    [
+                        Annotation(
+                            is_instance=ann["is_instance"],
+                            labels=list(
+                                [
+                                    Label(
+                                        key=label["key"],
+                                        value=label["value"],
+                                    )
+                                    for label in ann["labels"]
+                                ]
+                            ),
+                            bounding_box=(
+                                _convert_wkt_to_coordinates(ann["box"])
+                                if ann["box"]
+                                else None
+                            ),
+                            raster=(
+                                Raster.from_geometry(
+                                    geometry=MultiPolygon(
+                                        [
+                                            _convert_wkt_to_coordinates(
+                                                ann["raster"]
+                                            )
+                                        ]
+                                    ),
+                                    height=data["datum_metadata"]["height"],
+                                    width=data["datum_metadata"]["width"],
+                                )
+                                if ann["raster"]
+                                else None
+                            ),
+                            polygon=(
+                                (
+                                    Polygon(
+                                        _convert_wkt_to_coordinates(
+                                            ann["polygon"]
+                                        )
+                                    )
+                                )
+                                if ann["polygon"]
+                                else None
+                            ),
+                        )
+                        for ann in data["groundtruth_annotations"]
+                    ]
+                ),
+            )
+        )
+
+        predictions.append(
+            Prediction(
+                datum=datum,
+                annotations=list(
+                    [
+                        Annotation(
+                            is_instance=ann["is_instance"],
+                            labels=list(
+                                [
+                                    Label(
+                                        key=label["key"],
+                                        value=label["value"],
+                                        score=label["score"],
+                                    )
+                                    for label in ann["labels"]
+                                ]
+                            ),
+                            bounding_box=(
+                                _convert_wkt_to_coordinates(ann["box"])
+                                if ann["box"]
+                                else None
+                            ),
+                            raster=(
+                                Raster.from_geometry(
+                                    geometry=MultiPolygon(
+                                        [
+                                            _convert_wkt_to_coordinates(
+                                                ann["raster"]
+                                            )
+                                        ]
+                                    ),
+                                    height=data["datum_metadata"]["height"],
+                                    width=data["datum_metadata"]["width"],
+                                )
+                                if ann["raster"]
+                                else None
+                            ),
+                            polygon=(
+                                (
+                                    Polygon(
+                                        _convert_wkt_to_coordinates(
+                                            ann["polygon"]
+                                        )
+                                    )
+                                )
+                                if ann["polygon"]
+                                else None
+                            ),
+                        )
+                        for ann in data["prediction_annotations"]
+                    ]
+                ),
+            )
+        )
+
+    for gt in groundtruths:
+        dset.add_groundtruth(gt)
+
+    for pred in predictions:
+        model.add_prediction(dset, pred)
+
+    dset.finalize()
+    model.finalize_inferences(dataset=dset)
+
+
+def run_base_evaluation(dset: Dataset, model: Model):
+    """Run a base evaluation (with no PR curves)."""
+    evaluation = model.evaluate_detection(dset)
+    evaluation.wait_for_completion(timeout=30)
+    return evaluation
+
+
+def run_pr_curve_evaluation(dset: Dataset, model: Model):
+    """Run a base evaluation with PrecisionRecallCurve included."""
+    evaluation = model.evaluate_detection(
+        dset,
+        metrics_to_return=[
+            "AP",
+            "AR",
+            "mAP",
+            "APAveragedOverIOUs",
+            "mAR",
+            "mAPAveragedOverIOUs",
+            "PrecisionRecallCurve",
+        ],
+    )
+    evaluation.wait_for_completion()
+    return evaluation
+
+
+def run_detailed_pr_curve_evaluation(dset: Dataset, model: Model):
+    """Run a base evaluation with PrecisionRecallCurve and DetailedPrecisionRecallCurve included."""
+
+    evaluation = model.evaluate_detection(
+        dset,
+        metrics_to_return=[
+            "AP",
+            "AR",
+            "mAP",
+            "APAveragedOverIOUs",
+            "mAR",
+            "mAPAveragedOverIOUs",
+            "PrecisionRecallCurve",
+            "DetailedPrecisionRecallCurve",
+        ],
+    )
+    evaluation.wait_for_completion()
+    return evaluation
+
+
+def run_benchmarking_analysis(
+    limits_to_test: list[int] = [3, 3],
+    results_file: str = "results.json",
+    data_file: str = "data.json",
+):
+    """Time various function calls and export the results."""
+    current_directory = os.path.dirname(os.path.realpath(__file__))
+    write_path = f"{current_directory}/{results_file}"
+    data_path = f"{current_directory}/{data_file}"
+
+    download_data_if_not_exists(
+        file_path=data_path,
+        file_url="https://pub-fae71003f78140bdaedf32a7c8d331d2.r2.dev/detection_data.json",
+    )
+
+    with open(data_path) as file:
+        file.seek(0)
+        raw_data = json.load(file)
+
+    for limit in limits_to_test:
+        dset = Dataset.create(name="coco-dataset")
+        model = Model.create(name="coco-model")
+
+        # convert dict into list of tuples so we can slice it
+        raw_data_tuple = [(key, value) for key, value in raw_data.items()]
+
+        start_time = time()
+
+        ingest_groundtruths_and_predictions(
+            dset=dset, model=model, raw=raw_data_tuple, pair_limit=limit
+        )
+        ingest_time = time() - start_time
+
+        try:
+            eval_ = run_base_evaluation(dset=dset, model=model)
+        except TimeoutError:
+            raise TimeoutError(
+                f"Evaluation timed out when processing {limit} datums."
+            )
+
+        results = {
+            "number_of_datums": limit,
+            "number_of_unique_labels": eval_.meta["labels"],
+            "number_of_annotations": eval_.meta["labels"],
+            "ingest_runtime": f"{(ingest_time):.1f} seconds",
+            "eval_runtime": f"{(eval_.meta['duration']):.1f} seconds",
+        }
+
+        write_results_to_file(write_path=write_path, result_dict=results)
+
+        client.delete_dataset(dset.name, timeout=30)
+        client.delete_model(model.name, timeout=30)
+
+
+if __name__ == "__main__":
+    run_benchmarking_analysis()
diff --git a/integration_tests/stress_test.py b/integration_tests/stress_test.py
deleted file mode 100644
index 0dcb49b40..000000000
--- a/integration_tests/stress_test.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# NOTE: These tests aren't run automatically on each commit. They are intended to be manually kicked-off using the [GitHub UI](https://leonardomontini.dev/github-action-manual-trigger/)
-from integration_tests.client.datatype.test_data_generation import (
-    test_generate_segmentation_data,
-)
-
-from valor import Client
-
-
-def test_large_dataset_upload(client: Client):
-    """Tests the upload of a large dataset to valor (runtime: ~20 minutes)"""
-    test_generate_segmentation_data(
-        client=client,
-        dataset_name="stress_test_dataset",
-        n_images=1000,
-        n_annotations=10,
-        n_labels=2,
-    )