diff --git a/.github/workflows/lite-benchmark-evaluations.yml b/.github/workflows/lite-benchmark-evaluations.yml
index 8afe7ed21..82d80b7bf 100644
--- a/.github/workflows/lite-benchmark-evaluations.yml
+++ b/.github/workflows/lite-benchmark-evaluations.yml
@@ -35,4 +35,3 @@ jobs:
           export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('objdet_results.json', 'r')), indent=4));")
           echo "$BENCHMARK_RESULTS"
         working-directory: ./lite/benchmarks/
-      - run: make stop-env
diff --git a/.github/workflows/lite-synthetic-benchmarks.yml b/.github/workflows/lite-synthetic-benchmarks.yml
new file mode 100644
index 000000000..daaefe5dc
--- /dev/null
+++ b/.github/workflows/lite-synthetic-benchmarks.yml
@@ -0,0 +1,24 @@
+name: "[valor-lite] synthetic benchmarks"
+
+on:
+  push:
+    branches: "**"
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  run-benchmarks:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: install lite
+        run: pip install -e .
+        working-directory: ./lite
+      - name: benchmark semantic segmentation
+        run: python benchmark_semantic_segmentation.py
+        working-directory: ./lite/benchmarks/synthetic/
diff --git a/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py b/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py
new file mode 100644
index 000000000..737da165c
--- /dev/null
+++ b/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py
@@ -0,0 +1,94 @@
+from valor_lite.profiling import Benchmark, BenchmarkError
+from valor_lite.semantic_segmentation.benchmark import (
+    benchmark_add_data,
+    benchmark_evaluate,
+    benchmark_finalize,
+)
+
+
+def benchmark(
+    bitmask_shape: tuple[int, int],
+    number_of_unique_labels: int,
+    number_of_images: int,
+    *_,
+    memory_limit: float = 4.0,
+    time_limit: float = 10.0,
+    repeat: int = 1,
+    verbose: bool = False,
+):
+    """
+    Runs a single benchmark.
+
+    Parameters
+    ----------
+    bitmask_shape : tuple[int, int]
+        The size (h, w) of the bitmask array.
+    number_of_unique_labels : int
+        The number of unique labels used in the synthetic example.
+    number_of_images : int
+        The number of distinct datums that are created.
+    memory_limit : float
+        The maximum amount of system memory allowed in gigabytes (GB).
+    time_limit : float
+        The maximum amount of time permitted before killing the benchmark.
+    repeat : int
+        The number of times to run a benchmark to produce an average runtime.
+    verbose : bool, default=False
+        Toggles terminal output of benchmark results.
+    """
+
+    b = Benchmark(
+        time_limit=time_limit,
+        memory_limit=int(memory_limit * (1024**3)),
+        repeat=repeat,
+        verbose=verbose,
+    )
+
+    _, failed, details = b.run(
+        benchmark=benchmark_add_data,
+        n_labels=[number_of_unique_labels],
+        shape=[bitmask_shape],
+    )
+    if failed:
+        raise BenchmarkError(
+            benchmark=details["benchmark"],
+            error_type=failed[0]["error"],
+            error_message=failed[0]["msg"],
+        )
+
+    _, failed, details = b.run(
+        benchmark=benchmark_finalize,
+        n_datums=[number_of_images],
+        n_labels=[number_of_unique_labels],
+    )
+    if failed:
+        raise BenchmarkError(
+            benchmark=details["benchmark"],
+            error_type=failed[0]["error"],
+            error_message=failed[0]["msg"],
+        )
+
+    _, failed, details = b.run(
+        benchmark=benchmark_evaluate,
+        n_datums=[number_of_images],
+        n_labels=[number_of_unique_labels],
+    )
+    if failed:
+        raise BenchmarkError(
+            benchmark=details["benchmark"],
+            error_type=failed[0]["error"],
+            error_message=failed[0]["msg"],
+        )
+
+
+if __name__ == "__main__":
+
+    benchmark(
+        bitmask_shape=(4000, 4000),
+        number_of_images=1000,
+        number_of_unique_labels=10,
+        memory_limit=4.0,
+        time_limit=10.0,
+        repeat=1,
+        verbose=True,
+    )
diff --git a/lite/examples/benchmarking.ipynb b/lite/examples/benchmarking.ipynb
new file mode 100644
index 000000000..7774a759e
--- /dev/null
+++ b/lite/examples/benchmarking.ipynb
@@ -0,0 +1,279 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from valor_lite.profiling import Benchmark\n",
+    "\n",
+    "b = Benchmark(\n",
+    "    time_limit=5.0,  # 5s\n",
+    "    memory_limit=8 * (1024 ** 3),  # 8 GB\n",
+    "    repeat=1,\n",
+    "    verbose=True,\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Semantic Segmentation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from valor_lite.semantic_segmentation.benchmark import (\n",
+    "    benchmark_add_data as semseg_add_data,\n",
+    "    benchmark_finalize as semseg_finalize,\n",
+    "    benchmark_evaluate as semseg_evaluate,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_datums = [\n",
+    "    10000,\n",
+    "    1000,\n",
+    "    100,\n",
+    "    10,\n",
+    "    1,\n",
+    "]\n",
+    "\n",
+    "n_labels = [\n",
+    "    1000,\n",
+    "    100,\n",
+    "    10,\n",
+    "    3,\n",
+    "]\n",
+    "\n",
+    "shapes = [\n",
+    "    (10000, 10000),\n",
+    "    (2500, 2500),\n",
+    "    (1000, 1000),\n",
+    "    (100, 100),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 69%|██████▉   | 11/16 [00:46<00:21,  4.26s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=====================================================================\n",
+      "Details\n",
+      "{\n",
+      "    \"benchmark\": \"benchmark_add_data\",\n",
+      "    \"limits\": {\n",
+      "        \"memory_limit\": \"8.0 GB\",\n",
+      "        \"time_limit\": \"5.0 seconds\",\n",
+      "        \"repeat\": 1\n",
+      "    },\n",
+      "    \"passed\": 8,\n",
+      "    \"failed\": 8,\n",
+      "    \"total\": 16\n",
+      "}\n",
+      "\n",
+      "Passed\n",
+      "  complexity    |     runtime     |    n_labels     |      shape     \n",
+      "---------------------------------------------------------------------\n",
+      "   300000000    |     1.5151      |        3        | (10000, 10000) \n",
+      "   62500000     |     0.5952      |       10        |  (2500, 2500)  \n",
+      "   10000000     |     0.0911      |       10        |  (1000, 1000)  \n",
+      "    1000000     |     0.0582      |       100       |   (100, 100)   \n",
+      "\n",
+      "Failed\n",
+      "  complexity    |      error      |    n_labels     |      shape      |       msg      \n",
+      "---------------------------------------------------------------------------------------\n",
+      " 100000000000   |   MemoryError   |      1000       | (10000, 10000)  | Unable to allocate 186. GiB for an array with shape (1001, 20000, 10000) and data type bool\n",
+      "  10000000000   |   MemoryError   |       100       | (10000, 10000)  | Unable to allocate 18.8 GiB for an array with shape (101, 20000, 10000) and data type bool\n",
+      "  6250000000    |   MemoryError   |      1000       |  (2500, 2500)   | Unable to allocate 11.7 GiB for an array with shape (1001, 5000, 2500) and data type bool\n",
+      "  1000000000    |   MemoryError   |       10        | (10000, 10000)  | Unable to allocate 9.31 GiB for an array with shape (10, 10, 100000000) and data type bool\n",
+      "  1000000000    |   MemoryError   |      1000       |  (1000, 1000)   | Unable to allocate 931. GiB for an array with shape (1000, 1000, 1000000) and data type bool\n",
+      "   625000000    |   MemoryError   |       100       |  (2500, 2500)   | Unable to allocate 58.2 GiB for an array with shape (100, 100, 6250000) and data type bool\n",
+      "   100000000    |   MemoryError   |       100       |  (1000, 1000)   | Unable to allocate 9.31 GiB for an array with shape (100, 100, 1000000) and data type bool\n",
+      "   10000000     |   MemoryError   |      1000       |   (100, 100)    | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "_ = b.run(\n",
+    "    benchmark=semseg_add_data,\n",
+    "    n_labels=n_labels,\n",
+    "    shape=shapes,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 20%|██        | 4/20 [02:35<10:22, 38.92s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=====================================================================\n",
+      "Details\n",
+      "{\n",
+      "    \"benchmark\": \"benchmark_finalize\",\n",
+      "    \"limits\": {\n",
+      "        \"memory_limit\": \"8.0 GB\",\n",
+      "        \"time_limit\": \"5.0 seconds\",\n",
+      "        \"repeat\": 1\n",
+      "    },\n",
+      "    \"passed\": 18,\n",
+      "    \"failed\": 2,\n",
+      "    \"total\": 20\n",
+      "}\n",
+      "\n",
+      "Passed\n",
+      "  complexity    |     runtime     |    n_datums     |    n_labels    \n",
+      "---------------------------------------------------------------------\n",
+      "    1000000     |     1.1142      |      10000      |       100      \n",
+      "    100000      |     0.1748      |       100       |      1000      \n",
+      "    100000      |     0.1086      |      1000       |       100      \n",
+      "\n",
+      "Failed\n",
+      "  complexity    |      error      |    n_datums     |    n_labels     |       msg      \n",
+      "---------------------------------------------------------------------------------------\n",
+      "   10000000     |   MemoryError   |      10000      |      1000       | Unable to allocate 7.63 MiB for an array with shape (1000, 1000) and data type int64\n",
+      "    1000000     |   MemoryError   |      1000       |      1000       |                \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "_ = b.run(\n",
+    "    benchmark=semseg_finalize,\n",
+    "    n_datums=n_datums,\n",
+    "    n_labels=n_labels,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 20%|██        | 4/20 [02:25<09:40, 36.28s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=====================================================================\n",
+      "Details\n",
+      "{\n",
+      "    \"benchmark\": \"benchmark_evaluate\",\n",
+      "    \"limits\": {\n",
+      "        \"memory_limit\": \"8.0 GB\",\n",
+      "        \"time_limit\": \"5.0 seconds\",\n",
+      "        \"repeat\": 1\n",
+      "    },\n",
+      "    \"passed\": 18,\n",
+      "    \"failed\": 2,\n",
+      "    \"total\": 20\n",
+      "}\n",
+      "\n",
+      "Passed\n",
+      "  complexity    |     runtime     |    n_datums     |    n_labels    \n",
+      "---------------------------------------------------------------------\n",
+      "    1000000     |     0.0537      |      10000      |       100      \n",
+      "    100000      |     0.0815      |       100       |      1000      \n",
+      "    100000      |     0.0137      |      1000       |       100      \n",
+      "\n",
+      "Failed\n",
+      "  complexity    |      error      |    n_datums     |    n_labels     |       msg      \n",
+      "---------------------------------------------------------------------------------------\n",
+      "   10000000     |   MemoryError   |      10000      |      1000       | Unable to allocate 23.8 MiB for an array with shape (1000, 1000, 25) and data type bool\n",
+      "    1000000     |   MemoryError   |      1000       |      1000       | Unable to allocate 3.73 GiB for an array with shape (1000, 1001, 1001) and data type int32\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "_ = b.run(\n",
+    "    benchmark=semseg_evaluate,\n",
+    "    n_datums=n_datums,\n",
+    "    n_labels=n_labels,\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".env-valor",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/lite/tests/semantic_segmentation/test_annotation.py b/lite/tests/semantic_segmentation/test_annotation.py
index 999dd5240..89b0ba7a4 100644
--- a/lite/tests/semantic_segmentation/test_annotation.py
+++ b/lite/tests/semantic_segmentation/test_annotation.py
@@ -1,6 +1,10 @@
 import numpy as np
 import pytest
-from valor_lite.semantic_segmentation import Bitmask, Segmentation
+from valor_lite.semantic_segmentation import (
+    Bitmask,
+    Segmentation,
+    generate_segmentation,
+)
 
 
 def test_bitmask():
@@ -78,3 +82,55 @@ def test_segmentation():
             predictions=[],
         )
     assert "missing predictions" in str(e)
+
+
+def test_generate_segmentation():
+
+    # N labels > 1
+    segmentation = generate_segmentation(
+        datum_uid="uid1",
+        number_of_unique_labels=3,
+        mask_height=2,
+        mask_width=3,
+    )
+
+    assert segmentation.uid == "uid1"
+    assert segmentation.shape == (2, 3)
+    assert segmentation.size == 6
+
+    assert len(segmentation.groundtruths) == 3
+    assert all(gt.mask.dtype == np.bool_ for gt in segmentation.groundtruths)
+    assert all(gt.mask.shape == (2, 3) for gt in segmentation.groundtruths)
+
+    assert len(segmentation.predictions) == 3
+    assert all(pd.mask.dtype == np.bool_ for pd in segmentation.predictions)
+    assert all(pd.mask.shape == (2, 3) for pd in segmentation.predictions)
+
+    # N labels = 1
+    segmentation = generate_segmentation(
+        datum_uid="uid1",
+        number_of_unique_labels=1,
+        mask_height=2,
+        mask_width=3,
+    )
+
+    assert segmentation.uid == "uid1"
+    assert segmentation.shape == (2, 3)
+    assert segmentation.size == 6
+
+    assert len(segmentation.groundtruths) == 1
+    assert all(gt.mask.dtype == np.bool_ for gt in segmentation.groundtruths)
+    assert all(gt.mask.shape == (2, 3) for gt in segmentation.groundtruths)
+
+    assert len(segmentation.predictions) == 1
+    assert all(pd.mask.dtype == np.bool_ for pd in segmentation.predictions)
+    assert all(pd.mask.shape == (2, 3) for pd in segmentation.predictions)
+
+    # N labels = 0
+    with pytest.raises(ValueError):
+        generate_segmentation(
+            datum_uid="uid1",
+            number_of_unique_labels=0,
+            mask_height=2,
+            mask_width=3,
+        )
diff --git a/lite/valor_lite/object_detection/manager.py b/lite/valor_lite/object_detection/manager.py
index bd7663107..bfaacf5ab 100644
--- a/lite/valor_lite/object_detection/manager.py
+++ b/lite/valor_lite/object_detection/manager.py
@@ -334,6 +334,10 @@ def evaluate(
         return metrics
 
 
+def defaultdict_int():
+    return defaultdict(int)
+
+
 class DataLoader:
     """
     Object Detection DataLoader
@@ -342,8 +346,8 @@ class DataLoader:
     def __init__(self):
         self._evaluator = Evaluator()
         self.pairs: list[NDArray[np.float64]] = list()
-        self.groundtruth_count = defaultdict(lambda: defaultdict(int))
-        self.prediction_count = defaultdict(lambda: defaultdict(int))
+        self.groundtruth_count = defaultdict(defaultdict_int)
+        self.prediction_count = defaultdict(defaultdict_int)
 
     def _add_datum(self, uid: str) -> int:
         """
diff --git a/lite/valor_lite/profiling.py b/lite/valor_lite/profiling.py
new file mode 100644
index 000000000..be275ce3f
--- /dev/null
+++ b/lite/valor_lite/profiling.py
@@ -0,0 +1,374 @@
+import json
+import math
+import multiprocessing as mp
+import resource
+import time
+from collections import deque
+from multiprocessing import Queue
+from typing import Any
+
+from tqdm import tqdm
+
+
+class BenchmarkError(Exception):
+    def __init__(
+        self, benchmark: str, error_type: str, error_message: str
+    ) -> None:
+        super().__init__(
+            f"'{benchmark}' raised '{error_type}' with the following message: {error_message}"
+        )
+
+
+def _timeit_subprocess(*args, __fn, __queue: Queue, **kwargs):
+    """
+    Multiprocessing subprocess that reports either runtime or errors.
+
+    This is handled within a subprocess to protect the benchmark against OOM errors.
+    """
+    try:
+        timer_start = time.perf_counter()
+        __fn(*args, **kwargs)
+        timer_end = time.perf_counter()
+        __queue.put(timer_end - timer_start)
+    except Exception as e:
+        __queue.put(e)
+
+
+def create_runtime_profiler(
+    time_limit: float | None,
+    repeat: int = 1,
+):
+    """
+    Creates a runtime profiler as a decorating function.
+
+    The profiler reports runtime of the wrapped function from a subprocess to protect against OOM errors.
+
+    Parameters
+    ----------
+    time_limit : float, optional
+        An optional time limit to constrain the benchmark.
+    repeat : int, default=1
+        The number of times to repeat the benchmark to produce an average runtime.
+    """
+    ctx = mp.get_context("spawn")
+
+    def decorator(fn):
+        def wrapper(*args, **kwargs):
+            # Record average runtime over repeated runs.
+            elapsed = 0
+            for _ in range(repeat):
+                q = ctx.Queue()
+                p = ctx.Process(
+                    target=_timeit_subprocess,
+                    args=args,
+                    kwargs={"__fn": fn, "__queue": q, **kwargs},
+                )
+                p.start()
+                p.join(timeout=time_limit)
+
+                # Check if computation finishes within the timeout
+                if p.is_alive():
+                    p.terminate()
+                    p.join()
+                    q.close()
+                    q.join_thread()
+                    raise TimeoutError(
+                        f"Function '{fn.__name__}' did not complete within {time_limit} seconds."
+                    )
+
+                # Retrieve the result
+                result = q.get(timeout=1)
+                if isinstance(result, Exception):
+                    raise result
+                elif isinstance(result, float):
+                    elapsed += result
+                else:
+                    raise TypeError(type(result).__name__)
+
+            return elapsed / repeat
+
+        return wrapper
+
+    return decorator
+
+
+def pretty_print_results(results: tuple):
+    valid, invalid, permutations = results
+
+    print(
+        "====================================================================="
+    )
+    print("Details")
+    print(json.dumps(permutations, indent=4))
+
+    if len(valid) > 0:
+        print()
+        print("Passed")
+        keys = ["complexity", "runtime", *valid[0]["details"].keys()]
+        header = " | ".join(f"{header:^15}" for header in keys)
+        print(header)
+        print("-" * len(header))
+        for entry in valid:
+            values = [
+                entry["complexity"],
+                round(entry["runtime"], 4),
+                *entry["details"].values(),
+            ]
+            row = " | ".join(f"{str(value):^15}" for value in values)
+            print(row)
+
+    if len(invalid) > 0:
+        print()
+        print("Failed")
+        keys = ["complexity", "error", *invalid[0]["details"].keys(), "msg"]
+        header = " | ".join(f"{header:^15}" for header in keys)
+        print(header)
+        print("-" * len(header))
+        for entry in invalid:
+            values = [
+                entry["complexity"],
+                entry["error"],
+                *entry["details"].values(),
+                entry["msg"],
+            ]
+            row = " | ".join(f"{str(value):^15}" for value in values)
+            print(row)
+
+
+def _calculate_complexity(params: list[int | tuple[int]]) -> int:
+    """
+    Basic metric of benchmark complexity.
+    """
+    flattened_params = [
+        math.prod(p) if isinstance(p, tuple) else p for p in params
+    ]
+    return math.prod(flattened_params)
+
+
+class Benchmark:
+    def __init__(
+        self,
+        time_limit: float | None,
+        memory_limit: int | None,
+        *_,
+        repeat: int | None = 1,
+        verbose: bool = False,
+    ):
+        self.time_limit = time_limit
+        self.memory_limit = memory_limit
+        self.repeat = repeat
+        self.verbose = verbose
+
+    def get_limits(
+        self,
+        *_,
+        readable: bool = True,
+        memory_unit: str = "GB",
+        time_unit: str = "seconds",
+    ) -> dict[str, str | int | float | None]:
+        """
+        Returns a dictionary of benchmark limits.
+
+        Parameters
+        ----------
+        readable : bool, default=True
+            Toggles whether the output should be human readable.
+        memory_unit : str, default="GB"
+            Toggles what unit to display the memory limit with when 'readable=True'.
+        time_unit : str, default="seconds"
+            Toggles what unit to display the time limit with when 'readable=True'.
+
+        Returns
+        -------
+        dict[str, str | int | float | None]
+            The benchmark limits.
+        """
+
+        memory_value = self.memory_limit
+        if readable and memory_value is not None:
+            match memory_unit:
+                case "TB":
+                    memory_value /= 1024**4
+                case "GB":
+                    memory_value /= 1024**3
+                case "MB":
+                    memory_value /= 1024**2
+                case "KB":
+                    memory_value /= 1024
+                case "B":
+                    pass
+                case _:
+                    valid_set = {"TB", "GB", "MB", "KB", "B"}
+                    raise ValueError(
+                        f"Expected memory unit to be in the set {valid_set}, received '{memory_unit}'."
+                    )
+            memory_value = f"{memory_value} {memory_unit}"
+
+        time_value = self.time_limit
+        if readable and time_value is not None:
+            match time_unit:
+                case "minutes":
+                    time_value /= 60
+                case "seconds":
+                    pass
+                case "milliseconds":
+                    time_value *= 1000
+                case _:
+                    valid_set = {"minutes", "seconds", "milliseconds"}
+                    raise ValueError(
+                        f"Expected time unit to be in the set {valid_set}, received '{time_unit}'."
+                    )
+            time_value = f"{time_value} {time_unit}"
+
+        return {
+            "memory_limit": memory_value,
+            "time_limit": time_value,
+            "repeat": self.repeat,
+        }
+
+    @property
+    def memory_limit(self) -> int | None:
+        """
+        The memory limit in bytes (B).
+        """
+        return self._memory_limit
+
+    @memory_limit.setter
+    def memory_limit(self, limit: int | None):
+        """
+        Stores the memory limit and restricts resources.
+        """
+        self._memory_limit = limit
+        if limit is not None:
+            _, hard = resource.getrlimit(resource.RLIMIT_AS)
+            resource.setrlimit(resource.RLIMIT_AS, (limit, hard))
+
+    def run(
+        self,
+        benchmark,
+        **kwargs: list[Any],
+    ):
+        """
+        Runs a benchmark with ranges of parameters.
+
+        Parameters
+        ----------
+        benchmark : Callable
+            The benchmark function.
+        **kwargs : list[Any]
+            Keyword arguments passing lists of parameters to benchmark. The values should be sorted in
+            decreasing complexity. For example, if the number of labels is a parameter then a higher
+            number of unique labels would be considered "more" complex.
+
+        Example
+        -------
+        >>> b = Benchmark(
+        ...     time_limit=10.0,
+        ...     memory_limit=8 * (1024**3),
+        ...     repeat=1,
+        ...     verbose=False,
+        ... )
+        >>> results = b.run(
+        ...     benchmark=semseg_add_data,
+        ...     n_labels=[
+        ...         100,
+        ...         10,
+        ...     ],
+        ...     shape=[
+        ...         (1000, 1000),
+        ...         (100, 100),
+        ...     ],
+        ... )
+        """
+
+        nvars = len(kwargs)
+        keys = tuple(kwargs.keys())
+        vars = tuple(kwargs[key] for key in keys)
+
+        initial_indices = tuple(0 for _ in range(nvars))
+        max_indices = tuple(len(v) for v in vars)
+        permutations = math.prod(max_indices)
+
+        # Initialize queue with the starting index (0, ...)
+        queue = deque()
+        queue.append(initial_indices)
+
+        # Keep track of explored combinations to avoid duplicates
+        explored = set()
+        explored.add(initial_indices)
+
+        # Store valid combinations that finish within the time limit
+        valid_combinations = []
+        invalid_combinations = []
+
+        pbar = tqdm(total=math.prod(max_indices), disable=(not self.verbose))
+        prev_count = 0
+        while queue:
+
+            current_indices = queue.popleft()
+            parameters = {
+                k: v[current_indices[idx]]
+                for idx, (k, v) in enumerate(zip(keys, vars))
+            }
+            complexity = _calculate_complexity(list(parameters.values()))
+
+            details: dict = {k: str(v) for k, v in parameters.items()}
+
+            # update terminal with status
+            count = len(valid_combinations) + len(invalid_combinations)
+            pbar.update(count - prev_count)
+            prev_count = count
+
+            try:
+                runtime = benchmark(
+                    time_limit=self.time_limit,
+                    repeat=self.repeat,
+                    **parameters,
+                )
+                valid_combinations.append(
+                    {
+                        "complexity": complexity,
+                        "runtime": runtime,
+                        "details": details,
+                    }
+                )
+                continue
+            except Exception as e:
+                invalid_combinations.append(
+                    {
+                        "complexity": complexity,
+                        "error": type(e).__name__,
+                        "msg": str(e),
+                        "details": details,
+                    }
+                )
+
+            for idx in range(nvars):
+                new_indices = list(current_indices)
+                if new_indices[idx] + 1 < max_indices[idx]:
+                    new_indices[idx] += 1
+                    new_indices_tuple = tuple(new_indices)
+                    if new_indices_tuple not in explored:
+                        queue.append(new_indices_tuple)
+                        explored.add(new_indices_tuple)
+
+        valid_combinations.sort(key=lambda x: -x["complexity"])
+        invalid_combinations.sort(key=lambda x: -x["complexity"])
+
+        # clear terminal and display results
+        results = (
+            valid_combinations,
+            invalid_combinations,
+            {
+                "benchmark": benchmark.__name__,
+                "limits": self.get_limits(readable=True),
+                "passed": permutations - len(invalid_combinations),
+                "failed": len(invalid_combinations),
+                "total": permutations,
+            },
+        )
+        pbar.close()
+        if self.verbose:
+            pretty_print_results(results)
+
+        return results
diff --git a/lite/valor_lite/semantic_segmentation/__init__.py b/lite/valor_lite/semantic_segmentation/__init__.py
index dfa0e2380..51bd54d02 100644
--- a/lite/valor_lite/semantic_segmentation/__init__.py
+++ b/lite/valor_lite/semantic_segmentation/__init__.py
@@ -1,4 +1,4 @@
-from .annotation import Bitmask, Segmentation
+from .annotation import Bitmask, Segmentation, generate_segmentation
 from .manager import DataLoader, Evaluator
 from .metric import Metric, MetricType
 
@@ -9,4 +9,5 @@
     "Bitmask",
     "Metric",
     "MetricType",
+    "generate_segmentation",
 ]
diff --git a/lite/valor_lite/semantic_segmentation/annotation.py b/lite/valor_lite/semantic_segmentation/annotation.py
index acd99f8f7..7e96fe926 100644
--- a/lite/valor_lite/semantic_segmentation/annotation.py
+++ b/lite/valor_lite/semantic_segmentation/annotation.py
@@ -29,7 +29,7 @@ class Bitmask:
     def __post_init__(self):
         if self.mask.dtype != np.bool_:
             raise ValueError(
-                f"Bitmask recieved mask with dtype `{self.mask.dtype}`."
+                f"Bitmask recieved mask with dtype '{self.mask.dtype}'."
             )
 
 
@@ -94,3 +94,86 @@ def __post_init__(self):
 
         self.shape = groundtruth_shape.pop()
         self.size = int(np.prod(np.array(self.shape)))
+
+
+def generate_segmentation(
+    datum_uid: str,
+    number_of_unique_labels: int,
+    mask_height: int,
+    mask_width: int,
+) -> Segmentation:
+    """
+    Generates a semantic segmentation annotation.
+
+    Parameters
+    ----------
+    datum_uid : str
+        The datum UID for the generated segmentation.
+    number_of_unique_labels : int
+        The number of unique labels.
+    mask_height : int
+        The height of the mask in pixels.
+    mask_width : int
+        The width of the mask in pixels.
+
+    Returns
+    -------
+    Segmentation
+        A generated semantic segmenatation annotation.
+    """
+
+    if number_of_unique_labels > 1:
+        common_proba = 0.4 / (number_of_unique_labels - 1)
+        min_proba = min(common_proba, 0.1)
+        labels = [str(i) for i in range(number_of_unique_labels)] + [None]
+        proba = (
+            [0.5]
+            + [common_proba for _ in range(number_of_unique_labels - 1)]
+            + [0.1]
+        )
+    elif number_of_unique_labels == 1:
+        labels = ["0", None]
+        proba = [0.9, 0.1]
+        min_proba = 0.1
+    else:
+        raise ValueError(
+            "The number of unique labels should be greater than zero."
+        )
+
+    probabilities = np.array(proba, dtype=np.float64)
+    weights = (probabilities / min_proba).astype(np.int32)
+
+    indices = np.random.choice(
+        np.arange(len(weights)),
+        size=(mask_height * 2, mask_width),
+        p=probabilities,
+    )
+
+    N = len(labels)
+
+    masks = np.arange(N)[:, None, None] == indices
+
+    gts = []
+    pds = []
+    for lidx in range(N):
+        label = labels[lidx]
+        if label is None:
+            continue
+        gts.append(
+            Bitmask(
+                mask=masks[lidx, :mask_height, :],
+                label=label,
+            )
+        )
+        pds.append(
+            Bitmask(
+                mask=masks[lidx, mask_height:, :],
+                label=label,
+            )
+        )
+
+    return Segmentation(
+        uid=datum_uid,
+        groundtruths=gts,
+        predictions=pds,
+    )
diff --git a/lite/valor_lite/semantic_segmentation/benchmark.py b/lite/valor_lite/semantic_segmentation/benchmark.py
new file mode 100644
index 000000000..b4950eac1
--- /dev/null
+++ b/lite/valor_lite/semantic_segmentation/benchmark.py
@@ -0,0 +1,151 @@
+from valor_lite.profiling import create_runtime_profiler
+from valor_lite.semantic_segmentation import DataLoader, generate_segmentation
+
+
+def benchmark_add_data(
+    n_labels: int,
+    shape: tuple[int, int],
+    time_limit: float | None,
+    repeat: int = 1,
+) -> float:
+    """
+    Benchmarks 'Dataloader.add_data' for semantic segmentation.
+
+    Parameters
+    ----------
+    n_labels : int
+        The number of unique labels to generate.
+    shape : tuple[int, int]
+        The size (h,w) of the mask to generate.
+    time_limit : float, optional
+        An optional time limit to constrain the benchmark.
+    repeat : int
+        The number of times to run the benchmark to produce a runtime average.
+
+    Returns
+    -------
+    float
+        The average runtime.
+    """
+
+    profile = create_runtime_profiler(
+        time_limit=time_limit,
+        repeat=repeat,
+    )
+
+    elapsed = 0
+    for _ in range(repeat):
+        data = generate_segmentation(
+            datum_uid="uid",
+            number_of_unique_labels=n_labels,
+            mask_height=shape[0],
+            mask_width=shape[1],
+        )
+        loader = DataLoader()
+        elapsed += profile(loader.add_data)([data])
+    return elapsed / repeat
+
+
+def benchmark_finalize(
+    n_datums: int,
+    n_labels: int,
+    time_limit: float | None,
+    repeat: int = 1,
+):
+    """
+    Benchmarks 'Dataloader.finalize' for semantic segmentation.
+
+    Parameters
+    ----------
+    n_datums : int
+        The number of datums to generate.
+    n_labels : int
+        The number of unique labels to generate.
+    time_limit : float, optional
+        An optional time limit to constrain the benchmark.
+    repeat : int
+        The number of times to run the benchmark to produce a runtime average.
+
+    Returns
+    -------
+    float
+        The average runtime.
+    """
+
+    profile = create_runtime_profiler(
+        time_limit=time_limit,
+        repeat=repeat,
+    )
+
+    elapsed = 0
+    for _ in range(repeat):
+
+        data = [
+            generate_segmentation(
+                datum_uid=str(i),
+                number_of_unique_labels=n_labels,
+                mask_height=5,
+                mask_width=5,
+            )
+            for i in range(10)
+        ]
+        loader = DataLoader()
+        for datum_idx in range(n_datums):
+            segmentation = data[datum_idx % 10]
+            segmentation.uid = str(datum_idx)
+            loader.add_data([segmentation])
+        elapsed += profile(loader.finalize)()
+    return elapsed / repeat
+
+
+def benchmark_evaluate(
+    n_datums: int,
+    n_labels: int,
+    time_limit: float | None,
+    repeat: int = 1,
+):
+    """
+    Benchmarks 'Evaluator.evaluate' for semantic segmentation.
+
+    Parameters
+    ----------
+    n_datums : int
+        The number of datums to generate.
+    n_labels : int
+        The number of unique labels to generate.
+    time_limit : float, optional
+        An optional time limit to constrain the benchmark.
+    repeat : int
+        The number of times to run the benchmark to produce a runtime average.
+
+    Returns
+    -------
+    float
+        The average runtime.
+    """
+
+    profile = create_runtime_profiler(
+        time_limit=time_limit,
+        repeat=repeat,
+    )
+
+    elapsed = 0
+    for _ in range(repeat):
+
+        data = [
+            generate_segmentation(
+                datum_uid=str(i),
+                number_of_unique_labels=n_labels,
+                mask_height=5,
+                mask_width=5,
+            )
+            for i in range(10)
+        ]
+        loader = DataLoader()
+        for datum_idx in range(n_datums):
+            segmentation = data[datum_idx % 10]
+            segmentation.uid = str(datum_idx)
+            loader.add_data([segmentation])
+        evaluator = loader.finalize()
+        elapsed += profile(evaluator.evaluate)()
+    return elapsed / repeat
diff --git a/lite/valor_lite/semantic_segmentation/manager.py b/lite/valor_lite/semantic_segmentation/manager.py
index 8506b4e9b..50ddd283f 100644
--- a/lite/valor_lite/semantic_segmentation/manager.py
+++ b/lite/valor_lite/semantic_segmentation/manager.py
@@ -243,6 +243,10 @@ def evaluate(
         return self.compute_precision_recall_iou(filter_=filter_)
 
 
+def defaultdict_int():
+    return defaultdict(int)
+
+
 class DataLoader:
     """
     Segmentation DataLoader.
@@ -250,8 +254,8 @@ class DataLoader:
 
     def __init__(self):
         self._evaluator = Evaluator()
-        self.groundtruth_count = defaultdict(lambda: defaultdict(int))
-        self.prediction_count = defaultdict(lambda: defaultdict(int))
+        self.groundtruth_count = defaultdict(defaultdict_int)
+        self.prediction_count = defaultdict(defaultdict_int)
         self.matrices = list()
         self.pixel_count = list()