From c3bb2ddf37bef33b8ef8c919752cc933b100de71 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Mon, 4 Nov 2024 15:47:27 -0600
Subject: [PATCH 01/29] wip - found semseg bug

---
 .../benchmark/__init__.py                     |  0
 .../benchmark/generate.py                     | 93 +++++++++++++++++++
 .../semantic_segmentation/computation.py      |  4 +-
 3 files changed, 95 insertions(+), 2 deletions(-)
 create mode 100644 lite/valor_lite/semantic_segmentation/benchmark/__init__.py
 create mode 100644 lite/valor_lite/semantic_segmentation/benchmark/generate.py

diff --git a/lite/valor_lite/semantic_segmentation/benchmark/__init__.py b/lite/valor_lite/semantic_segmentation/benchmark/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/lite/valor_lite/semantic_segmentation/benchmark/generate.py b/lite/valor_lite/semantic_segmentation/benchmark/generate.py
new file mode 100644
index 000000000..4cbc882b9
--- /dev/null
+++ b/lite/valor_lite/semantic_segmentation/benchmark/generate.py
@@ -0,0 +1,93 @@
+import numpy as np
+from numpy.typing import NDArray
+from valor_lite.semantic_segmentation import Bitmask, DataLoader, Segmentation
+
+
+def generate_segmentation(
+    uid: str,
+    height: int,
+    width: int,
+    labels: list[str],
+    proba: list[float],
+) -> Segmentation:
+    """
+    Generates a list of segmentation annotations.
+
+    Parameters
+    ----------
+    height : int
+        The height of the bitmask.
+    width : int
+        The width of the bitmask.
+    labels : list[str]
+        A list of labels.
+    proba : list[float]
+        A list of probabilities for each label that sum to 1.0. Should be given in increments of 0.01.
+    Returns
+    -------
+    Segmenation
+        A generated semantic segmenatation annotation.
+    """
+    if len(labels) != len(proba):
+        raise ValueError("Labels and probabilities should be the same length.")
+
+    probabilities = np.array(proba, dtype=np.float64)
+    if not np.isclose(probabilities.sum(), 1.0).all():
+        raise ValueError("Probabilities should sum to 1.0.")
+
+    weights = (probabilities * 100.0).astype(np.int32)
+
+    indices = np.random.choice(
+        np.arange(len(weights)), size=(height * 2, width), p=probabilities
+    )
+
+    N = len(labels)
+
+    masks = np.arange(N)[:, None, None] == indices
+
+    gts = []
+    pds = []
+    for lidx in range(N):
+        print(masks[lidx, :, :])
+        gts.append(
+            Bitmask(
+                mask=masks[lidx, :height, :],
+                label=labels[lidx],
+            )
+        )
+        pds.append(
+            Bitmask(
+                mask=masks[lidx, height:, :],
+                label=labels[lidx],
+            )
+        )
+
+    return Segmentation(
+        uid=uid,
+        groundtruths=gts,
+        predictions=pds,
+    )
+
+
+def generate_cache():
+    pass
+
+
+if __name__ == "__main__":
+
+    seg = generate_segmentation(
+        uid="uid",
+        height=2,
+        width=2,
+        labels=["a", "b", "c", "d"],
+        proba=[0.25, 0.25, 0.25, 0.25],
+    )
+
+    loader = DataLoader()
+    loader.add_data([seg])
+
+    print(loader.matrices)
+
+    evaluator = loader.finalize()
+
+    print(evaluator._confusion_matrices)
diff --git a/lite/valor_lite/semantic_segmentation/computation.py b/lite/valor_lite/semantic_segmentation/computation.py
index 6c52f80e1..556a0ef9b 100644
--- a/lite/valor_lite/semantic_segmentation/computation.py
+++ b/lite/valor_lite/semantic_segmentation/computation.py
@@ -46,8 +46,8 @@ def compute_intermediate_confusion_matrices(
         predictions.reshape(1, n_pd_labels, -1),
     ).sum(axis=2)
 
-    intersected_groundtruth_counts = intersection_counts.sum(axis=0)
-    intersected_prediction_counts = intersection_counts.sum(axis=1)
+    intersected_groundtruth_counts = intersection_counts.sum(axis=1)
+    intersected_prediction_counts = intersection_counts.sum(axis=0)
 
     confusion_matrix = np.zeros((n_labels + 1, n_labels + 1), dtype=np.int32)
     confusion_matrix[0, 0] = background_counts

From 9a00cfa8f8d556300e5d35b4594b8da224f45ba8 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Mon, 4 Nov 2024 16:07:05 -0600
Subject: [PATCH 02/29] fixed bug

---
 .../test_confusion_matrix.py                  | 62 +++++++++++++++++++
 .../semantic_segmentation/computation.py      |  4 +-
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/lite/tests/semantic_segmentation/test_confusion_matrix.py b/lite/tests/semantic_segmentation/test_confusion_matrix.py
index 4ec0710c5..2ad2afdd5 100644
--- a/lite/tests/semantic_segmentation/test_confusion_matrix.py
+++ b/lite/tests/semantic_segmentation/test_confusion_matrix.py
@@ -1,4 +1,6 @@
+import numpy as np
 from valor_lite.semantic_segmentation import (
+    Bitmask,
     DataLoader,
     MetricType,
     Segmentation,
@@ -89,3 +91,63 @@ def test_confusion_matrix_segmentations_from_boxes(
         assert m in expected_metrics
     for m in expected_metrics:
         assert m in actual_metrics
+
+
+def test_confusion_matrix_intermediate_counting():
+
+    segmentation = Segmentation(
+        uid="uid1",
+        groundtruths=[
+            Bitmask(
+                mask=np.array([[False, False], [True, False]]),
+                label="a",
+            ),
+            Bitmask(
+                mask=np.array([[False, False], [False, True]]),
+                label="b",
+            ),
+            Bitmask(
+                mask=np.array([[True, False], [False, False]]),
+                label="c",
+            ),
+            Bitmask(
+                mask=np.array([[False, True], [False, False]]),
+                label="d",
+            ),
+        ],
+        predictions=[
+            Bitmask(
+                mask=np.array([[False, False], [False, False]]),
+                label="a",
+            ),
+            Bitmask(
+                mask=np.array([[False, False], [False, False]]),
+                label="b",
+            ),
+            Bitmask(
+                mask=np.array([[True, True], [True, True]]),
+                label="c",
+            ),
+            Bitmask(
+                mask=np.array([[False, False], [False, False]]),
+                label="d",
+            ),
+        ],
+    )
+
+    loader = DataLoader()
+    loader.add_data([segmentation])
+
+    assert len(loader.matrices) == 1
+    assert (
+        loader.matrices[0]
+        == np.array(
+            [
+                [0, 0, 0, 0, 0],
+                [0, 0, 0, 1, 0],
+                [0, 0, 0, 1, 0],
+                [0, 0, 0, 1, 0],
+                [0, 0, 0, 1, 0],
+            ]
+        )
+    ).all()
diff --git a/lite/valor_lite/semantic_segmentation/computation.py b/lite/valor_lite/semantic_segmentation/computation.py
index 6c52f80e1..556a0ef9b 100644
--- a/lite/valor_lite/semantic_segmentation/computation.py
+++ b/lite/valor_lite/semantic_segmentation/computation.py
@@ -46,8 +46,8 @@ def compute_intermediate_confusion_matrices(
         predictions.reshape(1, n_pd_labels, -1),
     ).sum(axis=2)
 
-    intersected_groundtruth_counts = intersection_counts.sum(axis=0)
-    intersected_prediction_counts = intersection_counts.sum(axis=1)
+    intersected_groundtruth_counts = intersection_counts.sum(axis=1)
+    intersected_prediction_counts = intersection_counts.sum(axis=0)
 
     confusion_matrix = np.zeros((n_labels + 1, n_labels + 1), dtype=np.int32)
     confusion_matrix[0, 0] = background_counts

From 24af843934877ba57eb5904f8ed810c4224416b7 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Mon, 4 Nov 2024 16:18:03 -0600
Subject: [PATCH 03/29] add background as option to generate

---
 .../semantic_segmentation/benchmark/generate.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/lite/valor_lite/semantic_segmentation/benchmark/generate.py b/lite/valor_lite/semantic_segmentation/benchmark/generate.py
index 4cbc882b9..9f6587b9e 100644
--- a/lite/valor_lite/semantic_segmentation/benchmark/generate.py
+++ b/lite/valor_lite/semantic_segmentation/benchmark/generate.py
@@ -1,5 +1,4 @@
 import numpy as np
-from numpy.typing import NDArray
 from valor_lite.semantic_segmentation import Bitmask, DataLoader, Segmentation
 
 
@@ -7,7 +6,7 @@ def generate_segmentation(
     uid: str,
     height: int,
     width: int,
-    labels: list[str],
+    labels: list[str | None],
     proba: list[float],
 ) -> Segmentation:
     """
@@ -19,8 +18,8 @@ def generate_segmentation(
         The height of the bitmask.
     width : int
         The width of the bitmask.
-    labels : list[str]
-        A list of labels.
+    labels : list[str | None]
+        A list of labels with None representing background.
     proba : list[float]
         A list of probabilities for each label that sum to 1.0. Should be given in increments of 0.01.
     Returns
@@ -48,17 +47,19 @@ def generate_segmentation(
     gts = []
     pds = []
     for lidx in range(N):
-        print(masks[lidx, :, :])
+        label = labels[lidx]
+        if label is None:
+            continue
         gts.append(
             Bitmask(
                 mask=masks[lidx, :height, :],
-                label=labels[lidx],
+                label=label,
             )
         )
         pds.append(
             Bitmask(
                 mask=masks[lidx, height:, :],
-                label=labels[lidx],
+                label=label,
             )
         )
 
@@ -79,7 +80,7 @@ def generate_cache():
         uid="uid",
         height=2,
         width=2,
-        labels=["a", "b", "c", "d"],
+        labels=["a", "b", "c", None],
         proba=[0.25, 0.25, 0.25, 0.25],
     )
 

From 21101ec81b6ecf0a52eb79bc5d055d85fce897e2 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Thu, 7 Nov 2024 17:03:29 -0600
Subject: [PATCH 04/29] adding synthetic benchmarks

---
 lite/valor_lite/object_detection/benchmark.py | 284 ++++++++++++++++
 lite/valor_lite/object_detection/manager.py   |   8 +-
 lite/valor_lite/profiling.py                  | 319 ++++++++++++++++++
 .../semantic_segmentation/benchmark.py        | 202 +++++++++++
 .../benchmark/__init__.py                     |   0
 .../benchmark/generate.py                     |  94 ------
 .../semantic_segmentation/manager.py          |   8 +-
 7 files changed, 817 insertions(+), 98 deletions(-)
 create mode 100644 lite/valor_lite/object_detection/benchmark.py
 create mode 100644 lite/valor_lite/profiling.py
 create mode 100644 lite/valor_lite/semantic_segmentation/benchmark.py
 delete mode 100644 lite/valor_lite/semantic_segmentation/benchmark/__init__.py
 delete mode 100644 lite/valor_lite/semantic_segmentation/benchmark/generate.py

diff --git a/lite/valor_lite/object_detection/benchmark.py b/lite/valor_lite/object_detection/benchmark.py
new file mode 100644
index 000000000..8ba8d4971
--- /dev/null
+++ b/lite/valor_lite/object_detection/benchmark.py
@@ -0,0 +1,284 @@
+import math
+import random
+
+import numpy as np
+from valor_lite.object_detection import (
+    Bitmask,
+    BoundingBox,
+    DataLoader,
+    Detection,
+    Polygon,
+)
+from valor_lite.profiling import Benchmark, create_runtime_profiler
+
+
+def generate_random_bbox(
+    n_labels: int,
+    is_prediction: bool,
+) -> BoundingBox:
+
+    scale = random.uniform(25, 100)
+    offset_x = random.uniform(0, 10000)
+    offset_y = random.uniform(0, 10000)
+
+    side_length = random.uniform(0.1, 0.5)
+
+    xmax = max(1 - side_length, 0)
+    ymax = max(1 - side_length, 0)
+    x = random.uniform(0, xmax)
+    y = random.uniform(0, ymax)
+
+    xmin0 = x * scale + offset_x
+    xmax0 = (x + side_length) * scale + offset_x
+    ymin0 = y * scale + offset_y
+    ymax0 = (y + side_length) * scale + offset_y
+
+    if n_labels > 1:
+        if not is_prediction:
+            gt_label = str(random.randint(0, n_labels - 1))
+            return BoundingBox(
+                xmin=xmin0,
+                xmax=xmax0,
+                ymin=ymin0,
+                ymax=ymax0,
+                labels=[gt_label],
+            )
+        else:
+            labels = [str(i) for i in range(n_labels)]
+            common_proba = 0.4 / (n_labels - 1)
+            scores = [0.5] + [common_proba for _ in range(n_labels - 1)]
+            return BoundingBox(
+                xmin=xmin0,
+                xmax=xmax0,
+                ymin=ymin0,
+                ymax=ymax0,
+                labels=labels,
+                scores=scores,
+            )
+    elif n_labels == 1:
+        if not is_prediction:
+            return BoundingBox(
+                xmin=xmin0,
+                xmax=xmax0,
+                ymin=ymin0,
+                ymax=ymax0,
+                labels=["0"],
+            )
+        else:
+            pd_score = random.uniform(0.1, 0.9)
+            return BoundingBox(
+                xmin=xmin0,
+                xmax=xmax0,
+                ymin=ymin0,
+                ymax=ymax0,
+                labels=["0"],
+                scores=[pd_score],
+            )
+    else:
+        raise ValueError
+
+
+def generate_random_bbox_pair(
+    n_labels: int,
+) -> tuple[BoundingBox, BoundingBox]:
+
+    scale = random.uniform(25, 100)
+    offset_x = random.uniform(0, 10000)
+    offset_y = random.uniform(0, 10000)
+
+    iou = random.uniform(0.1, 0.9)
+    side_length = random.uniform(0.1, 0.5)
+    intersection_area = (2 * iou * side_length * side_length) / (1 + iou)
+    delta = side_length - math.sqrt(intersection_area)
+
+    xmax = max(1 - side_length - delta, 0)
+    ymax = max(1 - side_length - delta, 0)
+    x = random.uniform(0, xmax)
+    y = random.uniform(0, ymax)
+
+    xmin0 = x * scale + offset_x
+    xmax0 = (x + side_length) * scale + offset_x
+    ymin0 = y * scale + offset_y
+    ymax0 = (y + side_length) * scale + offset_y
+
+    xmin1 = (x + delta) * scale + offset_x
+    xmax1 = (x + delta + side_length) * scale + offset_x
+    ymin1 = (y + delta) * scale + offset_y
+    ymax1 = (y + delta + side_length) * scale + offset_y
+
+    if n_labels > 1:
+        common_proba = 0.4 / (n_labels - 1)
+        labels = [str(i) for i in range(n_labels)]
+        scores = [0.5] + [common_proba for _ in range(n_labels - 1)]
+        gt_label = str(random.randint(0, n_labels - 1))
+        gt = BoundingBox(
+            xmin=xmin0,
+            xmax=xmax0,
+            ymin=ymin0,
+            ymax=ymax0,
+            labels=[gt_label],
+        )
+        pd = BoundingBox(
+            xmin=xmin1,
+            xmax=xmax1,
+            ymin=ymin1,
+            ymax=ymax1,
+            labels=labels,
+            scores=scores,
+        )
+    elif n_labels == 1:
+        gt_label = str(random.randint(0, 1))
+        pd_score = random.uniform(0.1, 0.9)
+        gt = BoundingBox(
+            xmin=xmin0,
+            xmax=xmax0,
+            ymin=ymin0,
+            ymax=ymax0,
+            labels=[gt_label],
+        )
+        pd = BoundingBox(
+            xmin=xmin1,
+            xmax=xmax1,
+            ymin=ymin1,
+            ymax=ymax1,
+            labels=["0"],
+            scores=[pd_score],
+        )
+    else:
+        raise ValueError
+
+    return (gt, pd)
+
+
+def benchmark_add_bounding_boxes(
+    n_labels: int,
+    n_boxes_per_datum: tuple[int, int],
+    time_limit: float | None,
+    repeat: int = 1,
+):
+
+    profile = create_runtime_profiler(
+        time_limit=time_limit,
+        repeat=repeat,
+    )
+
+    n_matched, n_unmatched = n_boxes_per_datum
+
+    elapsed = 0
+    for _ in range(repeat):
+
+        gts = []
+        pds = []
+        for _ in range(n_matched):
+            gt, pd = generate_random_bbox_pair(n_labels)
+            gts.append(gt)
+            pds.append(pd)
+        for _ in range(n_unmatched):
+            gt = generate_random_bbox(n_labels, is_prediction=False)
+            gts.append(gt)
+            pd = generate_random_bbox(n_labels, is_prediction=True)
+            pds.append(pd)
+
+        detection = Detection(
+            uid="uid",
+            groundtruths=gts,
+            predictions=pds,
+        )
+        loader = DataLoader()
+        elapsed += profile(loader.add_bounding_boxes)([detection])
+    return elapsed / repeat
+
+
+def benchmark_finalize(
+    n_datums: int,
+    n_labels: int,
+    n_boxes_per_datum: tuple[int, int],
+    time_limit: float | None,
+    repeat: int = 1,
+):
+
+    profile = create_runtime_profiler(
+        time_limit=time_limit,
+        repeat=repeat,
+    )
+
+    n_matched, n_unmatched = n_boxes_per_datum
+
+    gts = []
+    pds = []
+    for _ in range(n_matched):
+        gt, pd = generate_random_bbox_pair(n_labels)
+        gts.append(gt)
+        pds.append(pd)
+    for _ in range(n_unmatched):
+        gt = generate_random_bbox(n_labels, is_prediction=False)
+        gts.append(gt)
+        pd = generate_random_bbox(n_labels, is_prediction=True)
+        pds.append(pd)
+
+    elapsed = 0
+    for _ in range(repeat):
+        loader = DataLoader()
+        for i in range(n_datums):
+            detection = Detection(
+                uid=f"uid{i}",
+                groundtruths=gts,
+                predictions=pds,
+            )
+            loader.add_bounding_boxes([detection])
+        elapsed += profile(loader.finalize)()
+    return elapsed / repeat
+
+
+if __name__ == "__main__":
+
+    n_datums = [
+        100,
+        10,
+        1,
+    ]
+
+    n_labels = [
+        # 1000,
+        100,
+        10,
+        1,
+    ]
+
+    n_boxes_per_datum = [
+        (100, 1),
+        (10, 10),
+        (1, 100),
+    ]
+
+    b = Benchmark(
+        time_limit=10.0,
+        memory_limit=8 * (1024**3),
+        repeat=1,
+        verbose=True,
+    )
+
+    # b.run(
+    #     benchmark=benchmark_add_bounding_boxes,
+    #     n_labels=n_labels,
+    #     n_boxes_per_datum=n_boxes_per_datum,
+    # )
+
+    b.run(
+        benchmark=benchmark_finalize,
+        n_datums=n_datums,
+        n_labels=n_labels,
+        n_boxes_per_datum=n_boxes_per_datum,
+    )
+
+    # b.run(
+    #     benchmark=benchmark_finalize,
+    #     n_datums=n_datums,
+    #     n_labels=n_labels,
+    # )
+
+    # b.run(
+    #     benchmark=benchmark_evaluate,
+    #     n_datums=n_datums,
+    #     n_labels=n_labels,
+    # )
diff --git a/lite/valor_lite/object_detection/manager.py b/lite/valor_lite/object_detection/manager.py
index f2b0b54b6..0e00b97a7 100644
--- a/lite/valor_lite/object_detection/manager.py
+++ b/lite/valor_lite/object_detection/manager.py
@@ -341,6 +341,10 @@ def evaluate(
         return metrics
 
 
+def defaultdict_int():
+    return defaultdict(int)
+
+
 class DataLoader:
     """
     Object Detection DataLoader
@@ -349,8 +353,8 @@ class DataLoader:
     def __init__(self):
         self._evaluator = Evaluator()
         self.pairs: list[NDArray[np.float64]] = list()
-        self.groundtruth_count = defaultdict(lambda: defaultdict(int))
-        self.prediction_count = defaultdict(lambda: defaultdict(int))
+        self.groundtruth_count = defaultdict(defaultdict_int)
+        self.prediction_count = defaultdict(defaultdict_int)
 
     def _add_datum(self, uid: str) -> int:
         """
diff --git a/lite/valor_lite/profiling.py b/lite/valor_lite/profiling.py
new file mode 100644
index 000000000..9e4990161
--- /dev/null
+++ b/lite/valor_lite/profiling.py
@@ -0,0 +1,319 @@
+import json
+import math
+import multiprocessing as mp
+import resource
+import sys
+import time
+from collections import deque
+from multiprocessing import Queue
+
+
+def _timeit_subprocess(*args, __fn, __queue: Queue, **kwargs):
+    try:
+        timer_start = time.perf_counter()
+        __fn(*args, **kwargs)
+        timer_end = time.perf_counter()
+        __queue.put(timer_end - timer_start)
+    except Exception as e:
+        __queue.put(e)
+
+
+def create_runtime_profiler(
+    time_limit: float | None,
+    repeat: int = 1,
+):
+    """
+    This profiles the runtime of the wrapped function in a subprocess.
+    """
+    ctx = mp.get_context("spawn")
+
+    def decorator(fn):
+        def wrapper(*args, **kwargs):
+            # Record average runtime over repeated runs.
+            elapsed = 0
+            for _ in range(repeat):
+                q = ctx.Queue()
+                p = ctx.Process(
+                    target=_timeit_subprocess,
+                    args=args,
+                    kwargs={"__fn": fn, "__queue": q, **kwargs},
+                )
+                p.start()
+                p.join(timeout=time_limit)
+
+                # Check if computation finishes within the timeout
+                if p.is_alive():
+                    p.terminate()
+                    p.join()
+                    q.close()
+                    q.join_thread()
+                    raise TimeoutError(
+                        f"Function '{fn.__name__}' did not complete within {time_limit} seconds."
+                    )
+
+                # Retrieve the result
+                result = q.get(timeout=1)
+                if isinstance(result, Exception):
+                    raise result
+                elif isinstance(result, float):
+                    elapsed += result
+                else:
+                    raise TypeError(type(result).__name__)
+
+            return elapsed / repeat
+
+        return wrapper
+
+    return decorator
+
+
+def calculate_complexity(params: list[int | tuple[int]]) -> int:
+    flattened_params = [
+        math.prod(p) if isinstance(p, tuple) else p for p in params
+    ]
+    return math.prod(flattened_params)
+
+
+def pretty_print_results(results: tuple):
+    valid, invalid, permutations = results
+
+    print(
+        "====================================================================="
+    )
+    print("Details")
+    print(json.dumps(permutations, indent=4))
+
+    print()
+    print("Passed")
+    if len(valid) > 0:
+        keys = ["complexity", "runtime", *valid[0]["details"].keys()]
+        header = " | ".join(f"{header:^15}" for header in keys)
+        print(header)
+        print("-" * len(header))
+        for entry in valid:
+            values = [
+                entry["complexity"],
+                round(entry["runtime"], 4),
+                *entry["details"].values(),
+            ]
+            row = " | ".join(f"{str(value):^15}" for value in values)
+            print(row)
+
+    print()
+    print("Failed")
+    if len(invalid) > 0:
+        keys = ["error", *invalid[0]["details"].keys(), "msg"]
+        header = " | ".join(f"{header:^15}" for header in keys)
+        print(header)
+        print("-" * len(header))
+        for entry in invalid:
+            values = [
+                entry["error"],
+                *entry["details"].values(),
+                entry["msg"],
+            ]
+            row = " | ".join(f"{str(value):^15}" for value in values)
+            print(row)
+
+
+class Benchmark:
+    def __init__(
+        self,
+        time_limit: float | None,
+        memory_limit: int | None,
+        *_,
+        repeat: int | None = 1,
+        verbose: bool = False,
+    ):
+        self.time_limit = time_limit
+        self.memory_limit = memory_limit
+        self.repeat = repeat
+        self.verbose = verbose
+
+        # printing
+        self.line_count = 0
+
+    def get_limits(
+        self,
+        *_,
+        readable: bool = True,
+        memory_unit: str = "GB",
+        time_unit: str = "seconds",
+    ) -> dict[str, str | int | float | None]:
+
+        memory_value = self.memory_limit
+        if readable and memory_value is not None:
+            match memory_unit:
+                case "TB":
+                    memory_value /= 1024**4
+                case "GB":
+                    memory_value /= 1024**3
+                case "MB":
+                    memory_value /= 1024**2
+                case "KB":
+                    memory_value /= 1024
+                case "B":
+                    pass
+                case _:
+                    valid_set = {"TB", "GB", "MB", "KB", "B"}
+                    raise ValueError(
+                        f"Expected memory unit to be in the set {valid_set}, received '{memory_unit}'."
+                    )
+            memory_value = f"{memory_value} {memory_unit}"
+
+        time_value = self.time_limit
+        if readable and time_value is not None:
+            match time_unit:
+                case "minutes":
+                    time_value /= 60
+                case "seconds":
+                    pass
+                case "milliseconds":
+                    time_value *= 1000
+                case _:
+                    valid_set = {"minutes", "seconds", "milliseconds"}
+                    raise ValueError(
+                        f"Expected time unit to be in the set {valid_set}, received '{time_unit}'."
+                    )
+            time_value = f"{time_value} {time_unit}"
+
+        return {
+            "memory_limit": memory_value,
+            "time_limit": time_value,
+            "repeat": self.repeat,
+        }
+
+    @property
+    def memory_limit(self) -> int | None:
+        return self._memory_limit
+
+    @memory_limit.setter
+    def memory_limit(self, limit: int | None):
+        """
+        Stores the memory limit and restricts resources.
+        """
+        self._memory_limit = limit
+        if limit is not None:
+            _, hard = resource.getrlimit(resource.RLIMIT_AS)
+            resource.setrlimit(resource.RLIMIT_AS, (limit, hard))
+
+    def clear_status(self):
+        if not self.verbose:
+            return
+        for _ in range(self.line_count):
+            sys.stdout.write("\033[F")
+            sys.stdout.write("\033[K")
+        self.line_count = 0
+
+    def write_status(self, text: str):
+        if not self.verbose:
+            return
+        self.clear_status()
+        self.line_count = text.count("\n") + 1
+        sys.stdout.write(text + "\n")
+        sys.stdout.flush()
+
+    def run(
+        self,
+        benchmark,
+        **kwargs,
+    ):
+        nvars = len(kwargs)
+        keys = tuple(kwargs.keys())
+        vars = tuple(kwargs[key] for key in keys)
+
+        initial_indices = tuple(0 for _ in range(nvars))
+        max_indices = tuple(len(v) for v in vars)
+        permutations = math.prod(max_indices)
+
+        # Initialize queue with the starting index (0, ...)
+        queue = deque()
+        queue.append(initial_indices)
+
+        # Keep track of explored combinations to avoid duplicates
+        explored = set()
+        explored.add(initial_indices)
+
+        # Store valid combinations that finish within the time limit
+        valid_combinations = []
+        invalid_combinations = []
+
+        while queue:
+
+            current_indices = queue.popleft()
+            parameters = {
+                k: v[current_indices[idx]]
+                for idx, (k, v) in enumerate(zip(keys, vars))
+            }
+
+            details: dict = {k: str(v) for k, v in parameters.items()}
+
+            # update terminal with status
+            self.write_status(
+                f"Running '{benchmark.__name__}'\n"
+                + json.dumps(
+                    {
+                        **details,
+                        **self.get_limits(
+                            readable=True,
+                            memory_unit="GB",
+                            time_unit="seconds",
+                        ),
+                    },
+                    indent=4,
+                )
+            )
+
+            try:
+                runtime = benchmark(
+                    time_limit=self.time_limit,
+                    repeat=self.repeat,
+                    **parameters,
+                )
+                valid_combinations.append(
+                    {
+                        "complexity": calculate_complexity(
+                            tuple(parameters.values())
+                        ),
+                        "runtime": runtime,
+                        "details": details,
+                    }
+                )
+                continue
+            except Exception as e:
+                invalid_combinations.append(
+                    {
+                        "error": type(e).__name__,
+                        "msg": str(e),
+                        "details": details,
+                    }
+                )
+
+            for idx in range(nvars):
+                new_indices = list(current_indices)
+                if new_indices[idx] + 1 < max_indices[idx]:
+                    new_indices[idx] += 1
+                    new_indices_tuple = tuple(new_indices)
+                    if new_indices_tuple not in explored:
+                        queue.append(new_indices_tuple)
+                        explored.add(new_indices_tuple)
+
+        valid_combinations.sort(key=lambda x: -x["complexity"])
+
+        # clear terminal and display results
+        self.clear_status()
+        results = (
+            valid_combinations,
+            invalid_combinations,
+            {
+                "benchmark": benchmark.__name__,
+                "limits": self.get_limits(readable=True),
+                "passed": permutations - len(invalid_combinations),
+                "failed": len(invalid_combinations),
+                "total": permutations,
+            },
+        )
+        if self.verbose:
+            pretty_print_results(results)
+
+        return results
diff --git a/lite/valor_lite/semantic_segmentation/benchmark.py b/lite/valor_lite/semantic_segmentation/benchmark.py
new file mode 100644
index 000000000..bb1934acf
--- /dev/null
+++ b/lite/valor_lite/semantic_segmentation/benchmark.py
@@ -0,0 +1,202 @@
+import numpy as np
+from valor_lite.profiling import Benchmark, create_runtime_profiler
+from valor_lite.semantic_segmentation import Bitmask, DataLoader, Segmentation
+
+
+def generate_segmentation(
+    uid: str,
+    n_labels: int,
+    height: int,
+    width: int,
+) -> Segmentation:
+    """
+    Generates a list of segmentation annotations.
+
+    Parameters
+    ----------
+    uid : str
+        The datum UID for the generated segmentation.
+
+    Returns
+    -------
+    Segmenation
+        A generated semantic segmenatation annotation.
+    """
+
+    if n_labels > 1:
+        common_proba = 0.4 / (n_labels - 1)
+        min_proba = min(common_proba, 0.1)
+        labels = [str(i) for i in range(n_labels)] + [None]
+        proba = [0.5] + [common_proba for _ in range(n_labels - 1)] + [0.1]
+    elif n_labels == 1:
+        labels = ["0", None]
+        proba = [0.9, 0.1]
+        min_proba = 0.1
+    else:
+        labels = [None]
+        proba = [1.0]
+        min_proba = 1.0
+
+    probabilities = np.array(proba, dtype=np.float64)
+    weights = (probabilities / min_proba).astype(np.int32)
+
+    indices = np.random.choice(
+        np.arange(len(weights)), size=(height * 2, width), p=probabilities
+    )
+
+    N = len(labels)
+
+    masks = np.arange(N)[:, None, None] == indices
+
+    gts = []
+    pds = []
+    for lidx in range(N):
+        label = labels[lidx]
+        if label is None:
+            continue
+        gts.append(
+            Bitmask(
+                mask=masks[lidx, :height, :],
+                label=label,
+            )
+        )
+        pds.append(
+            Bitmask(
+                mask=masks[lidx, height:, :],
+                label=label,
+            )
+        )
+
+    return Segmentation(
+        uid=uid,
+        groundtruths=gts,
+        predictions=pds,
+    )
+
+
+def benchmark_add_data(
+    n_labels: int,
+    shape: tuple[int, int],
+    time_limit: float | None,
+    repeat: int = 1,
+):
+
+    profile = create_runtime_profiler(
+        time_limit=time_limit,
+        repeat=repeat,
+    )
+
+    elapsed = 0
+    for _ in range(repeat):
+        data = generate_segmentation(
+            uid="uid",
+            n_labels=n_labels,
+            height=shape[0],
+            width=shape[1],
+        )
+        loader = DataLoader()
+        elapsed += profile(loader.add_data)([data])
+    return elapsed / repeat
+
+
+def benchmark_finalize(
+    n_datums: int,
+    n_labels: int,
+    time_limit: float | None,
+    repeat: int = 1,
+):
+
+    profile = create_runtime_profiler(
+        time_limit=time_limit,
+        repeat=repeat,
+    )
+
+    elapsed = 0
+    for _ in range(repeat):
+        loader = DataLoader()
+        for datum_idx in range(n_datums):
+            data = generate_segmentation(
+                uid=str(datum_idx),
+                n_labels=n_labels,
+                height=100,
+                width=100,
+            )
+            loader.add_data([data])
+        elapsed += profile(loader.finalize)()
+    return elapsed / repeat
+
+
+def benchmark_evaluate(
+    n_datums: int,
+    n_labels: int,
+    time_limit: float | None,
+    repeat: int = 1,
+):
+
+    profile = create_runtime_profiler(
+        time_limit=time_limit,
+        repeat=repeat,
+    )
+
+    elapsed = 0
+    for _ in range(repeat):
+        loader = DataLoader()
+        for datum_idx in range(n_datums):
+            data = generate_segmentation(
+                uid=str(datum_idx),
+                n_labels=n_labels,
+                height=100,
+                width=100,
+            )
+            loader.add_data([data])
+        evaluator = loader.finalize()
+        elapsed += profile(evaluator.evaluate)()
+    return elapsed / repeat
+
+
+if __name__ == "__main__":
+
+    n_datums = [
+        100,
+        10,
+        1,
+    ]
+
+    n_labels = [
+        1000,
+        100,
+        10,
+        1,
+    ]
+
+    shapes = [
+        (10000, 10000),
+        (2500, 2500),
+        (1000, 1000),
+        (100, 100),
+    ]
+
+    b = Benchmark(
+        time_limit=10.0,
+        memory_limit=8 * (1024**3),
+        repeat=1,
+        verbose=True,
+    )
+
+    b.run(
+        benchmark=benchmark_add_data,
+        n_labels=n_labels,
+        shape=shapes,
+    )
+
+    b.run(
+        benchmark=benchmark_finalize,
+        n_datums=n_datums,
+        n_labels=n_labels,
+    )
+
+    b.run(
+        benchmark=benchmark_evaluate,
+        n_datums=n_datums,
+        n_labels=n_labels,
+    )
diff --git a/lite/valor_lite/semantic_segmentation/benchmark/__init__.py b/lite/valor_lite/semantic_segmentation/benchmark/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/lite/valor_lite/semantic_segmentation/benchmark/generate.py b/lite/valor_lite/semantic_segmentation/benchmark/generate.py
deleted file mode 100644
index 9f6587b9e..000000000
--- a/lite/valor_lite/semantic_segmentation/benchmark/generate.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import numpy as np
-from valor_lite.semantic_segmentation import Bitmask, DataLoader, Segmentation
-
-
-def generate_segmentation(
-    uid: str,
-    height: int,
-    width: int,
-    labels: list[str | None],
-    proba: list[float],
-) -> Segmentation:
-    """
-    Generates a list of segmentation annotations.
-
-    Parameters
-    ----------
-    height : int
-        The height of the bitmask.
-    width : int
-        The width of the bitmask.
-    labels : list[str | None]
-        A list of labels with None representing background.
-    proba : list[float]
-        A list of probabilities for each label that sum to 1.0. Should be given in increments of 0.01.
-    Returns
-    -------
-    Segmenation
-        A generated semantic segmenatation annotation.
-    """
-    if len(labels) != len(proba):
-        raise ValueError("Labels and probabilities should be the same length.")
-
-    probabilities = np.array(proba, dtype=np.float64)
-    if not np.isclose(probabilities.sum(), 1.0).all():
-        raise ValueError("Probabilities should sum to 1.0.")
-
-    weights = (probabilities * 100.0).astype(np.int32)
-
-    indices = np.random.choice(
-        np.arange(len(weights)), size=(height * 2, width), p=probabilities
-    )
-
-    N = len(labels)
-
-    masks = np.arange(N)[:, None, None] == indices
-
-    gts = []
-    pds = []
-    for lidx in range(N):
-        label = labels[lidx]
-        if label is None:
-            continue
-        gts.append(
-            Bitmask(
-                mask=masks[lidx, :height, :],
-                label=label,
-            )
-        )
-        pds.append(
-            Bitmask(
-                mask=masks[lidx, height:, :],
-                label=label,
-            )
-        )
-
-    return Segmentation(
-        uid=uid,
-        groundtruths=gts,
-        predictions=pds,
-    )
-
-
-def generate_cache():
-    pass
-
-
-if __name__ == "__main__":
-
-    seg = generate_segmentation(
-        uid="uid",
-        height=2,
-        width=2,
-        labels=["a", "b", "c", None],
-        proba=[0.25, 0.25, 0.25, 0.25],
-    )
-
-    loader = DataLoader()
-    loader.add_data([seg])
-
-    print(loader.matrices)
-
-    evaluator = loader.finalize()
-
-    print(evaluator._confusion_matrices)
diff --git a/lite/valor_lite/semantic_segmentation/manager.py b/lite/valor_lite/semantic_segmentation/manager.py
index 8506b4e9b..50ddd283f 100644
--- a/lite/valor_lite/semantic_segmentation/manager.py
+++ b/lite/valor_lite/semantic_segmentation/manager.py
@@ -243,6 +243,10 @@ def evaluate(
         return self.compute_precision_recall_iou(filter_=filter_)
 
 
+def defaultdict_int():
+    return defaultdict(int)
+
+
 class DataLoader:
     """
     Segmentation DataLoader.
@@ -250,8 +254,8 @@ class DataLoader:
 
     def __init__(self):
         self._evaluator = Evaluator()
-        self.groundtruth_count = defaultdict(lambda: defaultdict(int))
-        self.prediction_count = defaultdict(lambda: defaultdict(int))
+        self.groundtruth_count = defaultdict(defaultdict_int)
+        self.prediction_count = defaultdict(defaultdict_int)
         self.matrices = list()
         self.pixel_count = list()
 

From 609dfaf22e75a19cb61b45fbac71966c9a31b28d Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Fri, 8 Nov 2024 11:37:51 -0600
Subject: [PATCH 05/29] found bug in obj det iou computation

---
 lite/valor_lite/object_detection/benchmark.py | 148 +++++++++++++-----
 lite/valor_lite/object_detection/manager.py   | 107 ++++++-------
 2 files changed, 156 insertions(+), 99 deletions(-)

diff --git a/lite/valor_lite/object_detection/benchmark.py b/lite/valor_lite/object_detection/benchmark.py
index 8ba8d4971..d4cb2e603 100644
--- a/lite/valor_lite/object_detection/benchmark.py
+++ b/lite/valor_lite/object_detection/benchmark.py
@@ -150,6 +150,78 @@ def generate_random_bbox_pair(
     return (gt, pd)
 
 
+def generate_cache(
+    n_datums: int,
+    n_labels: int,
+    n_boxes_per_datum: tuple[int, int],
+) -> DataLoader:
+    """
+    This skips the IOU computation.
+
+    Not ideal since we are dealing directly with internals.
+    """
+
+    gts = []
+    pds = []
+    n_matched, n_unmatched = n_boxes_per_datum
+    for _ in range(n_matched):
+        gt, pd = generate_random_bbox_pair(n_labels)
+        gts.append(gt)
+        pds.append(pd)
+    for _ in range(n_unmatched):
+        gt = generate_random_bbox(n_labels, is_prediction=False)
+        pd = generate_random_bbox(n_labels, is_prediction=True)
+        gts.append(gt)
+        pds.append(pd)
+
+    detection = Detection(
+        uid="0",
+        groundtruths=gts,
+        predictions=pds,
+    )
+
+    loader = DataLoader()
+    loader.add_bounding_boxes([detection])
+
+    # loader cache duplication
+    assert len(loader.pairs) == 1
+
+    # duplicate all iou pairs
+    master_pair = loader.pairs[0]
+    duplicated_pairs = list()
+    for i in range(n_datums):
+        duplicate_pair = master_pair.copy()
+        duplicate_pair[:, 0] = i
+        duplicated_pairs.append(duplicate_pair)
+    loader.pairs = duplicated_pairs
+
+    loader.groundtruth_count = {
+        label_idx: {
+            datum_idx: count * n_datums for datum_idx, count in values.items()
+        }
+        for label_idx, values in loader.groundtruth_count.items()
+    }
+    loader.prediction_count = {
+        label_idx: {
+            datum_idx: count * n_datums for datum_idx, count in values.items()
+        }
+        for label_idx, values in loader.prediction_count.items()
+    }
+
+    # evaluator cache duplication
+    assert loader._evaluator.n_datums == 1
+    loader._evaluator.n_datums = n_datums
+    loader._evaluator.n_groundtruths = n_matched + n_unmatched
+    loader._evaluator.n_predictions = n_matched + n_unmatched
+    loader._evaluator.n_labels = n_labels
+    loader._evaluator.uid_to_index = {str(i): i for i in range(n_datums)}
+    loader._evaluator.index_to_uid = {i: str(i) for i in range(n_datums)}
+    loader._evaluator.label_to_index = {str(i): i for i in range(n_labels)}
+    loader._evaluator.index_to_label = {i: str(i) for i in range(n_labels)}
+
+    return loader
+
+
 def benchmark_add_bounding_boxes(
     n_labels: int,
     n_boxes_per_datum: tuple[int, int],
@@ -162,11 +234,9 @@ def benchmark_add_bounding_boxes(
         repeat=repeat,
     )
 
-    n_matched, n_unmatched = n_boxes_per_datum
-
     elapsed = 0
+    n_matched, n_unmatched = n_boxes_per_datum
     for _ in range(repeat):
-
         gts = []
         pds = []
         for _ in range(n_matched):
@@ -202,30 +272,13 @@ def benchmark_finalize(
         repeat=repeat,
     )
 
-    n_matched, n_unmatched = n_boxes_per_datum
-
-    gts = []
-    pds = []
-    for _ in range(n_matched):
-        gt, pd = generate_random_bbox_pair(n_labels)
-        gts.append(gt)
-        pds.append(pd)
-    for _ in range(n_unmatched):
-        gt = generate_random_bbox(n_labels, is_prediction=False)
-        gts.append(gt)
-        pd = generate_random_bbox(n_labels, is_prediction=True)
-        pds.append(pd)
-
     elapsed = 0
     for _ in range(repeat):
-        loader = DataLoader()
-        for i in range(n_datums):
-            detection = Detection(
-                uid=f"uid{i}",
-                groundtruths=gts,
-                predictions=pds,
-            )
-            loader.add_bounding_boxes([detection])
+        loader = generate_cache(
+            n_datums=n_datums,
+            n_labels=n_labels,
+            n_boxes_per_datum=n_boxes_per_datum,
+        )
         elapsed += profile(loader.finalize)()
     return elapsed / repeat
 
@@ -233,22 +286,27 @@ def benchmark_finalize(
 if __name__ == "__main__":
 
     n_datums = [
+        1000000,
+        100000,
+        10000,
+        1000,
         100,
         10,
-        1,
     ]
 
     n_labels = [
-        # 1000,
+        1000,
         100,
-        10,
-        1,
+        20,
+        5,
     ]
 
     n_boxes_per_datum = [
-        (100, 1),
-        (10, 10),
-        (1, 100),
+        (1000, 1),
+        (100, 10),
+        (10, 2),
+        (10, 100),
+        (1, 1000),
     ]
 
     b = Benchmark(
@@ -258,11 +316,11 @@ def benchmark_finalize(
         verbose=True,
     )
 
-    # b.run(
-    #     benchmark=benchmark_add_bounding_boxes,
-    #     n_labels=n_labels,
-    #     n_boxes_per_datum=n_boxes_per_datum,
-    # )
+    b.run(
+        benchmark=benchmark_add_bounding_boxes,
+        n_labels=n_labels,
+        n_boxes_per_datum=n_boxes_per_datum,
+    )
 
     b.run(
         benchmark=benchmark_finalize,
@@ -271,14 +329,18 @@ def benchmark_finalize(
         n_boxes_per_datum=n_boxes_per_datum,
     )
 
-    # b.run(
-    #     benchmark=benchmark_finalize,
-    #     n_datums=n_datums,
-    #     n_labels=n_labels,
-    # )
-
     # b.run(
     #     benchmark=benchmark_evaluate,
     #     n_datums=n_datums,
     #     n_labels=n_labels,
     # )
+
+    loader = generate_cache(
+        n_datums=1,
+        n_labels=10,
+        n_boxes_per_datum=(2, 0),
+    )
+    evaluator = loader.finalize()
+
+    for pair in evaluator._detailed_pairs:
+        print(pair.tolist())
diff --git a/lite/valor_lite/object_detection/manager.py b/lite/valor_lite/object_detection/manager.py
index 0e00b97a7..350c1ac9a 100644
--- a/lite/valor_lite/object_detection/manager.py
+++ b/lite/valor_lite/object_detection/manager.py
@@ -3,15 +3,9 @@
 from typing import Type
 
 import numpy as np
-import valor_lite.object_detection.annotation as annotation
 from numpy.typing import NDArray
 from tqdm import tqdm
-from valor_lite.object_detection.annotation import (
-    Bitmask,
-    BoundingBox,
-    Detection,
-    Polygon,
-)
+from valor_lite.object_detection.annotation import Detection
 from valor_lite.object_detection.computation import (
     compute_bbox_iou,
     compute_bitmask_iou,
@@ -400,12 +394,12 @@ def _add_label(self, label: str) -> int:
 
         return self._evaluator.label_to_index[label]
 
-    def _compute_ious_and_cache_pairs(
+    def _cache_pairs(
         self,
         uid_index: int,
         groundtruths: list,
         predictions: list,
-        annotation_type: Type[BoundingBox] | Type[Polygon] | Type[Bitmask],
+        ious: NDArray[np.float64],
     ) -> None:
         """
         Compute IOUs between groundtruths and preditions before storing as pairs.
@@ -422,34 +416,10 @@ def _compute_ious_and_cache_pairs(
             The type of annotation to compute IOUs for.
         """
 
-        pairs = list()
-        n_predictions = len(predictions)
-        n_groundtruths = len(groundtruths)
-
-        all_pairs = np.array(
-            [
-                np.array([gann, pann])
-                for _, _, _, pann in predictions
-                for _, _, gann in groundtruths
-            ]
-        )
-
-        match annotation_type:
-            case annotation.BoundingBox:
-                ious = compute_bbox_iou(all_pairs)
-            case annotation.Polygon:
-                ious = compute_polygon_iou(all_pairs)
-            case annotation.Bitmask:
-                ious = compute_bitmask_iou(all_pairs)
-            case _:
-                raise ValueError(
-                    f"Invalid annotation type `{annotation_type}`."
-                )
-
-        ious = ious.reshape(n_predictions, n_groundtruths)
         predictions_with_iou_of_zero = np.where((ious < 1e-9).all(axis=1))[0]
         groundtruths_with_iou_of_zero = np.where((ious < 1e-9).all(axis=0))[0]
 
+        pairs = list()
         pairs.extend(
             [
                 np.array(
@@ -463,8 +433,8 @@ def _compute_ious_and_cache_pairs(
                         float(score),
                     ]
                 )
-                for pidx, plabel, score, _ in predictions
-                for gidx, glabel, _ in groundtruths
+                for pidx, plabel, score in predictions
+                for gidx, glabel in groundtruths
                 if ious[pidx, gidx] >= 1e-9
             ]
         )
@@ -500,13 +470,12 @@ def _compute_ious_and_cache_pairs(
                 for index in groundtruths_with_iou_of_zero
             ]
         )
-
         self.pairs.append(np.array(pairs))
 
     def _add_data(
         self,
         detections: list[Detection],
-        annotation_type: type[Bitmask] | type[BoundingBox] | type[Polygon],
+        detection_ious: list[NDArray[np.float64]],
         show_progress: bool = False,
     ):
         """
@@ -522,7 +491,9 @@ def _add_data(
             Toggle for tqdm progress bar.
         """
         disable_tqdm = not show_progress
-        for detection in tqdm(detections, disable=disable_tqdm):
+        for detection, ious in tqdm(
+            zip(detections, detection_ious), disable=disable_tqdm
+        ):
 
             # update metadata
             self._evaluator.n_datums += 1
@@ -545,11 +516,6 @@ def _add_data(
             predictions = list()
 
             for gidx, gann in enumerate(detection.groundtruths):
-                if not isinstance(gann, annotation_type):
-                    raise ValueError(
-                        f"Expected {annotation_type}, but annotation is of type {type(gann)}."
-                    )
-
                 self._evaluator.groundtruth_examples[uid_index][
                     gidx
                 ] = gann.extrema
@@ -560,16 +526,10 @@ def _add_data(
                         (
                             gidx,
                             label_idx,
-                            gann.annotation,
                         )
                     )
 
             for pidx, pann in enumerate(detection.predictions):
-                if not isinstance(pann, annotation_type):
-                    raise ValueError(
-                        f"Expected {annotation_type}, but annotation is of type {type(pann)}."
-                    )
-
                 self._evaluator.prediction_examples[uid_index][
                     pidx
                 ] = pann.extrema
@@ -581,15 +541,14 @@ def _add_data(
                             pidx,
                             label_idx,
                             pscore,
-                            pann.annotation,
                         )
                     )
 
-            self._compute_ious_and_cache_pairs(
+            self._cache_pairs(
                 uid_index=uid_index,
                 groundtruths=groundtruths,
                 predictions=predictions,
-                annotation_type=annotation_type,
+                ious=ious,
             )
 
     def add_bounding_boxes(
@@ -607,10 +566,22 @@ def add_bounding_boxes(
         show_progress : bool, default=False
             Toggle for tqdm progress bar.
         """
+        ious = [
+            compute_bbox_iou(
+                np.array(
+                    [
+                        [gt.extrema, pd.extrema]
+                        for pd in detection.predictions
+                        for gt in detection.groundtruths
+                    ]
+                )
+            ).reshape(len(detection.predictions), len(detection.groundtruths))
+            for detection in detections
+        ]
         return self._add_data(
             detections=detections,
+            detection_ious=ious,
             show_progress=show_progress,
-            annotation_type=BoundingBox,
         )
 
     def add_polygons(
@@ -628,10 +599,22 @@ def add_polygons(
         show_progress : bool, default=False
             Toggle for tqdm progress bar.
         """
+        ious = [
+            compute_polygon_iou(
+                np.array(
+                    [
+                        [gt.annotation, pd.annotation]
+                        for pd in detection.predictions
+                        for gt in detection.groundtruths
+                    ]
+                )
+            ).reshape(len(detection.predictions), len(detection.groundtruths))
+            for detection in detections
+        ]
         return self._add_data(
             detections=detections,
+            detection_ious=ious,
             show_progress=show_progress,
-            annotation_type=Polygon,
         )
 
     def add_bitmasks(
@@ -649,10 +632,22 @@ def add_bitmasks(
         show_progress : bool, default=False
             Toggle for tqdm progress bar.
         """
+        ious = [
+            compute_bitmask_iou(
+                np.array(
+                    [
+                        [gt.annotation, pd.annotation]
+                        for pd in detection.predictions
+                        for gt in detection.groundtruths
+                    ]
+                )
+            ).reshape(len(detection.predictions), len(detection.groundtruths))
+            for detection in detections
+        ]
         return self._add_data(
             detections=detections,
+            detection_ious=ious,
             show_progress=show_progress,
-            annotation_type=Bitmask,
         )
 
     def finalize(self) -> Evaluator:

From 0f5602eee01602111274738141adcd911d55a8aa Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Fri, 8 Nov 2024 12:00:22 -0600
Subject: [PATCH 06/29] fixed iou computation in object detection

---
 .../tests/object_detection/test_dataloader.py |  59 ++++++++-
 .../valor_lite/object_detection/annotation.py |  24 ----
 lite/valor_lite/object_detection/manager.py   | 122 +++++++++---------
 3 files changed, 113 insertions(+), 92 deletions(-)

diff --git a/lite/tests/object_detection/test_dataloader.py b/lite/tests/object_detection/test_dataloader.py
index 101579717..7178cab8c 100644
--- a/lite/tests/object_detection/test_dataloader.py
+++ b/lite/tests/object_detection/test_dataloader.py
@@ -16,6 +16,49 @@ def test_no_data():
         loader.finalize()
 
 
+def test_iou_computation():
+
+    detection = Detection(
+        uid="uid",
+        groundtruths=[
+            BoundingBox(xmin=0, xmax=10, ymin=0, ymax=10, labels=["0"]),
+            BoundingBox(xmin=100, xmax=110, ymin=100, ymax=110, labels=["0"]),
+            BoundingBox(
+                xmin=1000, xmax=1100, ymin=1000, ymax=1100, labels=["0"]
+            ),
+        ],
+        predictions=[
+            BoundingBox(
+                xmin=1,
+                xmax=11,
+                ymin=1,
+                ymax=11,
+                labels=["0", "1", "2"],
+                scores=[0.5, 0.25, 0.25],
+            ),
+            BoundingBox(
+                xmin=105,
+                xmax=116,
+                ymin=105,
+                ymax=116,
+                labels=["0", "1", "2"],
+                scores=[0.5, 0.25, 0.25],
+            ),
+        ],
+    )
+
+    loader = DataLoader()
+    loader.add_bounding_boxes([detection])
+
+    assert len(loader.pairs) == 1
+
+    # show that three unique IOUs exist
+    unique_ious = np.unique(loader.pairs[0][:, 3])
+    assert np.isclose(
+        unique_ious, np.array([0.0, 0.12755102, 0.68067227])
+    ).all()
+
+
 def test_mixed_annotations(
     rect1: tuple[float, float, float, float],
     rect1_rotated_5_degrees_around_origin: tuple[float, float, float, float],
@@ -87,7 +130,15 @@ def test_mixed_annotations(
 
     loader = DataLoader()
 
-    for input_ in mixed_detections:
-        with pytest.raises(ValueError) as e:
-            loader.add_bounding_boxes([input_])
-        assert "but annotation is of type" in str(e)
+    for detection in mixed_detections:
+
+        # anything can be converted to a bbox
+        loader.add_bounding_boxes([detection])
+
+        with pytest.raises(AttributeError) as e:
+            loader.add_polygons([detection])
+        assert "no attribute 'shape'" in str(e)
+
+        with pytest.raises(AttributeError) as e:
+            loader.add_bitmasks([detection])
+        assert "no attribute 'mask'" in str(e)
diff --git a/lite/valor_lite/object_detection/annotation.py b/lite/valor_lite/object_detection/annotation.py
index 7eb2be28d..5b94b9062 100644
--- a/lite/valor_lite/object_detection/annotation.py
+++ b/lite/valor_lite/object_detection/annotation.py
@@ -142,18 +142,6 @@ def extrema(self) -> tuple[float, float, float, float]:
         xmin, ymin, xmax, ymax = self.shape.bounds
         return (xmin, xmax, ymin, ymax)
 
-    @property
-    def annotation(self) -> ShapelyPolygon:
-        """
-        Returns the annotation's data representation.
-
-        Returns
-        -------
-        shapely.geometry.Polygon
-            The polygon shape.
-        """
-        return self.shape
-
 
 @dataclass
 class Bitmask:
@@ -222,18 +210,6 @@ def extrema(self) -> tuple[float, float, float, float]:
         rows, cols = np.nonzero(self.mask)
         return (cols.min(), cols.max(), rows.min(), rows.max())
 
-    @property
-    def annotation(self) -> NDArray[np.bool_]:
-        """
-        Returns the annotation's data representation.
-
-        Returns
-        -------
-        NDArray[np.bool_]
-            The binary mask array.
-        """
-        return self.mask
-
 
 @dataclass
 class Detection:
diff --git a/lite/valor_lite/object_detection/manager.py b/lite/valor_lite/object_detection/manager.py
index f2b0b54b6..c9ca5cb4e 100644
--- a/lite/valor_lite/object_detection/manager.py
+++ b/lite/valor_lite/object_detection/manager.py
@@ -1,17 +1,10 @@
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Type
 
 import numpy as np
-import valor_lite.object_detection.annotation as annotation
 from numpy.typing import NDArray
 from tqdm import tqdm
-from valor_lite.object_detection.annotation import (
-    Bitmask,
-    BoundingBox,
-    Detection,
-    Polygon,
-)
+from valor_lite.object_detection.annotation import Detection
 from valor_lite.object_detection.computation import (
     compute_bbox_iou,
     compute_bitmask_iou,
@@ -396,56 +389,32 @@ def _add_label(self, label: str) -> int:
 
         return self._evaluator.label_to_index[label]
 
-    def _compute_ious_and_cache_pairs(
+    def _cache_pairs(
         self,
         uid_index: int,
         groundtruths: list,
         predictions: list,
-        annotation_type: Type[BoundingBox] | Type[Polygon] | Type[Bitmask],
+        ious: NDArray[np.float64],
     ) -> None:
         """
         Compute IOUs between groundtruths and preditions before storing as pairs.
 
         Parameters
         ----------
-        uid_index: int
+        uid_index : int
             The index of the detection.
-        groundtruths: list
+        groundtruths : list
             A list of groundtruths.
-        predictions: list
+        predictions : list
             A list of predictions.
-        annotation_type: type[BoundingBox] | type[Polygon] | type[Bitmask]
-            The type of annotation to compute IOUs for.
+        ious : NDArray[np.float64]
+            An array with shape (n_preds, n_gts) containing IOUs.
         """
 
-        pairs = list()
-        n_predictions = len(predictions)
-        n_groundtruths = len(groundtruths)
-
-        all_pairs = np.array(
-            [
-                np.array([gann, pann])
-                for _, _, _, pann in predictions
-                for _, _, gann in groundtruths
-            ]
-        )
-
-        match annotation_type:
-            case annotation.BoundingBox:
-                ious = compute_bbox_iou(all_pairs)
-            case annotation.Polygon:
-                ious = compute_polygon_iou(all_pairs)
-            case annotation.Bitmask:
-                ious = compute_bitmask_iou(all_pairs)
-            case _:
-                raise ValueError(
-                    f"Invalid annotation type `{annotation_type}`."
-                )
-
-        ious = ious.reshape(n_predictions, n_groundtruths)
         predictions_with_iou_of_zero = np.where((ious < 1e-9).all(axis=1))[0]
         groundtruths_with_iou_of_zero = np.where((ious < 1e-9).all(axis=0))[0]
 
+        pairs = list()
         pairs.extend(
             [
                 np.array(
@@ -459,8 +428,8 @@ def _compute_ious_and_cache_pairs(
                         float(score),
                     ]
                 )
-                for pidx, plabel, score, _ in predictions
-                for gidx, glabel, _ in groundtruths
+                for pidx, plabel, score in predictions
+                for gidx, glabel in groundtruths
                 if ious[pidx, gidx] >= 1e-9
             ]
         )
@@ -496,13 +465,12 @@ def _compute_ious_and_cache_pairs(
                 for index in groundtruths_with_iou_of_zero
             ]
         )
-
         self.pairs.append(np.array(pairs))
 
     def _add_data(
         self,
         detections: list[Detection],
-        annotation_type: type[Bitmask] | type[BoundingBox] | type[Polygon],
+        detection_ious: list[NDArray[np.float64]],
         show_progress: bool = False,
     ):
         """
@@ -512,13 +480,15 @@ def _add_data(
         ----------
         detections : list[Detection]
             A list of Detection objects.
-        annotation_type : type[Bitmask] | type[BoundingBox] | type[Polygon]
-            The annotation type to process.
+        detection_ious : list[NDArray[np.float64]]
+            A list of arrays containing IOUs per detection.
         show_progress : bool, default=False
             Toggle for tqdm progress bar.
         """
         disable_tqdm = not show_progress
-        for detection in tqdm(detections, disable=disable_tqdm):
+        for detection, ious in tqdm(
+            zip(detections, detection_ious), disable=disable_tqdm
+        ):
 
             # update metadata
             self._evaluator.n_datums += 1
@@ -541,11 +511,6 @@ def _add_data(
             predictions = list()
 
             for gidx, gann in enumerate(detection.groundtruths):
-                if not isinstance(gann, annotation_type):
-                    raise ValueError(
-                        f"Expected {annotation_type}, but annotation is of type {type(gann)}."
-                    )
-
                 self._evaluator.groundtruth_examples[uid_index][
                     gidx
                 ] = gann.extrema
@@ -556,16 +521,10 @@ def _add_data(
                         (
                             gidx,
                             label_idx,
-                            gann.annotation,
                         )
                     )
 
             for pidx, pann in enumerate(detection.predictions):
-                if not isinstance(pann, annotation_type):
-                    raise ValueError(
-                        f"Expected {annotation_type}, but annotation is of type {type(pann)}."
-                    )
-
                 self._evaluator.prediction_examples[uid_index][
                     pidx
                 ] = pann.extrema
@@ -577,15 +536,14 @@ def _add_data(
                             pidx,
                             label_idx,
                             pscore,
-                            pann.annotation,
                         )
                     )
 
-            self._compute_ious_and_cache_pairs(
+            self._cache_pairs(
                 uid_index=uid_index,
                 groundtruths=groundtruths,
                 predictions=predictions,
-                annotation_type=annotation_type,
+                ious=ious,
             )
 
     def add_bounding_boxes(
@@ -603,10 +561,22 @@ def add_bounding_boxes(
         show_progress : bool, default=False
             Toggle for tqdm progress bar.
         """
+        ious = [
+            compute_bbox_iou(
+                np.array(
+                    [
+                        [gt.extrema, pd.extrema]
+                        for pd in detection.predictions
+                        for gt in detection.groundtruths
+                    ]
+                )
+            ).reshape(len(detection.predictions), len(detection.groundtruths))
+            for detection in detections
+        ]
         return self._add_data(
             detections=detections,
+            detection_ious=ious,
             show_progress=show_progress,
-            annotation_type=BoundingBox,
         )
 
     def add_polygons(
@@ -624,10 +594,22 @@ def add_polygons(
         show_progress : bool, default=False
             Toggle for tqdm progress bar.
         """
+        ious = [
+            compute_polygon_iou(
+                np.array(
+                    [
+                        [gt.shape, pd.shape]  # type: ignore - using the AttributeError as a validator
+                        for pd in detection.predictions
+                        for gt in detection.groundtruths
+                    ]
+                )
+            ).reshape(len(detection.predictions), len(detection.groundtruths))
+            for detection in detections
+        ]
         return self._add_data(
             detections=detections,
+            detection_ious=ious,
             show_progress=show_progress,
-            annotation_type=Polygon,
         )
 
     def add_bitmasks(
@@ -645,10 +627,22 @@ def add_bitmasks(
         show_progress : bool, default=False
             Toggle for tqdm progress bar.
         """
+        ious = [
+            compute_bitmask_iou(
+                np.array(
+                    [
+                        [gt.mask, pd.mask]  # type: ignore - using the AttributeError as a validator
+                        for pd in detection.predictions
+                        for gt in detection.groundtruths
+                    ]
+                )
+            ).reshape(len(detection.predictions), len(detection.groundtruths))
+            for detection in detections
+        ]
         return self._add_data(
             detections=detections,
+            detection_ious=ious,
             show_progress=show_progress,
-            annotation_type=Bitmask,
         )
 
     def finalize(self) -> Evaluator:

From 3f4ddb0c0d9e033ccc9867f706463202005abdc5 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Fri, 8 Nov 2024 16:11:06 -0600
Subject: [PATCH 07/29] wip

---
 lite/valor_lite/object_detection/benchmark.py | 99 ++++++++++++++-----
 lite/valor_lite/profiling.py                  | 10 +-
 2 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/lite/valor_lite/object_detection/benchmark.py b/lite/valor_lite/object_detection/benchmark.py
index d4cb2e603..c32c10218 100644
--- a/lite/valor_lite/object_detection/benchmark.py
+++ b/lite/valor_lite/object_detection/benchmark.py
@@ -153,7 +153,7 @@ def generate_random_bbox_pair(
 def generate_cache(
     n_datums: int,
     n_labels: int,
-    n_boxes_per_datum: tuple[int, int],
+    n_annotations_per_datum: tuple[int, int],
 ) -> DataLoader:
     """
     This skips the IOU computation.
@@ -163,7 +163,7 @@ def generate_cache(
 
     gts = []
     pds = []
-    n_matched, n_unmatched = n_boxes_per_datum
+    n_matched, n_unmatched = n_annotations_per_datum
     for _ in range(n_matched):
         gt, pd = generate_random_bbox_pair(n_labels)
         gts.append(gt)
@@ -224,7 +224,7 @@ def generate_cache(
 
 def benchmark_add_bounding_boxes(
     n_labels: int,
-    n_boxes_per_datum: tuple[int, int],
+    n_annotations_per_datum: tuple[int, int],
     time_limit: float | None,
     repeat: int = 1,
 ):
@@ -235,7 +235,7 @@ def benchmark_add_bounding_boxes(
     )
 
     elapsed = 0
-    n_matched, n_unmatched = n_boxes_per_datum
+    n_matched, n_unmatched = n_annotations_per_datum
     for _ in range(repeat):
         gts = []
         pds = []
@@ -262,7 +262,7 @@ def benchmark_add_bounding_boxes(
 def benchmark_finalize(
     n_datums: int,
     n_labels: int,
-    n_boxes_per_datum: tuple[int, int],
+    n_annotations_per_datum: tuple[int, int],
     time_limit: float | None,
     repeat: int = 1,
 ):
@@ -277,12 +277,65 @@ def benchmark_finalize(
         loader = generate_cache(
             n_datums=n_datums,
             n_labels=n_labels,
-            n_boxes_per_datum=n_boxes_per_datum,
+            n_annotations_per_datum=n_annotations_per_datum,
         )
         elapsed += profile(loader.finalize)()
     return elapsed / repeat
 
 
+def benchmark_compute_precision_recall(
+    n_datums: int,
+    n_labels: int,
+    n_annotations_per_datum: tuple[int, int],
+    time_limit: float | None,
+    repeat: int = 1,
+):
+
+    profile = create_runtime_profiler(
+        time_limit=time_limit,
+        repeat=repeat,
+    )
+
+    elapsed = 0
+    for _ in range(repeat):
+        loader = generate_cache(
+            n_datums=n_datums,
+            n_labels=n_labels,
+            n_annotations_per_datum=n_annotations_per_datum,
+        )
+        evaluator = loader.finalize()
+        elapsed += profile(evaluator.compute_precision_recall)()
+    return elapsed / repeat
+
+
+def benchmark_compute_confusion_matrix(
+    n_datums: int,
+    n_labels: int,
+    n_annotations_per_datum: tuple[int, int],
+    n_examples: int,
+    time_limit: float | None,
+    repeat: int = 1,
+):
+
+    profile = create_runtime_profiler(
+        time_limit=time_limit,
+        repeat=repeat,
+    )
+
+    elapsed = 0
+    for _ in range(repeat):
+        loader = generate_cache(
+            n_datums=n_datums,
+            n_labels=n_labels,
+            n_annotations_per_datum=n_annotations_per_datum,
+        )
+        evaluator = loader.finalize()
+        elapsed += profile(evaluator.compute_confusion_matrix)(
+            number_of_examples=n_examples
+        )
+    return elapsed / repeat
+
+
 if __name__ == "__main__":
 
     n_datums = [
@@ -301,12 +354,17 @@ def benchmark_finalize(
         5,
     ]
 
-    n_boxes_per_datum = [
+    n_annotations_per_datum = [
         (1000, 1),
         (100, 10),
         (10, 2),
-        (10, 100),
-        (1, 1000),
+    ]
+
+    n_examples = [
+        10,
+        5,
+        1,
+        0,
     ]
 
     b = Benchmark(
@@ -319,28 +377,19 @@ def benchmark_finalize(
     b.run(
         benchmark=benchmark_add_bounding_boxes,
         n_labels=n_labels,
-        n_boxes_per_datum=n_boxes_per_datum,
+        n_annotations_per_datum=n_annotations_per_datum,
     )
 
     b.run(
         benchmark=benchmark_finalize,
         n_datums=n_datums,
         n_labels=n_labels,
-        n_boxes_per_datum=n_boxes_per_datum,
+        n_annotations_per_datum=n_annotations_per_datum,
     )
 
-    # b.run(
-    #     benchmark=benchmark_evaluate,
-    #     n_datums=n_datums,
-    #     n_labels=n_labels,
-    # )
-
-    loader = generate_cache(
-        n_datums=1,
-        n_labels=10,
-        n_boxes_per_datum=(2, 0),
+    b.run(
+        benchmark=benchmark_compute_precision_recall,
+        n_datums=n_datums,
+        n_labels=n_labels,
+        n_annotations_per_datum=n_annotations_per_datum,
     )
-    evaluator = loader.finalize()
-
-    for pair in evaluator._detailed_pairs:
-        print(pair.tolist())
diff --git a/lite/valor_lite/profiling.py b/lite/valor_lite/profiling.py
index 9e4990161..484d9937f 100644
--- a/lite/valor_lite/profiling.py
+++ b/lite/valor_lite/profiling.py
@@ -102,12 +102,13 @@ def pretty_print_results(results: tuple):
     print()
     print("Failed")
     if len(invalid) > 0:
-        keys = ["error", *invalid[0]["details"].keys(), "msg"]
+        keys = ["complexity", "error", *invalid[0]["details"].keys(), "msg"]
         header = " | ".join(f"{header:^15}" for header in keys)
         print(header)
         print("-" * len(header))
         for entry in invalid:
             values = [
+                entry["complexity"],
                 entry["error"],
                 *entry["details"].values(),
                 entry["msg"],
@@ -245,6 +246,7 @@ def run(
                 k: v[current_indices[idx]]
                 for idx, (k, v) in enumerate(zip(keys, vars))
             }
+            complexity = calculate_complexity(tuple(parameters.values()))
 
             details: dict = {k: str(v) for k, v in parameters.items()}
 
@@ -272,9 +274,7 @@ def run(
                 )
                 valid_combinations.append(
                     {
-                        "complexity": calculate_complexity(
-                            tuple(parameters.values())
-                        ),
+                        "complexity": complexity,
                         "runtime": runtime,
                         "details": details,
                     }
@@ -283,6 +283,7 @@ def run(
             except Exception as e:
                 invalid_combinations.append(
                     {
+                        "complexity": complexity,
                         "error": type(e).__name__,
                         "msg": str(e),
                         "details": details,
@@ -299,6 +300,7 @@ def run(
                         explored.add(new_indices_tuple)
 
         valid_combinations.sort(key=lambda x: -x["complexity"])
+        invalid_combinations.sort(key=lambda x: -x["complexity"])
 
         # clear terminal and display results
         self.clear_status()

From 15bd4f3c0774020bfbb48417325ec6ff4f2bca6b Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Fri, 8 Nov 2024 16:12:22 -0600
Subject: [PATCH 08/29] suggested change

---
 lite/valor_lite/object_detection/manager.py | 35 ++++++++++-----------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/lite/valor_lite/object_detection/manager.py b/lite/valor_lite/object_detection/manager.py
index c9ca5cb4e..bd7663107 100644
--- a/lite/valor_lite/object_detection/manager.py
+++ b/lite/valor_lite/object_detection/manager.py
@@ -414,25 +414,22 @@ def _cache_pairs(
         predictions_with_iou_of_zero = np.where((ious < 1e-9).all(axis=1))[0]
         groundtruths_with_iou_of_zero = np.where((ious < 1e-9).all(axis=0))[0]
 
-        pairs = list()
-        pairs.extend(
-            [
-                np.array(
-                    [
-                        float(uid_index),
-                        float(gidx),
-                        float(pidx),
-                        ious[pidx, gidx],
-                        float(glabel),
-                        float(plabel),
-                        float(score),
-                    ]
-                )
-                for pidx, plabel, score in predictions
-                for gidx, glabel in groundtruths
-                if ious[pidx, gidx] >= 1e-9
-            ]
-        )
+        pairs = [
+            np.array(
+                [
+                    float(uid_index),
+                    float(gidx),
+                    float(pidx),
+                    ious[pidx, gidx],
+                    float(glabel),
+                    float(plabel),
+                    float(score),
+                ]
+            )
+            for pidx, plabel, score in predictions
+            for gidx, glabel in groundtruths
+            if ious[pidx, gidx] >= 1e-9
+        ]
         pairs.extend(
             [
                 np.array(

From 4914664c4ccb5cd44d28e83afc3e0a0e355eaec6 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 10:37:41 -0600
Subject: [PATCH 09/29] added notebook

---
 lite/benchmarks/syn_semantic_segmentation.py  |  13 +
 lite/examples/benchmarking.ipynb              | 433 ++++++++++++++++++
 lite/valor_lite/object_detection/benchmark.py |   4 +-
 lite/valor_lite/profiling.py                  |  51 +--
 .../semantic_segmentation/benchmark.py        |  99 ++--
 5 files changed, 530 insertions(+), 70 deletions(-)
 create mode 100644 lite/benchmarks/syn_semantic_segmentation.py
 create mode 100644 lite/examples/benchmarking.ipynb

diff --git a/lite/benchmarks/syn_semantic_segmentation.py b/lite/benchmarks/syn_semantic_segmentation.py
new file mode 100644
index 000000000..6629b344f
--- /dev/null
+++ b/lite/benchmarks/syn_semantic_segmentation.py
@@ -0,0 +1,13 @@
+from valor_lite.semantic_segmentation.benchmark import benchmark
+
+if __name__ == "__main__":
+
+    benchmark(
+        bitmask_shape=(10000, 10000),
+        number_of_images=10,
+        number_of_unique_labels=10,
+        memory_limit=4.0,
+        time_limit=10.0,
+        repeat=1,
+        verbose=False,
+    )
diff --git a/lite/examples/benchmarking.ipynb b/lite/examples/benchmarking.ipynb
new file mode 100644
index 000000000..78a6319d4
--- /dev/null
+++ b/lite/examples/benchmarking.ipynb
@@ -0,0 +1,433 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from valor_lite.profiling import Benchmark\n",
+    "\n",
+    "b = Benchmark(\n",
+    "    time_limit=5.0,  # 5s\n",
+    "    memory_limit=8 * (1024 ** 3),  # 8 GB\n",
+    "    repeat=1,\n",
+    "    verbose=True,\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Object Detection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from valor_lite.object_detection.benchmark import (\n",
+    "    benchmark_add_bounding_boxes as objdet_add_bboxes,\n",
+    "    benchmark_finalize as objdet_finalize,\n",
+    "    # benchmark_evaluate as objdet_evaluate,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_datums = [\n",
+    "    100,\n",
+    "    10,\n",
+    "    1,\n",
+    "]\n",
+    "\n",
+    "n_labels = [\n",
+    "    1000,\n",
+    "    100,\n",
+    "    10,\n",
+    "    3,\n",
+    "]\n",
+    "\n",
+    "n_annotations_per_datum = [\n",
+    "    (100, 10),  # 100 pairs, 10 w/ no overlap\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/4 [00:01<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=====================================================================\n",
+      "Details\n",
+      "{\n",
+      "    \"benchmark\": \"benchmark_add_bounding_boxes\",\n",
+      "    \"limits\": {\n",
+      "        \"memory_limit\": \"8.0 GB\",\n",
+      "        \"time_limit\": \"5.0 seconds\",\n",
+      "        \"repeat\": 1\n",
+      "    },\n",
+      "    \"passed\": 4,\n",
+      "    \"failed\": 0,\n",
+      "    \"total\": 4\n",
+      "}\n",
+      "\n",
+      "Passed\n",
+      "  complexity    |     runtime     |    n_labels     | n_annotations_per_datum\n",
+      "-----------------------------------------------------------------------------\n",
+      "    1000000     |     1.2612      |      1000       |    (100, 10)   \n",
+      "\n",
+      "Failed\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "_ = b.run(\n",
+    "    benchmark=objdet_add_bboxes,\n",
+    "    n_labels=n_labels,\n",
+    "    n_annotations_per_datum=n_annotations_per_datum,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/12 [00:02<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=====================================================================\n",
+      "Details\n",
+      "{\n",
+      "    \"benchmark\": \"benchmark_finalize\",\n",
+      "    \"limits\": {\n",
+      "        \"memory_limit\": \"8.0 GB\",\n",
+      "        \"time_limit\": \"5.0 seconds\",\n",
+      "        \"repeat\": 1\n",
+      "    },\n",
+      "    \"passed\": 12,\n",
+      "    \"failed\": 0,\n",
+      "    \"total\": 12\n",
+      "}\n",
+      "\n",
+      "Passed\n",
+      "  complexity    |     runtime     |    n_datums     |    n_labels     | n_annotations_per_datum\n",
+      "-----------------------------------------------------------------------------------------------\n",
+      "   100000000    |     0.7321      |       100       |      1000       |    (100, 10)   \n",
+      "\n",
+      "Failed\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "_ = b.run(\n",
+    "    benchmark=objdet_finalize,\n",
+    "    n_datums=n_datums,\n",
+    "    n_labels=n_labels,\n",
+    "    n_annotations_per_datum=n_annotations_per_datum,\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Semantic Segmentation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from valor_lite.semantic_segmentation.benchmark import (\n",
+    "    benchmark_add_data as semseg_add_data,\n",
+    "    benchmark_finalize as semseg_finalize,\n",
+    "    benchmark_evaluate as semseg_evaluate,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_datums = [\n",
+    "    100,\n",
+    "    10,\n",
+    "    1,\n",
+    "]\n",
+    "\n",
+    "n_labels = [\n",
+    "    1000,\n",
+    "    100,\n",
+    "    10,\n",
+    "    3,\n",
+    "]\n",
+    "\n",
+    "shapes = [\n",
+    "    (10000, 10000),\n",
+    "    (2500, 2500),\n",
+    "    (1000, 1000),\n",
+    "    (100, 100),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 69%|██████▉   | 11/16 [00:44<00:20,  4.07s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=====================================================================\n",
+      "Details\n",
+      "{\n",
+      "    \"benchmark\": \"benchmark_add_data\",\n",
+      "    \"limits\": {\n",
+      "        \"memory_limit\": \"8.0 GB\",\n",
+      "        \"time_limit\": \"5.0 seconds\",\n",
+      "        \"repeat\": 1\n",
+      "    },\n",
+      "    \"passed\": 8,\n",
+      "    \"failed\": 8,\n",
+      "    \"total\": 16\n",
+      "}\n",
+      "\n",
+      "Passed\n",
+      "  complexity    |     runtime     |    n_labels     |      shape     \n",
+      "---------------------------------------------------------------------\n",
+      "   300000000    |     1.3209      |        3        | (10000, 10000) \n",
+      "   62500000     |     0.5378      |       10        |  (2500, 2500)  \n",
+      "   10000000     |     0.0911      |       10        |  (1000, 1000)  \n",
+      "    1000000     |     0.0598      |       100       |   (100, 100)   \n",
+      "\n",
+      "Failed\n",
+      "  complexity    |      error      |    n_labels     |      shape      |       msg      \n",
+      "---------------------------------------------------------------------------------------\n",
+      " 100000000000   |   MemoryError   |      1000       | (10000, 10000)  | Unable to allocate 186. GiB for an array with shape (1001, 20000, 10000) and data type bool\n",
+      "  10000000000   |   MemoryError   |       100       | (10000, 10000)  | Unable to allocate 18.8 GiB for an array with shape (101, 20000, 10000) and data type bool\n",
+      "  6250000000    |   MemoryError   |      1000       |  (2500, 2500)   | Unable to allocate 11.7 GiB for an array with shape (1001, 5000, 2500) and data type bool\n",
+      "  1000000000    |   MemoryError   |       10        | (10000, 10000)  | Unable to allocate 9.31 GiB for an array with shape (10, 10, 100000000) and data type bool\n",
+      "  1000000000    |   MemoryError   |      1000       |  (1000, 1000)   | Unable to allocate 931. GiB for an array with shape (1000, 1000, 1000000) and data type bool\n",
+      "   625000000    |   MemoryError   |       100       |  (2500, 2500)   | Unable to allocate 58.2 GiB for an array with shape (100, 100, 6250000) and data type bool\n",
+      "   100000000    |   MemoryError   |       100       |  (1000, 1000)   | Unable to allocate 9.31 GiB for an array with shape (100, 100, 1000000) and data type bool\n",
+      "   10000000     |   MemoryError   |      1000       |   (100, 100)    | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "_ = b.run(\n",
+    "    benchmark=semseg_add_data,\n",
+    "    n_labels=n_labels,\n",
+    "    shape=shapes,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 42%|████▏     | 5/12 [00:06<00:09,  1.37s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=====================================================================\n",
+      "Details\n",
+      "{\n",
+      "    \"benchmark\": \"benchmark_finalize\",\n",
+      "    \"limits\": {\n",
+      "        \"memory_limit\": \"8.0 GB\",\n",
+      "        \"time_limit\": \"5.0 seconds\",\n",
+      "        \"repeat\": 1\n",
+      "    },\n",
+      "    \"passed\": 9,\n",
+      "    \"failed\": 3,\n",
+      "    \"total\": 12\n",
+      "}\n",
+      "\n",
+      "Passed\n",
+      "  complexity    |     runtime     |    n_datums     |    n_labels    \n",
+      "---------------------------------------------------------------------\n",
+      "     10000      |     0.0152      |       100       |       100      \n",
+      "     1000       |     0.0044      |       10        |       100      \n",
+      "      100       |     0.0014      |        1        |       100      \n",
+      "\n",
+      "Failed\n",
+      "  complexity    |      error      |    n_datums     |    n_labels     |       msg      \n",
+      "---------------------------------------------------------------------------------------\n",
+      "    100000      |   MemoryError   |       100       |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n",
+      "     10000      |   MemoryError   |       10        |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n",
+      "     1000       |   MemoryError   |        1        |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "_ = b.run(\n",
+    "    benchmark=semseg_finalize,\n",
+    "    n_datums=n_datums,\n",
+    "    n_labels=n_labels,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 42%|████▏     | 5/12 [00:06<00:09,  1.39s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=====================================================================\n",
+      "Details\n",
+      "{\n",
+      "    \"benchmark\": \"benchmark_evaluate\",\n",
+      "    \"limits\": {\n",
+      "        \"memory_limit\": \"8.0 GB\",\n",
+      "        \"time_limit\": \"5.0 seconds\",\n",
+      "        \"repeat\": 1\n",
+      "    },\n",
+      "    \"passed\": 9,\n",
+      "    \"failed\": 3,\n",
+      "    \"total\": 12\n",
+      "}\n",
+      "\n",
+      "Passed\n",
+      "  complexity    |     runtime     |    n_datums     |    n_labels    \n",
+      "---------------------------------------------------------------------\n",
+      "     10000      |     0.0146      |       100       |       100      \n",
+      "     1000       |     0.0148      |       10        |       100      \n",
+      "      100       |      0.013      |        1        |       100      \n",
+      "\n",
+      "Failed\n",
+      "  complexity    |      error      |    n_datums     |    n_labels     |       msg      \n",
+      "---------------------------------------------------------------------------------------\n",
+      "    100000      |   MemoryError   |       100       |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n",
+      "     10000      |   MemoryError   |       10        |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n",
+      "     1000       |   MemoryError   |        1        |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "_ = b.run(\n",
+    "    benchmark=semseg_evaluate,\n",
+    "    n_datums=n_datums,\n",
+    "    n_labels=n_labels,\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".env-valor",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/lite/valor_lite/object_detection/benchmark.py b/lite/valor_lite/object_detection/benchmark.py
index c32c10218..288d51ba9 100644
--- a/lite/valor_lite/object_detection/benchmark.py
+++ b/lite/valor_lite/object_detection/benchmark.py
@@ -368,8 +368,8 @@ def benchmark_compute_confusion_matrix(
     ]
 
     b = Benchmark(
-        time_limit=10.0,
-        memory_limit=8 * (1024**3),
+        time_limit=5.0,
+        memory_limit=4 * (1024**3),
         repeat=1,
         verbose=True,
     )
diff --git a/lite/valor_lite/profiling.py b/lite/valor_lite/profiling.py
index 484d9937f..02e315824 100644
--- a/lite/valor_lite/profiling.py
+++ b/lite/valor_lite/profiling.py
@@ -7,6 +7,17 @@
 from collections import deque
 from multiprocessing import Queue
 
+from tqdm import tqdm
+
+
+class BenchmarkError(Exception):
+    def __init__(
+        self, benchmark: str, error_type: str, error_message: str
+    ) -> None:
+        super().__init__(
+            f"'{benchmark}' raised '{error_type}' with the following message: {error_message}"
+        )
+
 
 def _timeit_subprocess(*args, __fn, __queue: Queue, **kwargs):
     try:
@@ -131,9 +142,6 @@ def __init__(
         self.repeat = repeat
         self.verbose = verbose
 
-        # printing
-        self.line_count = 0
-
     def get_limits(
         self,
         *_,
@@ -198,22 +206,6 @@ def memory_limit(self, limit: int | None):
             _, hard = resource.getrlimit(resource.RLIMIT_AS)
             resource.setrlimit(resource.RLIMIT_AS, (limit, hard))
 
-    def clear_status(self):
-        if not self.verbose:
-            return
-        for _ in range(self.line_count):
-            sys.stdout.write("\033[F")
-            sys.stdout.write("\033[K")
-        self.line_count = 0
-
-    def write_status(self, text: str):
-        if not self.verbose:
-            return
-        self.clear_status()
-        self.line_count = text.count("\n") + 1
-        sys.stdout.write(text + "\n")
-        sys.stdout.flush()
-
     def run(
         self,
         benchmark,
@@ -239,6 +231,8 @@ def run(
         valid_combinations = []
         invalid_combinations = []
 
+        pbar = tqdm(total=math.prod(max_indices), disable=(not self.verbose))
+        prev_count = 0
         while queue:
 
             current_indices = queue.popleft()
@@ -251,20 +245,9 @@ def run(
             details: dict = {k: str(v) for k, v in parameters.items()}
 
             # update terminal with status
-            self.write_status(
-                f"Running '{benchmark.__name__}'\n"
-                + json.dumps(
-                    {
-                        **details,
-                        **self.get_limits(
-                            readable=True,
-                            memory_unit="GB",
-                            time_unit="seconds",
-                        ),
-                    },
-                    indent=4,
-                )
-            )
+            count = len(valid_combinations) + len(invalid_combinations)
+            pbar.update(count - prev_count)
+            prev_count = count
 
             try:
                 runtime = benchmark(
@@ -303,7 +286,6 @@ def run(
         invalid_combinations.sort(key=lambda x: -x["complexity"])
 
         # clear terminal and display results
-        self.clear_status()
         results = (
             valid_combinations,
             invalid_combinations,
@@ -315,6 +297,7 @@ def run(
                 "total": permutations,
             },
         )
+        pbar.close()
         if self.verbose:
             pretty_print_results(results)
 
diff --git a/lite/valor_lite/semantic_segmentation/benchmark.py b/lite/valor_lite/semantic_segmentation/benchmark.py
index bb1934acf..a713fa675 100644
--- a/lite/valor_lite/semantic_segmentation/benchmark.py
+++ b/lite/valor_lite/semantic_segmentation/benchmark.py
@@ -1,5 +1,9 @@
 import numpy as np
-from valor_lite.profiling import Benchmark, create_runtime_profiler
+from valor_lite.profiling import (
+    Benchmark,
+    BenchmarkError,
+    create_runtime_profiler,
+)
 from valor_lite.semantic_segmentation import Bitmask, DataLoader, Segmentation
 
 
@@ -154,49 +158,76 @@ def benchmark_evaluate(
     return elapsed / repeat
 
 
-if __name__ == "__main__":
-
-    n_datums = [
-        100,
-        10,
-        1,
-    ]
-
-    n_labels = [
-        1000,
-        100,
-        10,
-        1,
-    ]
+def benchmark(
+    bitmask_shape: tuple[int, int],
+    number_of_unique_labels: int,
+    number_of_images: int,
+    *_,
+    memory_limit: float = 4.0,
+    time_limit: float = 10.0,
+    repeat: int = 1,
+    verbose: bool = False,
+):
+    """
+    Runs a single benchmark.
 
-    shapes = [
-        (10000, 10000),
-        (2500, 2500),
-        (1000, 1000),
-        (100, 100),
-    ]
+    Parameters
+    ----------
+    bitmask_shape : tuple[int, int]
+        The size (h, w) of the bitmask array.
+    number_of_unique_labels : int
+        The number of unique labels used in the synthetic example.
+    number_of_images : int
+        The number of distinct datums that are created.
+    memory_limit : float
+        The maximum amount of system memory allowed in gigabytes (GB).
+    time_limit : float
+        The maximum amount of time permitted before killing the benchmark.
+    repeat : int
+        The number of times to run a benchmark to produce an average runtime.
+    verbose : bool, default=False
+        Toggles terminal output of benchmark results.
+    """
 
     b = Benchmark(
-        time_limit=10.0,
-        memory_limit=8 * (1024**3),
-        repeat=1,
-        verbose=True,
+        time_limit=time_limit,
+        memory_limit=int(memory_limit * (1024**3)),
+        repeat=repeat,
+        verbose=verbose,
     )
 
-    b.run(
+    _, failed, details = b.run(
         benchmark=benchmark_add_data,
-        n_labels=n_labels,
-        shape=shapes,
+        n_labels=[number_of_unique_labels],
+        shape=[bitmask_shape],
     )
+    if failed:
+        raise BenchmarkError(
+            benchmark=details["benchmark"],
+            error_type=failed[0]["error"],
+            error_message=failed[0]["msg"],
+        )
 
-    b.run(
+    _, failed, details = b.run(
         benchmark=benchmark_finalize,
-        n_datums=n_datums,
-        n_labels=n_labels,
+        n_datums=[number_of_images],
+        n_labels=[number_of_unique_labels],
     )
+    if failed:
+        raise BenchmarkError(
+            benchmark=details["benchmark"],
+            error_type=failed[0]["error"],
+            error_message=failed[0]["msg"],
+        )
 
-    b.run(
+    _, failed, details = b.run(
         benchmark=benchmark_evaluate,
-        n_datums=n_datums,
-        n_labels=n_labels,
+        n_datums=[number_of_images],
+        n_labels=[number_of_unique_labels],
     )
+    if failed:
+        raise BenchmarkError(
+            benchmark=details["benchmark"],
+            error_type=failed[0]["error"],
+            error_message=failed[0]["msg"],
+        )

From 566fe297a3ad43bf8e13e1aadfbb64332654696d Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 11:48:35 -0600
Subject: [PATCH 10/29] added new benchmark workflow

---
 .../workflows/lite-synthetic-benchmarks.yml   |  28 ++
 .../benchmark_semantic_segmentation.py        |  13 +
 .../benchmark_that_fails.py}                  |   2 +-
 lite/valor_lite/object_detection/benchmark.py | 395 ------------------
 4 files changed, 42 insertions(+), 396 deletions(-)
 create mode 100644 .github/workflows/lite-synthetic-benchmarks.yml
 create mode 100644 lite/benchmarks/synthetic/benchmark_semantic_segmentation.py
 rename lite/benchmarks/{syn_semantic_segmentation.py => synthetic/benchmark_that_fails.py} (92%)
 delete mode 100644 lite/valor_lite/object_detection/benchmark.py

diff --git a/.github/workflows/lite-synthetic-benchmarks.yml b/.github/workflows/lite-synthetic-benchmarks.yml
new file mode 100644
index 000000000..d21a8d7e1
--- /dev/null
+++ b/.github/workflows/lite-synthetic-benchmarks.yml
@@ -0,0 +1,28 @@
+name: "[valor-lite] synthetic benchmarks"
+
+on:
+  push:
+    branches: "**"
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  run-benchmarks:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: install lite
+        run: pip install -e .
+        working-directory: ./lite
+      - name: benchmark semantic segmentation
+        run: python benchmark_semantic_segmentation.py
+        working-directory: ./lite/benchmarks/synthetic/
+      - name: benchmark that fails
+        run: python benchmark_that_fails.py
+        working-directory: ./lite/benchmarks/synthetic/
+      - run: make stop-env
diff --git a/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py b/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py
new file mode 100644
index 000000000..27a2d0313
--- /dev/null
+++ b/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py
@@ -0,0 +1,13 @@
+from valor_lite.semantic_segmentation.benchmark import benchmark
+
+if __name__ == "__main__":
+
+    benchmark(
+        bitmask_shape=(4000, 4000),
+        number_of_images=10,
+        number_of_unique_labels=10,
+        memory_limit=4.0,
+        time_limit=10.0,
+        repeat=1,
+        verbose=True,
+    )
diff --git a/lite/benchmarks/syn_semantic_segmentation.py b/lite/benchmarks/synthetic/benchmark_that_fails.py
similarity index 92%
rename from lite/benchmarks/syn_semantic_segmentation.py
rename to lite/benchmarks/synthetic/benchmark_that_fails.py
index 6629b344f..2aa7aa64f 100644
--- a/lite/benchmarks/syn_semantic_segmentation.py
+++ b/lite/benchmarks/synthetic/benchmark_that_fails.py
@@ -9,5 +9,5 @@
         memory_limit=4.0,
         time_limit=10.0,
         repeat=1,
-        verbose=False,
+        verbose=True,
     )
diff --git a/lite/valor_lite/object_detection/benchmark.py b/lite/valor_lite/object_detection/benchmark.py
deleted file mode 100644
index 288d51ba9..000000000
--- a/lite/valor_lite/object_detection/benchmark.py
+++ /dev/null
@@ -1,395 +0,0 @@
-import math
-import random
-
-import numpy as np
-from valor_lite.object_detection import (
-    Bitmask,
-    BoundingBox,
-    DataLoader,
-    Detection,
-    Polygon,
-)
-from valor_lite.profiling import Benchmark, create_runtime_profiler
-
-
-def generate_random_bbox(
-    n_labels: int,
-    is_prediction: bool,
-) -> BoundingBox:
-
-    scale = random.uniform(25, 100)
-    offset_x = random.uniform(0, 10000)
-    offset_y = random.uniform(0, 10000)
-
-    side_length = random.uniform(0.1, 0.5)
-
-    xmax = max(1 - side_length, 0)
-    ymax = max(1 - side_length, 0)
-    x = random.uniform(0, xmax)
-    y = random.uniform(0, ymax)
-
-    xmin0 = x * scale + offset_x
-    xmax0 = (x + side_length) * scale + offset_x
-    ymin0 = y * scale + offset_y
-    ymax0 = (y + side_length) * scale + offset_y
-
-    if n_labels > 1:
-        if not is_prediction:
-            gt_label = str(random.randint(0, n_labels - 1))
-            return BoundingBox(
-                xmin=xmin0,
-                xmax=xmax0,
-                ymin=ymin0,
-                ymax=ymax0,
-                labels=[gt_label],
-            )
-        else:
-            labels = [str(i) for i in range(n_labels)]
-            common_proba = 0.4 / (n_labels - 1)
-            scores = [0.5] + [common_proba for _ in range(n_labels - 1)]
-            return BoundingBox(
-                xmin=xmin0,
-                xmax=xmax0,
-                ymin=ymin0,
-                ymax=ymax0,
-                labels=labels,
-                scores=scores,
-            )
-    elif n_labels == 1:
-        if not is_prediction:
-            return BoundingBox(
-                xmin=xmin0,
-                xmax=xmax0,
-                ymin=ymin0,
-                ymax=ymax0,
-                labels=["0"],
-            )
-        else:
-            pd_score = random.uniform(0.1, 0.9)
-            return BoundingBox(
-                xmin=xmin0,
-                xmax=xmax0,
-                ymin=ymin0,
-                ymax=ymax0,
-                labels=["0"],
-                scores=[pd_score],
-            )
-    else:
-        raise ValueError
-
-
-def generate_random_bbox_pair(
-    n_labels: int,
-) -> tuple[BoundingBox, BoundingBox]:
-
-    scale = random.uniform(25, 100)
-    offset_x = random.uniform(0, 10000)
-    offset_y = random.uniform(0, 10000)
-
-    iou = random.uniform(0.1, 0.9)
-    side_length = random.uniform(0.1, 0.5)
-    intersection_area = (2 * iou * side_length * side_length) / (1 + iou)
-    delta = side_length - math.sqrt(intersection_area)
-
-    xmax = max(1 - side_length - delta, 0)
-    ymax = max(1 - side_length - delta, 0)
-    x = random.uniform(0, xmax)
-    y = random.uniform(0, ymax)
-
-    xmin0 = x * scale + offset_x
-    xmax0 = (x + side_length) * scale + offset_x
-    ymin0 = y * scale + offset_y
-    ymax0 = (y + side_length) * scale + offset_y
-
-    xmin1 = (x + delta) * scale + offset_x
-    xmax1 = (x + delta + side_length) * scale + offset_x
-    ymin1 = (y + delta) * scale + offset_y
-    ymax1 = (y + delta + side_length) * scale + offset_y
-
-    if n_labels > 1:
-        common_proba = 0.4 / (n_labels - 1)
-        labels = [str(i) for i in range(n_labels)]
-        scores = [0.5] + [common_proba for _ in range(n_labels - 1)]
-        gt_label = str(random.randint(0, n_labels - 1))
-        gt = BoundingBox(
-            xmin=xmin0,
-            xmax=xmax0,
-            ymin=ymin0,
-            ymax=ymax0,
-            labels=[gt_label],
-        )
-        pd = BoundingBox(
-            xmin=xmin1,
-            xmax=xmax1,
-            ymin=ymin1,
-            ymax=ymax1,
-            labels=labels,
-            scores=scores,
-        )
-    elif n_labels == 1:
-        gt_label = str(random.randint(0, 1))
-        pd_score = random.uniform(0.1, 0.9)
-        gt = BoundingBox(
-            xmin=xmin0,
-            xmax=xmax0,
-            ymin=ymin0,
-            ymax=ymax0,
-            labels=[gt_label],
-        )
-        pd = BoundingBox(
-            xmin=xmin1,
-            xmax=xmax1,
-            ymin=ymin1,
-            ymax=ymax1,
-            labels=["0"],
-            scores=[pd_score],
-        )
-    else:
-        raise ValueError
-
-    return (gt, pd)
-
-
-def generate_cache(
-    n_datums: int,
-    n_labels: int,
-    n_annotations_per_datum: tuple[int, int],
-) -> DataLoader:
-    """
-    This skips the IOU computation.
-
-    Not ideal since we are dealing directly with internals.
-    """
-
-    gts = []
-    pds = []
-    n_matched, n_unmatched = n_annotations_per_datum
-    for _ in range(n_matched):
-        gt, pd = generate_random_bbox_pair(n_labels)
-        gts.append(gt)
-        pds.append(pd)
-    for _ in range(n_unmatched):
-        gt = generate_random_bbox(n_labels, is_prediction=False)
-        pd = generate_random_bbox(n_labels, is_prediction=True)
-        gts.append(gt)
-        pds.append(pd)
-
-    detection = Detection(
-        uid="0",
-        groundtruths=gts,
-        predictions=pds,
-    )
-
-    loader = DataLoader()
-    loader.add_bounding_boxes([detection])
-
-    # loader cache duplication
-    assert len(loader.pairs) == 1
-
-    # duplicate all iou pairs
-    master_pair = loader.pairs[0]
-    duplicated_pairs = list()
-    for i in range(n_datums):
-        duplicate_pair = master_pair.copy()
-        duplicate_pair[:, 0] = i
-        duplicated_pairs.append(duplicate_pair)
-    loader.pairs = duplicated_pairs
-
-    loader.groundtruth_count = {
-        label_idx: {
-            datum_idx: count * n_datums for datum_idx, count in values.items()
-        }
-        for label_idx, values in loader.groundtruth_count.items()
-    }
-    loader.prediction_count = {
-        label_idx: {
-            datum_idx: count * n_datums for datum_idx, count in values.items()
-        }
-        for label_idx, values in loader.prediction_count.items()
-    }
-
-    # evaluator cache duplication
-    assert loader._evaluator.n_datums == 1
-    loader._evaluator.n_datums = n_datums
-    loader._evaluator.n_groundtruths = n_matched + n_unmatched
-    loader._evaluator.n_predictions = n_matched + n_unmatched
-    loader._evaluator.n_labels = n_labels
-    loader._evaluator.uid_to_index = {str(i): i for i in range(n_datums)}
-    loader._evaluator.index_to_uid = {i: str(i) for i in range(n_datums)}
-    loader._evaluator.label_to_index = {str(i): i for i in range(n_labels)}
-    loader._evaluator.index_to_label = {i: str(i) for i in range(n_labels)}
-
-    return loader
-
-
-def benchmark_add_bounding_boxes(
-    n_labels: int,
-    n_annotations_per_datum: tuple[int, int],
-    time_limit: float | None,
-    repeat: int = 1,
-):
-
-    profile = create_runtime_profiler(
-        time_limit=time_limit,
-        repeat=repeat,
-    )
-
-    elapsed = 0
-    n_matched, n_unmatched = n_annotations_per_datum
-    for _ in range(repeat):
-        gts = []
-        pds = []
-        for _ in range(n_matched):
-            gt, pd = generate_random_bbox_pair(n_labels)
-            gts.append(gt)
-            pds.append(pd)
-        for _ in range(n_unmatched):
-            gt = generate_random_bbox(n_labels, is_prediction=False)
-            gts.append(gt)
-            pd = generate_random_bbox(n_labels, is_prediction=True)
-            pds.append(pd)
-
-        detection = Detection(
-            uid="uid",
-            groundtruths=gts,
-            predictions=pds,
-        )
-        loader = DataLoader()
-        elapsed += profile(loader.add_bounding_boxes)([detection])
-    return elapsed / repeat
-
-
-def benchmark_finalize(
-    n_datums: int,
-    n_labels: int,
-    n_annotations_per_datum: tuple[int, int],
-    time_limit: float | None,
-    repeat: int = 1,
-):
-
-    profile = create_runtime_profiler(
-        time_limit=time_limit,
-        repeat=repeat,
-    )
-
-    elapsed = 0
-    for _ in range(repeat):
-        loader = generate_cache(
-            n_datums=n_datums,
-            n_labels=n_labels,
-            n_annotations_per_datum=n_annotations_per_datum,
-        )
-        elapsed += profile(loader.finalize)()
-    return elapsed / repeat
-
-
-def benchmark_compute_precision_recall(
-    n_datums: int,
-    n_labels: int,
-    n_annotations_per_datum: tuple[int, int],
-    time_limit: float | None,
-    repeat: int = 1,
-):
-
-    profile = create_runtime_profiler(
-        time_limit=time_limit,
-        repeat=repeat,
-    )
-
-    elapsed = 0
-    for _ in range(repeat):
-        loader = generate_cache(
-            n_datums=n_datums,
-            n_labels=n_labels,
-            n_annotations_per_datum=n_annotations_per_datum,
-        )
-        evaluator = loader.finalize()
-        elapsed += profile(evaluator.compute_precision_recall)()
-    return elapsed / repeat
-
-
-def benchmark_compute_confusion_matrix(
-    n_datums: int,
-    n_labels: int,
-    n_annotations_per_datum: tuple[int, int],
-    n_examples: int,
-    time_limit: float | None,
-    repeat: int = 1,
-):
-
-    profile = create_runtime_profiler(
-        time_limit=time_limit,
-        repeat=repeat,
-    )
-
-    elapsed = 0
-    for _ in range(repeat):
-        loader = generate_cache(
-            n_datums=n_datums,
-            n_labels=n_labels,
-            n_annotations_per_datum=n_annotations_per_datum,
-        )
-        evaluator = loader.finalize()
-        elapsed += profile(evaluator.compute_confusion_matrix)(
-            number_of_examples=n_examples
-        )
-    return elapsed / repeat
-
-
-if __name__ == "__main__":
-
-    n_datums = [
-        1000000,
-        100000,
-        10000,
-        1000,
-        100,
-        10,
-    ]
-
-    n_labels = [
-        1000,
-        100,
-        20,
-        5,
-    ]
-
-    n_annotations_per_datum = [
-        (1000, 1),
-        (100, 10),
-        (10, 2),
-    ]
-
-    n_examples = [
-        10,
-        5,
-        1,
-        0,
-    ]
-
-    b = Benchmark(
-        time_limit=5.0,
-        memory_limit=4 * (1024**3),
-        repeat=1,
-        verbose=True,
-    )
-
-    b.run(
-        benchmark=benchmark_add_bounding_boxes,
-        n_labels=n_labels,
-        n_annotations_per_datum=n_annotations_per_datum,
-    )
-
-    b.run(
-        benchmark=benchmark_finalize,
-        n_datums=n_datums,
-        n_labels=n_labels,
-        n_annotations_per_datum=n_annotations_per_datum,
-    )
-
-    b.run(
-        benchmark=benchmark_compute_precision_recall,
-        n_datums=n_datums,
-        n_labels=n_labels,
-        n_annotations_per_datum=n_annotations_per_datum,
-    )

From e47600810ca35fa5c6875029821249872e4f3454 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 11:51:12 -0600
Subject: [PATCH 11/29] remove intentional fail

---
 .github/workflows/lite-benchmark-evaluations.yml  |  2 +-
 .github/workflows/lite-synthetic-benchmarks.yml   |  3 ---
 .github/workflows/lite-tests-and-coverage.yml     |  2 +-
 lite/benchmarks/synthetic/benchmark_that_fails.py | 13 -------------
 4 files changed, 2 insertions(+), 18 deletions(-)
 delete mode 100644 lite/benchmarks/synthetic/benchmark_that_fails.py

diff --git a/.github/workflows/lite-benchmark-evaluations.yml b/.github/workflows/lite-benchmark-evaluations.yml
index 8dead7503..8afe7ed21 100644
--- a/.github/workflows/lite-benchmark-evaluations.yml
+++ b/.github/workflows/lite-benchmark-evaluations.yml
@@ -1,4 +1,4 @@
-name: Run valor-lite benchmarks
+name: "[valor-lite] benchmarks"
 
 on:
   push:
diff --git a/.github/workflows/lite-synthetic-benchmarks.yml b/.github/workflows/lite-synthetic-benchmarks.yml
index d21a8d7e1..0b18af2ee 100644
--- a/.github/workflows/lite-synthetic-benchmarks.yml
+++ b/.github/workflows/lite-synthetic-benchmarks.yml
@@ -22,7 +22,4 @@ jobs:
       - name: benchmark semantic segmentation
         run: python benchmark_semantic_segmentation.py
         working-directory: ./lite/benchmarks/synthetic/
-      - name: benchmark that fails
-        run: python benchmark_that_fails.py
-        working-directory: ./lite/benchmarks/synthetic/
       - run: make stop-env
diff --git a/.github/workflows/lite-tests-and-coverage.yml b/.github/workflows/lite-tests-and-coverage.yml
index 5bac96f9a..5628a2959 100644
--- a/.github/workflows/lite-tests-and-coverage.yml
+++ b/.github/workflows/lite-tests-and-coverage.yml
@@ -1,4 +1,4 @@
-name: Run valor-lite code coverage report
+name: "[valor-lite] code coverage report"
 
 on:
   push:
diff --git a/lite/benchmarks/synthetic/benchmark_that_fails.py b/lite/benchmarks/synthetic/benchmark_that_fails.py
deleted file mode 100644
index 2aa7aa64f..000000000
--- a/lite/benchmarks/synthetic/benchmark_that_fails.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from valor_lite.semantic_segmentation.benchmark import benchmark
-
-if __name__ == "__main__":
-
-    benchmark(
-        bitmask_shape=(10000, 10000),
-        number_of_images=10,
-        number_of_unique_labels=10,
-        memory_limit=4.0,
-        time_limit=10.0,
-        repeat=1,
-        verbose=True,
-    )

From 03343915efc02661da3faf3d9193078b15906dbc Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 11:54:08 -0600
Subject: [PATCH 12/29] renamed api-client tests to valor service tests

---
 .github/workflows/client-api-tests-and-coverage.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/client-api-tests-and-coverage.yml b/.github/workflows/client-api-tests-and-coverage.yml
index cc49ad4ab..a5504a279 100644
--- a/.github/workflows/client-api-tests-and-coverage.yml
+++ b/.github/workflows/client-api-tests-and-coverage.yml
@@ -1,4 +1,4 @@
-name: Run API + client code coverage report
+name: "[valor-service] code coverage report"
 
 on:
   push:

From 22785fc1495c944a4f92e9af6324cac0b9013697 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 11:54:32 -0600
Subject: [PATCH 13/29] renamed api-client tests to valor service tests

---
 .github/workflows/client-api-benchmark-evaluations.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/client-api-benchmark-evaluations.yml b/.github/workflows/client-api-benchmark-evaluations.yml
index 078a66237..a8314303f 100644
--- a/.github/workflows/client-api-benchmark-evaluations.yml
+++ b/.github/workflows/client-api-benchmark-evaluations.yml
@@ -1,4 +1,4 @@
-name: Run API + client benchmarks
+name: "[valor-service] benchmarks"
 
 on:
   push:

From af47d9c8335edd584c06e026d73bf74828561685 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 13:30:33 -0600
Subject: [PATCH 14/29] precommit

---
 .../benchmark_semantic_segmentation.py        |  83 +++++++++-
 lite/valor_lite/profiling.py                  |  70 ++++++--
 .../semantic_segmentation/benchmark.py        | 150 ++++++++----------
 3 files changed, 208 insertions(+), 95 deletions(-)

diff --git a/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py b/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py
index 27a2d0313..997cd3a9a 100644
--- a/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py
+++ b/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py
@@ -1,4 +1,85 @@
-from valor_lite.semantic_segmentation.benchmark import benchmark
+from valor_lite.profiling import Benchmark, BenchmarkError
+from valor_lite.semantic_segmentation.benchmark import (
+    benchmark_add_data,
+    benchmark_evaluate,
+    benchmark_finalize,
+)
+
+
+def benchmark(
+    bitmask_shape: tuple[int, int],
+    number_of_unique_labels: int,
+    number_of_images: int,
+    *_,
+    memory_limit: float = 4.0,
+    time_limit: float = 10.0,
+    repeat: int = 1,
+    verbose: bool = False,
+):
+    """
+    Runs a single benchmark.
+
+    Parameters
+    ----------
+    bitmask_shape : tuple[int, int]
+        The size (h, w) of the bitmask array.
+    number_of_unique_labels : int
+        The number of unique labels used in the synthetic example.
+    number_of_images : int
+        The number of distinct datums that are created.
+    memory_limit : float
+        The maximum amount of system memory allowed in gigabytes (GB).
+    time_limit : float
+        The maximum amount of time permitted before killing the benchmark.
+    repeat : int
+        The number of times to run a benchmark to produce an average runtime.
+    verbose : bool, default=False
+        Toggles terminal output of benchmark results.
+    """
+
+    b = Benchmark(
+        time_limit=time_limit,
+        memory_limit=int(memory_limit * (1024**3)),
+        repeat=repeat,
+        verbose=verbose,
+    )
+
+    _, failed, details = b.run(
+        benchmark=benchmark_add_data,
+        n_labels=[number_of_unique_labels],
+        shape=[bitmask_shape],
+    )
+    if failed:
+        raise BenchmarkError(
+            benchmark=details["benchmark"],
+            error_type=failed[0]["error"],
+            error_message=failed[0]["msg"],
+        )
+
+    _, failed, details = b.run(
+        benchmark=benchmark_finalize,
+        n_datums=[number_of_images],
+        n_labels=[number_of_unique_labels],
+    )
+    if failed:
+        raise BenchmarkError(
+            benchmark=details["benchmark"],
+            error_type=failed[0]["error"],
+            error_message=failed[0]["msg"],
+        )
+
+    _, failed, details = b.run(
+        benchmark=benchmark_evaluate,
+        n_datums=[number_of_images],
+        n_labels=[number_of_unique_labels],
+    )
+    if failed:
+        raise BenchmarkError(
+            benchmark=details["benchmark"],
+            error_type=failed[0]["error"],
+            error_message=failed[0]["msg"],
+        )
+
 
 if __name__ == "__main__":
 
diff --git a/lite/valor_lite/profiling.py b/lite/valor_lite/profiling.py
index 02e315824..9ad1e6009 100644
--- a/lite/valor_lite/profiling.py
+++ b/lite/valor_lite/profiling.py
@@ -2,10 +2,10 @@
 import math
 import multiprocessing as mp
 import resource
-import sys
 import time
 from collections import deque
 from multiprocessing import Queue
+from typing import Any
 
 from tqdm import tqdm
 
@@ -20,6 +20,11 @@ def __init__(
 
 
 def _timeit_subprocess(*args, __fn, __queue: Queue, **kwargs):
+    """
+    Multiprocessing subprocess that reports either runtime or errors.
+
+    This is handled within a subprocess to protect the benchmark against OOM errors.
+    """
     try:
         timer_start = time.perf_counter()
         __fn(*args, **kwargs)
@@ -34,7 +39,16 @@ def create_runtime_profiler(
     repeat: int = 1,
 ):
     """
-    This profiles the runtime of the wrapped function in a subprocess.
+    Creates a runtime profiler as a decorating function.
+
+    The profiler reports runtime of the wrapped function from a subprocess to protect against OOM errors.
+
+    Parameters
+    ----------
+    time_limit : float, optional
+        An optional time limit to constrain the benchmark.
+    repeat : int, default=1
+        The number of times to repeat the benchmark to produce an average runtime.
     """
     ctx = mp.get_context("spawn")
 
@@ -78,13 +92,6 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
-def calculate_complexity(params: list[int | tuple[int]]) -> int:
-    flattened_params = [
-        math.prod(p) if isinstance(p, tuple) else p for p in params
-    ]
-    return math.prod(flattened_params)
-
-
 def pretty_print_results(results: tuple):
     valid, invalid, permutations = results
 
@@ -128,6 +135,16 @@ def pretty_print_results(results: tuple):
             print(row)
 
 
+def _calculate_complexity(params: list[int | tuple[int]]) -> int:
+    """
+    Basic metric of benchmark complexity.
+    """
+    flattened_params = [
+        math.prod(p) if isinstance(p, tuple) else p for p in params
+    ]
+    return math.prod(flattened_params)
+
+
 class Benchmark:
     def __init__(
         self,
@@ -149,6 +166,23 @@ def get_limits(
         memory_unit: str = "GB",
         time_unit: str = "seconds",
     ) -> dict[str, str | int | float | None]:
+        """
+        Returns a dictionary of benchmark limits.
+
+        Parameters
+        ----------
+        readable : bool, default=True
+            Toggles whether the output should be human readable.
+        memory_unit : str, default="GB"
+            Toggles what unit to display the memory limit with when 'readable=True'.
+        time_unit : str, default="seconds"
+            Toggles what unit to display the time limit with when 'readable=True'.
+
+        Returns
+        -------
+        dict[str, str | int | float | None]
+            The benchmark limits.
+        """
 
         memory_value = self.memory_limit
         if readable and memory_value is not None:
@@ -194,6 +228,9 @@ def get_limits(
 
     @property
     def memory_limit(self) -> int | None:
+        """
+        The memory limit in bytes (B).
+        """
         return self._memory_limit
 
     @memory_limit.setter
@@ -209,8 +246,19 @@ def memory_limit(self, limit: int | None):
     def run(
         self,
         benchmark,
-        **kwargs,
+        **kwargs: list[Any],
     ):
+        """
+        Runs a benchmark.
+
+        Parameters
+        ----------
+        benchmark : Callable
+            The benchmark function.
+        kwargs : dict[str, list]
+            Lists of arguments to
+        """
+
         nvars = len(kwargs)
         keys = tuple(kwargs.keys())
         vars = tuple(kwargs[key] for key in keys)
@@ -240,7 +288,7 @@ def run(
                 k: v[current_indices[idx]]
                 for idx, (k, v) in enumerate(zip(keys, vars))
             }
-            complexity = calculate_complexity(tuple(parameters.values()))
+            complexity = _calculate_complexity(list(parameters.values()))
 
             details: dict = {k: str(v) for k, v in parameters.items()}
 
diff --git a/lite/valor_lite/semantic_segmentation/benchmark.py b/lite/valor_lite/semantic_segmentation/benchmark.py
index a713fa675..5b7a5f4b3 100644
--- a/lite/valor_lite/semantic_segmentation/benchmark.py
+++ b/lite/valor_lite/semantic_segmentation/benchmark.py
@@ -1,9 +1,5 @@
 import numpy as np
-from valor_lite.profiling import (
-    Benchmark,
-    BenchmarkError,
-    create_runtime_profiler,
-)
+from valor_lite.profiling import create_runtime_profiler
 from valor_lite.semantic_segmentation import Bitmask, DataLoader, Segmentation
 
 
@@ -14,16 +10,22 @@ def generate_segmentation(
     width: int,
 ) -> Segmentation:
     """
-    Generates a list of segmentation annotations.
+    Generates a semantic segmentation annotation.
 
     Parameters
     ----------
     uid : str
         The datum UID for the generated segmentation.
+    n_labels : int
+        The number of unique labels.
+    height : int
+        The height of the mask in pixels.
+    width : int
+        The width of the mask in pixels.
 
     Returns
     -------
-    Segmenation
+    Segmentation
         A generated semantic segmenatation annotation.
     """
 
@@ -83,7 +85,26 @@ def benchmark_add_data(
     shape: tuple[int, int],
     time_limit: float | None,
     repeat: int = 1,
-):
+) -> float:
+    """
+    Benchmarks 'Dataloader.add_data' for semantic segmentation.
+
+    Parameters
+    ----------
+    n_labels : int
+        The number of unique labels to generate.
+    shape : tuple[int, int]
+        The size (h,w) of the mask to generate.
+    time_limit : float, optional
+        An optional time limit to constrain the benchmark.
+    repeat : int
+        The number of times to run the benchmark to produce a runtime average.
+
+    Returns
+    -------
+    float
+        The average runtime.
+    """
 
     profile = create_runtime_profiler(
         time_limit=time_limit,
@@ -109,6 +130,25 @@ def benchmark_finalize(
     time_limit: float | None,
     repeat: int = 1,
 ):
+    """
+    Benchmarks 'Dataloader.finalize' for semantic segmentation.
+
+    Parameters
+    ----------
+    n_datums : int
+        The number of datums to generate.
+    n_labels : int
+        The number of unique labels to generate.
+    time_limit : float, optional
+        An optional time limit to constrain the benchmark.
+    repeat : int
+        The number of times to run the benchmark to produce a runtime average.
+
+    Returns
+    -------
+    float
+        The average runtime.
+    """
 
     profile = create_runtime_profiler(
         time_limit=time_limit,
@@ -136,6 +176,25 @@ def benchmark_evaluate(
     time_limit: float | None,
     repeat: int = 1,
 ):
+    """
+    Benchmarks 'Evaluator.evaluate' for semantic segmentation.
+
+    Parameters
+    ----------
+    n_datums : int
+        The number of datums to generate.
+    n_labels : int
+        The number of unique labels to generate.
+    time_limit : float, optional
+        An optional time limit to constrain the benchmark.
+    repeat : int
+        The number of times to run the benchmark to produce a runtime average.
+
+    Returns
+    -------
+    float
+        The average runtime.
+    """
 
     profile = create_runtime_profiler(
         time_limit=time_limit,
@@ -156,78 +215,3 @@ def benchmark_evaluate(
         evaluator = loader.finalize()
         elapsed += profile(evaluator.evaluate)()
     return elapsed / repeat
-
-
-def benchmark(
-    bitmask_shape: tuple[int, int],
-    number_of_unique_labels: int,
-    number_of_images: int,
-    *_,
-    memory_limit: float = 4.0,
-    time_limit: float = 10.0,
-    repeat: int = 1,
-    verbose: bool = False,
-):
-    """
-    Runs a single benchmark.
-
-    Parameters
-    ----------
-    bitmask_shape : tuple[int, int]
-        The size (h, w) of the bitmask array.
-    number_of_unique_labels : int
-        The number of unique labels used in the synthetic example.
-    number_of_images : int
-        The number of distinct datums that are created.
-    memory_limit : float
-        The maximum amount of system memory allowed in gigabytes (GB).
-    time_limit : float
-        The maximum amount of time permitted before killing the benchmark.
-    repeat : int
-        The number of times to run a benchmark to produce an average runtime.
-    verbose : bool, default=False
-        Toggles terminal output of benchmark results.
-    """
-
-    b = Benchmark(
-        time_limit=time_limit,
-        memory_limit=int(memory_limit * (1024**3)),
-        repeat=repeat,
-        verbose=verbose,
-    )
-
-    _, failed, details = b.run(
-        benchmark=benchmark_add_data,
-        n_labels=[number_of_unique_labels],
-        shape=[bitmask_shape],
-    )
-    if failed:
-        raise BenchmarkError(
-            benchmark=details["benchmark"],
-            error_type=failed[0]["error"],
-            error_message=failed[0]["msg"],
-        )
-
-    _, failed, details = b.run(
-        benchmark=benchmark_finalize,
-        n_datums=[number_of_images],
-        n_labels=[number_of_unique_labels],
-    )
-    if failed:
-        raise BenchmarkError(
-            benchmark=details["benchmark"],
-            error_type=failed[0]["error"],
-            error_message=failed[0]["msg"],
-        )
-
-    _, failed, details = b.run(
-        benchmark=benchmark_evaluate,
-        n_datums=[number_of_images],
-        n_labels=[number_of_unique_labels],
-    )
-    if failed:
-        raise BenchmarkError(
-            benchmark=details["benchmark"],
-            error_type=failed[0]["error"],
-            error_message=failed[0]["msg"],
-        )

From 00aeaa7a18868da83d31b45e6e1514a5e58ceb75 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 14:25:34 -0600
Subject: [PATCH 15/29] weird bitnami bug

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2d25c227c..80ec2904d 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,7 @@ unit-tests:
 
 start-postgres-docker:
 	docker build -t pgvalor ./database
-	docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -e POSTGRES_DB=valor -d pgvalor
+	docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -e POSTGRESQL_REPLICATION_USE_PASSFILE=false -e POSTGRES_DB=valor -d pgvalor
 
 run-migrations:
 ifeq ($(shell uname -s),Darwin)

From 2e505e84a87bc198dd6901147e0a60a77547a64f Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 14:29:53 -0600
Subject: [PATCH 16/29] weird bitnami bug

---
 .github/workflows/client-api-tests-and-coverage.yml | 2 +-
 docker-compose.yml                                  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/client-api-tests-and-coverage.yml b/.github/workflows/client-api-tests-and-coverage.yml
index a5504a279..3285c80c3 100644
--- a/.github/workflows/client-api-tests-and-coverage.yml
+++ b/.github/workflows/client-api-tests-and-coverage.yml
@@ -27,7 +27,7 @@ jobs:
           docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -e POSTGRES_DB=valor -d pgvalor
           sleep 3
           docker build ./migrations -t migrations
-          docker run -e POSTGRES_PASSWORD=password -e POSTGRES_HOST=localhost -e POSTGRES_DB=valor -e POSTGRES_USERNAME=postgres -e POSTGRES_PORT=5432 --network "host" migrations
+          docker run -e POSTGRES_PASSWORD=password -e POSTGRES_HOST=localhost -e POSTGRES_DB=valor -e POSTGRES_USERNAME=postgres -e POSTGRES_PORT=5432 -e POSTGRESQL_REPLICATION_USE_PASSFILE=false --network "host" migrations
       - name: run functional tests
         run: |
           cd api && pip install ".[test]"
diff --git a/docker-compose.yml b/docker-compose.yml
index 040eed70e..a94bce9f7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,6 +19,7 @@ services:
       POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
       POSTGRES_DB: ${POSTGRES_DB}
       POSTGRES_PORT: ${POSTGRES_PORT}
+      POSTGRESQL_REPLICATION_USE_PASSFILE: false
       VALOR_SECRET_KEY: ${VALOR_SECRET_KEY}
       VALOR_USERNAME: ${VALOR_USERNAME}
       VALOR_PASSWORD: ${VALOR_PASSWORD}

From 7824270ab2f41f64fbaccc8b5d4a20f81ca48846 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 14:42:29 -0600
Subject: [PATCH 17/29] weird bitnami stuff

---
 .github/workflows/client-api-tests-and-coverage.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/client-api-tests-and-coverage.yml b/.github/workflows/client-api-tests-and-coverage.yml
index 3285c80c3..e01f82319 100644
--- a/.github/workflows/client-api-tests-and-coverage.yml
+++ b/.github/workflows/client-api-tests-and-coverage.yml
@@ -24,10 +24,10 @@ jobs:
       - name: set up postgres
         run: |
           docker build ./database -t pgvalor
-          docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -e POSTGRES_DB=valor -d pgvalor
+          docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -e POSTGRES_DB=valor -e POSTGRESQL_REPLICATION_USE_PASSFILE=false -d pgvalor
           sleep 3
           docker build ./migrations -t migrations
-          docker run -e POSTGRES_PASSWORD=password -e POSTGRES_HOST=localhost -e POSTGRES_DB=valor -e POSTGRES_USERNAME=postgres -e POSTGRES_PORT=5432 -e POSTGRESQL_REPLICATION_USE_PASSFILE=false --network "host" migrations
+          docker run -e POSTGRES_PASSWORD=password -e POSTGRES_HOST=localhost -e POSTGRES_DB=valor -e POSTGRES_USERNAME=postgres -e POSTGRES_PORT=5432 --network "host" migrations
       - name: run functional tests
         run: |
           cd api && pip install ".[test]"

From e5945f1dd5331154fd03ecaebd2c3619b4bd7613 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 14:48:25 -0600
Subject: [PATCH 18/29] docstring

---
 lite/valor_lite/profiling.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/lite/valor_lite/profiling.py b/lite/valor_lite/profiling.py
index 9ad1e6009..a75774fc6 100644
--- a/lite/valor_lite/profiling.py
+++ b/lite/valor_lite/profiling.py
@@ -249,14 +249,36 @@ def run(
         **kwargs: list[Any],
     ):
         """
-        Runs a benchmark.
+        Runs a benchmark with ranges of parameters.
 
         Parameters
         ----------
         benchmark : Callable
             The benchmark function.
-        kwargs : dict[str, list]
-            Lists of arguments to
+        **kwargs : list[Any]
+            Keyword arguments passing lists of parameters to benchmark. The values should be sorted in
+            decreasing complexity. For example, if the number of labels is a parameter then a higher
+            number of unique labels would be considered "more" complex.
+
+        Example
+        -------
+        >>> b = Benchmark(
+        ...     time_limit=10.0,
+        ...     memory_limit=8 * (1024**3),
+        ...     repeat=1,
+        ...     verbose=False,
+        ... )
+        >>> results = b.run(
+        ...     benchmark=semseg_add_data,
+        ...     n_labels=[
+        ...         100,
+        ...         10,
+        ...     ],
+        ...     shape=[
+        ...         (1000, 1000),
+        ...         (100, 100),
+        ...     ],
+        ... )
         """
 
         nvars = len(kwargs)

From d7cdc1e8e8e3ff30694cd83dad174ff16c6700cf Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 14:50:07 -0600
Subject: [PATCH 19/29] more bitnami stuff

---
 api/.env.testing | 1 +
 1 file changed, 1 insertion(+)

diff --git a/api/.env.testing b/api/.env.testing
index acbd765fb..1e3b41462 100644
--- a/api/.env.testing
+++ b/api/.env.testing
@@ -3,3 +3,4 @@ POSTGRES_PASSWORD=password
 POSTGRES_HOST=db
 POSTGRES_DB=valor
 POSTGRES_PORT=5432
+POSTGRESQL_REPLICATION_USE_PASSFILE=false

From 0890f3ca7936ddbf67653dd3812ceb3d696c0ffc Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 15:09:30 -0600
Subject: [PATCH 20/29] remove bitnami fix

---
 .github/workflows/client-api-benchmark-evaluations.yml | 2 +-
 .github/workflows/client-api-tests-and-coverage.yml    | 4 ++--
 Makefile                                               | 2 +-
 api/.env.testing                                       | 1 -
 docker-compose.yml                                     | 1 -
 5 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/client-api-benchmark-evaluations.yml b/.github/workflows/client-api-benchmark-evaluations.yml
index a8314303f..078a66237 100644
--- a/.github/workflows/client-api-benchmark-evaluations.yml
+++ b/.github/workflows/client-api-benchmark-evaluations.yml
@@ -1,4 +1,4 @@
-name: "[valor-service] benchmarks"
+name: Run API + client benchmarks
 
 on:
   push:
diff --git a/.github/workflows/client-api-tests-and-coverage.yml b/.github/workflows/client-api-tests-and-coverage.yml
index e01f82319..cc49ad4ab 100644
--- a/.github/workflows/client-api-tests-and-coverage.yml
+++ b/.github/workflows/client-api-tests-and-coverage.yml
@@ -1,4 +1,4 @@
-name: "[valor-service] code coverage report"
+name: Run API + client code coverage report
 
 on:
   push:
@@ -24,7 +24,7 @@ jobs:
       - name: set up postgres
         run: |
           docker build ./database -t pgvalor
-          docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -e POSTGRES_DB=valor -e POSTGRESQL_REPLICATION_USE_PASSFILE=false -d pgvalor
+          docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -e POSTGRES_DB=valor -d pgvalor
           sleep 3
           docker build ./migrations -t migrations
           docker run -e POSTGRES_PASSWORD=password -e POSTGRES_HOST=localhost -e POSTGRES_DB=valor -e POSTGRES_USERNAME=postgres -e POSTGRES_PORT=5432 --network "host" migrations
diff --git a/Makefile b/Makefile
index 80ec2904d..2d25c227c 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,7 @@ unit-tests:
 
 start-postgres-docker:
 	docker build -t pgvalor ./database
-	docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -e POSTGRESQL_REPLICATION_USE_PASSFILE=false -e POSTGRES_DB=valor -d pgvalor
+	docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -e POSTGRES_DB=valor -d pgvalor
 
 run-migrations:
 ifeq ($(shell uname -s),Darwin)
diff --git a/api/.env.testing b/api/.env.testing
index 1e3b41462..acbd765fb 100644
--- a/api/.env.testing
+++ b/api/.env.testing
@@ -3,4 +3,3 @@ POSTGRES_PASSWORD=password
 POSTGRES_HOST=db
 POSTGRES_DB=valor
 POSTGRES_PORT=5432
-POSTGRESQL_REPLICATION_USE_PASSFILE=false
diff --git a/docker-compose.yml b/docker-compose.yml
index a94bce9f7..040eed70e 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,7 +19,6 @@ services:
       POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
       POSTGRES_DB: ${POSTGRES_DB}
       POSTGRES_PORT: ${POSTGRES_PORT}
-      POSTGRESQL_REPLICATION_USE_PASSFILE: false
       VALOR_SECRET_KEY: ${VALOR_SECRET_KEY}
       VALOR_USERNAME: ${VALOR_USERNAME}
       VALOR_PASSWORD: ${VALOR_PASSWORD}

From 2b497b8458de043674b5947935548b7d3f6e6536 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Tue, 12 Nov 2024 16:46:11 -0600
Subject: [PATCH 21/29] remove obj det example from notebook

---
 lite/examples/benchmarking.ipynb | 184 +++----------------------------
 1 file changed, 15 insertions(+), 169 deletions(-)

diff --git a/lite/examples/benchmarking.ipynb b/lite/examples/benchmarking.ipynb
index 78a6319d4..64cf300a8 100644
--- a/lite/examples/benchmarking.ipynb
+++ b/lite/examples/benchmarking.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -16,160 +16,6 @@
     ")"
    ]
   },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Object Detection"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from valor_lite.object_detection.benchmark import (\n",
-    "    benchmark_add_bounding_boxes as objdet_add_bboxes,\n",
-    "    benchmark_finalize as objdet_finalize,\n",
-    "    # benchmark_evaluate as objdet_evaluate,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "n_datums = [\n",
-    "    100,\n",
-    "    10,\n",
-    "    1,\n",
-    "]\n",
-    "\n",
-    "n_labels = [\n",
-    "    1000,\n",
-    "    100,\n",
-    "    10,\n",
-    "    3,\n",
-    "]\n",
-    "\n",
-    "n_annotations_per_datum = [\n",
-    "    (100, 10),  # 100 pairs, 10 w/ no overlap\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  0%|          | 0/4 [00:01<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "=====================================================================\n",
-      "Details\n",
-      "{\n",
-      "    \"benchmark\": \"benchmark_add_bounding_boxes\",\n",
-      "    \"limits\": {\n",
-      "        \"memory_limit\": \"8.0 GB\",\n",
-      "        \"time_limit\": \"5.0 seconds\",\n",
-      "        \"repeat\": 1\n",
-      "    },\n",
-      "    \"passed\": 4,\n",
-      "    \"failed\": 0,\n",
-      "    \"total\": 4\n",
-      "}\n",
-      "\n",
-      "Passed\n",
-      "  complexity    |     runtime     |    n_labels     | n_annotations_per_datum\n",
-      "-----------------------------------------------------------------------------\n",
-      "    1000000     |     1.2612      |      1000       |    (100, 10)   \n",
-      "\n",
-      "Failed\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "_ = b.run(\n",
-    "    benchmark=objdet_add_bboxes,\n",
-    "    n_labels=n_labels,\n",
-    "    n_annotations_per_datum=n_annotations_per_datum,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  0%|          | 0/12 [00:02<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "=====================================================================\n",
-      "Details\n",
-      "{\n",
-      "    \"benchmark\": \"benchmark_finalize\",\n",
-      "    \"limits\": {\n",
-      "        \"memory_limit\": \"8.0 GB\",\n",
-      "        \"time_limit\": \"5.0 seconds\",\n",
-      "        \"repeat\": 1\n",
-      "    },\n",
-      "    \"passed\": 12,\n",
-      "    \"failed\": 0,\n",
-      "    \"total\": 12\n",
-      "}\n",
-      "\n",
-      "Passed\n",
-      "  complexity    |     runtime     |    n_datums     |    n_labels     | n_annotations_per_datum\n",
-      "-----------------------------------------------------------------------------------------------\n",
-      "   100000000    |     0.7321      |       100       |      1000       |    (100, 10)   \n",
-      "\n",
-      "Failed\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "_ = b.run(\n",
-    "    benchmark=objdet_finalize,\n",
-    "    n_datums=n_datums,\n",
-    "    n_labels=n_labels,\n",
-    "    n_annotations_per_datum=n_annotations_per_datum,\n",
-    ")"
-   ]
-  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -180,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -227,7 +73,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      " 69%|██████▉   | 11/16 [00:44<00:20,  4.07s/it]"
+      " 69%|██████▉   | 11/16 [00:51<00:23,  4.66s/it]"
      ]
     },
     {
@@ -251,10 +97,10 @@
       "Passed\n",
       "  complexity    |     runtime     |    n_labels     |      shape     \n",
       "---------------------------------------------------------------------\n",
-      "   300000000    |     1.3209      |        3        | (10000, 10000) \n",
-      "   62500000     |     0.5378      |       10        |  (2500, 2500)  \n",
-      "   10000000     |     0.0911      |       10        |  (1000, 1000)  \n",
-      "    1000000     |     0.0598      |       100       |   (100, 100)   \n",
+      "   300000000    |     1.4995      |        3        | (10000, 10000) \n",
+      "   62500000     |      0.496      |       10        |  (2500, 2500)  \n",
+      "   10000000     |     0.0909      |       10        |  (1000, 1000)  \n",
+      "    1000000     |     0.0602      |       100       |   (100, 100)   \n",
       "\n",
       "Failed\n",
       "  complexity    |      error      |    n_labels     |      shape      |       msg      \n",
@@ -294,7 +140,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      " 42%|████▏     | 5/12 [00:06<00:09,  1.37s/it]"
+      " 42%|████▏     | 5/12 [00:07<00:09,  1.42s/it]"
      ]
     },
     {
@@ -318,9 +164,9 @@
       "Passed\n",
       "  complexity    |     runtime     |    n_datums     |    n_labels    \n",
       "---------------------------------------------------------------------\n",
-      "     10000      |     0.0152      |       100       |       100      \n",
-      "     1000       |     0.0044      |       10        |       100      \n",
-      "      100       |     0.0014      |        1        |       100      \n",
+      "     10000      |     0.0114      |       100       |       100      \n",
+      "     1000       |      0.004      |       10        |       100      \n",
+      "      100       |     0.0021      |        1        |       100      \n",
       "\n",
       "Failed\n",
       "  complexity    |      error      |    n_datums     |    n_labels     |       msg      \n",
@@ -355,7 +201,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      " 42%|████▏     | 5/12 [00:06<00:09,  1.39s/it]"
+      " 42%|████▏     | 5/12 [00:07<00:09,  1.40s/it]"
      ]
     },
     {
@@ -379,9 +225,9 @@
       "Passed\n",
       "  complexity    |     runtime     |    n_datums     |    n_labels    \n",
       "---------------------------------------------------------------------\n",
-      "     10000      |     0.0146      |       100       |       100      \n",
-      "     1000       |     0.0148      |       10        |       100      \n",
-      "      100       |      0.013      |        1        |       100      \n",
+      "     10000      |     0.0177      |       100       |       100      \n",
+      "     1000       |     0.0122      |       10        |       100      \n",
+      "      100       |     0.0205      |        1        |       100      \n",
       "\n",
       "Failed\n",
       "  complexity    |      error      |    n_datums     |    n_labels     |       msg      \n",

From 8d19d2ea05515ac3d9e6e7cd6d671008a9860202 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Wed, 13 Nov 2024 09:30:26 -0600
Subject: [PATCH 22/29] remove make stop-env from lite benchmarks

---
 .github/workflows/lite-benchmark-evaluations.yml | 1 -
 .github/workflows/lite-synthetic-benchmarks.yml  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/.github/workflows/lite-benchmark-evaluations.yml b/.github/workflows/lite-benchmark-evaluations.yml
index 8afe7ed21..82d80b7bf 100644
--- a/.github/workflows/lite-benchmark-evaluations.yml
+++ b/.github/workflows/lite-benchmark-evaluations.yml
@@ -35,4 +35,3 @@ jobs:
           export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('objdet_results.json', 'r')), indent=4));")
           echo "$BENCHMARK_RESULTS"
         working-directory: ./lite/benchmarks/
-      - run: make stop-env
diff --git a/.github/workflows/lite-synthetic-benchmarks.yml b/.github/workflows/lite-synthetic-benchmarks.yml
index 0b18af2ee..daaefe5dc 100644
--- a/.github/workflows/lite-synthetic-benchmarks.yml
+++ b/.github/workflows/lite-synthetic-benchmarks.yml
@@ -22,4 +22,3 @@ jobs:
       - name: benchmark semantic segmentation
         run: python benchmark_semantic_segmentation.py
         working-directory: ./lite/benchmarks/synthetic/
-      - run: make stop-env

From f201e3de06a725cc39af5a4a5e63ed2db0197a6b Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Wed, 13 Nov 2024 10:53:18 -0600
Subject: [PATCH 23/29] intermediate compuation 10x speedup

---
 lite/examples/benchmarking.ipynb              | 170 ++----------------
 .../semantic_segmentation/__init__.py         |   3 +-
 .../semantic_segmentation/annotation.py       |  85 ++++++++-
 .../semantic_segmentation/benchmark.py        | 144 +++++----------
 .../semantic_segmentation/computation.py      |  33 +---
 5 files changed, 152 insertions(+), 283 deletions(-)

diff --git a/lite/examples/benchmarking.ipynb b/lite/examples/benchmarking.ipynb
index 64cf300a8..e52a9ed61 100644
--- a/lite/examples/benchmarking.ipynb
+++ b/lite/examples/benchmarking.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -39,11 +39,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "n_datums = [\n",
+    "    10000,\n",
+    "    1000,\n",
     "    100,\n",
     "    10,\n",
     "    1,\n",
@@ -66,63 +68,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      " 69%|██████▉   | 11/16 [00:51<00:23,  4.66s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "=====================================================================\n",
-      "Details\n",
-      "{\n",
-      "    \"benchmark\": \"benchmark_add_data\",\n",
-      "    \"limits\": {\n",
-      "        \"memory_limit\": \"8.0 GB\",\n",
-      "        \"time_limit\": \"5.0 seconds\",\n",
-      "        \"repeat\": 1\n",
-      "    },\n",
-      "    \"passed\": 8,\n",
-      "    \"failed\": 8,\n",
-      "    \"total\": 16\n",
-      "}\n",
-      "\n",
-      "Passed\n",
-      "  complexity    |     runtime     |    n_labels     |      shape     \n",
-      "---------------------------------------------------------------------\n",
-      "   300000000    |     1.4995      |        3        | (10000, 10000) \n",
-      "   62500000     |      0.496      |       10        |  (2500, 2500)  \n",
-      "   10000000     |     0.0909      |       10        |  (1000, 1000)  \n",
-      "    1000000     |     0.0602      |       100       |   (100, 100)   \n",
-      "\n",
-      "Failed\n",
-      "  complexity    |      error      |    n_labels     |      shape      |       msg      \n",
-      "---------------------------------------------------------------------------------------\n",
-      " 100000000000   |   MemoryError   |      1000       | (10000, 10000)  | Unable to allocate 186. GiB for an array with shape (1001, 20000, 10000) and data type bool\n",
-      "  10000000000   |   MemoryError   |       100       | (10000, 10000)  | Unable to allocate 18.8 GiB for an array with shape (101, 20000, 10000) and data type bool\n",
-      "  6250000000    |   MemoryError   |      1000       |  (2500, 2500)   | Unable to allocate 11.7 GiB for an array with shape (1001, 5000, 2500) and data type bool\n",
-      "  1000000000    |   MemoryError   |       10        | (10000, 10000)  | Unable to allocate 9.31 GiB for an array with shape (10, 10, 100000000) and data type bool\n",
-      "  1000000000    |   MemoryError   |      1000       |  (1000, 1000)   | Unable to allocate 931. GiB for an array with shape (1000, 1000, 1000000) and data type bool\n",
-      "   625000000    |   MemoryError   |       100       |  (2500, 2500)   | Unable to allocate 58.2 GiB for an array with shape (100, 100, 6250000) and data type bool\n",
-      "   100000000    |   MemoryError   |       100       |  (1000, 1000)   | Unable to allocate 9.31 GiB for an array with shape (100, 100, 1000000) and data type bool\n",
-      "   10000000     |   MemoryError   |      1000       |   (100, 100)    | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "_ = b.run(\n",
     "    benchmark=semseg_add_data,\n",
@@ -133,57 +81,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      " 42%|████▏     | 5/12 [00:07<00:09,  1.42s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "=====================================================================\n",
-      "Details\n",
-      "{\n",
-      "    \"benchmark\": \"benchmark_finalize\",\n",
-      "    \"limits\": {\n",
-      "        \"memory_limit\": \"8.0 GB\",\n",
-      "        \"time_limit\": \"5.0 seconds\",\n",
-      "        \"repeat\": 1\n",
-      "    },\n",
-      "    \"passed\": 9,\n",
-      "    \"failed\": 3,\n",
-      "    \"total\": 12\n",
-      "}\n",
-      "\n",
-      "Passed\n",
-      "  complexity    |     runtime     |    n_datums     |    n_labels    \n",
-      "---------------------------------------------------------------------\n",
-      "     10000      |     0.0114      |       100       |       100      \n",
-      "     1000       |      0.004      |       10        |       100      \n",
-      "      100       |     0.0021      |        1        |       100      \n",
-      "\n",
-      "Failed\n",
-      "  complexity    |      error      |    n_datums     |    n_labels     |       msg      \n",
-      "---------------------------------------------------------------------------------------\n",
-      "    100000      |   MemoryError   |       100       |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n",
-      "     10000      |   MemoryError   |       10        |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n",
-      "     1000       |   MemoryError   |        1        |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "_ = b.run(\n",
     "    benchmark=semseg_finalize,\n",
@@ -194,57 +94,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      " 42%|████▏     | 5/12 [00:07<00:09,  1.40s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "=====================================================================\n",
-      "Details\n",
-      "{\n",
-      "    \"benchmark\": \"benchmark_evaluate\",\n",
-      "    \"limits\": {\n",
-      "        \"memory_limit\": \"8.0 GB\",\n",
-      "        \"time_limit\": \"5.0 seconds\",\n",
-      "        \"repeat\": 1\n",
-      "    },\n",
-      "    \"passed\": 9,\n",
-      "    \"failed\": 3,\n",
-      "    \"total\": 12\n",
-      "}\n",
-      "\n",
-      "Passed\n",
-      "  complexity    |     runtime     |    n_datums     |    n_labels    \n",
-      "---------------------------------------------------------------------\n",
-      "     10000      |     0.0177      |       100       |       100      \n",
-      "     1000       |     0.0122      |       10        |       100      \n",
-      "      100       |     0.0205      |        1        |       100      \n",
-      "\n",
-      "Failed\n",
-      "  complexity    |      error      |    n_datums     |    n_labels     |       msg      \n",
-      "---------------------------------------------------------------------------------------\n",
-      "    100000      |   MemoryError   |       100       |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n",
-      "     10000      |   MemoryError   |       10        |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n",
-      "     1000       |   MemoryError   |        1        |      1000       | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "_ = b.run(\n",
     "    benchmark=semseg_evaluate,\n",
diff --git a/lite/valor_lite/semantic_segmentation/__init__.py b/lite/valor_lite/semantic_segmentation/__init__.py
index dfa0e2380..51bd54d02 100644
--- a/lite/valor_lite/semantic_segmentation/__init__.py
+++ b/lite/valor_lite/semantic_segmentation/__init__.py
@@ -1,4 +1,4 @@
-from .annotation import Bitmask, Segmentation
+from .annotation import Bitmask, Segmentation, generate_segmentation
 from .manager import DataLoader, Evaluator
 from .metric import Metric, MetricType
 
@@ -9,4 +9,5 @@
     "Bitmask",
     "Metric",
     "MetricType",
+    "generate_segmentation",
 ]
diff --git a/lite/valor_lite/semantic_segmentation/annotation.py b/lite/valor_lite/semantic_segmentation/annotation.py
index acd99f8f7..7b19ecf8b 100644
--- a/lite/valor_lite/semantic_segmentation/annotation.py
+++ b/lite/valor_lite/semantic_segmentation/annotation.py
@@ -29,7 +29,7 @@ class Bitmask:
     def __post_init__(self):
         if self.mask.dtype != np.bool_:
             raise ValueError(
-                f"Bitmask recieved mask with dtype `{self.mask.dtype}`."
+                f"Bitmask recieved mask with dtype '{self.mask.dtype}'."
             )
 
 
@@ -94,3 +94,86 @@ def __post_init__(self):
 
         self.shape = groundtruth_shape.pop()
         self.size = int(np.prod(np.array(self.shape)))
+
+
+def generate_segmentation(
+    datum_uid: str,
+    number_of_unique_labels: int,
+    mask_height: int,
+    mask_width: int,
+) -> Segmentation:
+    """
+    Generates a semantic segmentation annotation.
+
+    Parameters
+    ----------
+    datum_uid : str
+        The datum UID for the generated segmentation.
+    number_of_unique_labels : int
+        The number of unique labels.
+    mask_height : int
+        The height of the mask in pixels.
+    mask_width : int
+        The width of the mask in pixels.
+
+    Returns
+    -------
+    Segmentation
+        A generated semantic segmenatation annotation.
+    """
+
+    if number_of_unique_labels > 1:
+        common_proba = 0.4 / (number_of_unique_labels - 1)
+        min_proba = min(common_proba, 0.1)
+        labels = [str(i) for i in range(number_of_unique_labels)] + [None]
+        proba = (
+            [0.5]
+            + [common_proba for _ in range(number_of_unique_labels - 1)]
+            + [0.1]
+        )
+    elif number_of_unique_labels == 1:
+        labels = ["0", None]
+        proba = [0.9, 0.1]
+        min_proba = 0.1
+    else:
+        labels = [None]
+        proba = [1.0]
+        min_proba = 1.0
+
+    probabilities = np.array(proba, dtype=np.float64)
+    weights = (probabilities / min_proba).astype(np.int32)
+
+    indices = np.random.choice(
+        np.arange(len(weights)),
+        size=(mask_height * 2, mask_width),
+        p=probabilities,
+    )
+
+    N = len(labels)
+
+    masks = np.arange(N)[:, None, None] == indices
+
+    gts = []
+    pds = []
+    for lidx in range(N):
+        label = labels[lidx]
+        if label is None:
+            continue
+        gts.append(
+            Bitmask(
+                mask=masks[lidx, :mask_height, :],
+                label=label,
+            )
+        )
+        pds.append(
+            Bitmask(
+                mask=masks[lidx, mask_height:, :],
+                label=label,
+            )
+        )
+
+    return Segmentation(
+        uid=datum_uid,
+        groundtruths=gts,
+        predictions=pds,
+    )
diff --git a/lite/valor_lite/semantic_segmentation/benchmark.py b/lite/valor_lite/semantic_segmentation/benchmark.py
index 5b7a5f4b3..4fcaa46d6 100644
--- a/lite/valor_lite/semantic_segmentation/benchmark.py
+++ b/lite/valor_lite/semantic_segmentation/benchmark.py
@@ -1,83 +1,5 @@
-import numpy as np
 from valor_lite.profiling import create_runtime_profiler
-from valor_lite.semantic_segmentation import Bitmask, DataLoader, Segmentation
-
-
-def generate_segmentation(
-    uid: str,
-    n_labels: int,
-    height: int,
-    width: int,
-) -> Segmentation:
-    """
-    Generates a semantic segmentation annotation.
-
-    Parameters
-    ----------
-    uid : str
-        The datum UID for the generated segmentation.
-    n_labels : int
-        The number of unique labels.
-    height : int
-        The height of the mask in pixels.
-    width : int
-        The width of the mask in pixels.
-
-    Returns
-    -------
-    Segmentation
-        A generated semantic segmenatation annotation.
-    """
-
-    if n_labels > 1:
-        common_proba = 0.4 / (n_labels - 1)
-        min_proba = min(common_proba, 0.1)
-        labels = [str(i) for i in range(n_labels)] + [None]
-        proba = [0.5] + [common_proba for _ in range(n_labels - 1)] + [0.1]
-    elif n_labels == 1:
-        labels = ["0", None]
-        proba = [0.9, 0.1]
-        min_proba = 0.1
-    else:
-        labels = [None]
-        proba = [1.0]
-        min_proba = 1.0
-
-    probabilities = np.array(proba, dtype=np.float64)
-    weights = (probabilities / min_proba).astype(np.int32)
-
-    indices = np.random.choice(
-        np.arange(len(weights)), size=(height * 2, width), p=probabilities
-    )
-
-    N = len(labels)
-
-    masks = np.arange(N)[:, None, None] == indices
-
-    gts = []
-    pds = []
-    for lidx in range(N):
-        label = labels[lidx]
-        if label is None:
-            continue
-        gts.append(
-            Bitmask(
-                mask=masks[lidx, :height, :],
-                label=label,
-            )
-        )
-        pds.append(
-            Bitmask(
-                mask=masks[lidx, height:, :],
-                label=label,
-            )
-        )
-
-    return Segmentation(
-        uid=uid,
-        groundtruths=gts,
-        predictions=pds,
-    )
+from valor_lite.semantic_segmentation import DataLoader, generate_segmentation
 
 
 def benchmark_add_data(
@@ -114,10 +36,10 @@ def benchmark_add_data(
     elapsed = 0
     for _ in range(repeat):
         data = generate_segmentation(
-            uid="uid",
-            n_labels=n_labels,
-            height=shape[0],
-            width=shape[1],
+            datum_uid="uid",
+            number_of_unique_labels=n_labels,
+            mask_height=shape[0],
+            mask_width=shape[1],
         )
         loader = DataLoader()
         elapsed += profile(loader.add_data)([data])
@@ -155,17 +77,25 @@ def benchmark_finalize(
         repeat=repeat,
     )
 
+    from tqdm import tqdm
+
     elapsed = 0
     for _ in range(repeat):
-        loader = DataLoader()
-        for datum_idx in range(n_datums):
-            data = generate_segmentation(
-                uid=str(datum_idx),
-                n_labels=n_labels,
-                height=100,
-                width=100,
+
+        data = [
+            generate_segmentation(
+                datum_uid=str(i),
+                number_of_unique_labels=n_labels,
+                mask_height=5,
+                mask_width=5,
             )
-            loader.add_data([data])
+            for i in range(10)
+        ]
+        loader = DataLoader()
+        for datum_idx in tqdm(range(n_datums)):
+            segmentation = data[datum_idx % 10]
+            segmentation.uid = str(datum_idx)
+            loader.add_data([segmentation])
         elapsed += profile(loader.finalize)()
     return elapsed / repeat
 
@@ -203,15 +133,33 @@ def benchmark_evaluate(
 
     elapsed = 0
     for _ in range(repeat):
+
+        data = [
+            generate_segmentation(
+                datum_uid=str(i),
+                number_of_unique_labels=n_labels,
+                mask_height=10,
+                mask_width=10,
+            )
+            for i in range(10)
+        ]
         loader = DataLoader()
         for datum_idx in range(n_datums):
-            data = generate_segmentation(
-                uid=str(datum_idx),
-                n_labels=n_labels,
-                height=100,
-                width=100,
-            )
-            loader.add_data([data])
+            segmentation = data[datum_idx % 10]
+            segmentation.uid = str(datum_idx)
+            loader.add_data([segmentation])
         evaluator = loader.finalize()
         elapsed += profile(evaluator.evaluate)()
     return elapsed / repeat
+
+
+if __name__ == "__main__":
+
+    print(
+        benchmark_finalize(
+            n_datums=10000,
+            n_labels=1000,
+            time_limit=10,
+            repeat=1,
+        )
+    )
diff --git a/lite/valor_lite/semantic_segmentation/computation.py b/lite/valor_lite/semantic_segmentation/computation.py
index 556a0ef9b..fe396629f 100644
--- a/lite/valor_lite/semantic_segmentation/computation.py
+++ b/lite/valor_lite/semantic_segmentation/computation.py
@@ -31,9 +31,6 @@ def compute_intermediate_confusion_matrices(
         A 2-D confusion matrix with shape (n_labels + 1, n_labels + 1).
     """
 
-    n_gt_labels = groundtruth_labels.size
-    n_pd_labels = prediction_labels.size
-
     groundtruth_counts = groundtruths.sum(axis=1)
     prediction_counts = predictions.sum(axis=1)
 
@@ -42,33 +39,21 @@ def compute_intermediate_confusion_matrices(
     ).sum()
 
     intersection_counts = np.logical_and(
-        groundtruths.reshape(n_gt_labels, 1, -1),
-        predictions.reshape(1, n_pd_labels, -1),
+        groundtruths[:, None, :],
+        predictions[None, :, :],
     ).sum(axis=2)
-
     intersected_groundtruth_counts = intersection_counts.sum(axis=1)
     intersected_prediction_counts = intersection_counts.sum(axis=0)
 
     confusion_matrix = np.zeros((n_labels + 1, n_labels + 1), dtype=np.int32)
     confusion_matrix[0, 0] = background_counts
-    for gidx in range(n_gt_labels):
-        gt_label_idx = groundtruth_labels[gidx]
-        for pidx in range(n_pd_labels):
-            pd_label_idx = prediction_labels[pidx]
-            confusion_matrix[
-                gt_label_idx + 1,
-                pd_label_idx + 1,
-            ] = intersection_counts[gidx, pidx]
-
-            if gidx == 0:
-                confusion_matrix[0, pd_label_idx + 1] = (
-                    prediction_counts[pidx]
-                    - intersected_prediction_counts[pidx]
-                )
-
-        confusion_matrix[gt_label_idx + 1, 0] = (
-            groundtruth_counts[gidx] - intersected_groundtruth_counts[gidx]
-        )
+    confusion_matrix[
+        np.ix_(groundtruth_labels + 1, prediction_labels + 1)
+    ] = intersection_counts
+    confusion_matrix[0, 1:] = prediction_counts - intersected_prediction_counts
+    confusion_matrix[1:, 0] = (
+        groundtruth_counts - intersected_groundtruth_counts
+    )
 
     return confusion_matrix
 

From e44484c0c213078940bac0ef61df7090a8257aaf Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Wed, 13 Nov 2024 11:00:50 -0600
Subject: [PATCH 24/29] bugfix

---
 lite/valor_lite/semantic_segmentation/benchmark.py   |  4 ++--
 lite/valor_lite/semantic_segmentation/computation.py | 11 +++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/lite/valor_lite/semantic_segmentation/benchmark.py b/lite/valor_lite/semantic_segmentation/benchmark.py
index 4fcaa46d6..ebc97a748 100644
--- a/lite/valor_lite/semantic_segmentation/benchmark.py
+++ b/lite/valor_lite/semantic_segmentation/benchmark.py
@@ -138,8 +138,8 @@ def benchmark_evaluate(
             generate_segmentation(
                 datum_uid=str(i),
                 number_of_unique_labels=n_labels,
-                mask_height=10,
-                mask_width=10,
+                mask_height=100,
+                mask_width=100,
             )
             for i in range(10)
         ]
diff --git a/lite/valor_lite/semantic_segmentation/computation.py b/lite/valor_lite/semantic_segmentation/computation.py
index fe396629f..cfda3de27 100644
--- a/lite/valor_lite/semantic_segmentation/computation.py
+++ b/lite/valor_lite/semantic_segmentation/computation.py
@@ -50,8 +50,15 @@ def compute_intermediate_confusion_matrices(
     confusion_matrix[
         np.ix_(groundtruth_labels + 1, prediction_labels + 1)
     ] = intersection_counts
-    confusion_matrix[0, 1:] = prediction_counts - intersected_prediction_counts
-    confusion_matrix[1:, 0] = (
+    print(
+        confusion_matrix[0, 1:].shape,
+        prediction_counts.shape,
+        intersected_prediction_counts.shape,
+    )
+    confusion_matrix[0, prediction_labels + 1] = (
+        prediction_counts - intersected_prediction_counts
+    )
+    confusion_matrix[groundtruth_labels + 1, 0] = (
         groundtruth_counts - intersected_groundtruth_counts
     )
 

From 4d8ff837ac91603fd53de648f6df25be6236bf61 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Wed, 13 Nov 2024 11:06:38 -0600
Subject: [PATCH 25/29] revert computation improvement as its handled in a
 separate commit

---
 .../semantic_segmentation/computation.py      | 40 +++++++++++--------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/lite/valor_lite/semantic_segmentation/computation.py b/lite/valor_lite/semantic_segmentation/computation.py
index cfda3de27..556a0ef9b 100644
--- a/lite/valor_lite/semantic_segmentation/computation.py
+++ b/lite/valor_lite/semantic_segmentation/computation.py
@@ -31,6 +31,9 @@ def compute_intermediate_confusion_matrices(
         A 2-D confusion matrix with shape (n_labels + 1, n_labels + 1).
     """
 
+    n_gt_labels = groundtruth_labels.size
+    n_pd_labels = prediction_labels.size
+
     groundtruth_counts = groundtruths.sum(axis=1)
     prediction_counts = predictions.sum(axis=1)
 
@@ -39,28 +42,33 @@ def compute_intermediate_confusion_matrices(
     ).sum()
 
     intersection_counts = np.logical_and(
-        groundtruths[:, None, :],
-        predictions[None, :, :],
+        groundtruths.reshape(n_gt_labels, 1, -1),
+        predictions.reshape(1, n_pd_labels, -1),
     ).sum(axis=2)
+
     intersected_groundtruth_counts = intersection_counts.sum(axis=1)
     intersected_prediction_counts = intersection_counts.sum(axis=0)
 
     confusion_matrix = np.zeros((n_labels + 1, n_labels + 1), dtype=np.int32)
     confusion_matrix[0, 0] = background_counts
-    confusion_matrix[
-        np.ix_(groundtruth_labels + 1, prediction_labels + 1)
-    ] = intersection_counts
-    print(
-        confusion_matrix[0, 1:].shape,
-        prediction_counts.shape,
-        intersected_prediction_counts.shape,
-    )
-    confusion_matrix[0, prediction_labels + 1] = (
-        prediction_counts - intersected_prediction_counts
-    )
-    confusion_matrix[groundtruth_labels + 1, 0] = (
-        groundtruth_counts - intersected_groundtruth_counts
-    )
+    for gidx in range(n_gt_labels):
+        gt_label_idx = groundtruth_labels[gidx]
+        for pidx in range(n_pd_labels):
+            pd_label_idx = prediction_labels[pidx]
+            confusion_matrix[
+                gt_label_idx + 1,
+                pd_label_idx + 1,
+            ] = intersection_counts[gidx, pidx]
+
+            if gidx == 0:
+                confusion_matrix[0, pd_label_idx + 1] = (
+                    prediction_counts[pidx]
+                    - intersected_prediction_counts[pidx]
+                )
+
+        confusion_matrix[gt_label_idx + 1, 0] = (
+            groundtruth_counts[gidx] - intersected_groundtruth_counts[gidx]
+        )
 
     return confusion_matrix
 

From 5108d94d27865c72f4f9e12de5ee82437bfef38d Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Wed, 13 Nov 2024 13:37:20 -0600
Subject: [PATCH 26/29] moved generate function

---
 lite/examples/benchmarking.ipynb              | 166 +++++++++++++++++-
 .../semantic_segmentation/benchmark.py        |  20 +--
 2 files changed, 160 insertions(+), 26 deletions(-)

diff --git a/lite/examples/benchmarking.ipynb b/lite/examples/benchmarking.ipynb
index e52a9ed61..7774a759e 100644
--- a/lite/examples/benchmarking.ipynb
+++ b/lite/examples/benchmarking.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -68,9 +68,63 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 69%|██████▉   | 11/16 [00:46<00:21,  4.26s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=====================================================================\n",
+      "Details\n",
+      "{\n",
+      "    \"benchmark\": \"benchmark_add_data\",\n",
+      "    \"limits\": {\n",
+      "        \"memory_limit\": \"8.0 GB\",\n",
+      "        \"time_limit\": \"5.0 seconds\",\n",
+      "        \"repeat\": 1\n",
+      "    },\n",
+      "    \"passed\": 8,\n",
+      "    \"failed\": 8,\n",
+      "    \"total\": 16\n",
+      "}\n",
+      "\n",
+      "Passed\n",
+      "  complexity    |     runtime     |    n_labels     |      shape     \n",
+      "---------------------------------------------------------------------\n",
+      "   300000000    |     1.5151      |        3        | (10000, 10000) \n",
+      "   62500000     |     0.5952      |       10        |  (2500, 2500)  \n",
+      "   10000000     |     0.0911      |       10        |  (1000, 1000)  \n",
+      "    1000000     |     0.0582      |       100       |   (100, 100)   \n",
+      "\n",
+      "Failed\n",
+      "  complexity    |      error      |    n_labels     |      shape      |       msg      \n",
+      "---------------------------------------------------------------------------------------\n",
+      " 100000000000   |   MemoryError   |      1000       | (10000, 10000)  | Unable to allocate 186. GiB for an array with shape (1001, 20000, 10000) and data type bool\n",
+      "  10000000000   |   MemoryError   |       100       | (10000, 10000)  | Unable to allocate 18.8 GiB for an array with shape (101, 20000, 10000) and data type bool\n",
+      "  6250000000    |   MemoryError   |      1000       |  (2500, 2500)   | Unable to allocate 11.7 GiB for an array with shape (1001, 5000, 2500) and data type bool\n",
+      "  1000000000    |   MemoryError   |       10        | (10000, 10000)  | Unable to allocate 9.31 GiB for an array with shape (10, 10, 100000000) and data type bool\n",
+      "  1000000000    |   MemoryError   |      1000       |  (1000, 1000)   | Unable to allocate 931. GiB for an array with shape (1000, 1000, 1000000) and data type bool\n",
+      "   625000000    |   MemoryError   |       100       |  (2500, 2500)   | Unable to allocate 58.2 GiB for an array with shape (100, 100, 6250000) and data type bool\n",
+      "   100000000    |   MemoryError   |       100       |  (1000, 1000)   | Unable to allocate 9.31 GiB for an array with shape (100, 100, 1000000) and data type bool\n",
+      "   10000000     |   MemoryError   |      1000       |   (100, 100)    | Unable to allocate 9.31 GiB for an array with shape (1000, 1000, 10000) and data type bool\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "_ = b.run(\n",
     "    benchmark=semseg_add_data,\n",
@@ -81,9 +135,56 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 20%|██        | 4/20 [02:35<10:22, 38.92s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=====================================================================\n",
+      "Details\n",
+      "{\n",
+      "    \"benchmark\": \"benchmark_finalize\",\n",
+      "    \"limits\": {\n",
+      "        \"memory_limit\": \"8.0 GB\",\n",
+      "        \"time_limit\": \"5.0 seconds\",\n",
+      "        \"repeat\": 1\n",
+      "    },\n",
+      "    \"passed\": 18,\n",
+      "    \"failed\": 2,\n",
+      "    \"total\": 20\n",
+      "}\n",
+      "\n",
+      "Passed\n",
+      "  complexity    |     runtime     |    n_datums     |    n_labels    \n",
+      "---------------------------------------------------------------------\n",
+      "    1000000     |     1.1142      |      10000      |       100      \n",
+      "    100000      |     0.1748      |       100       |      1000      \n",
+      "    100000      |     0.1086      |      1000       |       100      \n",
+      "\n",
+      "Failed\n",
+      "  complexity    |      error      |    n_datums     |    n_labels     |       msg      \n",
+      "---------------------------------------------------------------------------------------\n",
+      "   10000000     |   MemoryError   |      10000      |      1000       | Unable to allocate 7.63 MiB for an array with shape (1000, 1000) and data type int64\n",
+      "    1000000     |   MemoryError   |      1000       |      1000       |                \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "_ = b.run(\n",
     "    benchmark=semseg_finalize,\n",
@@ -94,9 +195,56 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 20%|██        | 4/20 [02:25<09:40, 36.28s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=====================================================================\n",
+      "Details\n",
+      "{\n",
+      "    \"benchmark\": \"benchmark_evaluate\",\n",
+      "    \"limits\": {\n",
+      "        \"memory_limit\": \"8.0 GB\",\n",
+      "        \"time_limit\": \"5.0 seconds\",\n",
+      "        \"repeat\": 1\n",
+      "    },\n",
+      "    \"passed\": 18,\n",
+      "    \"failed\": 2,\n",
+      "    \"total\": 20\n",
+      "}\n",
+      "\n",
+      "Passed\n",
+      "  complexity    |     runtime     |    n_datums     |    n_labels    \n",
+      "---------------------------------------------------------------------\n",
+      "    1000000     |     0.0537      |      10000      |       100      \n",
+      "    100000      |     0.0815      |       100       |      1000      \n",
+      "    100000      |     0.0137      |      1000       |       100      \n",
+      "\n",
+      "Failed\n",
+      "  complexity    |      error      |    n_datums     |    n_labels     |       msg      \n",
+      "---------------------------------------------------------------------------------------\n",
+      "   10000000     |   MemoryError   |      10000      |      1000       | Unable to allocate 23.8 MiB for an array with shape (1000, 1000, 25) and data type bool\n",
+      "    1000000     |   MemoryError   |      1000       |      1000       | Unable to allocate 3.73 GiB for an array with shape (1000, 1001, 1001) and data type int32\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "_ = b.run(\n",
     "    benchmark=semseg_evaluate,\n",
diff --git a/lite/valor_lite/semantic_segmentation/benchmark.py b/lite/valor_lite/semantic_segmentation/benchmark.py
index ebc97a748..b4950eac1 100644
--- a/lite/valor_lite/semantic_segmentation/benchmark.py
+++ b/lite/valor_lite/semantic_segmentation/benchmark.py
@@ -77,8 +77,6 @@ def benchmark_finalize(
         repeat=repeat,
     )
 
-    from tqdm import tqdm
-
     elapsed = 0
     for _ in range(repeat):
 
@@ -92,7 +90,7 @@ def benchmark_finalize(
             for i in range(10)
         ]
         loader = DataLoader()
-        for datum_idx in tqdm(range(n_datums)):
+        for datum_idx in range(n_datums):
             segmentation = data[datum_idx % 10]
             segmentation.uid = str(datum_idx)
             loader.add_data([segmentation])
@@ -138,8 +136,8 @@ def benchmark_evaluate(
             generate_segmentation(
                 datum_uid=str(i),
                 number_of_unique_labels=n_labels,
-                mask_height=100,
-                mask_width=100,
+                mask_height=5,
+                mask_width=5,
             )
             for i in range(10)
         ]
@@ -151,15 +149,3 @@ def benchmark_evaluate(
         evaluator = loader.finalize()
         elapsed += profile(evaluator.evaluate)()
     return elapsed / repeat
-
-
-if __name__ == "__main__":
-
-    print(
-        benchmark_finalize(
-            n_datums=10000,
-            n_labels=1000,
-            time_limit=10,
-            repeat=1,
-        )
-    )

From 829a44ba4a9cb04bd541747aa8000f8c5ff24161 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Wed, 13 Nov 2024 14:06:08 -0600
Subject: [PATCH 27/29] added generator test

---
 .../semantic_segmentation/test_annotation.py  | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/lite/tests/semantic_segmentation/test_annotation.py b/lite/tests/semantic_segmentation/test_annotation.py
index 999dd5240..b6efb2c64 100644
--- a/lite/tests/semantic_segmentation/test_annotation.py
+++ b/lite/tests/semantic_segmentation/test_annotation.py
@@ -1,6 +1,10 @@
 import numpy as np
 import pytest
-from valor_lite.semantic_segmentation import Bitmask, Segmentation
+from valor_lite.semantic_segmentation import (
+    Bitmask,
+    Segmentation,
+    generate_segmentation,
+)
 
 
 def test_bitmask():
@@ -78,3 +82,25 @@ def test_segmentation():
             predictions=[],
         )
     assert "missing predictions" in str(e)
+
+
+def test_generate_segmentation():
+
+    segmentation = generate_segmentation(
+        datum_uid="uid1",
+        number_of_unique_labels=3,
+        mask_height=2,
+        mask_width=3,
+    )
+
+    assert segmentation.uid == "uid1"
+    assert segmentation.shape == (2, 3)
+    assert segmentation.size == 6
+
+    assert len(segmentation.groundtruths) == 3
+    assert all(gt.mask.dtype == np.bool_ for gt in segmentation.groundtruths)
+    assert all(gt.mask.shape == (2, 3) for gt in segmentation.groundtruths)
+
+    assert len(segmentation.predictions)
+    assert all(pd.mask.dtype == np.bool_ for pd in segmentation.predictions)
+    assert all(pd.mask.shape == (2, 3) for pd in segmentation.predictions)

From 5251970f11273a8eb5a1f9c2f6b87600476751ca Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Wed, 13 Nov 2024 14:09:46 -0600
Subject: [PATCH 28/29] adjusted benchmark

---
 .../synthetic/benchmark_semantic_segmentation.py          | 2 +-
 lite/valor_lite/profiling.py                              | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py b/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py
index 997cd3a9a..737da165c 100644
--- a/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py
+++ b/lite/benchmarks/synthetic/benchmark_semantic_segmentation.py
@@ -85,7 +85,7 @@ def benchmark(
 
     benchmark(
         bitmask_shape=(4000, 4000),
-        number_of_images=10,
+        number_of_images=1000,
         number_of_unique_labels=10,
         memory_limit=4.0,
         time_limit=10.0,
diff --git a/lite/valor_lite/profiling.py b/lite/valor_lite/profiling.py
index a75774fc6..be275ce3f 100644
--- a/lite/valor_lite/profiling.py
+++ b/lite/valor_lite/profiling.py
@@ -101,9 +101,9 @@ def pretty_print_results(results: tuple):
     print("Details")
     print(json.dumps(permutations, indent=4))
 
-    print()
-    print("Passed")
     if len(valid) > 0:
+        print()
+        print("Passed")
         keys = ["complexity", "runtime", *valid[0]["details"].keys()]
         header = " | ".join(f"{header:^15}" for header in keys)
         print(header)
@@ -117,9 +117,9 @@ def pretty_print_results(results: tuple):
             row = " | ".join(f"{str(value):^15}" for value in values)
             print(row)
 
-    print()
-    print("Failed")
     if len(invalid) > 0:
+        print()
+        print("Failed")
         keys = ["complexity", "error", *invalid[0]["details"].keys(), "msg"]
         header = " | ".join(f"{header:^15}" for header in keys)
         print(header)

From 9bc5217e34456095d4b447ff86d69edfd4d42b26 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <czaloom4@gmail.com>
Date: Wed, 13 Nov 2024 14:14:32 -0600
Subject: [PATCH 29/29] expanded on test

---
 .../semantic_segmentation/test_annotation.py  | 32 ++++++++++++++++++-
 .../semantic_segmentation/annotation.py       |  6 ++--
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/lite/tests/semantic_segmentation/test_annotation.py b/lite/tests/semantic_segmentation/test_annotation.py
index b6efb2c64..89b0ba7a4 100644
--- a/lite/tests/semantic_segmentation/test_annotation.py
+++ b/lite/tests/semantic_segmentation/test_annotation.py
@@ -86,6 +86,7 @@ def test_segmentation():
 
 def test_generate_segmentation():
 
+    # N labels > 1
     segmentation = generate_segmentation(
         datum_uid="uid1",
         number_of_unique_labels=3,
@@ -101,6 +102,35 @@ def test_generate_segmentation():
     assert all(gt.mask.dtype == np.bool_ for gt in segmentation.groundtruths)
     assert all(gt.mask.shape == (2, 3) for gt in segmentation.groundtruths)
 
-    assert len(segmentation.predictions)
+    assert len(segmentation.predictions) == 3
     assert all(pd.mask.dtype == np.bool_ for pd in segmentation.predictions)
     assert all(pd.mask.shape == (2, 3) for pd in segmentation.predictions)
+
+    # N labels = 1
+    segmentation = generate_segmentation(
+        datum_uid="uid1",
+        number_of_unique_labels=1,
+        mask_height=2,
+        mask_width=3,
+    )
+
+    assert segmentation.uid == "uid1"
+    assert segmentation.shape == (2, 3)
+    assert segmentation.size == 6
+
+    assert len(segmentation.groundtruths) == 1
+    assert all(gt.mask.dtype == np.bool_ for gt in segmentation.groundtruths)
+    assert all(gt.mask.shape == (2, 3) for gt in segmentation.groundtruths)
+
+    assert len(segmentation.predictions) == 1
+    assert all(pd.mask.dtype == np.bool_ for pd in segmentation.predictions)
+    assert all(pd.mask.shape == (2, 3) for pd in segmentation.predictions)
+
+    # N labels = 0
+    with pytest.raises(ValueError):
+        generate_segmentation(
+            datum_uid="uid1",
+            number_of_unique_labels=0,
+            mask_height=2,
+            mask_width=3,
+        )
diff --git a/lite/valor_lite/semantic_segmentation/annotation.py b/lite/valor_lite/semantic_segmentation/annotation.py
index 7b19ecf8b..7e96fe926 100644
--- a/lite/valor_lite/semantic_segmentation/annotation.py
+++ b/lite/valor_lite/semantic_segmentation/annotation.py
@@ -136,9 +136,9 @@ def generate_segmentation(
         proba = [0.9, 0.1]
         min_proba = 0.1
     else:
-        labels = [None]
-        proba = [1.0]
-        min_proba = 1.0
+        raise ValueError(
+            "The number of unique labels should be greater than zero."
+        )
 
     probabilities = np.array(proba, dtype=np.float64)
     weights = (probabilities / min_proba).astype(np.int32)