From 9b74e1ffb8f10b3c3af2d354dabde8a0380ca808 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <38677807+czaloom@users.noreply.github.com>
Date: Thu, 11 Jul 2024 15:40:00 -0500
Subject: [PATCH] Metric Ingestion Patch (#668)

---
 .../backend/metrics/test_classification.py    |   7 +
 api/valor_api/backend/core/evaluation.py      |   2 -
 .../backend/metrics/classification.py         |  30 +--
 api/valor_api/backend/metrics/detection.py    |  20 +-
 api/valor_api/backend/metrics/metric_utils.py | 184 ++++++++++--------
 api/valor_api/backend/metrics/segmentation.py |  20 +-
 6 files changed, 122 insertions(+), 141 deletions(-)

diff --git a/api/tests/functional-tests/backend/metrics/test_classification.py b/api/tests/functional-tests/backend/metrics/test_classification.py
index a1feca59c..d5586704d 100644
--- a/api/tests/functional-tests/backend/metrics/test_classification.py
+++ b/api/tests/functional-tests/backend/metrics/test_classification.py
@@ -179,6 +179,7 @@ def test_compute_confusion_matrix_at_grouper_key(
     )
 
     grouper_mappings = create_grouper_mappings(
+        db=db,
         labels=labels,
         label_map=None,
         evaluation_type=enums.TaskType.CLASSIFICATION,
@@ -405,6 +406,7 @@ def test_compute_confusion_matrix_at_grouper_key_and_filter(
     )
 
     grouper_mappings = create_grouper_mappings(
+        db=db,
         labels=labels,
         label_map=None,
         evaluation_type=enums.TaskType.CLASSIFICATION,
@@ -555,6 +557,7 @@ def test_compute_confusion_matrix_at_grouper_key_using_label_map(
     )
 
     grouper_mappings = create_grouper_mappings(
+        db=db,
         labels=labels,
         label_map=label_map,
         evaluation_type=enums.TaskType.CLASSIFICATION,
@@ -734,6 +737,7 @@ def test_compute_roc_auc(
     )
 
     grouper_mappings = create_grouper_mappings(
+        db=db,
         labels=labels,
         label_map=None,
         evaluation_type=enums.TaskType.CLASSIFICATION,
@@ -843,6 +847,7 @@ def test_compute_roc_auc_groupby_metadata(
     )
 
     grouper_mappings = create_grouper_mappings(
+        db=db,
         labels=labels,
         label_map=None,
         evaluation_type=enums.TaskType.CLASSIFICATION,
@@ -951,6 +956,7 @@ def test_compute_roc_auc_with_label_map(
     )
 
     grouper_mappings = create_grouper_mappings(
+        db=db,
         labels=labels,
         label_map=label_map,
         evaluation_type=enums.TaskType.CLASSIFICATION,
@@ -1262,6 +1268,7 @@ def test__compute_curves(
     )
 
     grouper_mappings = create_grouper_mappings(
+        db=db,
         labels=labels,
         label_map=None,
         evaluation_type=enums.TaskType.CLASSIFICATION,
diff --git a/api/valor_api/backend/core/evaluation.py b/api/valor_api/backend/core/evaluation.py
index eefa3053a..85106a7b1 100644
--- a/api/valor_api/backend/core/evaluation.py
+++ b/api/valor_api/backend/core/evaluation.py
@@ -243,12 +243,10 @@ def _validate_evaluation_filter(
 
     # generate filters
     groundtruth_filter, prediction_filter = prepare_filter_for_evaluation(
-        db=db,
         filters=filters,
         dataset_names=evaluation.dataset_names,
         model_name=evaluation.model_name,
         task_type=parameters.task_type,
-        label_map=parameters.label_map,
     )
 
     datasets = (
diff --git a/api/valor_api/backend/metrics/classification.py b/api/valor_api/backend/metrics/classification.py
index 6a52f180f..35a6dc717 100644
--- a/api/valor_api/backend/metrics/classification.py
+++ b/api/valor_api/backend/metrics/classification.py
@@ -10,9 +10,8 @@
 from valor_api import enums, schemas
 from valor_api.backend import core, models
 from valor_api.backend.metrics.metric_utils import (
+    commit_results,
     create_grouper_mappings,
-    create_metric_mappings,
-    get_or_create_row,
     log_evaluation_duration,
     log_evaluation_item_counts,
     prepare_filter_for_evaluation,
@@ -1035,6 +1034,7 @@ def _compute_clf_metrics(
     )
 
     grouper_mappings = create_grouper_mappings(
+        db=db,
         labels=labels,
         label_map=label_map,
         evaluation_type=enums.TaskType.CLASSIFICATION,
@@ -1089,12 +1089,10 @@ def compute_clf_metrics(
     # unpack filters and params
     parameters = schemas.EvaluationParameters(**evaluation.parameters)
     groundtruth_filter, prediction_filter = prepare_filter_for_evaluation(
-        db=db,
         filters=schemas.Filter(**evaluation.filters),
         dataset_names=evaluation.dataset_names,
         model_name=evaluation.model_name,
         task_type=parameters.task_type,
-        label_map=parameters.label_map,
     )
 
     log_evaluation_item_counts(
@@ -1120,36 +1118,20 @@ def compute_clf_metrics(
         metrics_to_return=parameters.metrics_to_return,
     )
 
-    confusion_matrices_mappings = create_metric_mappings(
+    # add confusion matrices to database
+    commit_results(
         db=db,
         metrics=confusion_matrices,
         evaluation_id=evaluation.id,
     )
 
-    for mapping in confusion_matrices_mappings:
-        get_or_create_row(
-            db,
-            models.ConfusionMatrix,
-            mapping,
-        )
-
-    metric_mappings = create_metric_mappings(
+    # add metrics to database
+    commit_results(
         db=db,
         metrics=metrics,
         evaluation_id=evaluation.id,
     )
 
-    for mapping in metric_mappings:
-        # ignore value since the other columns are unique identifiers
-        # and have empirically noticed value can slightly change due to floating
-        # point errors
-        get_or_create_row(
-            db,
-            models.Metric,
-            mapping,
-            columns_to_ignore=["value"],
-        )
-
     log_evaluation_duration(
         evaluation=evaluation,
         db=db,
diff --git a/api/valor_api/backend/metrics/detection.py b/api/valor_api/backend/metrics/detection.py
index 0bdfd4b0c..370189f1e 100644
--- a/api/valor_api/backend/metrics/detection.py
+++ b/api/valor_api/backend/metrics/detection.py
@@ -13,9 +13,8 @@
 from valor_api import enums, schemas
 from valor_api.backend import core, models
 from valor_api.backend.metrics.metric_utils import (
+    commit_results,
     create_grouper_mappings,
-    create_metric_mappings,
-    get_or_create_row,
     log_evaluation_duration,
     log_evaluation_item_counts,
     prepare_filter_for_evaluation,
@@ -739,6 +738,7 @@ def _annotation_type_to_geojson(
     )
 
     grouper_mappings = create_grouper_mappings(
+        db=db,
         labels=labels,
         label_map=parameters.label_map,
         evaluation_type=enums.TaskType.OBJECT_DETECTION,
@@ -1108,6 +1108,7 @@ def _annotation_type_to_geojson(
     )
 
     grouper_mappings = create_grouper_mappings(
+        db=db,
         labels=labels,
         label_map=parameters.label_map,
         evaluation_type=enums.TaskType.OBJECT_DETECTION,
@@ -1641,12 +1642,10 @@ def compute_detection_metrics(*_, db: Session, evaluation_id: int):
     # unpack filters and params
     parameters = schemas.EvaluationParameters(**evaluation.parameters)
     groundtruth_filter, prediction_filter = prepare_filter_for_evaluation(
-        db=db,
         filters=schemas.Filter(**evaluation.filters),
         dataset_names=evaluation.dataset_names,
         model_name=evaluation.model_name,
         task_type=parameters.task_type,
-        label_map=parameters.label_map,
     )
 
     log_evaluation_item_counts(
@@ -1755,22 +1754,13 @@ def compute_detection_metrics(*_, db: Session, evaluation_id: int):
             target_type=target_type,
         )
 
-    metric_mappings = create_metric_mappings(
+    # add metrics to database
+    commit_results(
         db=db,
         metrics=metrics,
         evaluation_id=evaluation_id,
     )
 
-    for mapping in metric_mappings:
-        # ignore value since the other columns are unique identifiers
-        # and have empircally noticed value can slightly change due to floating
-        # point errors
-
-        get_or_create_row(
-            db, models.Metric, mapping, columns_to_ignore=["value"]
-        )
-    db.commit()
-
     log_evaluation_duration(
         evaluation=evaluation,
         db=db,
diff --git a/api/valor_api/backend/metrics/metric_utils.py b/api/valor_api/backend/metrics/metric_utils.py
index 6ea9a9e06..28adaada6 100644
--- a/api/valor_api/backend/metrics/metric_utils.py
+++ b/api/valor_api/backend/metrics/metric_utils.py
@@ -1,7 +1,7 @@
 from collections import defaultdict
 from typing import Callable, Sequence
 
-from sqlalchemy import select
+from sqlalchemy import and_, or_, select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import Session
 from sqlalchemy.sql import func
@@ -105,6 +105,7 @@ def _create_classification_grouper_mappings(
 
 
 def create_grouper_mappings(
+    db: Session,
     labels: list,
     label_map: LabelMapType | None,
     evaluation_type: enums.TaskType,
@@ -139,64 +140,46 @@ def create_grouper_mappings(
         )
 
     # create a map of labels to groupers; will be empty if the user didn't pass a label_map
-    mapping_dict = (
-        {tuple(label): tuple(grouper) for label, grouper in label_map}
-        if label_map
-        else {}
-    )
+    grouper_key_to_value = defaultdict(list)
+    mapping_dict = dict()
+    if label_map:
+        for label, grouper in label_map:
+            mapping_dict[tuple(label)] = tuple(grouper)
+            grouper_key_to_value[grouper[0]].append(grouper[1])
+
+        # add grouper labels to database (if they don't exist)
+        grouper_labels = set(mapping_dict.values())
+        existing_labels = {
+            (row.key, row.value)
+            for row in (
+                db.query(models.Label)
+                .where(
+                    or_(
+                        *[
+                            and_(
+                                models.Label.key == key,
+                                models.Label.value.in_(values),
+                            )
+                            for key, values in grouper_key_to_value.items()
+                        ]
+                    )
+                )
+                .all()
+            )
+        }
+        labels_to_create = list(grouper_labels - existing_labels)
+        core.create_labels(
+            db=db,
+            labels=[
+                schemas.Label(key=key, value=value)
+                for key, value in labels_to_create
+            ],
+        )
 
     return mapping_functions[evaluation_type](mapping_dict, labels)
 
 
-def get_or_create_row(
-    db: Session,
-    model_class: type,
-    mapping: dict,
-    columns_to_ignore: list[str] | None = None,
-):
-    """
-    Tries to get the row defined by mapping. If that exists then its mapped object is returned. Otherwise a row is created by `mapping` and the newly created object is returned.
-
-    Parameters
-    ----------
-    db : Session
-        The database Session to query against.
-    model_class : type
-        The type of model.
-    mapping : dict
-        The mapping to use when creating the row.
-    columns_to_ignore : List[str]
-        Specifies any columns to ignore in forming the WHERE expression. This can be used for numerical columns that might slightly differ but are essentially the same.
-
-    Returns
-    ----------
-    any
-        A model class object.
-    """
-    columns_to_ignore = columns_to_ignore or []
-
-    # create the query from the mapping
-    where_expressions = [
-        (getattr(model_class, k) == v)
-        for k, v in mapping.items()
-        if k not in columns_to_ignore
-    ]
-    where_expression = where_expressions[0]
-    for exp in where_expressions[1:]:
-        where_expression = where_expression & exp
-
-    db_element = db.scalar(select(model_class).where(where_expression))
-
-    if not db_element:
-        db_element = model_class(**mapping)
-        db.add(db_element)
-        db.flush()
-        db.commit()
-
-    return db_element
-
-
-def create_metric_mappings(
+def commit_results(
     db: Session,
     metrics: Sequence[
         schemas.APMetric
@@ -217,7 +200,7 @@ def create_metric_mappings(
         | schemas.DetailedPrecisionRecallCurve
     ],
     evaluation_id: int,
-) -> list[dict]:
+):
     """
     Create metric mappings from a list of metrics.
 
@@ -229,13 +212,10 @@ def create_metric_mappings(
         A list of metrics to create mappings for.
     evaluation_id : int
         The id of the evaluation job.
-
-    Returns
-    ----------
-    List[Dict]
-        A list of metric mappings.
     """
-    ret = []
+
+    # cache labels for metrics that use them
+    cached_labels = defaultdict(list)
     for metric in metrics:
         if isinstance(
             metric,
@@ -249,29 +229,69 @@ def create_metric_mappings(
                 schemas.IOUMetric,
             ),
         ):
-            label = core.fetch_label(
-                db=db,
-                label=metric.label,
+            cached_labels[metric.label.key].append(metric.label.value)
+    cached_label_to_id = {
+        schemas.Label(key=row.key, value=row.value): row.id
+        for row in (
+            db.query(models.Label)
+            .where(
+                or_(
+                    *[
+                        and_(
+                            models.Label.key == key,
+                            models.Label.value.in_(values),
+                        )
+                        for key, values in cached_labels.items()
+                    ]
+                )
             )
+            .all()
+        )
+    }
 
-            # create the label in the database if it doesn't exist
-            # this is useful if the user maps existing labels to a non-existant grouping label
-            if not label:
-                label_map = core.create_labels(db=db, labels=[metric.label])
-                label_id = list(label_map.values())[0]
-            else:
-                label_id = label.id
-
-            ret.append(
-                metric.db_mapping(
-                    label_id=label_id,
-                    evaluation_id=evaluation_id,
+    metric_rows = []
+    confusion_rows = []
+    for metric in metrics:
+        if isinstance(
+            metric,
+            (
+                schemas.APMetric,
+                schemas.ARMetric,
+                schemas.APMetricAveragedOverIOUs,
+                schemas.PrecisionMetric,
+                schemas.RecallMetric,
+                schemas.F1Metric,
+                schemas.IOUMetric,
+            ),
+        ):
+            metric_rows.append(
+                models.Metric(
+                    **metric.db_mapping(
+                        label_id=cached_label_to_id[metric.label],
+                        evaluation_id=evaluation_id,
+                    )
+                )
+            )
+        elif isinstance(metric, schemas.ConfusionMatrix):
+            confusion_rows.append(
+                models.ConfusionMatrix(
+                    **metric.db_mapping(evaluation_id=evaluation_id)
                 )
             )
         else:
-            ret.append(metric.db_mapping(evaluation_id=evaluation_id))
+            metric_rows.append(
+                models.Metric(**metric.db_mapping(evaluation_id=evaluation_id))
+            )
 
-    return ret
+    try:
+        if metric_rows:
+            db.add_all(metric_rows)
+        if confusion_rows:
+            db.add_all(confusion_rows)
+        db.commit()
+    except IntegrityError as e:
+        db.rollback()
+        raise e
 
 
 def log_evaluation_duration(
@@ -444,12 +464,10 @@ def wrapper(*args, **kwargs):
 
 
 def prepare_filter_for_evaluation(
-    db: Session,
     filters: schemas.Filter,
     dataset_names: list[str],
     model_name: str,
     task_type: enums.TaskType,
-    label_map: LabelMapType | None = None,
 ) -> tuple[schemas.Filter, schemas.Filter]:
     """
     Prepares the filter for use by an evaluation method.
@@ -458,8 +476,6 @@ def prepare_filter_for_evaluation(
 
     Parameters
     ----------
-    db : Session
-        The database session.
     filters : Filter
         The data filter.
     dataset_names : list[str]
@@ -468,8 +484,6 @@ def prepare_filter_for_evaluation(
         A model name to filter by.
     task_type : TaskType
         A task type to filter by.
-    label_map : LabelMapType, optional
-        An optional label mapping.
 
     Returns
     -------
diff --git a/api/valor_api/backend/metrics/segmentation.py b/api/valor_api/backend/metrics/segmentation.py
index b4fea07b0..064c9e9b0 100644
--- a/api/valor_api/backend/metrics/segmentation.py
+++ b/api/valor_api/backend/metrics/segmentation.py
@@ -8,9 +8,8 @@
 from valor_api import enums, schemas
 from valor_api.backend import core, models
 from valor_api.backend.metrics.metric_utils import (
+    commit_results,
     create_grouper_mappings,
-    create_metric_mappings,
-    get_or_create_row,
     log_evaluation_duration,
     log_evaluation_item_counts,
     prepare_filter_for_evaluation,
@@ -178,6 +177,7 @@ def _compute_segmentation_metrics(
     )
 
     grouper_mappings = create_grouper_mappings(
+        db=db,
         labels=labels,
         label_map=parameters.label_map,
         evaluation_type=enums.TaskType.SEMANTIC_SEGMENTATION,
@@ -272,12 +272,10 @@ def compute_semantic_segmentation_metrics(
     # unpack filters and params
     parameters = schemas.EvaluationParameters(**evaluation.parameters)
     groundtruth_filter, prediction_filter = prepare_filter_for_evaluation(
-        db=db,
         filters=schemas.Filter(**evaluation.filters),
         dataset_names=evaluation.dataset_names,
         model_name=evaluation.model_name,
         task_type=parameters.task_type,
-        label_map=parameters.label_map,
     )
 
     log_evaluation_item_counts(
@@ -293,17 +291,9 @@ def compute_semantic_segmentation_metrics(
         prediction_filter=prediction_filter,
         groundtruth_filter=groundtruth_filter,
     )
-    metric_mappings = create_metric_mappings(db, metrics, evaluation_id)
-    for mapping in metric_mappings:
-        # ignore value since the other columns are unique identifiers
-        # and have empirically noticed value can slightly change due to floating
-        # point errors
-        get_or_create_row(
-            db,
-            models.Metric,
-            mapping,
-            columns_to_ignore=["value"],
-        )
+
+    # add metrics to database
+    commit_results(db, metrics, evaluation_id)
 
     log_evaluation_duration(
         evaluation=evaluation,