From 9b74e1ffb8f10b3c3af2d354dabde8a0380ca808 Mon Sep 17 00:00:00 2001 From: Charles Zaloom <38677807+czaloom@users.noreply.github.com> Date: Thu, 11 Jul 2024 15:40:00 -0500 Subject: [PATCH] Metric Ingestion Patch (#668) --- .../backend/metrics/test_classification.py | 7 + api/valor_api/backend/core/evaluation.py | 2 - .../backend/metrics/classification.py | 30 +-- api/valor_api/backend/metrics/detection.py | 20 +- api/valor_api/backend/metrics/metric_utils.py | 184 ++++++++++-------- api/valor_api/backend/metrics/segmentation.py | 20 +- 6 files changed, 122 insertions(+), 141 deletions(-) diff --git a/api/tests/functional-tests/backend/metrics/test_classification.py b/api/tests/functional-tests/backend/metrics/test_classification.py index a1feca59c..d5586704d 100644 --- a/api/tests/functional-tests/backend/metrics/test_classification.py +++ b/api/tests/functional-tests/backend/metrics/test_classification.py @@ -179,6 +179,7 @@ def test_compute_confusion_matrix_at_grouper_key( ) grouper_mappings = create_grouper_mappings( + db=db, labels=labels, label_map=None, evaluation_type=enums.TaskType.CLASSIFICATION, @@ -405,6 +406,7 @@ def test_compute_confusion_matrix_at_grouper_key_and_filter( ) grouper_mappings = create_grouper_mappings( + db=db, labels=labels, label_map=None, evaluation_type=enums.TaskType.CLASSIFICATION, @@ -555,6 +557,7 @@ def test_compute_confusion_matrix_at_grouper_key_using_label_map( ) grouper_mappings = create_grouper_mappings( + db=db, labels=labels, label_map=label_map, evaluation_type=enums.TaskType.CLASSIFICATION, @@ -734,6 +737,7 @@ def test_compute_roc_auc( ) grouper_mappings = create_grouper_mappings( + db=db, labels=labels, label_map=None, evaluation_type=enums.TaskType.CLASSIFICATION, @@ -843,6 +847,7 @@ def test_compute_roc_auc_groupby_metadata( ) grouper_mappings = create_grouper_mappings( + db=db, labels=labels, label_map=None, evaluation_type=enums.TaskType.CLASSIFICATION, @@ -951,6 +956,7 @@ def test_compute_roc_auc_with_label_map( ) grouper_mappings = create_grouper_mappings( + db=db, labels=labels, label_map=label_map, evaluation_type=enums.TaskType.CLASSIFICATION, @@ -1262,6 +1268,7 @@ def test__compute_curves( ) grouper_mappings = create_grouper_mappings( + db=db, labels=labels, label_map=None, evaluation_type=enums.TaskType.CLASSIFICATION, diff --git a/api/valor_api/backend/core/evaluation.py b/api/valor_api/backend/core/evaluation.py index eefa3053a..85106a7b1 100644 --- a/api/valor_api/backend/core/evaluation.py +++ b/api/valor_api/backend/core/evaluation.py @@ -243,12 +243,10 @@ def _validate_evaluation_filter( # generate filters groundtruth_filter, prediction_filter = prepare_filter_for_evaluation( - db=db, filters=filters, dataset_names=evaluation.dataset_names, model_name=evaluation.model_name, task_type=parameters.task_type, - label_map=parameters.label_map, ) datasets = ( diff --git a/api/valor_api/backend/metrics/classification.py b/api/valor_api/backend/metrics/classification.py index 6a52f180f..35a6dc717 100644 --- a/api/valor_api/backend/metrics/classification.py +++ b/api/valor_api/backend/metrics/classification.py @@ -10,9 +10,8 @@ from valor_api import enums, schemas from valor_api.backend import core, models from valor_api.backend.metrics.metric_utils import ( + commit_results, create_grouper_mappings, - create_metric_mappings, - get_or_create_row, log_evaluation_duration, log_evaluation_item_counts, prepare_filter_for_evaluation, @@ -1035,6 +1034,7 @@ def _compute_clf_metrics( ) grouper_mappings = create_grouper_mappings( + db=db, labels=labels, label_map=label_map, evaluation_type=enums.TaskType.CLASSIFICATION, @@ -1089,12 +1089,10 @@ def compute_clf_metrics( # unpack filters and params parameters = schemas.EvaluationParameters(**evaluation.parameters) groundtruth_filter, prediction_filter = prepare_filter_for_evaluation( - db=db, filters=schemas.Filter(**evaluation.filters), dataset_names=evaluation.dataset_names, model_name=evaluation.model_name, task_type=parameters.task_type, - label_map=parameters.label_map, ) log_evaluation_item_counts( @@ -1120,36 +1118,20 @@ def compute_clf_metrics( metrics_to_return=parameters.metrics_to_return, ) - confusion_matrices_mappings = create_metric_mappings( + # add confusion matrices to database + commit_results( db=db, metrics=confusion_matrices, evaluation_id=evaluation.id, ) - for mapping in confusion_matrices_mappings: - get_or_create_row( - db, - models.ConfusionMatrix, - mapping, - ) - - metric_mappings = create_metric_mappings( + # add metrics to database + commit_results( db=db, metrics=metrics, evaluation_id=evaluation.id, ) - for mapping in metric_mappings: - # ignore value since the other columns are unique identifiers - # and have empirically noticed value can slightly change due to floating - # point errors - get_or_create_row( - db, - models.Metric, - mapping, - columns_to_ignore=["value"], - ) - log_evaluation_duration( evaluation=evaluation, db=db, diff --git a/api/valor_api/backend/metrics/detection.py b/api/valor_api/backend/metrics/detection.py index 0bdfd4b0c..370189f1e 100644 --- a/api/valor_api/backend/metrics/detection.py +++ b/api/valor_api/backend/metrics/detection.py @@ -13,9 +13,8 @@ from valor_api import enums, schemas from valor_api.backend import core, models from valor_api.backend.metrics.metric_utils import ( + commit_results, create_grouper_mappings, - create_metric_mappings, - get_or_create_row, log_evaluation_duration, log_evaluation_item_counts, prepare_filter_for_evaluation, @@ -739,6 +738,7 @@ def _annotation_type_to_geojson( ) grouper_mappings = create_grouper_mappings( + db=db, labels=labels, label_map=parameters.label_map, evaluation_type=enums.TaskType.OBJECT_DETECTION, @@ -1108,6 +1108,7 @@ def _annotation_type_to_geojson( ) grouper_mappings = create_grouper_mappings( + db=db, labels=labels, label_map=parameters.label_map, evaluation_type=enums.TaskType.OBJECT_DETECTION, @@ -1641,12 +1642,10 @@ def compute_detection_metrics(*_, db: Session, evaluation_id: int): # unpack filters and params parameters = schemas.EvaluationParameters(**evaluation.parameters) groundtruth_filter, prediction_filter = prepare_filter_for_evaluation( - db=db, filters=schemas.Filter(**evaluation.filters), dataset_names=evaluation.dataset_names, model_name=evaluation.model_name, task_type=parameters.task_type, - label_map=parameters.label_map, ) log_evaluation_item_counts( @@ -1755,22 +1754,13 @@ def compute_detection_metrics(*_, db: Session, evaluation_id: int): target_type=target_type, ) - metric_mappings = create_metric_mappings( + # add metrics to database + commit_results( db=db, metrics=metrics, evaluation_id=evaluation_id, ) - for mapping in metric_mappings: - # ignore value since the other columns are unique identifiers - # and have empircally noticed value can slightly change due to floating - # point errors - - get_or_create_row( - db, models.Metric, mapping, columns_to_ignore=["value"] - ) - db.commit() - log_evaluation_duration( evaluation=evaluation, db=db, diff --git a/api/valor_api/backend/metrics/metric_utils.py b/api/valor_api/backend/metrics/metric_utils.py index 6ea9a9e06..28adaada6 100644 --- a/api/valor_api/backend/metrics/metric_utils.py +++ b/api/valor_api/backend/metrics/metric_utils.py @@ -1,7 +1,7 @@ from collections import defaultdict from typing import Callable, Sequence -from sqlalchemy import select +from sqlalchemy import and_, or_, select from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session from sqlalchemy.sql import func @@ -105,6 +105,7 @@ def _create_classification_grouper_mappings( def create_grouper_mappings( + db: Session, labels: list, label_map: LabelMapType | None, evaluation_type: enums.TaskType, @@ -139,64 +140,46 @@ def create_grouper_mappings( ) # create a map of labels to groupers; will be empty if the user didn't pass a label_map - mapping_dict = ( - {tuple(label): tuple(grouper) for label, grouper in label_map} - if label_map - else {} - ) + grouper_key_to_value = defaultdict(list) + mapping_dict = dict() + if label_map: + for label, grouper in label_map: + mapping_dict[tuple(label)] = tuple(grouper) + grouper_key_to_value[grouper[0]].append(grouper[1]) + + # add grouper labels to database (if they don't exist) + grouper_labels = set(mapping_dict.values()) + existing_labels = { + (row.key, row.value) + for row in ( + db.query(models.Label) + .where( + or_( + *[ + and_( + models.Label.key == key, + models.Label.value.in_(values), + ) + for key, values in grouper_key_to_value.items() + ] + ) + ) + .all() + ) + } + labels_to_create = list(grouper_labels - existing_labels) + core.create_labels( + db=db, + labels=[ + schemas.Label(key=key, value=value) + for key, value in labels_to_create + ], + ) return mapping_functions[evaluation_type](mapping_dict, labels) -def get_or_create_row( - db: Session, - model_class: type, - mapping: dict, - columns_to_ignore: list[str] | None = None, -): - """ - Tries to get the row defined by mapping. If that exists then its mapped object is returned. Otherwise a row is created by `mapping` and the newly created object is returned. - - Parameters - ---------- - db : Session - The database Session to query against. - model_class : type - The type of model. - mapping : dict - The mapping to use when creating the row. - columns_to_ignore : List[str] - Specifies any columns to ignore in forming the WHERE expression. This can be used for numerical columns that might slightly differ but are essentially the same. - - Returns - ---------- - any - A model class object. - """ - columns_to_ignore = columns_to_ignore or [] - - # create the query from the mapping - where_expressions = [ - (getattr(model_class, k) == v) - for k, v in mapping.items() - if k not in columns_to_ignore - ] - where_expression = where_expressions[0] - for exp in where_expressions[1:]: - where_expression = where_expression & exp - - db_element = db.scalar(select(model_class).where(where_expression)) - - if not db_element: - db_element = model_class(**mapping) - db.add(db_element) - db.flush() - db.commit() - - return db_element - - -def create_metric_mappings( +def commit_results( db: Session, metrics: Sequence[ schemas.APMetric @@ -217,7 +200,7 @@ def create_metric_mappings( | schemas.DetailedPrecisionRecallCurve ], evaluation_id: int, -) -> list[dict]: +): """ Create metric mappings from a list of metrics. @@ -229,13 +212,10 @@ def create_metric_mappings( A list of metrics to create mappings for. evaluation_id : int The id of the evaluation job. - - Returns - ---------- - List[Dict] - A list of metric mappings. """ - ret = [] + + # cache labels for metrics that use them + cached_labels = defaultdict(list) for metric in metrics: if isinstance( metric, @@ -249,29 +229,69 @@ def create_metric_mappings( schemas.IOUMetric, ), ): - label = core.fetch_label( - db=db, - label=metric.label, + cached_labels[metric.label.key].append(metric.label.value) + cached_label_to_id = { + schemas.Label(key=row.key, value=row.value): row.id + for row in ( + db.query(models.Label) + .where( + or_( + *[ + and_( + models.Label.key == key, + models.Label.value.in_(values), + ) + for key, values in cached_labels.items() + ] + ) ) + .all() + ) + } - # create the label in the database if it doesn't exist - # this is useful if the user maps existing labels to a non-existant grouping label - if not label: - label_map = core.create_labels(db=db, labels=[metric.label]) - label_id = list(label_map.values())[0] - else: - label_id = label.id - - ret.append( - metric.db_mapping( - label_id=label_id, - evaluation_id=evaluation_id, + metric_rows = [] + confusion_rows = [] + for metric in metrics: + if isinstance( + metric, + ( + schemas.APMetric, + schemas.ARMetric, + schemas.APMetricAveragedOverIOUs, + schemas.PrecisionMetric, + schemas.RecallMetric, + schemas.F1Metric, + schemas.IOUMetric, + ), + ): + metric_rows.append( + models.Metric( + **metric.db_mapping( + label_id=cached_label_to_id[metric.label], + evaluation_id=evaluation_id, + ) + ) + ) + elif isinstance(metric, schemas.ConfusionMatrix): + confusion_rows.append( + models.ConfusionMatrix( + **metric.db_mapping(evaluation_id=evaluation_id) ) ) else: - ret.append(metric.db_mapping(evaluation_id=evaluation_id)) + metric_rows.append( + models.Metric(**metric.db_mapping(evaluation_id=evaluation_id)) + ) - return ret + try: + if metric_rows: + db.add_all(metric_rows) + if confusion_rows: + db.add_all(confusion_rows) + db.commit() + except IntegrityError as e: + db.rollback() + raise e def log_evaluation_duration( @@ -444,12 +464,10 @@ def wrapper(*args, **kwargs): def prepare_filter_for_evaluation( - db: Session, filters: schemas.Filter, dataset_names: list[str], model_name: str, task_type: enums.TaskType, - label_map: LabelMapType | None = None, ) -> tuple[schemas.Filter, schemas.Filter]: """ Prepares the filter for use by an evaluation method. @@ -458,8 +476,6 @@ def prepare_filter_for_evaluation( Parameters ---------- - db : Session - The database session. filters : Filter The data filter. dataset_names : list[str] @@ -468,8 +484,6 @@ def prepare_filter_for_evaluation( A model name to filter by. task_type : TaskType A task type to filter by. - label_map : LabelMapType, optional - An optional label mapping. Returns ------- diff --git a/api/valor_api/backend/metrics/segmentation.py b/api/valor_api/backend/metrics/segmentation.py index b4fea07b0..064c9e9b0 100644 --- a/api/valor_api/backend/metrics/segmentation.py +++ b/api/valor_api/backend/metrics/segmentation.py @@ -8,9 +8,8 @@ from valor_api import enums, schemas from valor_api.backend import core, models from valor_api.backend.metrics.metric_utils import ( + commit_results, create_grouper_mappings, - create_metric_mappings, - get_or_create_row, log_evaluation_duration, log_evaluation_item_counts, prepare_filter_for_evaluation, @@ -178,6 +177,7 @@ def _compute_segmentation_metrics( ) grouper_mappings = create_grouper_mappings( + db=db, labels=labels, label_map=parameters.label_map, evaluation_type=enums.TaskType.SEMANTIC_SEGMENTATION, @@ -272,12 +272,10 @@ def compute_semantic_segmentation_metrics( # unpack filters and params parameters = schemas.EvaluationParameters(**evaluation.parameters) groundtruth_filter, prediction_filter = prepare_filter_for_evaluation( - db=db, filters=schemas.Filter(**evaluation.filters), dataset_names=evaluation.dataset_names, model_name=evaluation.model_name, task_type=parameters.task_type, - label_map=parameters.label_map, ) log_evaluation_item_counts( @@ -293,17 +291,9 @@ def compute_semantic_segmentation_metrics( prediction_filter=prediction_filter, groundtruth_filter=groundtruth_filter, ) - metric_mappings = create_metric_mappings(db, metrics, evaluation_id) - for mapping in metric_mappings: - # ignore value since the other columns are unique identifiers - # and have empirically noticed value can slightly change due to floating - # point errors - get_or_create_row( - db, - models.Metric, - mapping, - columns_to_ignore=["value"], - ) + + # add metrics to database + commit_results(db, metrics, evaluation_id) log_evaluation_duration( evaluation=evaluation,