From 5eb82359ca44bc20a9e5e403989159dff34ef3c4 Mon Sep 17 00:00:00 2001
From: Charles Zaloom <38677807+czaloom@users.noreply.github.com>
Date: Fri, 14 Jun 2024 17:02:41 -0500
Subject: [PATCH] Add Delete Evaluation Endpoint (#627)

---
 .../crud/test_evaluation_crud.py              | 175 +++++++++++++++++-
 api/valor_api/backend/__init__.py             |   2 +
 api/valor_api/backend/core/__init__.py        |   2 +
 api/valor_api/backend/core/evaluation.py      |  35 ++++
 api/valor_api/crud/__init__.py                |   3 +-
 api/valor_api/crud/_delete.py                 |  19 ++
 api/valor_api/main.py                         |  35 ++++
 client/valor/client.py                        |  13 ++
 client/valor/coretypes.py                     |  24 +++
 .../evaluations/test_evaluation_crud.py       | 155 ++++++++++++++++
 10 files changed, 461 insertions(+), 2 deletions(-)
 create mode 100644 integration_tests/client/evaluations/test_evaluation_crud.py

diff --git a/api/tests/functional-tests/crud/test_evaluation_crud.py b/api/tests/functional-tests/crud/test_evaluation_crud.py
index 7636f76f8..482bb2bfc 100644
--- a/api/tests/functional-tests/crud/test_evaluation_crud.py
+++ b/api/tests/functional-tests/crud/test_evaluation_crud.py
@@ -1,9 +1,10 @@
 import pytest
+from sqlalchemy import func, select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import Session
 
 from valor_api import crud, enums, exceptions, schemas
-from valor_api.backend import core
+from valor_api.backend import core, models
 
 
 def test_evaluation_creation_exceptions(db: Session):
@@ -226,3 +227,175 @@ def test_restart_failed_evaluation(db: Session):
     assert len(evaluations4) == 1
     assert evaluations4[0].status == enums.EvaluationStatus.DONE
     assert evaluations4[0].id == evaluations1[0].id
+
+
+@pytest.fixture
+def create_evaluations(db: Session):
+
+    rows = [
+        models.Evaluation(
+            id=idx,
+            dataset_names=["1", "2"],
+            model_name=str(idx),
+            parameters=schemas.EvaluationParameters(
+                task_type=enums.TaskType.CLASSIFICATION
+            ).model_dump(),
+            filters=schemas.Filter().model_dump(),
+            status=status,
+        )
+        for idx, status in enumerate(enums.EvaluationStatus)
+    ]
+
+    try:
+        db.add_all(rows)
+        db.commit()
+    except IntegrityError as e:
+        db.rollback()
+        raise e
+
+    yield [(row.id, row.status) for row in rows]
+
+    for row in rows:
+        try:
+            db.delete(row)
+        except IntegrityError:
+            db.rollback()
+
+
+def test_delete_evaluation(db: Session, create_evaluations):
+
+    for idx, status in create_evaluations:
+        assert (
+            db.scalar(
+                select(func.count(models.Evaluation.id)).where(
+                    models.Evaluation.id == idx
+                )
+            )
+            == 1
+        )
+        if status in {
+            enums.EvaluationStatus.PENDING,
+            enums.EvaluationStatus.RUNNING,
+        }:
+            with pytest.raises(exceptions.EvaluationRunningError):
+                crud.delete_evaluation(db=db, evaluation_id=idx)
+            assert (
+                db.scalar(
+                    select(func.count(models.Evaluation.id)).where(
+                        models.Evaluation.id == idx
+                    )
+                )
+                == 1
+            )
+        elif status == enums.EvaluationStatus.DELETING:
+            with pytest.raises(exceptions.EvaluationDoesNotExistError):
+                crud.delete_evaluation(db=db, evaluation_id=idx)
+            assert (
+                db.scalar(
+                    select(func.count(models.Evaluation.id)).where(
+                        models.Evaluation.id == idx
+                    )
+                )
+                == 1
+            )
+        else:
+            crud.delete_evaluation(db=db, evaluation_id=idx)
+            assert (
+                db.scalar(
+                    select(func.count(models.Evaluation.id)).where(
+                        models.Evaluation.id == idx
+                    )
+                )
+                == 0
+            )
+
+    # check for id that doesnt exist
+    with pytest.raises(exceptions.EvaluationDoesNotExistError):
+        crud.delete_evaluation(db=db, evaluation_id=10000)
+
+
+@pytest.fixture
+def create_evaluation_with_metrics(db: Session):
+    evaluation_id = 0
+    number_of_metrics = 4
+
+    evaluation = models.Evaluation(
+        id=evaluation_id,
+        dataset_names=["1", "2"],
+        model_name="3",
+        parameters=schemas.EvaluationParameters(
+            task_type=enums.TaskType.CLASSIFICATION
+        ).model_dump(),
+        filters=schemas.Filter().model_dump(),
+        status=enums.EvaluationStatus.DONE,
+    )
+    metrics = [
+        models.Metric(
+            evaluation_id=evaluation_id,
+            label_id=None,
+            type="Precision",
+            value=float(i) / float(number_of_metrics),
+            parameters=dict(),
+        )
+        for i in range(number_of_metrics)
+    ]
+
+    try:
+        db.add(evaluation)
+        db.add_all(metrics)
+        db.commit()
+    except IntegrityError as e:
+        db.rollback()
+        raise e
+
+    yield (evaluation_id, number_of_metrics)
+
+    for row in [evaluation, *metrics]:
+        try:
+            db.delete(row)
+            db.commit()
+        except IntegrityError:
+            db.rollback()
+
+
+def test_delete_evaluation_with_metrics(
+    db: Session, create_evaluation_with_metrics
+):
+    row_id, num_metrics = create_evaluation_with_metrics
+
+    assert num_metrics == 4
+    assert (
+        db.scalar(
+            select(func.count(models.Evaluation.id)).where(
+                models.Evaluation.id == row_id
+            )
+        )
+        == 1
+    )
+    assert (
+        db.scalar(
+            select(func.count(models.Metric.id)).where(
+                models.Metric.evaluation_id == row_id
+            )
+        )
+        == num_metrics
+    )
+
+    crud.delete_evaluation(db=db, evaluation_id=row_id)
+
+    assert (
+        db.scalar(
+            select(func.count(models.Evaluation.id)).where(
+                models.Evaluation.id == row_id
+            )
+        )
+        == 0
+    )
+    assert (
+        db.scalar(
+            select(func.count(models.Metric.id)).where(
+                models.Metric.evaluation_id == row_id
+            )
+        )
+        == 0
+    )
diff --git a/api/valor_api/backend/__init__.py b/api/valor_api/backend/__init__.py
index ecbb64656..feded4d84 100644
--- a/api/valor_api/backend/__init__.py
+++ b/api/valor_api/backend/__init__.py
@@ -5,6 +5,7 @@
     create_or_get_evaluations,
     create_predictions,
     delete_dataset,
+    delete_evaluation_from_id,
     delete_model,
     get_dataset,
     get_dataset_status,
@@ -45,6 +46,7 @@
     "create_predictions",
     "delete_dataset",
     "delete_model",
+    "delete_evaluation_from_id",
     "get_dataset",
     "get_paginated_datasets",
     "get_dataset_summary",
diff --git a/api/valor_api/backend/core/__init__.py b/api/valor_api/backend/core/__init__.py
index a39a3e640..a4f26870e 100644
--- a/api/valor_api/backend/core/__init__.py
+++ b/api/valor_api/backend/core/__init__.py
@@ -32,6 +32,7 @@
 from .evaluation import (
     count_active_evaluations,
     create_or_get_evaluations,
+    delete_evaluation_from_id,
     delete_evaluations,
     fetch_evaluation_from_id,
     get_evaluation_requests_from_model,
@@ -131,6 +132,7 @@
     "get_paginated_evaluations",
     "get_evaluation_status",
     "set_evaluation_status",
+    "delete_evaluation_from_id",
     "get_evaluation_requests_from_model",
     "count_active_evaluations",
     "delete_dataset_annotations",
diff --git a/api/valor_api/backend/core/evaluation.py b/api/valor_api/backend/core/evaluation.py
index 52120db53..82003e09f 100644
--- a/api/valor_api/backend/core/evaluation.py
+++ b/api/valor_api/backend/core/evaluation.py
@@ -1055,3 +1055,38 @@ def delete_evaluations(
     except IntegrityError as e:
         db.rollback()
         raise e
+
+
+def delete_evaluation_from_id(db: Session, evaluation_id: int):
+    """
+    Delete a evaluation by id.
+
+    Parameters
+    ----------
+    db : Session
+        The database session.
+    evaluation_id : int
+        The evaluation identifer.
+
+    Raises
+    ------
+    EvaluationRunningError
+        If the evaluation is currently running.
+    EvaluationDoesNotExistError
+        If the evaluation does not exist.
+    """
+    evaluation = fetch_evaluation_from_id(db=db, evaluation_id=evaluation_id)
+    if evaluation.status in {
+        enums.EvaluationStatus.PENDING,
+        enums.EvaluationStatus.RUNNING,
+    }:
+        raise exceptions.EvaluationRunningError
+    elif evaluation.status in enums.EvaluationStatus.DELETING:
+        return
+
+    try:
+        db.delete(evaluation)
+        db.commit()
+    except IntegrityError as e:
+        db.rollback()
+        raise e
diff --git a/api/valor_api/crud/__init__.py b/api/valor_api/crud/__init__.py
index 1bd0ace4a..1500baa4c 100644
--- a/api/valor_api/crud/__init__.py
+++ b/api/valor_api/crud/__init__.py
@@ -5,7 +5,7 @@
     create_or_get_evaluations,
     create_predictions,
 )
-from ._delete import delete
+from ._delete import delete, delete_evaluation
 from ._read import (
     get_dataset,
     get_dataset_summary,
@@ -44,4 +44,5 @@
     "get_prediction",
     "finalize",
     "get_evaluations",
+    "delete_evaluation",
 ]
diff --git a/api/valor_api/crud/_delete.py b/api/valor_api/crud/_delete.py
index 60c6d6706..88155f075 100644
--- a/api/valor_api/crud/_delete.py
+++ b/api/valor_api/crud/_delete.py
@@ -25,3 +25,22 @@ def delete(
         backend.delete_dataset(db, dataset_name)
     elif model_name is not None:
         backend.delete_model(db, model_name)
+
+
+def delete_evaluation(*, db: Session, evaluation_id: int):
+    """
+    Deletes an evaluation by id.
+
+    Parameters
+    ----------
+    evaluation_id : int
+        The evaluation identifier.
+
+    Raises
+    ------
+    EvaluationRunningError
+        If the evaluation is currently running.
+    EvaluationDoesNotExistError
+        If the evaluation does not exist.
+    """
+    backend.delete_evaluation_from_id(db=db, evaluation_id=evaluation_id)
diff --git a/api/valor_api/main.py b/api/valor_api/main.py
index 39a055a60..399d77030 100644
--- a/api/valor_api/main.py
+++ b/api/valor_api/main.py
@@ -1183,6 +1183,41 @@ def get_evaluations(
         raise exceptions.create_http_error(e)
 
 
+@app.delete(
+    "/evaluations/{evaluation_id}",
+    dependencies=[Depends(token_auth_scheme)],
+    tags=["Evaluations"],
+)
+def delete_evaluation(
+    evaluation_id: int,
+    db: Session = Depends(get_db),
+):
+    """
+    Delete a evaluation from the database.
+
+    DELETE Endpoint: `/evaluations/{evaluation_id}`
+
+    Parameters
+    ----------
+    evaluation_id : int
+        The evaluation identifier.
+    db : Session
+        The database session to use. This parameter is a sqlalchemy dependency and shouldn't be submitted by the user.
+
+    Raises
+    ------
+    HTTPException (404)
+        If the evaluation doesn't exist.
+    HTTPException (409)
+        If the evaluation isn't in the correct state to be deleted.
+    """
+    logger.debug(f"request to delete evaluation {evaluation_id}")
+    try:
+        crud.delete_evaluation(db=db, evaluation_id=evaluation_id)
+    except Exception as e:
+        raise exceptions.create_http_error(e)
+
+
 """ AUTHENTICATION """
 
 
diff --git a/client/valor/client.py b/client/valor/client.py
index 339de8b30..9f27f4b3b 100644
--- a/client/valor/client.py
+++ b/client/valor/client.py
@@ -836,6 +836,19 @@ def _build_query_param(param_name, element, typ):
 
         return self._requests_get_rel_host(endpoint).json()
 
+    def delete_evaluation(self, evaluation_id: int) -> None:
+        """
+        Deletes an evaluation.
+
+        `DELETE` endpoint.
+
+        Parameters
+        ----------
+        evaluation_id : int
+            The id of the evaluation to be deleted.
+        """
+        self._requests_delete_rel_host(f"evaluations/{evaluation_id}")
+
     def get_user(self) -> Union[str, None]:
         """
         Gets the users e-mail address (in the case when auth is enabled)
diff --git a/client/valor/coretypes.py b/client/valor/coretypes.py
index b756e3bec..7f20970e4 100644
--- a/client/valor/coretypes.py
+++ b/client/valor/coretypes.py
@@ -1804,3 +1804,27 @@ def evaluate(
                 request, allow_retries=allow_retries
             )
         ]
+
+    def delete_evaluation(self, evaluation_id: int, timeout: int = 0) -> None:
+        """
+        Deletes an evaluation.
+
+        Parameters
+        ----------
+        evaluation_id : int
+            The id of the evaluation to be deleted.
+        timeout : int
+            The number of seconds to wait in order to confirm that the model was deleted.
+        """
+        self.conn.delete_evaluation(evaluation_id)
+        if timeout:
+            for _ in range(timeout):
+                try:
+                    self.get_evaluations(evaluation_ids=[evaluation_id])
+                except EvaluationDoesNotExist:
+                    break
+                time.sleep(1)
+            else:
+                raise TimeoutError(
+                    "Evaluation wasn't deleted within timeout interval"
+                )
diff --git a/integration_tests/client/evaluations/test_evaluation_crud.py b/integration_tests/client/evaluations/test_evaluation_crud.py
new file mode 100644
index 000000000..fae632b72
--- /dev/null
+++ b/integration_tests/client/evaluations/test_evaluation_crud.py
@@ -0,0 +1,155 @@
+import pytest
+from sqlalchemy import func, select
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import Session
+
+# client
+from valor import Client, Dataset, GroundTruth, Label, Model, Prediction
+from valor.enums import EvaluationStatus
+from valor.exceptions import ClientException
+
+# api
+from valor_api import enums, schemas
+from valor_api.backend import models
+
+
+@pytest.fixture
+def create_evaluations(db: Session):
+
+    rows = [
+        models.Evaluation(
+            id=idx,
+            dataset_names=["1", "2"],
+            model_name=str(idx),
+            parameters=schemas.EvaluationParameters(
+                task_type=enums.TaskType.CLASSIFICATION
+            ).model_dump(),
+            filters=schemas.Filter().model_dump(),
+            status=status,
+        )
+        for idx, status in enumerate(enums.EvaluationStatus)
+    ]
+
+    try:
+        db.add_all(rows)
+        db.commit()
+    except IntegrityError as e:
+        db.rollback()
+        raise e
+
+    yield [(row.id, row.status) for row in rows]
+
+    for row in rows:
+        try:
+            db.delete(row)
+        except IntegrityError:
+            db.rollback()
+
+
+def test_delete_evaluation(db: Session, client: Client, create_evaluations):
+
+    for idx, status in create_evaluations:
+        assert (
+            db.scalar(
+                select(func.count(models.Evaluation.id)).where(
+                    models.Evaluation.id == idx
+                )
+            )
+            == 1
+        )
+
+        if status in {
+            enums.EvaluationStatus.PENDING,
+            enums.EvaluationStatus.RUNNING,
+        }:
+            with pytest.raises(ClientException) as e:
+                client.delete_evaluation(evaluation_id=idx)
+            assert "EvaluationRunningError" in str(e)
+            assert (
+                db.scalar(
+                    select(func.count(models.Evaluation.id)).where(
+                        models.Evaluation.id == idx
+                    )
+                )
+                == 1
+            )
+        elif status == enums.EvaluationStatus.DELETING:
+            with pytest.raises(ClientException) as e:
+                client.delete_evaluation(evaluation_id=idx)
+            assert "EvaluationDoesNotExistError" in str(e)
+            assert (
+                db.scalar(
+                    select(func.count(models.Evaluation.id)).where(
+                        models.Evaluation.id == idx
+                    )
+                )
+                == 1
+            )
+        else:
+            client.delete_evaluation(evaluation_id=idx)
+            assert (
+                db.scalar(
+                    select(func.count(models.Evaluation.id)).where(
+                        models.Evaluation.id == idx
+                    )
+                )
+                == 0
+            )
+
+    # check for id that doesnt exist
+    with pytest.raises(ClientException) as e:
+        client.delete_evaluation(evaluation_id=10000)
+    assert "EvaluationDoesNotExistError" in str(e)
+
+
+def test_delete_evaluation_scope(
+    client: Client,
+    gt_clfs: list[GroundTruth],
+    pred_clfs: list[Prediction],
+    dataset_name: str,
+    model_name: str,
+):
+    dataset = Dataset.create(dataset_name)
+    for gt in gt_clfs:
+        dataset.add_groundtruth(gt)
+    dataset.finalize()
+
+    model = Model.create(model_name)
+    for pd in pred_clfs:
+        model.add_prediction(dataset, pd)
+    model.finalize_inferences(dataset)
+
+    eval1 = model.evaluate_classification(dataset)
+    assert eval1.wait_for_completion(timeout=30) == EvaluationStatus.DONE
+    eval2 = model.evaluate_classification(
+        dataset, filter_by=[Label.key == "k4"]
+    )
+    assert eval2.wait_for_completion(timeout=30)
+
+    assert eval1.id != eval2.id
+    assert len(client.get_evaluations(evaluation_ids=[eval1.id])) == 1
+    assert len(client.get_evaluations(evaluation_ids=[eval2.id])) == 1
+
+    # delete eval 1
+    client.delete_evaluation(eval1.id)
+
+    assert len(client.get_evaluations(evaluation_ids=[eval1.id])) == 0
+    assert len(client.get_evaluations(evaluation_ids=[eval2.id])) == 1
+
+    # show that we can still make evaluations
+    eval3 = model.evaluate_classification(dataset)
+    assert eval3.wait_for_completion(timeout=30)
+
+    assert eval1.id != eval2.id
+    assert eval1.id != eval3.id
+    assert eval2.id != eval3.id
+    assert len(client.get_evaluations(evaluation_ids=[eval1.id])) == 0
+    assert len(client.get_evaluations(evaluation_ids=[eval2.id])) == 1
+    assert len(client.get_evaluations(evaluation_ids=[eval3.id])) == 1
+
+    # show that eval1 was repreated in eval3
+    assert eval1.id != eval3.id
+    for metric in eval1.metrics:
+        assert metric in eval3.metrics
+    for metric in eval3.metrics:
+        assert metric in eval1.metrics