From 34969818f9b2d57a2245abc9df022815df8f7bb5 Mon Sep 17 00:00:00 2001 From: Charles Zaloom <38677807+czaloom@users.noreply.github.com> Date: Mon, 16 Sep 2024 16:25:08 -0400 Subject: [PATCH] Numpy-based Object Detection for Bounding Boxes (#748) --- .github/workflows/build-and-publish.yml | 17 + .../workflows/lite-benchmark-evaluations.yml | 38 + .github/workflows/lite-tests-and-coverage.yml | 35 + .../client/metrics/test_detection.py | 2 +- lite/LICENSE | 21 + lite/README.md | 1 + lite/benchmarks/.gitignore | 2 + lite/benchmarks/benchmark_objdet.py | 330 +++++++ lite/examples/.gitignore | 1 + lite/examples/coco-yolo.ipynb | 442 +++++++++ lite/pyproject.toml | 38 + lite/tests/detection/__init__.py | 0 lite/tests/detection/conftest.py | 504 ++++++++++ .../tests/detection/test_average_precision.py | 623 +++++++++++++ lite/tests/detection/test_average_recall.py | 246 +++++ lite/tests/detection/test_counts.py | 457 +++++++++ lite/tests/detection/test_dataloader.py | 34 + .../tests/detection/test_detailed_pr_curve.py | 882 ++++++++++++++++++ lite/tests/detection/test_evaluator.py | 31 + lite/tests/detection/test_filtering.py | 401 ++++++++ lite/tests/detection/test_iou.py | 30 + lite/tests/detection/test_pr_curve.py | 177 ++++ lite/tests/detection/test_precision.py | 389 ++++++++ lite/tests/detection/test_recall.py | 389 ++++++++ lite/tests/detection/test_schemas.py | 105 +++ lite/tests/detection/test_stability.py | 87 ++ lite/valor_lite/__init__.py | 0 lite/valor_lite/detection/__init__.py | 56 ++ lite/valor_lite/detection/annotation.py | 54 ++ lite/valor_lite/detection/computation.py | 506 ++++++++++ lite/valor_lite/detection/manager.py | 845 +++++++++++++++++ lite/valor_lite/detection/metric.py | 357 +++++++ lite/valor_lite/schemas.py | 15 + 33 files changed, 7114 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/lite-benchmark-evaluations.yml create mode 100644 .github/workflows/lite-tests-and-coverage.yml create mode 100644 lite/LICENSE create mode 100644 lite/README.md create mode 100644 lite/benchmarks/.gitignore create mode 100644 lite/benchmarks/benchmark_objdet.py create mode 100644 lite/examples/.gitignore create mode 100644 lite/examples/coco-yolo.ipynb create mode 100644 lite/pyproject.toml create mode 100644 lite/tests/detection/__init__.py create mode 100644 lite/tests/detection/conftest.py create mode 100644 lite/tests/detection/test_average_precision.py create mode 100644 lite/tests/detection/test_average_recall.py create mode 100644 lite/tests/detection/test_counts.py create mode 100644 lite/tests/detection/test_dataloader.py create mode 100644 lite/tests/detection/test_detailed_pr_curve.py create mode 100644 lite/tests/detection/test_evaluator.py create mode 100644 lite/tests/detection/test_filtering.py create mode 100644 lite/tests/detection/test_iou.py create mode 100644 lite/tests/detection/test_pr_curve.py create mode 100644 lite/tests/detection/test_precision.py create mode 100644 lite/tests/detection/test_recall.py create mode 100644 lite/tests/detection/test_schemas.py create mode 100644 lite/tests/detection/test_stability.py create mode 100644 lite/valor_lite/__init__.py create mode 100644 lite/valor_lite/detection/__init__.py create mode 100644 lite/valor_lite/detection/annotation.py create mode 100644 lite/valor_lite/detection/computation.py create mode 100644 lite/valor_lite/detection/manager.py create mode 100644 lite/valor_lite/detection/metric.py create mode 100644 lite/valor_lite/schemas.py diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml index 7063fd297..cf8a1b267 100644 --- a/.github/workflows/build-and-publish.yml +++ b/.github/workflows/build-and-publish.yml @@ -23,6 +23,23 @@ jobs: with: password: ${{ secrets.PYPI_API_TOKEN }} packages-dir: ./client/dist + build-and-publish-py-lite-package: + runs-on: ubuntu-latest + defaults: + run: + working-directory: ./lite + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Build wheel + run: pip install build && python -m build + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.VALOR_LITE_PYPI_API_TOKEN }} + packages-dir: ./lite/dist build-and-publish-ts-package: runs-on: ubuntu-latest defaults: diff --git a/.github/workflows/lite-benchmark-evaluations.yml b/.github/workflows/lite-benchmark-evaluations.yml new file mode 100644 index 000000000..7debf7a05 --- /dev/null +++ b/.github/workflows/lite-benchmark-evaluations.yml @@ -0,0 +1,38 @@ +name: Run valor-lite benchmarks + +on: + push: + branches: "**" + +permissions: + id-token: write + contents: read + +jobs: + run-benchmarks: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: install lite + run: pip install -e . + working-directory: ./lite + # - name: run classification benchmarks + # run: python benchmark_script.py + # working-directory: ./lite/benchmarks/classification + # - name: print classification results + # run: | + # export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('results.json', 'r')), indent=4));") + # echo "$BENCHMARK_RESULTS" + # working-directory: ./lite/benchmarks/classification + - name: run object detection benchmarks + run: python benchmark_objdet.py + working-directory: ./lite/benchmarks/ + - name: print object detection results + run: | + export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('manager_results.json', 'r')), indent=4));") + echo "$BENCHMARK_RESULTS" + working-directory: ./lite/benchmarks/ + - run: make stop-env diff --git a/.github/workflows/lite-tests-and-coverage.yml b/.github/workflows/lite-tests-and-coverage.yml new file mode 100644 index 000000000..e1e238263 --- /dev/null +++ b/.github/workflows/lite-tests-and-coverage.yml @@ -0,0 +1,35 @@ +name: Run valor-lite code coverage report + +on: + push: + branches: "**" + +permissions: + id-token: write + contents: read + +jobs: + lite-tests: + runs-on: ubuntu-latest + defaults: + run: + working-directory: . + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: run object detection tests and report coverage + run: | + pip install -e ".[test]" + COVERAGE_FILE=.coverage.functional python -m coverage run --omit "tests/*" -m pytest -v tests/detection/ + python -m coverage combine + python -m coverage report -m + python -m coverage json + export TOTAL=$(python -c "import json;print(json.load(open('coverage.json'))['totals']['percent_covered_display'])") + echo "total=$TOTAL" >> $GITHUB_ENV + if (( $TOTAL < 90 )); then + echo "Coverage is below 90%" + exit 1 + fi + working-directory: ./lite diff --git a/integration_tests/client/metrics/test_detection.py b/integration_tests/client/metrics/test_detection.py index 780be8616..8c4bc36bf 100644 --- a/integration_tests/client/metrics/test_detection.py +++ b/integration_tests/client/metrics/test_detection.py @@ -2481,7 +2481,7 @@ def test_evaluate_detection_false_negatives_two_images_one_only_with_different_c ): """In this test we have 1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU) - 2. A second image with a groundtruth annotation with clas `"other value"` and a prediction with higher confidence + 2. A second image with a groundtruth annotation with class `"other value"` and a prediction with higher confidence then the prediction on the first image. In this case, the AP for class `"value"` should be 0.5 since the false positive has higher confidence than the true positive. diff --git a/lite/LICENSE b/lite/LICENSE new file mode 100644 index 000000000..2965db998 --- /dev/null +++ b/lite/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Striveworks + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/lite/README.md b/lite/README.md new file mode 100644 index 000000000..4bfbcd0a8 --- /dev/null +++ b/lite/README.md @@ -0,0 +1 @@ +# valor-lite: Compute classification, object detection, and segmentation metrics locally. diff --git a/lite/benchmarks/.gitignore b/lite/benchmarks/.gitignore new file mode 100644 index 000000000..945e26b14 --- /dev/null +++ b/lite/benchmarks/.gitignore @@ -0,0 +1,2 @@ +*.json +*.jsonl \ No newline at end of file diff --git a/lite/benchmarks/benchmark_objdet.py b/lite/benchmarks/benchmark_objdet.py new file mode 100644 index 000000000..43423f355 --- /dev/null +++ b/lite/benchmarks/benchmark_objdet.py @@ -0,0 +1,330 @@ +import json +import os +from dataclasses import dataclass +from datetime import datetime +from enum import Enum +from pathlib import Path +from time import time + +import requests +from tqdm import tqdm +from valor_lite.detection import DataLoader + + +class AnnotationType(str, Enum): + NONE = "none" + BOX = "box" + POLYGON = "polygon" + MULTIPOLYGON = "multipolygon" + RASTER = "raster" + + +def time_it(fn): + def wrapper(*args, **kwargs): + start = time() + results = fn(*args, **kwargs) + return (time() - start, results) + + return wrapper + + +def download_data_if_not_exists( + file_name: str, + file_path: Path, + url: str, +): + """Download the data from a public bucket if it doesn't exist locally.""" + + if not os.path.exists(file_path): + response = requests.get(url, stream=True) + if response.status_code == 200: + total_size = int(response.headers.get("content-length", 0)) + with open(file_path, "wb") as f: + with tqdm( + total=total_size, + unit="B", + unit_scale=True, + unit_divisor=1024, + desc=file_name, + ) as pbar: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + pbar.update(1024) + else: + raise RuntimeError(response) + else: + print(f"{file_name} already exists locally.") + + # sort file by datum uid + with open(file_path, "r") as f: + lines = [x for x in f] + with open(file_path, "w") as f: + for line in sorted( + lines, key=lambda x: int(json.loads(x)["datum"]["uid"]) + ): + f.write(line) + + +def write_results_to_file(write_path: Path, results: list[dict]): + """Write results to manager_results.json""" + current_datetime = datetime.now().strftime("%d/%m/%Y %H:%M:%S") + if os.path.isfile(write_path): + with open(write_path, "r") as file: + file.seek(0) + data = json.load(file) + else: + data = {} + + data[current_datetime] = results + + with open(write_path, "w+") as file: + json.dump(data, file, indent=4) + + +@time_it +def ingest( + manager: DataLoader, + gt_path: Path, + pd_path: Path, + limit: int, + chunk_size: int, +): + accumulated_time = 0.0 + with open(gt_path, "r") as gf: + with open(pd_path, "r") as pf: + + count = 0 + groundtruths = [] + predictions = [] + for gline, pline in zip(gf, pf): + + # groundtruth + gt_dict = json.loads(gline) + groundtruths.append(gt_dict) + + # prediction + pd_dict = json.loads(pline) + predictions.append(pd_dict) + + count += 1 + if count >= limit and limit > 0: + break + elif len(groundtruths) < chunk_size or chunk_size == -1: + continue + + timer, _ = time_it(manager.add_data_from_valor_dict)( + zip(groundtruths, predictions), True + ) + accumulated_time += timer + groundtruths = [] + predictions = [] + + if groundtruths: + timer, _ = time_it(manager.add_data_from_valor_dict)( + zip(groundtruths, predictions), True + ) + accumulated_time += timer + + return accumulated_time + + +@dataclass +class Benchmark: + limit: int + n_datums: int + n_groundtruths: int + n_predictions: int + n_labels: int + gt_type: AnnotationType + pd_type: AnnotationType + chunk_size: int + ingestion: float + preprocessing: float + precomputation: float + evaluation: float + detailed_curves: list[tuple[int, float]] + + def result(self) -> dict: + return { + "limit": self.limit, + "n_datums": self.n_datums, + "n_groundtruths": self.n_groundtruths, + "n_predictions": self.n_predictions, + "n_labels": self.n_labels, + "dtype": { + "groundtruth": self.gt_type.value, + "prediction": self.pd_type.value, + }, + "chunk_size": self.chunk_size, + "ingestion": { + "loading_from_file": f"{round(self.ingestion - self.preprocessing, 2)} seconds", + "numpy_conversion + IoU": f"{round(self.preprocessing, 2)} seconds", + "ranking_pairs": f"{round(self.precomputation, 2)} seconds", + "total": f"{round(self.ingestion + self.precomputation, 2)} seconds", + }, + "base_evaluation": f"{round(self.evaluation, 2)} seconds", + "detailed_pr_curve": [ + { + "n_points": 10, + "n_examples": curve[0], + "computation": f"{round(curve[1], 2)} seconds", + } + for curve in self.detailed_curves + ], + } + + +def run_benchmarking_analysis( + limits_to_test: list[int], + combinations: list[tuple[AnnotationType, AnnotationType]] | None = None, + results_file: str = "manager_results.json", + chunk_size: int = -1, + compute_pr: bool = True, + compute_detailed: bool = True, + ingestion_timeout=30, + evaluation_timeout=30, +): + """Time various function calls and export the results.""" + current_directory = Path(__file__).parent + write_path = current_directory / Path(results_file) + + gt_box_filename = "gt_objdet_coco_bbox.jsonl" + gt_polygon_filename = "gt_objdet_coco_polygon.jsonl" + gt_multipolygon_filename = "gt_objdet_coco_raster_multipolygon.jsonl" + gt_raster_filename = "gt_objdet_coco_raster_bitmask.jsonl" + pd_box_filename = "pd_objdet_yolo_bbox.jsonl" + pd_polygon_filename = "pd_objdet_yolo_polygon.jsonl" + pd_multipolygon_filename = "pd_objdet_yolo_multipolygon.jsonl" + pd_raster_filename = "pd_objdet_yolo_raster.jsonl" + + groundtruth_caches = { + AnnotationType.BOX: gt_box_filename, + AnnotationType.POLYGON: gt_polygon_filename, + AnnotationType.MULTIPOLYGON: gt_multipolygon_filename, + AnnotationType.RASTER: gt_raster_filename, + } + prediction_caches = { + AnnotationType.BOX: pd_box_filename, + AnnotationType.POLYGON: pd_polygon_filename, + AnnotationType.MULTIPOLYGON: pd_multipolygon_filename, + AnnotationType.RASTER: pd_raster_filename, + } + + # default is to perform all combinations + if combinations is None: + combinations = [ + (gt_type, pd_type) + for gt_type in groundtruth_caches + for pd_type in prediction_caches + ] + + # cache data locally + filenames = [ + *list(groundtruth_caches.values()), + *list(prediction_caches.values()), + ] + for filename in filenames: + file_path = current_directory / Path(filename) + url = f"https://pub-fae71003f78140bdaedf32a7c8d331d2.r2.dev/{filename}" + download_data_if_not_exists( + file_name=filename, file_path=file_path, url=url + ) + + # iterate through datum limits + results = list() + for limit in limits_to_test: + for gt_type, pd_type in combinations: + + gt_filename = groundtruth_caches[gt_type] + pd_filename = prediction_caches[pd_type] + + # === Base Evaluation === + manager = DataLoader() + + # ingest + preprocess + (ingest_time, preprocessing_time,) = ingest( + manager=manager, + gt_path=current_directory / Path(gt_filename), + pd_path=current_directory / Path(pd_filename), + limit=limit, + chunk_size=chunk_size, + ) # type: ignore - time_it wrapper + + finalization_time, evaluator = time_it(manager.finalize)() + + if ingest_time > ingestion_timeout and ingestion_timeout != -1: + raise TimeoutError( + f"Base precomputation timed out with limit of {limit}." + ) + + # test detailed pr curve with no samples + detailed_pr_curve_time_no_samples, _ = time_it( + evaluator.compute_detailed_pr_curve + )() + + # test detailed pr curve with 3 samples + detailed_pr_curve_time_three_samples, _ = time_it( + evaluator.compute_detailed_pr_curve + )(n_samples=3) + + # evaluate + eval_time, metrics = time_it(evaluator.evaluate)() + # print(metrics) + if eval_time > evaluation_timeout and evaluation_timeout != -1: + raise TimeoutError( + f"Base evaluation timed out with {evaluator.n_datums} datums." + ) + + results.append( + Benchmark( + limit=limit, + n_datums=evaluator.n_datums, + n_groundtruths=evaluator.n_groundtruths, + n_predictions=evaluator.n_predictions, + n_labels=evaluator.n_labels, + gt_type=gt_type, + pd_type=pd_type, + chunk_size=chunk_size, + ingestion=ingest_time, + preprocessing=preprocessing_time, + precomputation=finalization_time, + evaluation=eval_time, + detailed_curves=[ + (0, detailed_pr_curve_time_no_samples), + (3, detailed_pr_curve_time_three_samples), + ], + ).result() + ) + + write_results_to_file(write_path=write_path, results=results) + + +if __name__ == "__main__": + + # run bounding box benchmark + run_benchmarking_analysis( + combinations=[ + (AnnotationType.BOX, AnnotationType.BOX), + ], + limits_to_test=[5000, 5000], + compute_detailed=False, + ) + + # # run polygon benchmark + # run_benchmarking_analysis( + # combinations=[ + # (AnnotationType.POLYGON, AnnotationType.POLYGON), + # ], + # limits_to_test=[5000, 5000], + # compute_detailed=False, + # ) + + # # run raster benchmark + # run_benchmarking_analysis( + # combinations=[ + # (AnnotationType.RASTER, AnnotationType.RASTER), + # ], + # limits_to_test=[500, 500], + # compute_detailed=False, + # ) diff --git a/lite/examples/.gitignore b/lite/examples/.gitignore new file mode 100644 index 000000000..7bc897f92 --- /dev/null +++ b/lite/examples/.gitignore @@ -0,0 +1 @@ +!*.ipynb \ No newline at end of file diff --git a/lite/examples/coco-yolo.ipynb b/lite/examples/coco-yolo.ipynb new file mode 100644 index 000000000..aa681d913 --- /dev/null +++ b/lite/examples/coco-yolo.ipynb @@ -0,0 +1,442 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "38ec8ecc", + "metadata": {}, + "source": [ + "# Object Detection Example\n", + "\n", + "## Introduction\n", + "\n", + "In this notebook, we'll walk through a detailed example of how you can use Valor to evaluate object detections made on [the COCO Panoptic dataset](https://cocodataset.org/#home). We'll use Ultralytics' `YOLOv8` model to predict what objects exist in various COCO photographs and compare performance between bounding box and image segmentation results.\n", + "\n", + "For a conceptual introduction to Valor, [check out our project overview](https://striveworks.github.io/valor/). For a higher-level example notebook, [check out our \"Getting Started\" notebook](https://github.com/Striveworks/valor/blob/main/examples/getting_started.ipynb).\n", + "\n", + "Before using this notebook, please ensure that the Valor service is running on your machine (for start-up instructions, [click here](https://striveworks.github.io/valor/getting_started/)). To connect to a non-local instance of Valor, update `client = Client(\"http://0.0.0.0:8000\")` in the first code block to point to the correct URL." + ] + }, + { + "cell_type": "markdown", + "id": "ff9b26ec", + "metadata": {}, + "source": [ + "## Defining Our Datasets\n", + "\n", + "We start by fetching our dataset and uploading it to Valor." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a4d0a509-7500-44ba-b951-3566d4a4fac1", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import os\n", + "import json\n", + "import builtins\n", + "import requests\n", + "\n", + "from tqdm import tqdm\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from valor_lite.detection import DataLoader, MetricType" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a28f5e66", + "metadata": {}, + "source": [ + "The modules included in `./integrations` are helper modules that demonstrate how to ingest datasets and model inferences into Valor. The depth of each integration varies depending on the use case. \n", + "\n", + "The `coco_integration` is designed to download, extract, and upload all in one command as you are starting off with all the the data. \n", + "\n", + "The `yolo_integration` is much simpler; it is a collection of parser functions that convert YOLO model results into Valor types." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3ea11c76", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gt_objdet_coco_bbox.jsonl already exists locally.\n", + "pd_objdet_yolo_bbox.jsonl already exists locally.\n" + ] + } + ], + "source": [ + "def download_data_if_not_exists(\n", + " file_name: str,\n", + " file_path: Path,\n", + " url: str,\n", + "):\n", + " \"\"\"Download the data from a public bucket if it doesn't exist locally.\"\"\"\n", + "\n", + " if not os.path.exists(file_path):\n", + " response = requests.get(url, stream=True)\n", + " if response.status_code == 200:\n", + " total_size = int(response.headers.get(\"content-length\", 0))\n", + " with open(file_path, \"wb\") as f:\n", + " with tqdm(\n", + " total=total_size,\n", + " unit=\"B\",\n", + " unit_scale=True,\n", + " unit_divisor=1024,\n", + " desc=file_name,\n", + " ) as pbar:\n", + " for chunk in response.iter_content(chunk_size=1024):\n", + " if chunk:\n", + " f.write(chunk)\n", + " pbar.update(1024)\n", + " else:\n", + " raise RuntimeError(response)\n", + " else:\n", + " print(f\"{file_name} already exists locally.\")\n", + "\n", + " # sort file by datum uid\n", + " with open(file_path, \"r\") as f:\n", + " lines = [x for x in f]\n", + " with open(file_path, \"w\") as f:\n", + " for line in sorted(\n", + " lines, key=lambda x: int(json.loads(x)[\"datum\"][\"uid\"])\n", + " ):\n", + " f.write(line)\n", + "\n", + "groundtruth_file = \"gt_objdet_coco_bbox.jsonl\"\n", + "prediction_file = \"pd_objdet_yolo_bbox.jsonl\"\n", + "\n", + "# cache data locally\n", + "current_directory = Path(os.getcwd())\n", + "for filename in [groundtruth_file, prediction_file]:\n", + " file_path = current_directory / Path(filename)\n", + " url = f\"https://pub-fae71003f78140bdaedf32a7c8d331d2.r2.dev/{filename}\"\n", + " download_data_if_not_exists(\n", + " file_name=filename, file_path=file_path, url=url\n", + " )\n", + "\n", + "gt_path = current_directory / Path(groundtruth_file)\n", + "pd_path = current_directory / Path(prediction_file)\n", + "\n", + "gf = builtins.open(gt_path, \"r\")\n", + "groundtruths = [\n", + " json.loads(gline)\n", + " for gline in gf\n", + "]\n", + "gf.close()\n", + "\n", + "pf = builtins.open(pd_path, \"r\")\n", + "predictions = [\n", + " json.loads(pline)\n", + " for pline in pf\n", + "]\n", + "pf.close()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "db64b6c6", + "metadata": {}, + "source": [ + "# Creating an Evaluator" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "94798123", + "metadata": {}, + "source": [ + "This block utilizes `get_instance_groundtruths` from `integrations/coco_integration.py` to download, extract, and upload the COCO Panoptic validation dataset to Valor." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "89ddd815", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "5000it [00:00, 5679.12it/s]\n" + ] + } + ], + "source": [ + "loader = DataLoader()\n", + "loader.add_data_from_valor_dict(zip(groundtruths, predictions), show_progress=True)\n", + "evaluator = loader.finalize()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b2c78827", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_datums': 5000,\n", + " 'n_groundtruths': 36536,\n", + " 'n_predictions': 27092,\n", + " 'n_labels': 94,\n", + " 'ignored_prediction_labels': [],\n", + " 'missing_prediction_labels': [('supercategory', 'person'),\n", + " ('supercategory', 'accessory'),\n", + " ('iscrowd', '0'),\n", + " ('supercategory', 'furniture'),\n", + " ('supercategory', 'electronic'),\n", + " ('supercategory', 'kitchen'),\n", + " ('supercategory', 'appliance'),\n", + " ('supercategory', 'food'),\n", + " ('supercategory', 'indoor'),\n", + " ('supercategory', 'animal'),\n", + " ('iscrowd', '1'),\n", + " ('supercategory', 'vehicle'),\n", + " ('supercategory', 'outdoor'),\n", + " ('supercategory', 'sports'),\n", + " ('name', 'hair drier')]}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluator.metadata" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3e8e7aab", + "metadata": {}, + "source": [ + "## Evaluating Performance\n", + "\n", + "With our `Dataset` and `Model` defined, we're ready to evaluate our performance and display the results. Note that we use the `wait_for_completion` method since all evaluations run as background tasks; this method ensures that the evaluation finishes before we display the results.\n", + "\n", + "Sometimes, we may only want to calculate metrics for a subset of our data (i.e., we may only want to see how well our model performed at a specific type of detection). To accomplish this task, we can use the `filters` parameter of `evaluation_detection` to specify what types of data to evaluate performance for.\n", + "\n", + "We will be running and comparing two different evaluations investigating the performance difference of YOLOv8's bounding box and raster outputs." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "50f5d932", + "metadata": {}, + "outputs": [], + "source": [ + "metrics = evaluator.evaluate(\n", + " iou_thresholds=[0.25],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "81dce63d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[AP(value=0.0, iou=0.25, label=('supercategory', 'person')),\n", + " AP(value=0.034964121945004485, iou=0.25, label=('name', 'person')),\n", + " AP(value=0.0, iou=0.25, label=('iscrowd', '0')),\n", + " AP(value=0.0, iou=0.25, label=('supercategory', 'furniture')),\n", + " AP(value=0.0055361585101038925, iou=0.25, label=('name', 'chair')),\n", + " AP(value=0.012869301715994604, iou=0.25, label=('name', 'potted plant')),\n", + " AP(value=0.10866229353693777, iou=0.25, label=('name', 'dining table')),\n", + " AP(value=0.0, iou=0.25, label=('supercategory', 'electronic')),\n", + " AP(value=0.02314849593109984, iou=0.25, label=('name', 'tv')),\n", + " AP(value=0.0, iou=0.25, label=('supercategory', 'appliance')),\n", + " AP(value=0.0873892652423137, iou=0.25, label=('name', 'microwave')),\n", + " AP(value=0.07156154587928347, iou=0.25, label=('name', 'refrigerator')),\n", + " AP(value=0.0, iou=0.25, label=('supercategory', 'indoor')),\n", + " AP(value=0.0, iou=0.25, label=('name', 'book')),\n", + " AP(value=0.01678316083256647, iou=0.25, label=('name', 'clock')),\n", + " AP(value=0.014015775501818683, iou=0.25, label=('name', 'vase')),\n", + " AP(value=0.0, iou=0.25, label=('supercategory', 'animal')),\n", + " AP(value=0.231753782860388, iou=0.25, label=('name', 'bear')),\n", + " AP(value=0.19525677806009462, iou=0.25, label=('name', 'bed')),\n", + " AP(value=0.0, iou=0.25, label=('iscrowd', '1')),\n", + " AP(value=0.0006028705915087995, iou=0.25, label=('name', 'bottle')),\n", + " AP(value=0.0, iou=0.25, label=('supercategory', 'vehicle')),\n", + " AP(value=0.0024033740498041, iou=0.25, label=('name', 'car')),\n", + " AP(value=0.059802019119639865, iou=0.25, label=('name', 'truck')),\n", + " AP(value=0.0, iou=0.25, label=('supercategory', 'outdoor')),\n", + " AP(value=0.12696681676449215, iou=0.25, label=('name', 'stop sign')),\n", + " AP(value=0.12695275463693767, iou=0.25, label=('name', 'teddy bear')),\n", + " AP(value=0.0, iou=0.25, label=('supercategory', 'sports')),\n", + " AP(value=0.0007519739315703722, iou=0.25, label=('name', 'skis')),\n", + " AP(value=0.027301028578575685, iou=0.25, label=('name', 'oven')),\n", + " AP(value=0.0007072135785007071, iou=0.25, label=('name', 'sports ball')),\n", + " AP(value=0.014991066775850518, iou=0.25, label=('name', 'baseball glove')),\n", + " AP(value=0.010214586772126536, iou=0.25, label=('name', 'tennis racket')),\n", + " AP(value=0.0, iou=0.25, label=('supercategory', 'accessory')),\n", + " AP(value=0.0020844189682126106, iou=0.25, label=('name', 'backpack')),\n", + " AP(value=0.0012280297797221582, iou=0.25, label=('name', 'handbag')),\n", + " AP(value=0.013443055499235678, iou=0.25, label=('name', 'boat')),\n", + " AP(value=0.02472893969494828, iou=0.25, label=('name', 'bird')),\n", + " AP(value=0.009243715069181337, iou=0.25, label=('name', 'cell phone')),\n", + " AP(value=0.21514217763688503, iou=0.25, label=('name', 'train')),\n", + " AP(value=0.0, iou=0.25, label=('supercategory', 'kitchen')),\n", + " AP(value=0.030281021162493073, iou=0.25, label=('name', 'bowl')),\n", + " AP(value=0.0, iou=0.25, label=('supercategory', 'food')),\n", + " AP(value=0.08771008312142972, iou=0.25, label=('name', 'sandwich')),\n", + " AP(value=0.008624009780546994, iou=0.25, label=('name', 'cup')),\n", + " AP(value=0.0016891419565576583, iou=0.25, label=('name', 'surfboard')),\n", + " AP(value=0.04972058716145529, iou=0.25, label=('name', 'laptop')),\n", + " AP(value=0.003592328379843429, iou=0.25, label=('name', 'mouse')),\n", + " AP(value=0.0077222703965693675, iou=0.25, label=('name', 'keyboard')),\n", + " AP(value=0.0015763816845228704, iou=0.25, label=('name', 'traffic light')),\n", + " AP(value=0.14803132929413712, iou=0.25, label=('name', 'bus')),\n", + " AP(value=0.19463347439123024, iou=0.25, label=('name', 'cat')),\n", + " AP(value=0.06197253140976744, iou=0.25, label=('name', 'airplane')),\n", + " AP(value=0.11098807002863581, iou=0.25, label=('name', 'zebra')),\n", + " AP(value=0.0, iou=0.25, label=('name', 'tie')),\n", + " AP(value=0.01789546873590162, iou=0.25, label=('name', 'apple')),\n", + " AP(value=0.00497941318169436, iou=0.25, label=('name', 'baseball bat')),\n", + " AP(value=0.008054041938151662, iou=0.25, label=('name', 'wine glass')),\n", + " AP(value=0.0013656538067599864, iou=0.25, label=('name', 'knife')),\n", + " AP(value=0.028903897960834195, iou=0.25, label=('name', 'cake')),\n", + " AP(value=0.0010068803490518544, iou=0.25, label=('name', 'spoon')),\n", + " AP(value=0.004558789212254558, iou=0.25, label=('name', 'snowboard')),\n", + " AP(value=0.032493461301708465, iou=0.25, label=('name', 'banana')),\n", + " AP(value=0.06007451642097835, iou=0.25, label=('name', 'donut')),\n", + " AP(value=0.04444354398000359, iou=0.25, label=('name', 'toilet')),\n", + " AP(value=0.004502686909260146, iou=0.25, label=('name', 'sink')),\n", + " AP(value=0.03680123352285498, iou=0.25, label=('name', 'broccoli')),\n", + " AP(value=0.012761324967124905, iou=0.25, label=('name', 'skateboard')),\n", + " AP(value=0.01976429998256282, iou=0.25, label=('name', 'bench')),\n", + " AP(value=0.0027046607099734366, iou=0.25, label=('name', 'fork')),\n", + " AP(value=0.006694106628751388, iou=0.25, label=('name', 'carrot')),\n", + " AP(value=0.029851407442426727, iou=0.25, label=('name', 'couch')),\n", + " AP(value=0.0005351886540005352, iou=0.25, label=('name', 'remote')),\n", + " AP(value=0.016878142346284875, iou=0.25, label=('name', 'bicycle')),\n", + " AP(value=0.06534653465346534, iou=0.25, label=('name', 'scissors')),\n", + " AP(value=0.043602866009579126, iou=0.25, label=('name', 'orange')),\n", + " AP(value=0.023613502704789624, iou=0.25, label=('name', 'sheep')),\n", + " AP(value=0.14258072068496863, iou=0.25, label=('name', 'elephant')),\n", + " AP(value=0.009255151266686511, iou=0.25, label=('name', 'frisbee')),\n", + " AP(value=0.046454982301241954, iou=0.25, label=('name', 'umbrella')),\n", + " AP(value=0.10096759191399589, iou=0.25, label=('name', 'horse')),\n", + " AP(value=0.08258136367068454, iou=0.25, label=('name', 'motorcycle')),\n", + " AP(value=0.11613747307594723, iou=0.25, label=('name', 'dog')),\n", + " AP(value=0.016052361400942596, iou=0.25, label=('name', 'kite')),\n", + " AP(value=0.100066969572512, iou=0.25, label=('name', 'pizza')),\n", + " AP(value=0.030910998396653174, iou=0.25, label=('name', 'cow')),\n", + " AP(value=0.06492906623800135, iou=0.25, label=('name', 'fire hydrant')),\n", + " AP(value=0.04156939318188106, iou=0.25, label=('name', 'suitcase')),\n", + " AP(value=0.16491634254279497, iou=0.25, label=('name', 'giraffe')),\n", + " AP(value=0.0206312990102997, iou=0.25, label=('name', 'hot dog')),\n", + " AP(value=0.02254763291455196, iou=0.25, label=('name', 'parking meter')),\n", + " AP(value=0.0, iou=0.25, label=('name', 'toothbrush')),\n", + " AP(value=0.0, iou=0.25, label=('name', 'toaster')),\n", + " AP(value=0.0, iou=0.25, label=('name', 'hair drier'))]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics[MetricType.AP]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8a3bc015", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[mAP(value=-1.0, iou=0.25, label_key='supercategory'),\n", + " mAP(value=0.04520859389645013, iou=0.25, label_key='name'),\n", + " mAP(value=-1.0, iou=0.25, label_key='iscrowd')]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics[MetricType.mAP]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ec56ab6e", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.rcParams['figure.figsize'] = [24, 24]\n", + "plt.figure()\n", + "recall = [x / 100 for x in range(0,101)]\n", + "for curve in metrics[MetricType.PrecisionRecallCurve]:\n", + " if curve.label[0] != \"name\":\n", + " continue\n", + " plt.plot(recall, curve.precision, label=curve.label[1])\n", + "plt.title(\"Precision-Recall Curve\")\n", + "plt.legend()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".env-valor", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lite/pyproject.toml b/lite/pyproject.toml new file mode 100644 index 000000000..20ec83e03 --- /dev/null +++ b/lite/pyproject.toml @@ -0,0 +1,38 @@ +[project] +name = "valor-lite" +dynamic = ["version"] +description = "Compute valor metrics directly in your client." +readme = "README.md" +requires-python = ">=3.10" +license = { file = "LICENSE" } +dependencies = [ + "Pillow >= 9.1.0", + "importlib_metadata; python_version < '3.8'", + "tqdm", + "requests", + "numpy", +] + +[project.urls] +homepage = "https://www.striveworks.com" + +[build-system] +requires = ["setuptools>=61.0", "setuptools_scm[toml]>=6.2"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +test = ["pytest", "coverage"] + +[tool.black] +line-length = 79 + +[tool.isort] +line_length = 79 +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true + +[tool.setuptools_scm] +root = ".." diff --git a/lite/tests/detection/__init__.py b/lite/tests/detection/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/lite/tests/detection/conftest.py b/lite/tests/detection/conftest.py new file mode 100644 index 000000000..a6a500338 --- /dev/null +++ b/lite/tests/detection/conftest.py @@ -0,0 +1,504 @@ +import pytest +from valor_lite.detection import BoundingBox, Detection + + +@pytest.fixture +def rect1() -> tuple[float, float, float, float]: + """Box with area = 1500.""" + return (10.0, 60.0, 10.0, 40.0) + + +@pytest.fixture +def rect2() -> tuple[float, float, float, float]: + """Box with area = 1100.""" + return (15.0, 70.0, 0.0, 20.0) + + +@pytest.fixture +def rect3() -> tuple[float, float, float, float]: + """Box with area = 57,510.""" + return (87.0, 158.0, 10.0, 820.0) + + +@pytest.fixture +def rect4() -> tuple[float, float, float, float]: + """Box with area = 90.""" + return (1.0, 10.0, 10.0, 20.0) + + +@pytest.fixture +def rect5() -> tuple[float, float, float, float]: + """Box with partial overlap to rect3.""" + return (87, 158, 10, 400) + + +@pytest.fixture +def basic_detections( + rect1: tuple[float, float, float, float], + rect2: tuple[float, float, float, float], + rect3: tuple[float, float, float, float], +) -> list[Detection]: + return [ + Detection( + uid="uid1", + groundtruths=[ + BoundingBox( + xmin=rect1[0], + xmax=rect1[1], + ymin=rect1[2], + ymax=rect1[3], + labels=[("k1", "v1")], + ), + BoundingBox( + xmin=rect3[0], + xmax=rect3[1], + ymin=rect3[2], + ymax=rect3[3], + labels=[("k2", "v2")], + ), + ], + predictions=[ + BoundingBox( + xmin=rect1[0], + xmax=rect1[1], + ymin=rect1[2], + ymax=rect1[3], + labels=[("k1", "v1")], + scores=[0.3], + ), + ], + ), + Detection( + uid="uid2", + groundtruths=[ + BoundingBox( + xmin=rect2[0], + xmax=rect2[1], + ymin=rect2[2], + ymax=rect2[3], + labels=[("k1", "v1")], + ), + ], + predictions=[ + BoundingBox( + xmin=rect2[0], + xmax=rect2[1], + ymin=rect2[2], + ymax=rect2[3], + labels=[("k2", "v2")], + scores=[0.98], + ), + ], + ), + ] + + +@pytest.fixture +def torchmetrics_detections() -> list[Detection]: + """Creates a model called "test_model" with some predicted + detections on the dataset "test_dataset". These predictions are taken + from a torchmetrics unit test (see test_metrics.py) + """ + + # predictions for four images taken from + # https://github.com/Lightning-AI/metrics/blob/107dbfd5fb158b7ae6d76281df44bd94c836bfce/tests/unittests/detection/test_map.py#L59 + + groundtruths = [ + {"boxes": [[214.1500, 41.2900, 562.4100, 285.0700]], "labels": ["4"]}, + { + "boxes": [ + [13.00, 22.75, 548.98, 632.42], + [1.66, 3.32, 270.26, 275.23], + ], + "labels": ["2", "2"], + }, + { + "boxes": [ + [61.87, 276.25, 358.29, 379.43], + [2.75, 3.66, 162.15, 316.06], + [295.55, 93.96, 313.97, 152.79], + [326.94, 97.05, 340.49, 122.98], + [356.62, 95.47, 372.33, 147.55], + [462.08, 105.09, 493.74, 146.99], + [277.11, 103.84, 292.44, 150.72], + ], + "labels": ["4", "1", "0", "0", "0", "0", "0"], + }, + { + "boxes": [ + [72.92, 45.96, 91.23, 80.57], + [50.17, 45.34, 71.28, 79.83], + [81.28, 47.04, 98.66, 78.50], + [63.96, 46.17, 84.35, 80.48], + [75.29, 23.01, 91.85, 50.85], + [56.39, 21.65, 75.66, 45.54], + [73.14, 1.10, 98.96, 28.33], + [62.34, 55.23, 78.14, 79.57], + [44.17, 45.78, 63.99, 78.48], + [58.18, 44.80, 66.42, 56.25], + ], + "labels": [ + "49", + "49", + "49", + "49", + "49", + "49", + "49", + "49", + "49", + "49", + ], + }, + ] + predictions = [ + { + "boxes": [[258.15, 41.29, 606.41, 285.07]], + "scores": [0.236], + "labels": ["4"], + }, + { + "boxes": [ + [61.00, 22.75, 565.00, 632.42], + [12.66, 3.32, 281.26, 275.23], + ], + "scores": [0.318, 0.726], + "labels": ["3", "2"], + }, + { + "boxes": [ + [87.87, 276.25, 384.29, 379.43], + [0.00, 3.66, 142.15, 316.06], + [296.55, 93.96, 314.97, 152.79], + [328.94, 97.05, 342.49, 122.98], + [356.62, 95.47, 372.33, 147.55], + [464.08, 105.09, 495.74, 146.99], + [276.11, 103.84, 291.44, 150.72], + ], + "scores": [0.546, 0.3, 0.407, 0.611, 0.335, 0.805, 0.953], + "labels": ["4", "1", "0", "0", "0", "0", "0"], + }, + { + "boxes": [ + [72.92, 45.96, 91.23, 80.57], + [45.17, 45.34, 66.28, 79.83], + [82.28, 47.04, 99.66, 78.50], + [59.96, 46.17, 80.35, 80.48], + [75.29, 23.01, 91.85, 50.85], + [71.14, 1.10, 96.96, 28.33], + [61.34, 55.23, 77.14, 79.57], + [41.17, 45.78, 60.99, 78.48], + [56.18, 44.80, 64.42, 56.25], + ], + "scores": [ + 0.532, + 0.204, + 0.782, + 0.202, + 0.883, + 0.271, + 0.561, + 0.204, + 0.349, + ], + "labels": ["49", "49", "49", "49", "49", "49", "49", "49", "49"], + }, + ] + + return [ + Detection( + uid=str(idx), + groundtruths=[ + BoundingBox( + xmin=box[0], + ymin=box[1], + xmax=box[2], + ymax=box[3], + labels=[("class", label_value)], + ) + for box, label_value in zip(gt["boxes"], gt["labels"]) + ], + predictions=[ + BoundingBox( + xmin=box[0], + ymin=box[1], + xmax=box[2], + ymax=box[3], + labels=[("class", label_value)], + scores=[score], + ) + for box, label_value, score in zip( + pd["boxes"], pd["labels"], pd["scores"] + ) + ], + ) + for idx, (gt, pd) in enumerate(zip(groundtruths, predictions)) + ] + + +@pytest.fixture +def false_negatives_single_datum_baseline_detections() -> list[Detection]: + return [ + Detection( + uid="uid1", + groundtruths=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + ) + ], + predictions=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + scores=[0.8], + ), + BoundingBox( + xmin=100, + xmax=110, + ymin=100, + ymax=200, + labels=[("key", "value")], + scores=[0.7], + ), + ], + ) + ] + + +@pytest.fixture +def false_negatives_single_datum_detections() -> list[Detection]: + return [ + Detection( + uid="uid1", + groundtruths=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + ) + ], + predictions=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + scores=[0.8], + ), + BoundingBox( + xmin=100, + xmax=110, + ymin=100, + ymax=200, + labels=[("key", "value")], + scores=[0.9], + ), + ], + ) + ] + + +@pytest.fixture +def false_negatives_two_datums_one_empty_low_confidence_of_fp_detections() -> list[ + Detection +]: + + return [ + Detection( + uid="uid1", + groundtruths=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + ) + ], + predictions=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + scores=[0.8], + ), + ], + ), + Detection( + uid="uid2", + groundtruths=[], + predictions=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + scores=[0.7], + ), + ], + ), + ] + + +@pytest.fixture +def false_negatives_two_datums_one_empty_high_confidence_of_fp_detections() -> list[ + Detection +]: + + return [ + Detection( + uid="uid1", + groundtruths=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + ) + ], + predictions=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + scores=[0.8], + ), + ], + ), + Detection( + uid="uid2", + groundtruths=[], + predictions=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + scores=[0.9], + ), + ], + ), + ] + + +@pytest.fixture +def false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp_detections() -> list[ + Detection +]: + + return [ + Detection( + uid="uid1", + groundtruths=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + ) + ], + predictions=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + scores=[0.8], + ), + ], + ), + Detection( + uid="uid2", + groundtruths=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "other value")], + ) + ], + predictions=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + scores=[0.7], + ), + ], + ), + ] + + +@pytest.fixture +def false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_detections() -> list[ + Detection +]: + + return [ + Detection( + uid="uid1", + groundtruths=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + ) + ], + predictions=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + scores=[0.8], + ), + ], + ), + Detection( + uid="uid2", + groundtruths=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "other value")], + ) + ], + predictions=[ + BoundingBox( + xmin=10, + xmax=20, + ymin=10, + ymax=20, + labels=[("key", "value")], + scores=[0.9], + ), + ], + ), + ] diff --git a/lite/tests/detection/test_average_precision.py b/lite/tests/detection/test_average_precision.py new file mode 100644 index 000000000..21e01ecaa --- /dev/null +++ b/lite/tests/detection/test_average_precision.py @@ -0,0 +1,623 @@ +import numpy as np +from valor_lite.detection import ( + DataLoader, + Detection, + MetricType, + compute_metrics, +) + + +def test__compute_average_precision(): + + sorted_pairs = np.array( + [ + # dt, gt, pd, iou, gl, pl, score, + [0.0, 0.0, 2.0, 0.25, 0.0, 0.0, 0.95], + [0.0, 0.0, 3.0, 0.33333, 0.0, 0.0, 0.9], + [0.0, 0.0, 4.0, 0.66667, 0.0, 0.0, 0.65], + [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.1], + [0.0, 0.0, 1.0, 0.5, 0.0, 0.0, 0.01], + ] + ) + + label_counts = np.array([[1, 5, 0]]) + iou_thresholds = np.array([0.1, 0.6]) + score_thresholds = np.array([0.0]) + + (results, _, _, _,) = compute_metrics( + sorted_pairs, + label_counts=label_counts, + iou_thresholds=iou_thresholds, + score_thresholds=score_thresholds, + ) + ( + average_precision, + mean_average_precision, + average_precision_averaged_over_ious, + mean_average_precision_averaged_over_ious, + ) = results + + expected_ap = np.array( + [ + [1.0], # iou = 0.1 + [1 / 3], # iou = 0.6 + ] + ) + assert expected_ap.shape == average_precision.shape + assert np.isclose(average_precision, expected_ap).all() + + # since only one class, ap == map + assert expected_ap.shape == mean_average_precision.shape + assert np.isclose(mean_average_precision, expected_ap).all() + + expected_average = np.array([2 / 3]) + + assert average_precision_averaged_over_ious.shape == expected_average.shape + assert np.isclose( + average_precision_averaged_over_ious, expected_average + ).all() + + # since only one class, ap == map + assert ( + mean_average_precision_averaged_over_ious.shape + == expected_average.shape + ) + assert np.isclose( + mean_average_precision_averaged_over_ious, expected_average + ).all() + + +def test_ap_metrics(basic_detections: list[Detection]): + """ + Basic object detection test. + + groundtruths + datum uid1 + box 1 - label (k1, v1) - tp + box 3 - label (k2, v2) - fn missing prediction + datum uid2 + box 2 - label (k1, v1) - fn misclassification + + predictions + datum uid1 + box 1 - label (k1, v1) - score 0.3 - tp + datum uid2 + box 2 - label (k2, v2) - score 0.98 - fp + """ + + manager = DataLoader() + manager.add_data(basic_detections) + evaluator = manager.finalize() + + metrics = evaluator.evaluate( + iou_thresholds=[0.1, 0.6], + ) + + assert evaluator.ignored_prediction_labels == [] + assert evaluator.missing_prediction_labels == [] + assert evaluator.n_datums == 2 + assert evaluator.n_labels == 2 + assert evaluator.n_groundtruths == 3 + assert evaluator.n_predictions == 2 + + # test AP + actual_metrics = [m.to_dict() for m in metrics[MetricType.AP]] + expected_metrics = [ + { + "type": "AP", + "value": 0.504950495049505, + "parameters": { + "iou": 0.1, + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "AP", + "value": 0.504950495049505, + "parameters": { + "iou": 0.6, + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "AP", + "value": 0.0, + "parameters": { + "iou": 0.1, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "AP", + "value": 0.0, + "parameters": { + "iou": 0.6, + "label": {"key": "k2", "value": "v2"}, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + # test mAP + actual_metrics = [m.to_dict() for m in metrics[MetricType.mAP]] + expected_metrics = [ + { + "type": "mAP", + "value": 0.504950495049505, + "parameters": { + "iou": 0.1, + "label_key": "k1", + }, + }, + { + "type": "mAP", + "value": 0.504950495049505, + "parameters": { + "iou": 0.6, + "label_key": "k1", + }, + }, + { + "type": "mAP", + "value": 0.0, + "parameters": { + "iou": 0.1, + "label_key": "k2", + }, + }, + { + "type": "mAP", + "value": 0.0, + "parameters": { + "iou": 0.6, + "label_key": "k2", + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + # test AP Averaged Over IoUs + actual_metrics = [ + m.to_dict() for m in metrics[MetricType.APAveragedOverIOUs] + ] + expected_metrics = [ + { + "type": "APAveragedOverIOUs", + "value": 0.504950495049505, + "parameters": { + "ious": [0.1, 0.6], + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "APAveragedOverIOUs", + "value": 0.0, + "parameters": { + "ious": [0.1, 0.6], + "label": {"key": "k2", "value": "v2"}, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + # test mAP Averaged Over IoUs + actual_metrics = [ + m.to_dict() for m in metrics[MetricType.mAPAveragedOverIOUs] + ] + expected_metrics = [ + { + "type": "mAPAveragedOverIOUs", + "value": 0.504950495049505, + "parameters": { + "ious": [0.1, 0.6], + "label_key": "k1", + }, + }, + { + "type": "mAPAveragedOverIOUs", + "value": 0.0, + "parameters": { + "ious": [0.1, 0.6], + "label_key": "k2", + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_ap_using_torch_metrics_example( + torchmetrics_detections: list[Detection], +): + """ + cf with torch metrics/pycocotools results listed here: + https://github.com/Lightning-AI/metrics/blob/107dbfd5fb158b7ae6d76281df44bd94c836bfce/tests/unittests/detection/test_map.py#L231 + """ + manager = DataLoader() + manager.add_data(torchmetrics_detections) + evaluator = manager.finalize() + + assert evaluator.ignored_prediction_labels == [("class", "3")] + assert evaluator.missing_prediction_labels == [] + assert evaluator.n_datums == 4 + assert evaluator.n_labels == 6 + assert evaluator.n_groundtruths == 20 + assert evaluator.n_predictions == 19 + + metrics = evaluator.evaluate( + iou_thresholds=[0.5, 0.75], + ) + + # test AP + actual_metrics = [m.to_dict() for m in metrics[MetricType.AP]] + expected_metrics = [ + { + "type": "AP", + "value": 1.0, + "parameters": { + "iou": 0.5, + "label": {"key": "class", "value": "0"}, + }, + }, + { + "type": "AP", + "value": 0.7227722772277229, + "parameters": { + "iou": 0.75, + "label": {"key": "class", "value": "0"}, + }, + }, + { + "type": "AP", + "value": 1.0, + "parameters": { + "iou": 0.5, + "label": {"key": "class", "value": "1"}, + }, + }, + { + "type": "AP", + "value": 1.0, + "parameters": { + "iou": 0.75, + "label": {"key": "class", "value": "1"}, + }, + }, + { + "type": "AP", + "value": 0.504950495049505, + "parameters": { + "iou": 0.5, + "label": {"key": "class", "value": "2"}, + }, + }, + { + "type": "AP", + "value": 0.504950495049505, + "parameters": { + "iou": 0.75, + "label": {"key": "class", "value": "2"}, + }, + }, + { + "type": "AP", + "value": 1.0, + "parameters": { + "iou": 0.5, + "label": {"key": "class", "value": "4"}, + }, + }, + { + "type": "AP", + "value": 1.0, + "parameters": { + "iou": 0.75, + "label": {"key": "class", "value": "4"}, + }, + }, + { + "type": "AP", + "value": 0.7909790979097909, + "parameters": { + "iou": 0.5, + "label": {"key": "class", "value": "49"}, + }, + }, + { + "type": "AP", + "value": 0.5756718528995757, + "parameters": { + "iou": 0.75, + "label": {"key": "class", "value": "49"}, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + # test mAP + actual_metrics = [m.to_dict() for m in metrics[MetricType.mAP]] + expected_metrics = [ + { + "type": "mAP", + "value": 0.8591859185918592, + "parameters": { + "iou": 0.5, + "label_key": "class", + }, + }, + { + "type": "mAP", + "value": 0.7606789250353607, + "parameters": { + "iou": 0.75, + "label_key": "class", + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_ap_false_negatives_single_datum_baseline( + false_negatives_single_datum_baseline_detections: list[Detection], +): + """This is the baseline for the below test. In this case there are two predictions and + one groundtruth, but the highest confident prediction overlaps sufficiently with the groundtruth + so there is not a penalty for the false negative so the AP is 1 + """ + + manager = DataLoader() + manager.add_data(false_negatives_single_datum_baseline_detections) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.AP]] + expected_metrics = [ + { + "type": "AP", + "value": 1.0, + "parameters": { + "iou": 0.5, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_ap_false_negatives_single_datum( + false_negatives_single_datum_detections: list[Detection], +): + """Tests where high confidence false negative was not being penalized. The + difference between this test and the above is that here the prediction with higher confidence + does not sufficiently overlap the groundtruth and so is penalized and we get an AP of 0.5 + """ + + manager = DataLoader() + manager.add_data(false_negatives_single_datum_detections) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.AP]] + expected_metrics = [ + { + "type": "AP", + "value": 0.5, + "parameters": { + "iou": 0.5, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_ap_false_negatives_two_datums_one_empty_low_confidence_of_fp( + false_negatives_two_datums_one_empty_low_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class and high IOU) + 2. A second image with empty groundtruth annotation but a prediction with lower confidence + then the prediction on the first image. + + In this case, the AP should be 1.0 since the false positive has lower confidence than the true positive + + """ + + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_empty_low_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.AP]] + expected_metrics = [ + { + "type": "AP", + "value": 1.0, + "parameters": { + "iou": 0.5, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_ap_false_negatives_two_datums_one_empty_high_confidence_of_fp( + false_negatives_two_datums_one_empty_high_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class and high IOU) + 2. A second image with empty groundtruth annotation and a prediction with higher confidence + then the prediction on the first image. + + In this case, the AP should be 0.5 since the false positive has higher confidence than the true positive + """ + + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_empty_high_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.AP]] + expected_metrics = [ + { + "type": "AP", + "value": 0.5, + "parameters": { + "iou": 0.5, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_ap_false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp( + false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU) + 2. A second image with a groundtruth annotation with class `"other value"` and a prediction with lower confidence + then the prediction on the first image. + + In this case, the AP for class `"value"` should be 1 since the false positive has lower confidence than the true positive. + AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth + """ + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.AP]] + expected_metrics = [ + { + "type": "AP", + "value": 1.0, + "parameters": { + "iou": 0.5, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + { + "type": "AP", + "value": 0.0, + "parameters": { + "iou": 0.5, + "label": { + "key": "key", + "value": "other value", + }, + }, + }, + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_ap_false_negatives_two_datums_one_only_with_different_class_high_confidence_of_fp( + false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU) + 2. A second image with a groundtruth annotation with class `"other value"` and a prediction with higher confidence + then the prediction on the first image. + + In this case, the AP for class `"value"` should be 0.5 since the false positive has higher confidence than the true positive. + AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth + """ + manager = DataLoader() + manager.add_data( + false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.AP]] + expected_metrics = [ + { + "type": "AP", + "value": 0.5, + "parameters": { + "iou": 0.5, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + { + "type": "AP", + "value": 0.0, + "parameters": { + "iou": 0.5, + "label": { + "key": "key", + "value": "other value", + }, + }, + }, + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics diff --git a/lite/tests/detection/test_average_recall.py b/lite/tests/detection/test_average_recall.py new file mode 100644 index 000000000..c793d726c --- /dev/null +++ b/lite/tests/detection/test_average_recall.py @@ -0,0 +1,246 @@ +import numpy as np +from valor_lite.detection import ( + DataLoader, + Detection, + MetricType, + compute_metrics, +) + + +def test__compute_average_recall(): + + sorted_pairs = np.array( + [ + # dt, gt, pd, iou, gl, pl, score, + [0.0, 0.0, 2.0, 0.25, 0.0, 0.0, 0.95], + [0.0, 1.0, 3.0, 0.33333, 0.0, 0.0, 0.9], + [0.0, 0.0, 4.0, 0.66667, 0.0, 0.0, 0.65], + [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.1], + [0.0, 0.0, 1.0, 0.5, 0.0, 0.0, 0.01], + [0.0, 2.0, 5.0, 0.5, 1.0, 1.0, 0.95], + ] + ) + + label_counts = np.array([[2, 5, 0], [1, 1, 0]]) + iou_thresholds = np.array([0.1, 0.6]) + score_thresholds = np.array([0.5, 0.93, 0.98]) + + (_, results, _, _,) = compute_metrics( + sorted_pairs, + label_counts=label_counts, + iou_thresholds=iou_thresholds, + score_thresholds=score_thresholds, + ) + ( + average_recall, + mean_average_recall, + average_recall_averaged_over_scores, + mean_average_recall_averaged_over_scores, + ) = results + + expected = np.array( + [ + [0.75, 0.5], + [0.25, 0.5], + [0.0, 0.0], + ] + ) + assert expected.shape == average_recall.shape + assert np.isclose(average_recall, expected).all() + + expected = np.array( + [ + [(0.75 + 0.5) / 2.0], + [(0.25 + 0.5) / 2.0], + [0.0], + ] + ) + assert expected.shape == mean_average_recall.shape + assert np.isclose(mean_average_recall, expected).all() + + expected = np.array( + [1 / 3, 1 / 3], + ) + assert expected.shape == average_recall_averaged_over_scores.shape + assert np.isclose(average_recall_averaged_over_scores, expected).all() + + expected = np.array( + [1 / 3], + ) + assert expected.shape == mean_average_recall_averaged_over_scores.shape + assert np.isclose(mean_average_recall_averaged_over_scores, expected).all() + + +def test_ar_using_torch_metrics_example( + torchmetrics_detections: list[Detection], +): + """ + cf with torch metrics/pycocotools results listed here: + https://github.com/Lightning-AI/metrics/blob/107dbfd5fb158b7ae6d76281df44bd94c836bfce/tests/unittests/detection/test_map.py#L231 + """ + manager = DataLoader() + manager.add_data(torchmetrics_detections) + evaluator = manager.finalize() + + assert evaluator.ignored_prediction_labels == [("class", "3")] + assert evaluator.missing_prediction_labels == [] + assert evaluator.n_datums == 4 + assert evaluator.n_labels == 6 + assert evaluator.n_groundtruths == 20 + assert evaluator.n_predictions == 19 + + score_thresholds = [0.0] + iou_thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95] + + metrics = evaluator.evaluate( + iou_thresholds=iou_thresholds, + score_thresholds=score_thresholds, + ) + + # test AR + actual_metrics = [m.to_dict() for m in metrics[MetricType.AR]] + expected_metrics = [ + { + "type": "AR", + "value": 0.45, + "parameters": { + "ious": iou_thresholds, + "score": 0.0, + "label": {"key": "class", "value": "2"}, + }, + }, + { + "type": "AR", + "value": 0.5800000000000001, + "parameters": { + "ious": iou_thresholds, + "score": 0.0, + "label": {"key": "class", "value": "49"}, + }, + }, + { + "type": "AR", + "value": 0.78, + "parameters": { + "ious": iou_thresholds, + "score": 0.0, + "label": {"key": "class", "value": "0"}, + }, + }, + { + "type": "AR", + "value": 0.8, + "parameters": { + "ious": iou_thresholds, + "score": 0.0, + "label": {"key": "class", "value": "1"}, + }, + }, + { + "type": "AR", + "value": 0.65, + "parameters": { + "ious": iou_thresholds, + "score": 0.0, + "label": {"key": "class", "value": "4"}, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + # test mAR + actual_metrics = [m.to_dict() for m in metrics[MetricType.mAR]] + expected_metrics = [ + { + "type": "mAR", + "value": 0.652, + "parameters": { + "ious": iou_thresholds, + "score": 0.0, + "label_key": "class", + }, + } + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + # test ARAveragedOverScores + actual_metrics = [ + m.to_dict() for m in metrics[MetricType.ARAveragedOverScores] + ] + expected_metrics = [ + { + "type": "ARAveragedOverScores", + "value": 0.45, + "parameters": { + "ious": iou_thresholds, + "scores": [0.0], + "label": {"key": "class", "value": "2"}, + }, + }, + { + "type": "ARAveragedOverScores", + "value": 0.5800000000000001, + "parameters": { + "ious": iou_thresholds, + "scores": [0.0], + "label": {"key": "class", "value": "49"}, + }, + }, + { + "type": "ARAveragedOverScores", + "value": 0.78, + "parameters": { + "ious": iou_thresholds, + "scores": [0.0], + "label": {"key": "class", "value": "0"}, + }, + }, + { + "type": "ARAveragedOverScores", + "value": 0.8, + "parameters": { + "ious": iou_thresholds, + "scores": [0.0], + "label": {"key": "class", "value": "1"}, + }, + }, + { + "type": "ARAveragedOverScores", + "value": 0.65, + "parameters": { + "ious": iou_thresholds, + "scores": [0.0], + "label": {"key": "class", "value": "4"}, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + # test mARAveragedOverScores + actual_metrics = [ + m.to_dict() for m in metrics[MetricType.mARAveragedOverScores] + ] + expected_metrics = [ + { + "type": "mARAveragedOverScores", + "value": 0.652, + "parameters": { + "ious": iou_thresholds, + "scores": [0.0], + "label_key": "class", + }, + } + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics diff --git a/lite/tests/detection/test_counts.py b/lite/tests/detection/test_counts.py new file mode 100644 index 000000000..4ff066412 --- /dev/null +++ b/lite/tests/detection/test_counts.py @@ -0,0 +1,457 @@ +from valor_lite.detection import DataLoader, Detection, MetricType + + +def test_counts_metrics(basic_detections: list[Detection]): + """ + Basic object detection test. + + groundtruths + datum uid1 + box 1 - label (k1, v1) - tp + box 3 - label (k2, v2) - fn missing prediction + datum uid2 + box 2 - label (k1, v1) - fn misclassification + + predictions + datum uid1 + box 1 - label (k1, v1) - score 0.3 - tp + datum uid2 + box 2 - label (k2, v2) - score 0.98 - fp + """ + + manager = DataLoader() + manager.add_data(basic_detections) + evaluator = manager.finalize() + + metrics = evaluator.evaluate( + iou_thresholds=[0.1, 0.6], + score_thresholds=[0.0, 0.5], + ) + + assert evaluator.ignored_prediction_labels == [] + assert evaluator.missing_prediction_labels == [] + assert evaluator.n_datums == 2 + assert evaluator.n_labels == 2 + assert evaluator.n_groundtruths == 3 + assert evaluator.n_predictions == 2 + + # test Counts + actual_metrics = [m.to_dict() for m in metrics[MetricType.Counts]] + expected_metrics = [ + { + "type": "Counts", + "value": { + "tp": 0, + "fp": 1, + "fn": 1, + }, + "parameters": { + "iou": 0.1, + "score": 0.0, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Counts", + "value": { + "tp": 0, + "fp": 1, + "fn": 1, + }, + "parameters": { + "iou": 0.6, + "score": 0.0, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Counts", + "value": { + "tp": 1, + "fp": 0, + "fn": 1, + }, + "parameters": { + "iou": 0.1, + "score": 0.0, + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "Counts", + "value": { + "tp": 1, + "fp": 0, + "fn": 1, + }, + "parameters": { + "iou": 0.6, + "score": 0.0, + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "Counts", + "value": { + "tp": 0, + "fp": 1, + "fn": 1, + }, + "parameters": { + "iou": 0.1, + "score": 0.5, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Counts", + "value": { + "tp": 0, + "fp": 1, + "fn": 1, + }, + "parameters": { + "iou": 0.6, + "score": 0.5, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Counts", + "value": { + "tp": 0, + "fp": 0, + "fn": 2, + }, + "parameters": { + "iou": 0.1, + "score": 0.5, + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "Counts", + "value": { + "tp": 0, + "fp": 0, + "fn": 2, + }, + "parameters": { + "iou": 0.6, + "score": 0.5, + "label": {"key": "k1", "value": "v1"}, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_counts_false_negatives_single_datum_baseline( + false_negatives_single_datum_baseline_detections: list[Detection], +): + """This is the baseline for the below test. In this case there are two predictions and + one groundtruth, but the highest confident prediction overlaps sufficiently with the groundtruth + so there is not a penalty for the false negative so the AP is 1 + """ + + manager = DataLoader() + manager.add_data(false_negatives_single_datum_baseline_detections) + evaluator = manager.finalize() + + metrics = evaluator.evaluate( + iou_thresholds=[0.5], score_thresholds=[0.0, 0.9] + ) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Counts]] + expected_metrics = [ + { + "type": "Counts", + "value": { + "tp": 1, + "fp": 1, + "fn": 0, + }, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + { + "type": "Counts", + "value": { + "tp": 0, + "fp": 0, + "fn": 1, + }, + "parameters": { + "iou": 0.5, + "score": 0.9, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_counts_false_negatives_single_datum( + false_negatives_single_datum_detections: list[Detection], +): + """Tests where high confidence false negative was not being penalized. The + difference between this test and the above is that here the prediction with higher confidence + does not sufficiently overlap the groundtruth and so is penalized and we get an AP of 0.5 + """ + + manager = DataLoader() + manager.add_data(false_negatives_single_datum_detections) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Counts]] + expected_metrics = [ + { + "type": "Counts", + "value": { + "tp": 1, + "fp": 1, + "fn": 0, + }, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_counts_false_negatives_two_datums_one_empty_low_confidence_of_fp( + false_negatives_two_datums_one_empty_low_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class and high IOU) + 2. A second image with empty groundtruth annotation but a prediction with lower confidence + then the prediction on the first image. + + In this case, the AP should be 1.0 since the false positive has lower confidence than the true positive + + """ + + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_empty_low_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Counts]] + expected_metrics = [ + { + "type": "Counts", + "value": { + "tp": 1, + "fp": 1, + "fn": 0, + }, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_counts_false_negatives_two_datums_one_empty_high_confidence_of_fp( + false_negatives_two_datums_one_empty_high_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class and high IOU) + 2. A second image with empty groundtruth annotation and a prediction with higher confidence + then the prediction on the first image. + + In this case, the AP should be 0.5 since the false positive has higher confidence than the true positive + """ + + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_empty_high_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Counts]] + expected_metrics = [ + { + "type": "Counts", + "value": { + "tp": 1, + "fp": 1, + "fn": 0, + }, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_counts_false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp( + false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU) + 2. A second image with a groundtruth annotation with class `"other value"` and a prediction with lower confidence + then the prediction on the first image. + + In this case, the AP for class `"value"` should be 1 since the false positive has lower confidence than the true positive. + AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth + """ + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Counts]] + expected_metrics = [ + { + "type": "Counts", + "value": { + "tp": 1, + "fp": 1, + "fn": 0, + }, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + { + "type": "Counts", + "value": { + "tp": 0, + "fp": 0, + "fn": 1, + }, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "other value", + }, + }, + }, + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_counts_false_negatives_two_datums_one_only_with_different_class_high_confidence_of_fp( + false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU) + 2. A second image with a groundtruth annotation with class `"other value"` and a prediction with higher confidence + then the prediction on the first image. + + In this case, the AP for class `"value"` should be 0.5 since the false positive has higher confidence than the true positive. + AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth + """ + manager = DataLoader() + manager.add_data( + false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Counts]] + expected_metrics = [ + { + "type": "Counts", + "value": { + "tp": 1, + "fp": 1, + "fn": 0, + }, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + { + "type": "Counts", + "value": { + "tp": 0, + "fp": 0, + "fn": 1, + }, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "other value", + }, + }, + }, + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics diff --git a/lite/tests/detection/test_dataloader.py b/lite/tests/detection/test_dataloader.py new file mode 100644 index 000000000..210a0e421 --- /dev/null +++ b/lite/tests/detection/test_dataloader.py @@ -0,0 +1,34 @@ +import json + +import pytest +from valor_lite.detection import DataLoader + + +def test_no_data(): + loader = DataLoader() + with pytest.raises(ValueError): + loader.finalize() + + +def test_valor_integration(): + + gt_json = '{"datum": {"uid": "139", "text": null, "metadata": {"license": 2, "file_name": "000000000139.jpg", "coco_url": "http://images.cocodataset.org/val2017/000000000139.jpg", "date_captured": "2013-11-21 01:34:01", "flickr_url": "http://farm9.staticflickr.com/8035/8024364858_9c41dc1666_z.jpg", "height": 426, "width": 640}}, "annotations": [{"metadata": {}, "labels": [{"key": "supercategory", "value": "person", "score": null}, {"key": "name", "value": "person", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[158.0, 413.0], [295.0, 413.0], [295.0, 465.0], [158.0, 465.0], [158.0, 413.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "person", "score": null}, {"key": "name", "value": "person", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[172.0, 384.0], [207.0, 384.0], [207.0, 399.0], [172.0, 399.0], [172.0, 384.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "furniture", "score": null}, {"key": "name", "value": "chair", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[223.0, 413.0], [303.0, 413.0], [303.0, 442.0], [223.0, 442.0], [223.0, 413.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "furniture", "score": null}, {"key": "name", "value": "chair", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[218.0, 291.0], [315.0, 291.0], [315.0, 352.0], [218.0, 352.0], [218.0, 291.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "furniture", "score": null}, {"key": "name", "value": "chair", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[219.0, 412.0], [231.0, 412.0], [231.0, 421.0], [219.0, 421.0], [219.0, 412.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "furniture", "score": null}, {"key": "name", "value": "chair", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[219.0, 317.0], [230.0, 317.0], [230.0, 338.0], [219.0, 338.0], [219.0, 317.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "furniture", "score": null}, {"key": "name", "value": "chair", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[218.0, 359.0], [320.0, 359.0], [320.0, 414.0], [218.0, 414.0], [218.0, 359.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "furniture", "score": null}, {"key": "name", "value": "potted plant", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[149.0, 237.0], [210.0, 237.0], [210.0, 260.0], [149.0, 260.0], [149.0, 237.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "furniture", "score": null}, {"key": "name", "value": "dining table", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[231.0, 321.0], [319.0, 321.0], [319.0, 446.0], [231.0, 446.0], [231.0, 321.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "electronic", "score": null}, {"key": "name", "value": "tv", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[168.0, 7.0], [262.0, 7.0], [262.0, 155.0], [168.0, 155.0], [168.0, 7.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "electronic", "score": null}, {"key": "name", "value": "tv", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[209.0, 557.0], [287.0, 557.0], [287.0, 638.0], [209.0, 638.0], [209.0, 557.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "appliance", "score": null}, {"key": "name", "value": "microwave", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[206.0, 512.0], [221.0, 512.0], [221.0, 526.0], [206.0, 526.0], [206.0, 512.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "appliance", "score": null}, {"key": "name", "value": "refrigerator", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[174.0, 493.0], [281.0, 493.0], [281.0, 512.0], [174.0, 512.0], [174.0, 493.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "indoor", "score": null}, {"key": "name", "value": "book", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[308.0, 613.0], [353.0, 613.0], [353.0, 625.0], [308.0, 625.0], [308.0, 613.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "indoor", "score": null}, {"key": "name", "value": "book", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[306.0, 605.0], [350.0, 605.0], [350.0, 618.0], [306.0, 618.0], [306.0, 605.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "indoor", "score": null}, {"key": "name", "value": "clock", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[121.0, 448.0], [142.0, 448.0], [142.0, 461.0], [121.0, 461.0], [121.0, 448.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "indoor", "score": null}, {"key": "name", "value": "vase", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[195.0, 241.0], [212.0, 241.0], [212.0, 254.0], [195.0, 254.0], [195.0, 241.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "indoor", "score": null}, {"key": "name", "value": "vase", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[309.0, 549.0], [398.0, 549.0], [398.0, 584.0], [309.0, 584.0], [309.0, 549.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "indoor", "score": null}, {"key": "name", "value": "vase", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[209.0, 351.0], [230.0, 351.0], [230.0, 361.0], [209.0, 361.0], [209.0, 351.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "supercategory", "value": "indoor", "score": null}, {"key": "name", "value": "vase", "score": null}, {"key": "iscrowd", "value": "0", "score": null}], "bounding_box": [[[200.0, 337.0], [215.0, 337.0], [215.0, 346.0], [200.0, 346.0], [200.0, 337.0]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}]}' + pd_json = '{"datum": {"uid": "139", "text": null, "metadata": {"license": 2, "file_name": "000000000139.jpg", "coco_url": "http://images.cocodataset.org/val2017/000000000139.jpg", "date_captured": "2013-11-21 01:34:01", "flickr_url": "http://farm9.staticflickr.com/8035/8024364858_9c41dc1666_z.jpg", "height": 426, "width": 640}}, "annotations": [{"metadata": {}, "labels": [{"key": "name", "value": "tv", "score": 0.9257726073265076}, {"key": "unused_class", "value": "tv", "score": 0.9257726073265076}], "bounding_box": [[[4, 166], [155, 166], [155, 263], [4, 263], [4, 166]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "name", "value": "chair", "score": 0.866135835647583}], "bounding_box": [[[293, 217], [354, 217], [354, 319], [293, 319], [293, 217]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "name", "value": "chair", "score": 0.7706670761108398}], "bounding_box": [[[361, 217], [418, 217], [418, 310], [361, 310], [361, 217]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "name", "value": "person", "score": 0.7308055758476257}], "bounding_box": [[[416, 157], [465, 157], [465, 295], [416, 295], [416, 157]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "name", "value": "chair", "score": 0.6489511728286743}], "bounding_box": [[[405, 219], [444, 219], [444, 306], [405, 306], [405, 219]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "name", "value": "clock", "score": 0.6184478998184204}], "bounding_box": [[[448, 119], [461, 119], [461, 141], [448, 141], [448, 119]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "name", "value": "refrigerator", "score": 0.6119757294654846}], "bounding_box": [[[446, 167], [513, 167], [513, 289], [446, 289], [446, 167]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "name", "value": "potted plant", "score": 0.5597260594367981}], "bounding_box": [[[226, 178], [268, 178], [268, 212], [226, 212], [226, 178]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "name", "value": "vase", "score": 0.431998074054718}], "bounding_box": [[[550, 304], [585, 304], [585, 399], [550, 399], [550, 304]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "name", "value": "potted plant", "score": 0.3539217412471771}], "bounding_box": [[[334, 175], [370, 175], [370, 221], [334, 221], [334, 175]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "name", "value": "dining table", "score": 0.27812352776527405}], "bounding_box": [[[462, 350], [639, 350], [639, 423], [462, 423], [462, 350]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}, {"metadata": {}, "labels": [{"key": "name", "value": "tv", "score": 0.25976383686065674}], "bounding_box": [[[558, 207], [639, 207], [639, 296], [558, 296], [558, 207]]], "polygon": null, "raster": null, "embedding": null, "text": null, "context_list": null, "is_instance": true, "implied_task_types": null}]}' + + gt = json.loads(gt_json) + pd = json.loads(pd_json) + + loader = DataLoader() + loader.add_data_from_valor_dict([(gt, pd)]) + + assert len(loader.pairs) == 1 + assert loader.pairs[0].shape == (281, 7) + + assert set(loader._evaluator.label_key_to_index.keys()) == { + "iscrowd", + "name", + "supercategory", + "unused_class", + } + assert len(loader._evaluator.index_to_label) == 17 + assert loader._evaluator.n_datums == 1 diff --git a/lite/tests/detection/test_detailed_pr_curve.py b/lite/tests/detection/test_detailed_pr_curve.py new file mode 100644 index 000000000..19e8cd73a --- /dev/null +++ b/lite/tests/detection/test_detailed_pr_curve.py @@ -0,0 +1,882 @@ +import numpy as np +from valor_lite.detection import ( + DataLoader, + Detection, + Evaluator, + compute_detailed_pr_curve, +) + + +def test_detailed_pr_curve_no_data(): + evaluator = Evaluator() + curves = evaluator.compute_detailed_pr_curve() + assert isinstance(curves, list) + assert len(curves) == 0 + + +def test_compute_detailed_pr_curve(): + sorted_pairs = np.array( + [ + # dt, gt, pd, iou, gl, pl, score, + [0.0, 0.0, 1.0, 0.98, 0.0, 0.0, 0.95], + [1.0, 1.0, 2.0, 0.55, 1.0, 0.0, 0.95], + [2.0, -1.0, 3.0, 0.67, -1.0, 0.0, 0.65], + [3.0, 4.0, 4.0, 1.0, 0.0, 0.0, 0.1], + [4.0, 5.0, -1.0, 0.5, 0.0, -1.0, -1.0], + ] + ) + label_counts = np.array([[3, 4], [1, 0]]) + iou_thresholds = np.array([0.5]) + score_thresholds = np.array([score / 100.0 for score in range(1, 101)]) + + results = compute_detailed_pr_curve( + data=sorted_pairs, + label_counts=label_counts, + iou_thresholds=iou_thresholds, + score_thresholds=score_thresholds, + n_samples=0, + ) + + assert len(results) == 1 + assert results.shape == (1, 100, 2, 5) # iou, score, label, metrics + + """ + @ iou=0.5, score<0.1 + 2x tp + 1x fp misclassification + 1x fp hallucination + 0x fn misclassification + 1x fn missing prediction + """ + assert np.isclose(results[0, :10, 0, :], np.array([2, 1, 1, 0, 1])).all() + + """ + @ iou=0.5, score=0.5 + 1x tp + 1x fp misclassification + 1x fp hallucination + 1x fn misclassification + 1x fn missing prediction + """ + assert np.isclose(results[0, 10:95, 0, :], np.array([1, 1, 1, 1, 1])).all() + + """ + @ iou=0.5, score>=0.95 + 0x tp + 0x fp misclassification + 2x fp hallucination + 2x fn misclassification + 1x fn missing prediction + """ + assert np.isclose(results[0, 95:, 0, :], np.array([0, 0, 2, 2, 1])).all() + + # compute with examples + + """ + + output + + label_idx + tp + ... examples + fp_misclassification + ... examples + fp_hallucination + ... examples + fn_misclassification + ... examples + fn_missing_prediction + ... examples + """ + + n_samples = 2 + + results = compute_detailed_pr_curve( + data=sorted_pairs, + label_counts=label_counts, + iou_thresholds=iou_thresholds, + score_thresholds=score_thresholds, + n_samples=n_samples, + ) + + assert len(results) == 1 + assert results.shape == (1, 100, 2, 15) # iou, score, label, metrics + + tp_idx = 0 + fp_misclf_idx = tp_idx + n_samples + 1 + fp_halluc_idx = fp_misclf_idx + n_samples + 1 + fn_misclf_idx = fp_halluc_idx + n_samples + 1 + fn_misprd_idx = fn_misclf_idx + n_samples + 1 + + metric_indices = np.zeros((15,), dtype=bool) + for index in [ + tp_idx, + fp_misclf_idx, + fp_halluc_idx, + fn_misclf_idx, + fn_misprd_idx, + ]: + metric_indices[index] = True + + """ + @ iou=0.5, score<0.1 + 2x tp + 1x fp misclassification + 1x fp hallucination + 0x fn misclassification + 1x fn missing prediction + """ + assert np.isclose( + results[0, :10, 0, metric_indices], + np.array([2, 1, 1, 0, 1])[:, np.newaxis], + ).all() # metrics + assert np.isclose( + results[0, :10, 0, tp_idx + 1 : fp_misclf_idx], np.array([0.0, 3.0]) + ).all() # tp + assert np.isclose( + results[0, :10, 0, fp_misclf_idx + 1 : fp_halluc_idx], + np.array([1.0, -1.0]), + ).all() # fp misclf + assert np.isclose( + results[0, :10, 0, fp_halluc_idx + 1 : fn_misclf_idx], + np.array([2.0, -1.0]), + ).all() # fp halluc + assert np.isclose( + results[0, :10, 0, fn_misclf_idx + 1 : fn_misprd_idx], + np.array([-1.0, -1.0]), + ).all() # fn misclf + assert np.isclose( + results[0, :10, 0, fn_misprd_idx + 1 :], np.array([4.0, -1.0]) + ).all() # fn misprd + + """ + @ iou=0.5, score=0.5 + 1x tp + 1x fp misclassification + 1x fp hallucination + 1x fn misclassification + 1x fn missing prediction + """ + assert np.isclose( + results[0, 10:95, 0, metric_indices], + np.array([1, 1, 1, 1, 1])[:, np.newaxis], + ).all() + assert np.isclose( + results[0, 10:95, 0, tp_idx + 1 : fp_misclf_idx], np.array([0.0, -1.0]) + ).all() # tp + assert np.isclose( + results[0, 10:95, 0, fp_misclf_idx + 1 : fp_halluc_idx], + np.array([1.0, -1.0]), + ).all() # fp misclf + assert np.isclose( + results[0, 10:95, 0, fp_halluc_idx + 1 : fn_misclf_idx], + np.array([2.0, -1.0]), + ).all() # fp halluc + assert np.isclose( + results[0, 10:95, 0, fn_misclf_idx + 1 : fn_misprd_idx], + np.array([3.0, -1.0]), + ).all() # fn misclf + assert np.isclose( + results[0, 10:95, 0, fn_misprd_idx + 1 :], np.array([4.0, -1.0]) + ).all() # fn misprd + + """ + @ iou=0.5, score>=0.95 + 0x tp + 0x fp misclassification + 2x fp hallucination + 2x fn misclassification + 1x fn missing prediction + """ + assert np.isclose( + results[0, 95:, 0, metric_indices], + np.array([0, 0, 2, 2, 1])[:, np.newaxis], + ).all() + assert np.isclose( + results[0, 95:, 0, tp_idx + 1 : fp_misclf_idx], np.array([-1.0, -1.0]) + ).all() # tp + assert np.isclose( + results[0, 95:, 0, fp_misclf_idx + 1 : fp_halluc_idx], + np.array([-1.0, -1.0]), + ).all() # fp misclf + assert np.isclose( + results[0, 95:, 0, fp_halluc_idx + 1 : fn_misclf_idx], + np.array([1.0, 2.0]), + ).all() # fp halluc + assert np.isclose( + results[0, 95:, 0, fn_misclf_idx + 1 : fn_misprd_idx], + np.array([0.0, 3.0]), + ).all() # fn misclf + assert np.isclose( + results[0, 95:, 0, fn_misprd_idx + 1 :], np.array([4.0, -1.0]) + ).all() # fn misprd + + +def test_detailed_pr_curve_using_torch_metrics_example( + torchmetrics_detections: list[Detection], +): + """ + cf with torch metrics/pycocotools results listed here: + https://github.com/Lightning-AI/metrics/blob/107dbfd5fb158b7ae6d76281df44bd94c836bfce/tests/unittests/detection/test_map.py#L231 + """ + manager = DataLoader() + manager.add_data(torchmetrics_detections) + evaluator = manager.finalize() + + assert evaluator.ignored_prediction_labels == [("class", "3")] + assert evaluator.missing_prediction_labels == [] + assert evaluator.n_datums == 4 + assert evaluator.n_labels == 6 + assert evaluator.n_groundtruths == 20 + assert evaluator.n_predictions == 19 + + metrics = evaluator.compute_detailed_pr_curve( + iou_thresholds=[0.5, 0.75], + score_thresholds=[0.25, 0.75], + n_samples=1, + ) + + # test DetailedPrecisionRecallCurve + actual_metrics = [m.to_dict() for m in metrics] + expected_metrics = [ + { + "value": [ + { + "score": 0.25, + "tp": 1.0, + "fp_misclassification": 0.0, + "fp_hallucination": 0.0, + "fn_misclassification": 1.0, + "fn_missing_prediction": 0.0, + "tp_examples": ["2"], + "fp_misclassification_examples": [], + "fp_hallucination_examples": [], + "fn_misclassification_examples": ["0"], + "fn_missing_prediction_examples": [], + }, + { + "score": 0.75, + "tp": 0.0, + "fp_misclassification": 0.0, + "fp_hallucination": 6.0, + "fn_misclassification": 2.0, + "fn_missing_prediction": 4.0, + "tp_examples": [], + "fp_misclassification_examples": [], + "fp_hallucination_examples": ["2"], + "fn_misclassification_examples": ["0"], + "fn_missing_prediction_examples": ["2"], + }, + ], + "iou": 0.5, + "label": {"key": "class", "value": "4"}, + "type": "DetailedPrecisionRecallCurve", + }, + { + "value": [ + { + "score": 0.25, + "tp": 1.0, + "fp_misclassification": 0.0, + "fp_hallucination": 0.0, + "fn_misclassification": 1.0, + "fn_missing_prediction": 0.0, + "tp_examples": ["2"], + "fp_misclassification_examples": [], + "fp_hallucination_examples": [], + "fn_misclassification_examples": ["0"], + "fn_missing_prediction_examples": [], + }, + { + "score": 0.75, + "tp": 0.0, + "fp_misclassification": 0.0, + "fp_hallucination": 6.0, + "fn_misclassification": 2.0, + "fn_missing_prediction": 4.0, + "tp_examples": [], + "fp_misclassification_examples": [], + "fp_hallucination_examples": ["2"], + "fn_misclassification_examples": ["0"], + "fn_missing_prediction_examples": ["2"], + }, + ], + "iou": 0.75, + "label": {"key": "class", "value": "4"}, + "type": "DetailedPrecisionRecallCurve", + }, + { + "value": [ + { + "score": 0.25, + "tp": 1.0, + "fp_misclassification": 0.0, + "fp_hallucination": 1.0, + "fn_misclassification": 0.0, + "fn_missing_prediction": 1.0, + "tp_examples": ["1"], + "fp_misclassification_examples": [], + "fp_hallucination_examples": ["1"], + "fn_misclassification_examples": [], + "fn_missing_prediction_examples": ["1"], + }, + { + "score": 0.75, + "tp": 0.0, + "fp_misclassification": 0.0, + "fp_hallucination": 1.0, + "fn_misclassification": 1.0, + "fn_missing_prediction": 3.0, + "tp_examples": [], + "fp_misclassification_examples": [], + "fp_hallucination_examples": ["1"], + "fn_misclassification_examples": ["1"], + "fn_missing_prediction_examples": ["1"], + }, + ], + "iou": 0.5, + "label": {"key": "class", "value": "2"}, + "type": "DetailedPrecisionRecallCurve", + }, + { + "value": [ + { + "score": 0.25, + "tp": 1.0, + "fp_misclassification": 0.0, + "fp_hallucination": 1.0, + "fn_misclassification": 0.0, + "fn_missing_prediction": 1.0, + "tp_examples": ["1"], + "fp_misclassification_examples": [], + "fp_hallucination_examples": ["1"], + "fn_misclassification_examples": [], + "fn_missing_prediction_examples": ["1"], + }, + { + "score": 0.75, + "tp": 0.0, + "fp_misclassification": 0.0, + "fp_hallucination": 1.0, + "fn_misclassification": 1.0, + "fn_missing_prediction": 3.0, + "tp_examples": [], + "fp_misclassification_examples": [], + "fp_hallucination_examples": ["1"], + "fn_misclassification_examples": ["1"], + "fn_missing_prediction_examples": ["1"], + }, + ], + "iou": 0.75, + "label": {"key": "class", "value": "2"}, + "type": "DetailedPrecisionRecallCurve", + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +# @pytest.fixture +# def test_detailed_precision_recall_curve( +# evaluate_detection_detailed_pr_curve_groundtruths: list, +# evaluate_detection_detailed_pr_curve_predictions: list, +# detailed_precision_recall_curve_outputs: tuple, +# ): + +# expected_outputs, _ = detailed_precision_recall_curve_outputs + +# Dataloader = Dataloader( +# metrics_to_return=[enums.MetricType.DetailedDetailedPrecisionRecallCurve], +# ) + +# Dataloader.add_data( +# groundtruths=evaluate_detection_detailed_pr_curve_groundtruths, +# predictions=evaluate_detection_detailed_pr_curve_predictions, +# ) + +# # check that ious have been precomputed +# assert "iou_" in Dataloader.joint_df.columns +# assert all( +# [ +# col not in ["raster", "bounding_box"] +# for col in Dataloader.joint_df.columns +# ] +# ) + +# eval_job = Dataloader.evaluate() +# for key, expected_value in expected_outputs.items(): +# result = eval_job.metrics[0]["value"] +# for k in key: +# result = result[k] +# assert result == expected_value + +# # repeat tests using a lower IOU threshold +# Dataloader = Dataloader( +# metrics_to_return=[enums.MetricType.DetailedDetailedPrecisionRecallCurve], +# pr_curve_iou_threshold=0.45, +# ) + +# Dataloader.add_data( +# groundtruths=evaluate_detection_detailed_pr_curve_groundtruths, +# predictions=evaluate_detection_detailed_pr_curve_predictions, +# ) + +# eval_job_low_iou_threshold = Dataloader.evaluate() + +# for key, expected_value in expected_outputs.items(): +# result = eval_job_low_iou_threshold.metrics[0]["value"] +# for k in key: +# result = result[k] +# assert result == expected_value + + +# def test_evaluate_detection_model_with_no_predictions( +# evaluate_detection_groundtruths: list, +# evaluate_detection_model_with_no_predictions_output: list, +# ): +# """ +# Test detection evaluations when the model outputs nothing. + +# gt_dets1 +# datum 1 +# - Label (k1, v1) with Annotation area = 1500 +# - Label (k2, v2) with Annotation area = 57,510 +# datum2 +# - Label (k1, v1) with Annotation area = 1100 +# """ +# predictions = [] +# for gt in evaluate_detection_groundtruths: +# predictions.append( +# schemas.Prediction( +# datum=gt.datum, +# annotations=[], +# ) +# ) + +# Dataloader = Dataloader() + +# # can't pass empty lists, but can pass predictions without annotations +# with pytest.raises(ValueError) as e: +# Dataloader.add_data( +# groundtruths=evaluate_detection_groundtruths, +# predictions=[], +# ) +# assert ( +# "it's neither a dataframe nor a list of Valor Prediction objects" +# in str(e) +# ) + +# Dataloader.add_data( +# groundtruths=evaluate_detection_groundtruths, +# predictions=predictions, +# ) + +# # check that ious have been precomputed +# assert "iou_" in Dataloader.joint_df.columns +# assert all( +# [ +# col not in ["raster", "bounding_box"] +# for col in Dataloader.joint_df.columns +# ] +# ) + +# eval_job = Dataloader.evaluate() + +# computed_metrics = eval_job.metrics + +# assert all([metric["value"] == 0 for metric in computed_metrics]) + +# for m in evaluate_detection_model_with_no_predictions_output: +# assert m in computed_metrics + +# for m in computed_metrics: +# assert m in evaluate_detection_model_with_no_predictions_output + + +# def test_evaluate_detection_functional_test( +# evaluate_detection_functional_test_groundtruths: list, +# evaluate_detection_functional_test_predictions: list, +# evaluate_detection_functional_test_outputs: tuple, +# ): + +# ( +# expected_metrics, +# pr_expected_answers, +# detailed_pr_expected_answers, +# higher_iou_threshold_pr_expected_answers, +# higher_iou_threshold_detailed_pr_expected_answers, +# ) = evaluate_detection_functional_test_outputs + +# Dataloader = Dataloader() +# Dataloader.add_data( +# groundtruths=evaluate_detection_functional_test_groundtruths, +# predictions=evaluate_detection_functional_test_predictions, +# ) +# Dataloader.finalize() + +# ap_metrics = translate_ap_metrics( +# Dataloader.compute_ap( +# iou_thresholds=[0.5, 0.75] +# ) +# ) + +# pr_curves = translate_pr_curves( +# Dataloader.compute_pr_curve( +# iou_thresholds=[0.5], +# n_samples=1, +# ) +# ) + +# metrics_to_return=[ +# enums.MetricType.AP, +# enums.MetricType.AR, +# enums.MetricType.mAP, +# enums.MetricType.APAveragedOverIOUs, +# enums.MetricType.mAR, +# enums.MetricType.mAPAveragedOverIOUs, +# enums.MetricType.DetailedPrecisionRecallCurve, +# enums.MetricType.DetailedDetailedPrecisionRecallCurve, +# ], +# pr_curve_iou_threshold=0.5, +# pr_curve_max_examples=1, +# ) + + +# metrics = [ +# m +# for m in eval_job.metrics +# if m["type"] +# not in ["DetailedPrecisionRecallCurve", "DetailedDetailedPrecisionRecallCurve"] +# ] + +# # round all metrics to the third decimal place +# for i, m in enumerate(metrics): +# metrics[i]["value"] = round(m["value"], 3) + +# pr_metrics = [ +# m for m in eval_job.metrics if m["type"] == "DetailedPrecisionRecallCurve" +# ] +# detailed_pr_metrics = [ +# m +# for m in eval_job.metrics +# if m["type"] == "DetailedDetailedPrecisionRecallCurve" +# ] + +# for m in metrics: +# assert m in expected_metrics +# for m in metrics: +# assert m in eval_job.metrics + +# for ( +# _, +# value, +# threshold, +# metric, +# ), expected_value in pr_expected_answers.items(): +# assert ( +# pr_metrics[0]["value"][value][threshold][metric] == expected_value +# ) + +# for ( +# value, +# threshold, +# metric, +# ), expected_output in detailed_pr_expected_answers.items(): +# model_output = detailed_pr_metrics[0]["value"][value][threshold][ +# metric +# ] +# assert isinstance(model_output, dict) +# assert model_output["total"] == expected_output["total"] +# assert all( +# [ +# model_output["observations"][key]["count"] # type: ignore - we know this element is a dict +# == expected_output[key] +# for key in [ +# key +# for key in expected_output.keys() +# if key not in ["total"] +# ] +# ] +# ) + +# # spot check number of examples +# assert ( +# len( +# detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][ # type: ignore - we know this element is a dict +# "examples" +# ] +# ) +# == 1 +# ) +# assert ( +# len( +# detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][ # type: ignore - we know this element is a dict +# "examples" +# ] +# ) +# == 1 +# ) + +# # raise the iou threshold +# Dataloader = Dataloader( +# metrics_to_return=[ +# enums.MetricType.DetailedPrecisionRecallCurve, +# enums.MetricType.DetailedDetailedPrecisionRecallCurve, +# ], +# pr_curve_iou_threshold=0.9, +# pr_curve_max_examples=1, +# ) + +# Dataloader.add_data( +# groundtruths=evaluate_detection_functional_test_groundtruths, +# predictions=evaluate_detection_functional_test_predictions, +# ) + +# # check that ious have been precomputed +# assert "iou_" in Dataloader.joint_df.columns +# assert all( +# [ +# col not in ["raster", "bounding_box"] +# for col in Dataloader.joint_df.columns +# ] +# ) + +# eval_job_higher_threshold = Dataloader.evaluate() + +# pr_metrics = [ +# m +# for m in eval_job_higher_threshold.metrics +# if m["type"] == "DetailedPrecisionRecallCurve" +# ] +# detailed_pr_metrics = [ +# m +# for m in eval_job_higher_threshold.metrics +# if m["type"] == "DetailedDetailedPrecisionRecallCurve" +# ] + +# for ( +# key, +# value, +# threshold, +# metric, +# ), expected_count in higher_iou_threshold_pr_expected_answers.items(): +# actual_count = pr_metrics[0]["value"][value][threshold][metric] +# assert actual_count == expected_count + +# for ( +# value, +# threshold, +# metric, +# ), expected_output in ( +# higher_iou_threshold_detailed_pr_expected_answers.items() +# ): +# model_output = detailed_pr_metrics[0]["value"][value][threshold][ +# metric +# ] +# assert isinstance(model_output, dict) +# assert model_output["total"] == expected_output["total"] +# assert all( +# [ +# model_output["observations"][key]["count"] # type: ignore - we know this element is a dict +# == expected_output[key] +# for key in [ +# key +# for key in expected_output.keys() +# if key not in ["total"] +# ] +# ] +# ) + +# assert ( +# len( +# detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][ # type: ignore - we know this element is a dict +# "examples" +# ] +# ) +# == 1 +# ) +# assert ( +# len( +# detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][ # type: ignore - we know this element is a dict +# "examples" +# ] +# ) +# == 1 +# ) + +# # repeat the above, but with a higher pr_max_curves_example +# Dataloader = Dataloader( +# metrics_to_return=[ +# enums.MetricType.DetailedPrecisionRecallCurve, +# enums.MetricType.DetailedDetailedPrecisionRecallCurve, +# ], +# pr_curve_iou_threshold=0.9, +# pr_curve_max_examples=3, +# ) + +# Dataloader.add_data( +# groundtruths=evaluate_detection_functional_test_groundtruths, +# predictions=evaluate_detection_functional_test_predictions, +# ) + +# # check that ious have been precomputed +# assert "iou_" in Dataloader.joint_df.columns +# assert all( +# [ +# col not in ["raster", "bounding_box"] +# for col in Dataloader.joint_df.columns +# ] +# ) + +# eval_job_higher_threshold = Dataloader.evaluate() + +# pr_metrics = [ +# m +# for m in eval_job_higher_threshold.metrics +# if m["type"] == "DetailedPrecisionRecallCurve" +# ] +# detailed_pr_metrics = [ +# m +# for m in eval_job_higher_threshold.metrics +# if m["type"] == "DetailedDetailedPrecisionRecallCurve" +# ] + +# for ( +# key, +# value, +# threshold, +# metric, +# ), expected_count in higher_iou_threshold_pr_expected_answers.items(): +# actual_count = pr_metrics[0]["value"][value][threshold][metric] +# assert actual_count == expected_count + +# for ( +# value, +# threshold, +# metric, +# ), expected_output in ( +# higher_iou_threshold_detailed_pr_expected_answers.items() +# ): +# model_output = detailed_pr_metrics[0]["value"][value][threshold][ +# metric +# ] +# assert isinstance(model_output, dict) +# assert model_output["total"] == expected_output["total"] +# assert all( +# [ +# model_output["observations"][key]["count"] # type: ignore - we know this element is a dict +# == expected_output[key] +# for key in [ +# key +# for key in expected_output.keys() +# if key not in ["total"] +# ] +# ] +# ) + +# assert ( +# len( +# detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][ # type: ignore - we know this element is a dict +# "examples" +# ] +# ) +# == 3 +# ) +# assert ( +# len( +# detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][ # type: ignore - we know this element is a dict +# "examples" +# ] +# ) +# == 2 +# ) + +# # test behavior if pr_curve_max_examples == 0 +# Dataloader = Dataloader( +# metrics_to_return=[ +# enums.MetricType.DetailedPrecisionRecallCurve, +# enums.MetricType.DetailedDetailedPrecisionRecallCurve, +# ], +# pr_curve_iou_threshold=0.9, +# pr_curve_max_examples=0, +# ) + +# Dataloader.add_data( +# groundtruths=evaluate_detection_functional_test_groundtruths, +# predictions=evaluate_detection_functional_test_predictions, +# ) + +# # check that ious have been precomputed +# assert "iou_" in Dataloader.joint_df.columns +# assert all( +# [ +# col not in ["raster", "bounding_box"] +# for col in Dataloader.joint_df.columns +# ] +# ) + +# eval_job_higher_threshold = Dataloader.evaluate() + +# pr_metrics = [ +# m +# for m in eval_job_higher_threshold.metrics +# if m["type"] == "DetailedPrecisionRecallCurve" +# ] +# detailed_pr_metrics = [ +# m +# for m in eval_job_higher_threshold.metrics +# if m["type"] == "DetailedDetailedPrecisionRecallCurve" +# ] + +# for ( +# key, +# value, +# threshold, +# metric, +# ), expected_count in higher_iou_threshold_pr_expected_answers.items(): +# actual_count = pr_metrics[0]["value"][value][threshold][metric] +# assert actual_count == expected_count + +# for ( +# value, +# threshold, +# metric, +# ), expected_output in ( +# higher_iou_threshold_detailed_pr_expected_answers.items() +# ): +# model_output = detailed_pr_metrics[0]["value"][value][threshold][ +# metric +# ] +# assert isinstance(model_output, dict) +# assert model_output["total"] == expected_output["total"] +# assert all( +# [ +# model_output["observations"][key]["count"] # type: ignore - we know this element is a dict +# == expected_output[key] +# for key in [ +# key +# for key in expected_output.keys() +# if key not in ["total"] +# ] +# ] +# ) + +# # spot check number of examples +# assert ( +# len( +# detailed_pr_metrics[0]["value"]["0"][0.95]["fn"]["observations"]["no_predictions"][ # type: ignore - we know this element is a dict +# "examples" +# ] +# ) +# == 0 +# ) +# assert ( +# len( +# detailed_pr_metrics[0]["value"]["49"][0.05]["tp"]["observations"]["all"][ # type: ignore - we know this element is a dict +# "examples" +# ] +# ) +# == 0 +# ) diff --git a/lite/tests/detection/test_evaluator.py b/lite/tests/detection/test_evaluator.py new file mode 100644 index 000000000..179e8c11e --- /dev/null +++ b/lite/tests/detection/test_evaluator.py @@ -0,0 +1,31 @@ +from valor_lite.detection import DataLoader, Detection + + +def test_metadata_using_torch_metrics_example( + torchmetrics_detections: list[Detection], +): + """ + cf with torch metrics/pycocotools results listed here: + https://github.com/Lightning-AI/metrics/blob/107dbfd5fb158b7ae6d76281df44bd94c836bfce/tests/unittests/detection/test_map.py#L231 + """ + manager = DataLoader() + manager.add_data(torchmetrics_detections) + evaluator = manager.finalize() + + assert evaluator.ignored_prediction_labels == [("class", "3")] + assert evaluator.missing_prediction_labels == [] + assert evaluator.n_datums == 4 + assert evaluator.n_labels == 6 + assert evaluator.n_groundtruths == 20 + assert evaluator.n_predictions == 19 + + assert evaluator.metadata == { + "ignored_prediction_labels": [ + ("class", "3"), + ], + "missing_prediction_labels": [], + "n_datums": 4, + "n_labels": 6, + "n_groundtruths": 20, + "n_predictions": 19, + } diff --git a/lite/tests/detection/test_filtering.py b/lite/tests/detection/test_filtering.py new file mode 100644 index 000000000..7420b4877 --- /dev/null +++ b/lite/tests/detection/test_filtering.py @@ -0,0 +1,401 @@ +from dataclasses import replace + +import numpy as np +import pytest +from valor_lite.detection import BoundingBox, DataLoader, Detection, MetricType + + +@pytest.fixture +def one_detection(basic_detections: list[Detection]) -> list[Detection]: + return [basic_detections[0]] + + +@pytest.fixture +def two_detections(basic_detections: list[Detection]) -> list[Detection]: + return basic_detections + + +@pytest.fixture +def four_detections(basic_detections: list[Detection]) -> list[Detection]: + det1 = basic_detections[0] + det2 = basic_detections[1] + det3 = replace(basic_detections[0]) + det4 = replace(basic_detections[1]) + + det3.uid = "uid3" + det4.uid = "uid4" + + return [det1, det2, det3, det4] + + +def generate_random_detections( + n_detections: int, n_boxes: int, labels: str +) -> list[Detection]: + from random import choice, uniform + + def bbox(is_prediction): + xmin, ymin = uniform(0, 10), uniform(0, 10) + xmax, ymax = uniform(xmin, 15), uniform(ymin, 15) + kw = {"scores": [uniform(0, 1)]} if is_prediction else {} + return BoundingBox( + xmin, + xmax, + ymin, + ymax, + [("cl", choice(labels))], + **kw, + ) + + return [ + Detection( + uid=f"uid{i}", + groundtruths=[bbox(is_prediction=False) for _ in range(n_boxes)], + predictions=[bbox(is_prediction=True) for _ in range(n_boxes)], + ) + for i in range(n_detections) + ] + + +def test_filtering_one_detection(one_detection: list[Detection]): + """ + Basic object detection test. + + groundtruths + datum uid1 + box 1 - label (k1, v1) - tp + box 3 - label (k2, v2) - fn missing prediction + + predictions + datum uid1 + box 1 - label (k1, v1) - score 0.3 - tp + """ + + manager = DataLoader() + manager.add_data(one_detection) + evaluator = manager.finalize() + + assert ( + evaluator._ranked_pairs + == np.array( + [ + [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.3], + ] + ) + ).all() + + assert ( + evaluator._label_metadata_per_datum + == np.array( + [ + [ + [1, 1], + ], + [ + [1, 0], + ], + ] + ) + ).all() + + assert ( + evaluator._label_metadata == np.array([[1, 1, 0], [1, 0, 1]]) + ).all() + + # test datum filtering + + filter_ = evaluator.create_filter(datum_uids=["uid1"]) + assert (filter_.indices == np.array([0])).all() + assert (filter_.label_metadata == np.array([[1, 1, 0], [1, 0, 1]])).all() + + # test label filtering + + filter_ = evaluator.create_filter(labels=[("k1", "v1")]) + assert (filter_.indices == np.array([])).all() + + filter_ = evaluator.create_filter(labels=[("k2", "v2")]) + assert (filter_.indices == np.array([])).all() + + # test label key filtering + + filter_ = evaluator.create_filter(label_keys=["k1"]) + assert (filter_.indices == np.array([0])).all() + + filter_ = evaluator.create_filter(label_keys=["k2"]) + assert (filter_.indices == np.array([])).all() + + # test combo + filter_ = evaluator.create_filter( + datum_uids=["uid1"], + label_keys=["k1"], + ) + assert (filter_.indices == np.array([0])).all() + + # test evaluation + filter_ = evaluator.create_filter(datum_uids=["uid1"]) + metrics = evaluator.evaluate( + iou_thresholds=[0.5], + filter_=filter_, + ) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.AP]] + expected_metrics = [ + { + "type": "AP", + "value": 1.0, + "parameters": {"iou": 0.5, "label": {"key": "k1", "value": "v1"}}, + }, + { + "type": "AP", + "value": 0.0, + "parameters": {"iou": 0.5, "label": {"key": "k2", "value": "v2"}}, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_filtering_two_detections(two_detections: list[Detection]): + """ + Basic object detection test. + + groundtruths + datum uid1 + box 1 - label (k1, v1) - tp + box 3 - label (k2, v2) - fn missing prediction + datum uid2 + box 2 - label (k1, v1) - fn misclassification + + predictions + datum uid1 + box 1 - label (k1, v1) - score 0.3 - tp + datum uid2 + box 2 - label (k2, v2) - score 0.98 - fp + """ + + manager = DataLoader() + manager.add_data(two_detections) + evaluator = manager.finalize() + + assert ( + evaluator._ranked_pairs + == np.array( + [ + [1.0, -1.0, 0.0, 0.0, -1.0, 1.0, 0.98], + [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.3], + ] + ) + ).all() + + assert ( + evaluator._label_metadata_per_datum + == np.array( + [ + [ + [1, 1], + [1, 0], + ], + [ + [1, 0], + [0, 1], + ], + ] + ) + ).all() + + assert ( + evaluator._label_metadata == np.array([[2, 1, 0], [1, 1, 1]]) + ).all() + + # test datum filtering + + filter_ = evaluator.create_filter(datum_uids=["uid1"]) + assert (filter_.indices == np.array([1])).all() + assert (filter_.label_metadata == np.array([[1, 1, 0], [1, 0, 1]])).all() + + filter_ = evaluator.create_filter(datum_uids=["uid2"]) + assert (filter_.indices == np.array([0])).all() + + # test label filtering + + filter_ = evaluator.create_filter(labels=[("k1", "v1")]) + assert (filter_.indices == np.array([1])).all() + + filter_ = evaluator.create_filter(labels=[("k2", "v2")]) + assert (filter_.indices == np.array([])).all() + + # test label key filtering + + filter_ = evaluator.create_filter(label_keys=["k1"]) + assert (filter_.indices == np.array([1])).all() + + filter_ = evaluator.create_filter(label_keys=["k2"]) + assert (filter_.indices == np.array([])).all() + + # test combo + filter_ = evaluator.create_filter( + datum_uids=["uid1"], + label_keys=["k1"], + ) + assert (filter_.indices == np.array([1])).all() + + # test evaluation + filter_ = evaluator.create_filter(datum_uids=["uid1"]) + metrics = evaluator.evaluate( + iou_thresholds=[0.5], + filter_=filter_, + ) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.AP]] + expected_metrics = [ + { + "type": "AP", + "value": 1.0, + "parameters": {"iou": 0.5, "label": {"key": "k1", "value": "v1"}}, + }, + { + "type": "AP", + "value": 0.0, + "parameters": {"iou": 0.5, "label": {"key": "k2", "value": "v2"}}, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_filtering_four_detections(four_detections: list[Detection]): + """ + Basic object detection test. + + groundtruths + datum uid1 + box 1 - label (k1, v1) - tp + box 3 - label (k2, v2) - fn missing prediction + datum uid2 + box 2 - label (k1, v1) - fn misclassification + datum uid3 + box 1 - label (k1, v1) - tp + box 3 - label (k2, v2) - fn missing prediction + datum uid4 + box 2 - label (k1, v1) - fn misclassification + + predictions + datum uid1 + box 1 - label (k1, v1) - score 0.3 - tp + datum uid2 + box 2 - label (k2, v2) - score 0.98 - fp + datum uid3 + box 1 - label (k1, v1) - score 0.3 - tp + datum uid4 + box 2 - label (k2, v2) - score 0.98 - fp + """ + + manager = DataLoader() + manager.add_data(four_detections) + evaluator = manager.finalize() + + assert ( + evaluator._ranked_pairs + == np.array( + [ + [1.0, -1.0, 0.0, 0.0, -1.0, 1.0, 0.98], + [3.0, -1.0, 0.0, 0.0, -1.0, 1.0, 0.98], + [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.3], + [2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.3], + ] + ) + ).all() + + assert ( + evaluator._label_metadata_per_datum + == np.array( + [ + [ + [1, 1], + [1, 0], + [1, 1], + [1, 0], + ], + [ + [1, 0], + [0, 1], + [1, 0], + [0, 1], + ], + ], + dtype=np.int32, + ) + ).all() + + assert ( + evaluator._label_metadata == np.array([[4, 2, 0], [2, 2, 1]]) + ).all() + + # test datum filtering + + filter_ = evaluator.create_filter(datum_uids=["uid1"]) + assert (filter_.indices == np.array([2])).all() + assert (filter_.label_metadata == np.array([[1, 1, 0], [1, 0, 1]])).all() + + filter_ = evaluator.create_filter(datum_uids=["uid2"]) + assert (filter_.indices == np.array([0])).all() + + # test label filtering + + filter_ = evaluator.create_filter(labels=[("k1", "v1")]) + assert (filter_.indices == np.array([2, 3])).all() + + filter_ = evaluator.create_filter(labels=[("k2", "v2")]) + assert (filter_.indices == np.array([])).all() + + # test label key filtering + + filter_ = evaluator.create_filter(label_keys=["k1"]) + assert (filter_.indices == np.array([2, 3])).all() + + filter_ = evaluator.create_filter(label_keys=["k2"]) + assert (filter_.indices == np.array([])).all() + + # test combo + filter_ = evaluator.create_filter( + datum_uids=["uid1"], + label_keys=["k1"], + ) + assert (filter_.indices == np.array([2])).all() + + # test evaluation + filter_ = evaluator.create_filter(datum_uids=["uid1"]) + + metrics = evaluator.evaluate( + iou_thresholds=[0.5], + filter_=filter_, + ) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.AP]] + expected_metrics = [ + { + "type": "AP", + "value": 1.0, + "parameters": {"iou": 0.5, "label": {"key": "k1", "value": "v1"}}, + }, + { + "type": "AP", + "value": 0.0, + "parameters": {"iou": 0.5, "label": {"key": "k2", "value": "v2"}}, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_filtering_random_detections(): + loader = DataLoader() + loader.add_data(generate_random_detections(13, 4, "abc")) + evaluator = loader.finalize() + f = evaluator.create_filter(datum_uids=["uid1"]) + evaluator.evaluate(filter_=f) diff --git a/lite/tests/detection/test_iou.py b/lite/tests/detection/test_iou.py new file mode 100644 index 000000000..c929f827b --- /dev/null +++ b/lite/tests/detection/test_iou.py @@ -0,0 +1,30 @@ +import numpy as np +from valor_lite.detection import compute_iou + + +def test_compute_iou(): + + # xmin, xmax, ymin, ymax + box1 = np.array([0.0, 10.0, 0.0, 10.0]) + box2 = np.array([5.0, 10.0, 0.0, 10.0]) + box3 = np.array([0.0, 5.0, 5.0, 10.0]) + box4 = np.array([5.0, 15.0, 0.0, 10.0]) + box5 = np.array([0.0, 15.0, 0.0, 10.0]) + + pairs = np.array( + [ + np.concatenate((box1, box1)), + np.concatenate((box1, box2)), + np.concatenate((box1, box3)), + np.concatenate((box1, box4)), + np.concatenate((box1, box5)), + ] + ) + + ious = compute_iou(pairs) + assert len(ious) == 5 + assert ious[0] == 1.0 + assert ious[1] == 0.5 + assert ious[2] == 0.25 + assert round(ious[3], 5) == 0.33333 + assert round(ious[4], 5) == 0.66667 diff --git a/lite/tests/detection/test_pr_curve.py b/lite/tests/detection/test_pr_curve.py new file mode 100644 index 000000000..3a99e6716 --- /dev/null +++ b/lite/tests/detection/test_pr_curve.py @@ -0,0 +1,177 @@ +import numpy as np +from valor_lite.detection import ( + DataLoader, + Detection, + MetricType, + compute_metrics, +) + + +def test_pr_curve_simple(): + + sorted_pairs = np.array( + [ + # dt, gt, pd, iou, gl, pl, score, + [0.0, 0.0, 2.0, 0.25, 0.0, 0.0, 0.95], + [0.0, 0.0, 3.0, 0.33333, 0.0, 0.0, 0.9], + [0.0, 0.0, 4.0, 0.66667, 0.0, 0.0, 0.65], + [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.1], + [0.0, 0.0, 1.0, 0.5, 0.0, 0.0, 0.01], + ] + ) + + label_counts = np.array([[1, 5, 0]]) + iou_thresholds = np.array([0.1, 0.6]) + score_thresholds = np.array([0.0]) + + (_, _, _, pr_curve) = compute_metrics( + sorted_pairs, + label_counts=label_counts, + iou_thresholds=iou_thresholds, + score_thresholds=score_thresholds, + ) + + assert pr_curve.shape == (2, 1, 101) + assert np.isclose(pr_curve[0][0], 1.0).all() + assert np.isclose(pr_curve[1][0], 1 / 3).all() + + +def test_pr_curve_using_torch_metrics_example( + torchmetrics_detections: list[Detection], +): + """ + cf with torch metrics/pycocotools results listed here: + https://github.com/Lightning-AI/metrics/blob/107dbfd5fb158b7ae6d76281df44bd94c836bfce/tests/unittests/detection/test_map.py#L231 + """ + manager = DataLoader() + manager.add_data(torchmetrics_detections) + evaluator = manager.finalize() + + assert evaluator.ignored_prediction_labels == [("class", "3")] + assert evaluator.missing_prediction_labels == [] + assert evaluator.n_datums == 4 + assert evaluator.n_labels == 6 + assert evaluator.n_groundtruths == 20 + assert evaluator.n_predictions == 19 + + metrics = evaluator.evaluate( + iou_thresholds=[0.5, 0.75], + ) + + # AP = 1.0 + a = [1.0 for _ in range(101)] + + # AP = 0.505 + b = [1.0 for _ in range(51)] + [0.0 for _ in range(50)] + + # AP = 0.791 + c = ( + [1.0 for _ in range(71)] + + [8 / 9 for _ in range(10)] + + [0.0 for _ in range(20)] + ) + + # AP = 0.722 + d = ( + [1.0 for _ in range(41)] + + [0.8 for _ in range(40)] + + [0.0 for _ in range(20)] + ) + + # AP = 0.576 + e = ( + [1.0 for _ in range(41)] + + [0.8571428571428571 for _ in range(20)] + + [0.0 for _ in range(40)] + ) + + # test PrecisionRecallCurve + actual_metrics = [ + m.to_dict() for m in metrics[MetricType.PrecisionRecallCurve] + ] + expected_metrics = [ + { + "type": "PrecisionRecallCurve", + "value": a, + "parameters": { + "iou": 0.5, + "label": {"key": "class", "value": "0"}, + }, + }, + { + "type": "PrecisionRecallCurve", + "value": d, + "parameters": { + "iou": 0.75, + "label": {"key": "class", "value": "0"}, + }, + }, + { + "type": "PrecisionRecallCurve", + "value": a, + "parameters": { + "iou": 0.5, + "label": {"key": "class", "value": "1"}, + }, + }, + { + "type": "PrecisionRecallCurve", + "value": a, + "parameters": { + "iou": 0.75, + "label": {"key": "class", "value": "1"}, + }, + }, + { + "type": "PrecisionRecallCurve", + "value": b, + "parameters": { + "iou": 0.5, + "label": {"key": "class", "value": "2"}, + }, + }, + { + "type": "PrecisionRecallCurve", + "value": b, + "parameters": { + "iou": 0.75, + "label": {"key": "class", "value": "2"}, + }, + }, + { + "type": "PrecisionRecallCurve", + "value": a, + "parameters": { + "iou": 0.5, + "label": {"key": "class", "value": "4"}, + }, + }, + { + "type": "PrecisionRecallCurve", + "value": a, + "parameters": { + "iou": 0.75, + "label": {"key": "class", "value": "4"}, + }, + }, + { + "type": "PrecisionRecallCurve", + "value": c, + "parameters": { + "iou": 0.5, + "label": {"key": "class", "value": "49"}, + }, + }, + { + "type": "PrecisionRecallCurve", + "value": e, + "parameters": { + "iou": 0.75, + "label": {"key": "class", "value": "49"}, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics diff --git a/lite/tests/detection/test_precision.py b/lite/tests/detection/test_precision.py new file mode 100644 index 000000000..6982cc39a --- /dev/null +++ b/lite/tests/detection/test_precision.py @@ -0,0 +1,389 @@ +from valor_lite.detection import DataLoader, Detection, MetricType + + +def test_precision_metrics(basic_detections: list[Detection]): + """ + Basic object detection test. + + groundtruths + datum uid1 + box 1 - label (k1, v1) - tp + box 3 - label (k2, v2) - fn missing prediction + datum uid2 + box 2 - label (k1, v1) - fn misclassification + + predictions + datum uid1 + box 1 - label (k1, v1) - score 0.3 - tp + datum uid2 + box 2 - label (k2, v2) - score 0.98 - fp + """ + + manager = DataLoader() + manager.add_data(basic_detections) + evaluator = manager.finalize() + + metrics = evaluator.evaluate( + iou_thresholds=[0.1, 0.6], + score_thresholds=[0.0, 0.5], + ) + + assert evaluator.ignored_prediction_labels == [] + assert evaluator.missing_prediction_labels == [] + assert evaluator.n_datums == 2 + assert evaluator.n_labels == 2 + assert evaluator.n_groundtruths == 3 + assert evaluator.n_predictions == 2 + + # test Precision + actual_metrics = [m.to_dict() for m in metrics[MetricType.Precision]] + expected_metrics = [ + { + "type": "Precision", + "value": 0.0, + "parameters": { + "iou": 0.1, + "score": 0.0, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Precision", + "value": 0.0, + "parameters": { + "iou": 0.6, + "score": 0.0, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Precision", + "value": 1.0, + "parameters": { + "iou": 0.1, + "score": 0.0, + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "Precision", + "value": 1.0, + "parameters": { + "iou": 0.6, + "score": 0.0, + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "Precision", + "value": 0.0, + "parameters": { + "iou": 0.1, + "score": 0.5, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Precision", + "value": 0.0, + "parameters": { + "iou": 0.6, + "score": 0.5, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Precision", + "value": 0.0, + "parameters": { + "iou": 0.1, + "score": 0.5, + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "Precision", + "value": 0.0, + "parameters": { + "iou": 0.6, + "score": 0.5, + "label": {"key": "k1", "value": "v1"}, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_precision_false_negatives_single_datum_baseline( + false_negatives_single_datum_baseline_detections: list[Detection], +): + """This is the baseline for the below test. In this case there are two predictions and + one groundtruth, but the highest confident prediction overlaps sufficiently with the groundtruth + so there is not a penalty for the false negative so the AP is 1 + """ + + manager = DataLoader() + manager.add_data(false_negatives_single_datum_baseline_detections) + evaluator = manager.finalize() + + metrics = evaluator.evaluate( + iou_thresholds=[0.5], score_thresholds=[0.0, 0.9] + ) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Precision]] + expected_metrics = [ + { + "type": "Precision", + "value": 0.5, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + { + "type": "Precision", + "value": 0.0, + "parameters": { + "iou": 0.5, + "score": 0.9, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_precision_false_negatives_single_datum( + false_negatives_single_datum_detections: list[Detection], +): + """Tests where high confidence false negative was not being penalized. The + difference between this test and the above is that here the prediction with higher confidence + does not sufficiently overlap the groundtruth and so is penalized and we get an AP of 0.5 + """ + + manager = DataLoader() + manager.add_data(false_negatives_single_datum_detections) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Precision]] + expected_metrics = [ + { + "type": "Precision", + "value": 0.5, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_precision_false_negatives_two_datums_one_empty_low_confidence_of_fp( + false_negatives_two_datums_one_empty_low_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class and high IOU) + 2. A second image with empty groundtruth annotation but a prediction with lower confidence + then the prediction on the first image. + + In this case, the AP should be 1.0 since the false positive has lower confidence than the true positive + + """ + + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_empty_low_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Precision]] + expected_metrics = [ + { + "type": "Precision", + "value": 0.5, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_precision_false_negatives_two_datums_one_empty_high_confidence_of_fp( + false_negatives_two_datums_one_empty_high_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class and high IOU) + 2. A second image with empty groundtruth annotation and a prediction with higher confidence + then the prediction on the first image. + + In this case, the AP should be 0.5 since the false positive has higher confidence than the true positive + """ + + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_empty_high_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Precision]] + expected_metrics = [ + { + "type": "Precision", + "value": 0.5, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_precision_false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp( + false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU) + 2. A second image with a groundtruth annotation with class `"other value"` and a prediction with lower confidence + then the prediction on the first image. + + In this case, the AP for class `"value"` should be 1 since the false positive has lower confidence than the true positive. + AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth + """ + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Precision]] + expected_metrics = [ + { + "type": "Precision", + "value": 0.5, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + { + "type": "Precision", + "value": 0.0, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "other value", + }, + }, + }, + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_precision_false_negatives_two_datums_one_only_with_different_class_high_confidence_of_fp( + false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU) + 2. A second image with a groundtruth annotation with class `"other value"` and a prediction with higher confidence + then the prediction on the first image. + + In this case, the AP for class `"value"` should be 0.5 since the false positive has higher confidence than the true positive. + AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth + """ + manager = DataLoader() + manager.add_data( + false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Precision]] + expected_metrics = [ + { + "type": "Precision", + "value": 0.5, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + { + "type": "Precision", + "value": 0.0, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "other value", + }, + }, + }, + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics diff --git a/lite/tests/detection/test_recall.py b/lite/tests/detection/test_recall.py new file mode 100644 index 000000000..659ac95e0 --- /dev/null +++ b/lite/tests/detection/test_recall.py @@ -0,0 +1,389 @@ +from valor_lite.detection import DataLoader, Detection, MetricType + + +def test_recall_metrics(basic_detections: list[Detection]): + """ + Basic object detection test. + + groundtruths + datum uid1 + box 1 - label (k1, v1) - tp + box 3 - label (k2, v2) - fn missing prediction + datum uid2 + box 2 - label (k1, v1) - fn misclassification + + predictions + datum uid1 + box 1 - label (k1, v1) - score 0.3 - tp + datum uid2 + box 2 - label (k2, v2) - score 0.98 - fp + """ + + manager = DataLoader() + manager.add_data(basic_detections) + evaluator = manager.finalize() + + metrics = evaluator.evaluate( + iou_thresholds=[0.1, 0.6], + score_thresholds=[0.0, 0.5], + ) + + assert evaluator.ignored_prediction_labels == [] + assert evaluator.missing_prediction_labels == [] + assert evaluator.n_datums == 2 + assert evaluator.n_labels == 2 + assert evaluator.n_groundtruths == 3 + assert evaluator.n_predictions == 2 + + # test Recall + actual_metrics = [m.to_dict() for m in metrics[MetricType.Recall]] + expected_metrics = [ + { + "type": "Recall", + "value": 0.0, + "parameters": { + "iou": 0.1, + "score": 0.0, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Recall", + "value": 0.0, + "parameters": { + "iou": 0.6, + "score": 0.0, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Recall", + "value": 0.5, + "parameters": { + "iou": 0.1, + "score": 0.0, + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "Recall", + "value": 0.5, + "parameters": { + "iou": 0.6, + "score": 0.0, + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "Recall", + "value": 0.0, + "parameters": { + "iou": 0.1, + "score": 0.5, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Recall", + "value": 0.0, + "parameters": { + "iou": 0.6, + "score": 0.5, + "label": {"key": "k2", "value": "v2"}, + }, + }, + { + "type": "Recall", + "value": 0.0, + "parameters": { + "iou": 0.1, + "score": 0.5, + "label": {"key": "k1", "value": "v1"}, + }, + }, + { + "type": "Recall", + "value": 0.0, + "parameters": { + "iou": 0.6, + "score": 0.5, + "label": {"key": "k1", "value": "v1"}, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_recall_false_negatives_single_datum_baseline( + false_negatives_single_datum_baseline_detections: list[Detection], +): + """This is the baseline for the below test. In this case there are two predictions and + one groundtruth, but the highest confident prediction overlaps sufficiently with the groundtruth + so there is not a penalty for the false negative so the AP is 1 + """ + + manager = DataLoader() + manager.add_data(false_negatives_single_datum_baseline_detections) + evaluator = manager.finalize() + + metrics = evaluator.evaluate( + iou_thresholds=[0.5], score_thresholds=[0.0, 0.9] + ) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Recall]] + expected_metrics = [ + { + "type": "Recall", + "value": 1.0, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + { + "type": "Recall", + "value": 0.0, + "parameters": { + "iou": 0.5, + "score": 0.9, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + ] + for m in actual_metrics: + assert m in expected_metrics + for m in expected_metrics: + assert m in actual_metrics + + +def test_recall_false_negatives_single_datum( + false_negatives_single_datum_detections: list[Detection], +): + """Tests where high confidence false negative was not being penalized. The + difference between this test and the above is that here the prediction with higher confidence + does not sufficiently overlap the groundtruth and so is penalized and we get an AP of 0.5 + """ + + manager = DataLoader() + manager.add_data(false_negatives_single_datum_detections) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Recall]] + expected_metrics = [ + { + "type": "Recall", + "value": 1.0, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_recall_false_negatives_two_datums_one_empty_low_confidence_of_fp( + false_negatives_two_datums_one_empty_low_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class and high IOU) + 2. A second image with empty groundtruth annotation but a prediction with lower confidence + then the prediction on the first image. + + In this case, the AP should be 1.0 since the false positive has lower confidence than the true positive + + """ + + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_empty_low_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Recall]] + expected_metrics = [ + { + "type": "Recall", + "value": 1.0, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_recall_false_negatives_two_datums_one_empty_high_confidence_of_fp( + false_negatives_two_datums_one_empty_high_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class and high IOU) + 2. A second image with empty groundtruth annotation and a prediction with higher confidence + then the prediction on the first image. + + In this case, the AP should be 0.5 since the false positive has higher confidence than the true positive + """ + + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_empty_high_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Recall]] + expected_metrics = [ + { + "type": "Recall", + "value": 1.0, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + } + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_recall_false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp( + false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU) + 2. A second image with a groundtruth annotation with class `"other value"` and a prediction with lower confidence + then the prediction on the first image. + + In this case, the AP for class `"value"` should be 1 since the false positive has lower confidence than the true positive. + AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth + """ + manager = DataLoader() + manager.add_data( + false_negatives_two_datums_one_only_with_different_class_low_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Recall]] + expected_metrics = [ + { + "type": "Recall", + "value": 1.0, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + { + "type": "Recall", + "value": 0.0, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "other value", + }, + }, + }, + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics + + +def test_recall_false_negatives_two_datums_one_only_with_different_class_high_confidence_of_fp( + false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_detections: list[ + Detection + ], +): + """In this test we have + 1. An image with a matching groundtruth and prediction (same class, `"value"`, and high IOU) + 2. A second image with a groundtruth annotation with class `"other value"` and a prediction with higher confidence + then the prediction on the first image. + + In this case, the AP for class `"value"` should be 0.5 since the false positive has higher confidence than the true positive. + AP for class `"other value"` should be 0 since there is no prediction for the `"other value"` groundtruth + """ + manager = DataLoader() + manager.add_data( + false_negatives_two_images_one_only_with_different_class_high_confidence_of_fp_detections + ) + evaluator = manager.finalize() + metrics = evaluator.evaluate(iou_thresholds=[0.5], score_thresholds=[0.0]) + + actual_metrics = [m.to_dict() for m in metrics[MetricType.Recall]] + expected_metrics = [ + { + "type": "Recall", + "value": 1.0, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "value", + }, + }, + }, + { + "type": "Recall", + "value": 0.0, + "parameters": { + "iou": 0.5, + "score": 0.0, + "label": { + "key": "key", + "value": "other value", + }, + }, + }, + ] + for m in expected_metrics: + assert m in actual_metrics + for m in actual_metrics: + assert m in expected_metrics diff --git a/lite/tests/detection/test_schemas.py b/lite/tests/detection/test_schemas.py new file mode 100644 index 000000000..8345fed36 --- /dev/null +++ b/lite/tests/detection/test_schemas.py @@ -0,0 +1,105 @@ +import numpy as np +import pytest +from valor_lite.detection import Bitmask, BoundingBox, Detection + + +def test_BoundingBox(): + # groundtruth + gt = BoundingBox(xmin=0, xmax=1, ymin=0, ymax=1, labels=[("k", "v")]) + + # prediction + pd = BoundingBox( + xmin=-1, + xmax=11, + ymin=0, + ymax=1, + labels=[("k", "v")], + scores=[0.7], + ) + + with pytest.raises(ValueError): + BoundingBox( + xmin=0, + xmax=1, + ymin=0, + ymax=1, + labels=[("k", "v")], + scores=[0.7, 0.1], + ) + with pytest.raises(ValueError): + BoundingBox( + xmin=0, + xmax=1, + ymin=0, + ymax=1, + labels=[("k", "v1"), ("k", "v2")], + scores=[0.7], + ) + + # test `extrema` property + assert gt.extrema == (0, 1, 0, 1) + assert pd.extrema == (-1, 11, 0, 1) + + +def test_Bitmask(): + + mask = mask = np.zeros((10, 10), dtype=np.bool_) + mask[:5, :5] = True + + # groundtruth + gt = Bitmask(mask=mask, labels=[("k", "v")]) + + # prediction + Bitmask( + mask=mask, + labels=[("k", "v")], + scores=[0.7], + ) + + # test score-label matching + with pytest.raises(ValueError): + Bitmask( + mask=np.zeros((10, 10), dtype=np.bool_), + labels=[("k", "v")], + scores=[0.7, 0.1], + ) + with pytest.raises(ValueError): + Bitmask( + mask=np.zeros((10, 10), dtype=np.bool_), + labels=[("k", "v1"), ("k", "v2")], + scores=[0.7], + ) + + # test `to_box` function + with pytest.raises(NotImplementedError): + gt.to_box() + + +def test_Detection(): + + # groundtruth + gt = BoundingBox(xmin=0, xmax=1, ymin=0, ymax=1, labels=[("k", "v")]) + + # prediction + pd = BoundingBox( + xmin=-1, + xmax=11, + ymin=0, + ymax=1, + labels=[("k", "v")], + scores=[0.7], + ) + + Detection( + uid="uid", + groundtruths=[gt], + predictions=[pd], + ) + + # test that predictions must contain scores + with pytest.raises(ValueError): + Detection( + uid="uid", + groundtruths=[gt], + predictions=[gt], + ) diff --git a/lite/tests/detection/test_stability.py b/lite/tests/detection/test_stability.py new file mode 100644 index 000000000..db239a194 --- /dev/null +++ b/lite/tests/detection/test_stability.py @@ -0,0 +1,87 @@ +from random import choice, uniform + +from valor_lite.detection import BoundingBox, DataLoader, Detection + + +def generate_random_detections( + n_detections: int, n_boxes: int, labels: str +) -> list[Detection]: + def bbox(is_prediction): + xmin, ymin = uniform(0, 10), uniform(0, 10) + xmax, ymax = uniform(xmin, 15), uniform(ymin, 15) + kw = ( + {"scores": [uniform(0, 1), uniform(0, 1)]} if is_prediction else {} + ) + return BoundingBox( + xmin, + xmax, + ymin, + ymax, + [("class", choice(labels)), ("category", choice(labels))], + **kw, + ) + + return [ + Detection( + uid=f"uid{i}", + groundtruths=[bbox(is_prediction=False) for _ in range(n_boxes)], + predictions=[bbox(is_prediction=True) for _ in range(n_boxes)], + ) + for i in range(n_detections) + ] + + +def test_fuzz_detections(): + + few_labels = "abc" + many_labels = "abcdefghijklmnopqrstuvwxyz123456789" + quantities = [1, 5, 10] + + for _ in range(100): + + labels = choice([few_labels, many_labels]) + n_detections = choice(quantities) + n_boxes = choice(quantities) + + detections = generate_random_detections(n_detections, n_boxes, labels) + + loader = DataLoader() + loader.add_data(detections) + evaluator = loader.finalize() + evaluator.evaluate( + iou_thresholds=[0.25, 0.75], + score_thresholds=[0.25, 0.75], + ) + + +def test_fuzz_detections_with_filtering(): + + few_labels = "abcd" + many_labels = "abcdefghijklmnopqrstuvwxyz123456789" + quantities = [4, 10] + + for _ in range(100): + + labels = choice([few_labels, many_labels]) + n_detections = choice(quantities) + n_boxes = choice(quantities) + + detections = generate_random_detections(n_detections, n_boxes, labels) + + loader = DataLoader() + loader.add_data(detections) + evaluator = loader.finalize() + + label_key = "class" + datum_subset = [f"uid{i}" for i in range(len(detections) // 2)] + + filter_ = evaluator.create_filter( + datum_uids=datum_subset, + label_keys=[label_key], + ) + + evaluator.evaluate( + iou_thresholds=[0.25, 0.75], + score_thresholds=[0.25, 0.75], + filter_=filter_, + ) diff --git a/lite/valor_lite/__init__.py b/lite/valor_lite/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/lite/valor_lite/detection/__init__.py b/lite/valor_lite/detection/__init__.py new file mode 100644 index 000000000..38c61ba99 --- /dev/null +++ b/lite/valor_lite/detection/__init__.py @@ -0,0 +1,56 @@ +from .annotation import Bitmask, BoundingBox, Detection +from .computation import ( + compute_detailed_pr_curve, + compute_iou, + compute_metrics, + compute_ranked_pairs, +) +from .manager import DataLoader, Evaluator +from .metric import ( + AP, + AR, + F1, + Accuracy, + APAveragedOverIOUs, + ARAveragedOverScores, + Counts, + DetailedPrecisionRecallCurve, + DetailedPrecisionRecallPoint, + MetricType, + Precision, + PrecisionRecallCurve, + Recall, + mAP, + mAPAveragedOverIOUs, + mAR, + mARAveragedOverScores, +) + +__all__ = [ + "Bitmask", + "BoundingBox", + "Detection", + "MetricType", + "Counts", + "Precision", + "Recall", + "Accuracy", + "F1", + "AP", + "mAP", + "AR", + "mAR", + "APAveragedOverIOUs", + "mAPAveragedOverIOUs", + "ARAveragedOverScores", + "mARAveragedOverScores", + "PrecisionRecallCurve", + "DetailedPrecisionRecallPoint", + "DetailedPrecisionRecallCurve", + "compute_iou", + "compute_ranked_pairs", + "compute_metrics", + "compute_detailed_pr_curve", + "DataLoader", + "Evaluator", +] diff --git a/lite/valor_lite/detection/annotation.py b/lite/valor_lite/detection/annotation.py new file mode 100644 index 000000000..ca6132438 --- /dev/null +++ b/lite/valor_lite/detection/annotation.py @@ -0,0 +1,54 @@ +from dataclasses import dataclass, field + +import numpy as np +from numpy.typing import NDArray + + +@dataclass +class BoundingBox: + xmin: float + xmax: float + ymin: float + ymax: float + labels: list[tuple[str, str]] + scores: list[float] = field(default_factory=list) + + def __post_init__(self): + if len(self.scores) > 0 and len(self.labels) != len(self.scores): + raise ValueError( + "If scores are defined, there must be a 1:1 pairing with labels." + ) + + @property + def extrema(self) -> tuple[float, float, float, float]: + return (self.xmin, self.xmax, self.ymin, self.ymax) + + +@dataclass +class Bitmask: + mask: NDArray[np.bool_] + labels: list[tuple[str, str]] + scores: list[float] = field(default_factory=list) + + def __post_init__(self): + if len(self.scores) > 0 and len(self.labels) != len(self.scores): + raise ValueError( + "If scores are defined, there must be a 1:1 pairing with labels." + ) + + def to_box(self) -> BoundingBox: + raise NotImplementedError + + +@dataclass +class Detection: + uid: str + groundtruths: list[BoundingBox] + predictions: list[BoundingBox] + + def __post_init__(self): + for prediction in self.predictions: + if len(prediction.scores) != len(prediction.labels): + raise ValueError( + "Predictions must provide a score for every label." + ) diff --git a/lite/valor_lite/detection/computation.py b/lite/valor_lite/detection/computation.py new file mode 100644 index 000000000..e41dcf256 --- /dev/null +++ b/lite/valor_lite/detection/computation.py @@ -0,0 +1,506 @@ +import numpy as np +from numpy.typing import NDArray + +# datum id 0 +# gt 1 +# pd 2 +# iou 3 +# gt label 4 +# pd label 5 +# score 6 + + +def compute_iou(data: NDArray[np.floating]) -> NDArray[np.floating]: + + xmin1, xmax1, ymin1, ymax1 = ( + data[:, 0], + data[:, 1], + data[:, 2], + data[:, 3], + ) + xmin2, xmax2, ymin2, ymax2 = ( + data[:, 4], + data[:, 5], + data[:, 6], + data[:, 7], + ) + + xmin = np.maximum(xmin1, xmin2) + ymin = np.maximum(ymin1, ymin2) + xmax = np.minimum(xmax1, xmax2) + ymax = np.minimum(ymax1, ymax2) + + intersection_width = np.maximum(0, xmax - xmin) + intersection_height = np.maximum(0, ymax - ymin) + intersection_area = intersection_width * intersection_height + + area1 = (xmax1 - xmin1) * (ymax1 - ymin1) + area2 = (xmax2 - xmin2) * (ymax2 - ymin2) + + union_area = area1 + area2 - intersection_area + + iou = np.zeros(data.shape[0]) + valid_union_mask = union_area >= 1e-9 + iou[valid_union_mask] = ( + intersection_area[valid_union_mask] / union_area[valid_union_mask] + ) + return iou + + +def _compute_ranked_pairs_for_datum( + data: np.ndarray, + label_counts: np.ndarray, +) -> np.ndarray: + """ + Computes ranked pairs for a datum. + """ + + # remove null predictions + data = data[data[:, 2] >= 0.0] + + # sort by gt_id, iou, score + indices = np.lexsort( + ( + data[:, 1], + -data[:, 3], + -data[:, 6], + ) + ) + data = data[indices] + + # remove ignored predictions + for label_idx, count in enumerate(label_counts[:, 0]): + if count > 0: + continue + data = data[data[:, 5] != label_idx] + + # only keep the highest ranked pair + _, indices = np.unique(data[:, [0, 2, 5]], axis=0, return_index=True) + + # np.unique orders its results by value, we need to sort the indices to maintain the results of the lexsort + data = data[indices, :] + + return data + + +def compute_ranked_pairs( + data: list[NDArray[np.floating]], + label_counts: NDArray[np.integer], +) -> NDArray[np.floating]: + pairs = np.concatenate( + [ + _compute_ranked_pairs_for_datum( + datum, + label_counts=label_counts, + ) + for datum in data + ], + axis=0, + ) + indices = np.lexsort( + ( + -pairs[:, 3], # iou + -pairs[:, 6], # score + ) + ) + return pairs[indices] + + +def compute_metrics( + data: np.ndarray, + label_counts: np.ndarray, + iou_thresholds: np.ndarray, + score_thresholds: np.ndarray, +) -> tuple[ + tuple[ + NDArray[np.floating], + NDArray[np.floating], + NDArray[np.floating], + NDArray[np.floating], + ], + tuple[ + NDArray[np.floating], + NDArray[np.floating], + NDArray[np.floating], + NDArray[np.floating], + ], + NDArray[np.floating], + NDArray[np.floating], +]: + """ + Computes Object Detection metrics. + + Returns + ------- + tuple[NDArray, NDArray, NDArray NDArray] + Average Precision results. + tuple[NDArray, NDArray, NDArray NDArray] + Average Recall results. + np.ndarray + Precision, Recall, TP, FP, FN, F1 Score, Accuracy. + np.ndarray + Interpolated Precision-Recall Curves. + """ + + n_rows = data.shape[0] + n_labels = label_counts.shape[0] + n_ious = iou_thresholds.shape[0] + n_scores = score_thresholds.shape[0] + + average_precision = np.zeros((n_ious, n_labels)) + average_recall = np.zeros((n_scores, n_labels)) + precision_recall = np.zeros((n_ious, n_scores, n_labels, 7)) + + pd_labels = data[:, 5].astype(int) + unique_pd_labels = np.unique(pd_labels) + gt_count = label_counts[:, 0] + running_total_count = np.zeros( + (n_ious, n_rows), + dtype=np.float64, + ) + running_tp_count = np.zeros_like(running_total_count) + running_gt_count = np.zeros_like(running_total_count) + pr_curve = np.zeros((n_ious, n_labels, 101)) + + mask_score_nonzero = data[:, 6] > 1e-9 + mask_gt_exists = data[:, 1] >= 0.0 + mask_labels_match = np.isclose(data[:, 4], data[:, 5]) + + mask_gt_exists_labels_match = mask_gt_exists & mask_labels_match + + mask_tp = mask_score_nonzero & mask_gt_exists_labels_match + mask_fp = mask_score_nonzero + mask_fn = mask_gt_exists_labels_match + + for iou_idx in range(n_ious): + mask_iou = data[:, 3] >= iou_thresholds[iou_idx] + + mask_tp_outer = mask_tp & mask_iou + mask_fp_outer = mask_fp & ( + (~mask_gt_exists_labels_match & mask_iou) | ~mask_iou + ) + mask_fn_outer = mask_fn & mask_iou + + for score_idx in range(n_scores): + mask_score_thresh = data[:, 6] >= score_thresholds[score_idx] + + mask_tp_inner = mask_tp_outer & mask_score_thresh + mask_fp_inner = mask_fp_outer & mask_score_thresh + mask_fn_inner = mask_fn_outer & ~mask_score_thresh + + # create true-positive mask score threshold + tp_candidates = data[mask_tp_inner] + _, indices_gt_unique = np.unique( + tp_candidates[:, [0, 1, 4]], axis=0, return_index=True + ) + mask_gt_unique = np.zeros(tp_candidates.shape[0], dtype=bool) + mask_gt_unique[indices_gt_unique] = True + true_positives_mask = np.zeros(n_rows, dtype=bool) + true_positives_mask[mask_tp_inner] = mask_gt_unique + + # calculate intermediates + pd_count = np.bincount(pd_labels, minlength=n_labels).astype(float) + tp_count = np.bincount( + pd_labels, + weights=true_positives_mask, + minlength=n_labels, + ).astype(float) + + fp_count = np.bincount( + pd_labels[mask_fp_inner], + minlength=n_labels, + ).astype(float) + + fn_count = np.bincount( + pd_labels[mask_fn_inner], + minlength=n_labels, + ) + + # calculate component metrics + recall = np.zeros_like(tp_count) + precision = np.zeros_like(tp_count) + np.divide(tp_count, gt_count, where=gt_count > 1e-9, out=recall) + np.divide(tp_count, pd_count, where=pd_count > 1e-9, out=precision) + fn_count = gt_count - tp_count + + f1_score = np.zeros_like(precision) + np.divide( + np.multiply(precision, recall), + (precision + recall), + where=(precision + recall) > 1e-9, + out=f1_score, + ) + + accuracy = np.zeros_like(tp_count) + np.divide( + tp_count, + (gt_count + pd_count), + where=(gt_count + pd_count) > 1e-9, + out=accuracy, + ) + + precision_recall[iou_idx][score_idx] = np.concatenate( + ( + tp_count[:, np.newaxis], + fp_count[:, np.newaxis], + fn_count[:, np.newaxis], + precision[:, np.newaxis], + recall[:, np.newaxis], + f1_score[:, np.newaxis], + accuracy[:, np.newaxis], + ), + axis=1, + ) + + # calculate recall for AR + average_recall[score_idx] += recall + + # create true-positive mask score threshold + tp_candidates = data[mask_tp_outer] + _, indices_gt_unique = np.unique( + tp_candidates[:, [0, 1, 4]], axis=0, return_index=True + ) + mask_gt_unique = np.zeros(tp_candidates.shape[0], dtype=bool) + mask_gt_unique[indices_gt_unique] = True + true_positives_mask = np.zeros(n_rows, dtype=bool) + true_positives_mask[mask_tp_outer] = mask_gt_unique + + # count running tp and total for AP + for pd_label in unique_pd_labels: + mask_pd_label = pd_labels == pd_label + running_gt_count[iou_idx][mask_pd_label] = gt_count[pd_label] + running_total_count[iou_idx][mask_pd_label] = np.arange( + 1, mask_pd_label.sum() + 1 + ) + mask_tp_for_counting = mask_pd_label & true_positives_mask + running_tp_count[iou_idx][mask_tp_for_counting] = np.arange( + 1, mask_tp_for_counting.sum() + 1 + ) + + # calculate running precision-recall points for AP + precision = np.zeros_like(running_total_count) + np.divide( + running_tp_count, + running_total_count, + where=running_total_count > 1e-9, + out=precision, + ) + recall = np.zeros_like(running_total_count) + np.divide( + running_tp_count, + running_gt_count, + where=running_gt_count > 1e-9, + out=recall, + ) + recall_index = np.floor(recall * 100.0).astype(int) + for iou_idx in range(n_ious): + p = precision[iou_idx] + r = recall_index[iou_idx] + pr_curve[iou_idx, pd_labels, r] = np.maximum( + pr_curve[iou_idx, pd_labels, r], p + ) + + # calculate average precision + running_max = np.zeros((n_ious, n_labels)) + for recall in range(100, -1, -1): + precision = pr_curve[:, :, recall] + running_max = np.maximum(precision, running_max) + average_precision += running_max + pr_curve[:, :, recall] = running_max + average_precision = average_precision / 101.0 + + # calculate average recall + average_recall /= n_ious + + # calculate mAP and mAR + label_key_mapping = label_counts[unique_pd_labels, 2] + label_keys = np.unique(label_counts[:, 2]) + mAP = np.ones((n_ious, label_keys.shape[0])) * -1.0 + mAR = np.ones((n_scores, label_keys.shape[0])) * -1.0 + for key in np.unique(label_key_mapping): + labels = unique_pd_labels[label_key_mapping == key] + key_idx = int(key) + mAP[:, key_idx] = average_precision[:, labels].mean(axis=1) + mAR[:, key_idx] = average_recall[:, labels].mean(axis=1) + + # calculate AP and mAP averaged over iou thresholds + APAveragedOverIoUs = average_precision.mean(axis=0) + mAPAveragedOverIoUs = mAP.mean(axis=0) + + # calculate AR and mAR averaged over score thresholds + ARAveragedOverIoUs = average_recall.mean(axis=0) + mARAveragedOverIoUs = mAR.mean(axis=0) + + ap_results = ( + average_precision, + mAP, + APAveragedOverIoUs, + mAPAveragedOverIoUs, + ) + ar_results = ( + average_recall, + mAR, + ARAveragedOverIoUs, + mARAveragedOverIoUs, + ) + + return ( + ap_results, + ar_results, + precision_recall, + pr_curve, + ) + + +def compute_detailed_pr_curve( + data: np.ndarray, + label_counts: np.ndarray, + iou_thresholds: np.ndarray, + score_thresholds: np.ndarray, + n_samples: int, +) -> np.ndarray: + + """ + 0 label + 1 tp + ... + 2 fp - 1 + 3 fp - 2 + 4 fn - misclassification + 5 fn - hallucination + """ + + n_labels = label_counts.shape[0] + n_ious = iou_thresholds.shape[0] + n_scores = score_thresholds.shape[0] + n_metrics = 5 * (n_samples + 1) + + tp_idx = 0 + fp_misclf_idx = tp_idx + n_samples + 1 + fp_halluc_idx = fp_misclf_idx + n_samples + 1 + fn_misclf_idx = fp_halluc_idx + n_samples + 1 + fn_misprd_idx = fn_misclf_idx + n_samples + 1 + + detailed_pr_curve = np.ones((n_ious, n_scores, n_labels, n_metrics)) * -1.0 + + mask_gt_exists = data[:, 1] > -0.5 + mask_pd_exists = data[:, 2] > -0.5 + mask_label_match = np.isclose(data[:, 4], data[:, 5]) + + mask_gt_pd_exists = mask_gt_exists & mask_pd_exists + mask_gt_pd_match = mask_gt_pd_exists & mask_label_match + mask_gt_pd_mismatch = mask_gt_pd_exists & ~mask_label_match + + for iou_idx in range(n_ious): + mask_iou = data[:, 3] >= iou_thresholds[iou_idx] + mask_gt_pd_match_iou = mask_gt_pd_match & mask_iou + mask_gt_pd_mismatch_iou = mask_gt_pd_mismatch & mask_iou + for score_idx in range(n_scores): + mask_score = data[:, 6] >= score_thresholds[score_idx] + mask_tp = mask_gt_pd_match_iou & mask_score + mask_fp_misclf = mask_gt_pd_mismatch_iou & mask_score + mask_fn_misclf = mask_gt_pd_match_iou & ~mask_score + mask_halluc_missing = ~( + mask_gt_pd_match_iou | (mask_gt_pd_mismatch & mask_score) + ) + mask_fp_halluc = mask_halluc_missing & mask_pd_exists + mask_fn_misprd = mask_halluc_missing & mask_gt_exists + + tp_slice = data[mask_tp] + fp_misclf_slice = data[mask_fp_misclf] + fp_halluc_slice = data[mask_fp_halluc] + fn_misclf_slice = data[mask_fn_misclf] + fn_misprd_slice = data[mask_fn_misprd] + + tp_count = np.bincount( + tp_slice[:, 5].astype(int), minlength=n_labels + ) + fp_misclf_count = np.bincount( + fp_misclf_slice[:, 5].astype(int), minlength=n_labels + ) + fp_halluc_count = np.bincount( + fp_halluc_slice[:, 5].astype(int), minlength=n_labels + ) + fn_misclf_count = np.bincount( + fn_misclf_slice[:, 4].astype(int), minlength=n_labels + ) + fn_misprd_count = np.bincount( + fn_misprd_slice[:, 4].astype(int), minlength=n_labels + ) + + detailed_pr_curve[iou_idx, score_idx, :, tp_idx] = tp_count + detailed_pr_curve[ + iou_idx, score_idx, :, fp_misclf_idx + ] = fp_misclf_count + detailed_pr_curve[ + iou_idx, score_idx, :, fp_halluc_idx + ] = fp_halluc_count + detailed_pr_curve[ + iou_idx, score_idx, :, fn_misclf_idx + ] = fn_misclf_count + detailed_pr_curve[ + iou_idx, score_idx, :, fn_misprd_idx + ] = fn_misprd_count + + if n_samples > 0: + for label_idx in range(n_labels): + tp_examples = tp_slice[ + tp_slice[:, 5].astype(int) == label_idx + ][:n_samples, 0] + fp_misclf_examples = fp_misclf_slice[ + fp_misclf_slice[:, 5].astype(int) == label_idx + ][:n_samples, 0] + fp_halluc_examples = fp_halluc_slice[ + fp_halluc_slice[:, 5].astype(int) == label_idx + ][:n_samples, 0] + fn_misclf_examples = fn_misclf_slice[ + fn_misclf_slice[:, 4].astype(int) == label_idx + ][:n_samples, 0] + fn_misprd_examples = fn_misprd_slice[ + fn_misprd_slice[:, 4].astype(int) == label_idx + ][:n_samples, 0] + + detailed_pr_curve[ + iou_idx, + score_idx, + label_idx, + tp_idx + 1 : tp_idx + 1 + tp_examples.shape[0], + ] = tp_examples + detailed_pr_curve[ + iou_idx, + score_idx, + label_idx, + fp_misclf_idx + + 1 : fp_misclf_idx + + 1 + + fp_misclf_examples.shape[0], + ] = fp_misclf_examples + detailed_pr_curve[ + iou_idx, + score_idx, + label_idx, + fp_halluc_idx + + 1 : fp_halluc_idx + + 1 + + fp_halluc_examples.shape[0], + ] = fp_halluc_examples + detailed_pr_curve[ + iou_idx, + score_idx, + label_idx, + fn_misclf_idx + + 1 : fn_misclf_idx + + 1 + + fn_misclf_examples.shape[0], + ] = fn_misclf_examples + detailed_pr_curve[ + iou_idx, + score_idx, + label_idx, + fn_misprd_idx + + 1 : fn_misprd_idx + + 1 + + fn_misprd_examples.shape[0], + ] = fn_misprd_examples + + return detailed_pr_curve diff --git a/lite/valor_lite/detection/manager.py b/lite/valor_lite/detection/manager.py new file mode 100644 index 000000000..c05c89d26 --- /dev/null +++ b/lite/valor_lite/detection/manager.py @@ -0,0 +1,845 @@ +from collections import defaultdict +from dataclasses import dataclass + +import numpy as np +from numpy.typing import NDArray +from tqdm import tqdm +from valor_lite.detection.annotation import Detection +from valor_lite.detection.computation import ( + compute_detailed_pr_curve, + compute_iou, + compute_metrics, + compute_ranked_pairs, +) +from valor_lite.detection.metric import ( + AP, + AR, + F1, + Accuracy, + APAveragedOverIOUs, + ARAveragedOverScores, + Counts, + DetailedPrecisionRecallCurve, + DetailedPrecisionRecallPoint, + MetricType, + Precision, + PrecisionRecallCurve, + Recall, + mAP, + mAPAveragedOverIOUs, + mAR, + mARAveragedOverScores, +) + +""" +Usage +----- + +manager = DataLoader() +manager.add_data( + groundtruths=groundtruths, + predictions=predictions, +) +evaluator = manager.finalize() + +metrics = evaluator.evaluate(iou_thresholds=[0.5]) + +ap_metrics = metrics[MetricType.AP] +ar_metrics = metrics[MetricType.AR] + +filter_mask = evaluator.create_filter(datum_uids=["uid1", "uid2"]) +filtered_metrics = evaluator.evaluate(iou_thresholds=[0.5], filter_mask=filter_mask) +""" + + +@dataclass +class Filter: + indices: NDArray[np.int32] + label_metadata: NDArray[np.int32] + + +class Evaluator: + def __init__(self): + + # metadata + self.n_datums = 0 + self.n_groundtruths = 0 + self.n_predictions = 0 + self.n_labels = 0 + + # datum reference + self.uid_to_index: dict[str, int] = dict() + self.index_to_uid: dict[int, str] = dict() + + # label reference + self.label_to_index: dict[tuple[str, str], int] = dict() + self.index_to_label: dict[int, tuple[str, str]] = dict() + + # label key reference + self.index_to_label_key: dict[int, str] = dict() + self.label_key_to_index: dict[str, int] = dict() + self.label_index_to_label_key_index: dict[int, int] = dict() + + # computation caches + self._detailed_pairs = np.array([]) + self._ranked_pairs = np.array([]) + self._label_metadata = np.array([]) + self._label_metadata_per_datum = np.array([]) + + @property + def ignored_prediction_labels(self) -> list[tuple[str, str]]: + glabels = set(np.where(self._label_metadata[:, 0] > 0)[0]) + plabels = set(np.where(self._label_metadata[:, 1] > 0)[0]) + return [ + self.index_to_label[label_id] for label_id in (plabels - glabels) + ] + + @property + def missing_prediction_labels(self) -> list[tuple[str, str]]: + glabels = set(np.where(self._label_metadata[:, 0] > 0)[0]) + plabels = set(np.where(self._label_metadata[:, 1] > 0)[0]) + return [ + self.index_to_label[label_id] for label_id in (glabels - plabels) + ] + + @property + def metadata(self) -> dict: + return { + "n_datums": self.n_datums, + "n_groundtruths": self.n_groundtruths, + "n_predictions": self.n_predictions, + "n_labels": self.n_labels, + "ignored_prediction_labels": self.ignored_prediction_labels, + "missing_prediction_labels": self.missing_prediction_labels, + } + + def create_filter( + self, + datum_uids: list[str] | NDArray[np.int32] | None = None, + labels: list[tuple[str, str]] | NDArray[np.int32] | None = None, + label_keys: list[str] | NDArray[np.int32] | None = None, + ) -> Filter: + """ + Creates a boolean mask that can be passed to an evaluation. + + Parameters + ---------- + datum_uids : list[str] | NDArray[np.int32], optional + An optional list of string uids or a numpy array of uid indices. + labels : list[tuple[str, str]] | NDArray[np.int32], optional + An optional list of labels or a numpy array of label indices. + label_keys : list[str] | NDArray[np.int32], optional + An optional list of label keys or a numpy array of label key indices. + + Returns + ------- + Filter + A filter object that can be passed to the `evaluate` method. + """ + n_rows = self._ranked_pairs.shape[0] + + n_datums = self._label_metadata_per_datum.shape[1] + n_labels = self._label_metadata_per_datum.shape[2] + + mask_pairs = np.ones((n_rows, 1), dtype=np.bool_) + mask_datums = np.ones(n_datums, dtype=np.bool_) + mask_labels = np.ones(n_labels, dtype=np.bool_) + + if datum_uids is not None: + if isinstance(datum_uids, list): + datum_uids = np.array( + [self.uid_to_index[uid] for uid in datum_uids], + dtype=np.int32, + ) + mask = np.zeros_like(mask_pairs, dtype=np.bool_) + mask[ + np.isin(self._ranked_pairs[:, 0].astype(int), datum_uids) + ] = True + mask_pairs &= mask + + mask = np.zeros_like(mask_datums, dtype=np.bool_) + mask[datum_uids] = True + mask_datums &= mask + + if labels is not None: + if isinstance(labels, list): + labels = np.array( + [self.label_to_index[label] for label in labels] + ) + mask = np.zeros_like(mask_pairs, dtype=np.bool_) + mask[np.isin(self._ranked_pairs[:, 4].astype(int), labels)] = True + mask_pairs &= mask + + mask = np.zeros_like(mask_labels, dtype=np.bool_) + mask[labels] = True + mask_labels &= mask + + if label_keys is not None: + if isinstance(label_keys, list): + label_keys = np.array( + [self.label_key_to_index[key] for key in label_keys] + ) + label_indices = np.where( + np.isclose(self._label_metadata[:, 2], label_keys) + )[0] + mask = np.zeros_like(mask_pairs, dtype=np.bool_) + mask[ + np.isin(self._ranked_pairs[:, 4].astype(int), label_indices) + ] = True + mask_pairs &= mask + + mask = np.zeros_like(mask_labels, dtype=np.bool_) + mask[label_indices] = True + mask_labels &= mask + + mask = mask_datums[:, np.newaxis] & mask_labels[np.newaxis, :] + label_metadata_per_datum = self._label_metadata_per_datum.copy() + label_metadata_per_datum[:, ~mask] = 0 + + label_metadata = np.zeros_like(self._label_metadata, dtype=np.int32) + label_metadata[:, :2] = np.transpose( + np.sum( + label_metadata_per_datum, + axis=1, + ) + ) + label_metadata[:, 2] = self._label_metadata[:, 2] + + return Filter( + indices=np.where(mask_pairs)[0], + label_metadata=label_metadata, + # uids=datum_uids, + # labels=labels, + # label_keys=label_keys, + ) + + def evaluate( + self, + iou_thresholds: list[float] = [0.5, 0.75, 0.9], + score_thresholds: list[float] = [0.5], + filter_: Filter | None = None, + ) -> dict[MetricType, list]: + """ + Runs evaluation over cached data. + + Parameters + ---------- + iou_thresholds : list[float] + A list of iou thresholds to compute over. + score_thresholds : list[float] + A list of score thresholds to compute over. + filter_mask : NDArray[bool], optional + A boolean mask that filters the cached data. + """ + + data = self._ranked_pairs + label_metadata = self._label_metadata + if filter_ is not None: + data = data[filter_.indices] + label_metadata = filter_.label_metadata + + ( + ( + average_precision, + mean_average_precision, + average_precision_average_over_ious, + mean_average_precision_average_over_ious, + ), + ( + average_recall, + mean_average_recall, + average_recall_averaged_over_scores, + mean_average_recall_averaged_over_scores, + ), + precision_recall, + pr_curves, + ) = compute_metrics( + data=data, + label_counts=label_metadata, + iou_thresholds=np.array(iou_thresholds), + score_thresholds=np.array(score_thresholds), + ) + + metrics = defaultdict(list) + + metrics[MetricType.AP] = [ + AP( + value=average_precision[iou_idx][label_idx], + iou=iou_thresholds[iou_idx], + label=self.index_to_label[label_idx], + ) + for iou_idx in range(average_precision.shape[0]) + for label_idx in range(average_precision.shape[1]) + if int(label_metadata[label_idx][0]) > 0 + ] + + metrics[MetricType.mAP] = [ + mAP( + value=mean_average_precision[iou_idx][label_key_idx], + iou=iou_thresholds[iou_idx], + label_key=self.index_to_label_key[label_key_idx], + ) + for iou_idx in range(mean_average_precision.shape[0]) + for label_key_idx in range(mean_average_precision.shape[1]) + ] + + metrics[MetricType.APAveragedOverIOUs] = [ + APAveragedOverIOUs( + value=average_precision_average_over_ious[label_idx], + ious=iou_thresholds, + label=self.index_to_label[label_idx], + ) + for label_idx in range(self.n_labels) + if int(label_metadata[label_idx][0]) > 0 + ] + + metrics[MetricType.mAPAveragedOverIOUs] = [ + mAPAveragedOverIOUs( + value=mean_average_precision_average_over_ious[label_key_idx], + ious=iou_thresholds, + label_key=self.index_to_label_key[label_key_idx], + ) + for label_key_idx in range( + mean_average_precision_average_over_ious.shape[0] + ) + ] + + metrics[MetricType.AR] = [ + AR( + value=average_recall[score_idx][label_idx], + ious=iou_thresholds, + score=score_thresholds[score_idx], + label=self.index_to_label[label_idx], + ) + for score_idx in range(average_recall.shape[0]) + for label_idx in range(average_recall.shape[1]) + if int(label_metadata[label_idx][0]) > 0 + ] + + metrics[MetricType.mAR] = [ + mAR( + value=mean_average_recall[score_idx][label_key_idx], + ious=iou_thresholds, + score=score_thresholds[score_idx], + label_key=self.index_to_label_key[label_key_idx], + ) + for score_idx in range(mean_average_recall.shape[0]) + for label_key_idx in range(mean_average_recall.shape[1]) + ] + + metrics[MetricType.ARAveragedOverScores] = [ + ARAveragedOverScores( + value=average_recall_averaged_over_scores[label_idx], + scores=score_thresholds, + ious=iou_thresholds, + label=self.index_to_label[label_idx], + ) + for label_idx in range(self.n_labels) + if int(label_metadata[label_idx][0]) > 0 + ] + + metrics[MetricType.mARAveragedOverScores] = [ + mARAveragedOverScores( + value=mean_average_recall_averaged_over_scores[label_key_idx], + scores=score_thresholds, + ious=iou_thresholds, + label_key=self.index_to_label_key[label_key_idx], + ) + for label_key_idx in range( + mean_average_recall_averaged_over_scores.shape[0] + ) + ] + + metrics[MetricType.PrecisionRecallCurve] = [ + PrecisionRecallCurve( + precision=list(pr_curves[iou_idx][label_idx]), + iou=iou_threshold, + label=label, + ) + for iou_idx, iou_threshold in enumerate(iou_thresholds) + for label_idx, label in self.index_to_label.items() + if int(label_metadata[label_idx][0]) > 0 + ] + + for iou_idx, iou_threshold in enumerate(iou_thresholds): + for score_idx, score_threshold in enumerate(score_thresholds): + for label_idx, label in self.index_to_label.items(): + row = precision_recall[iou_idx][score_idx][label_idx] + kwargs = { + "label": label, + "iou": iou_threshold, + "score": score_threshold, + } + metrics[MetricType.Counts].append( + Counts( + tp=int(row[0]), + fp=int(row[1]), + fn=int(row[2]), + **kwargs, + ) + ) + metrics[MetricType.Precision].append( + Precision( + value=row[3], + **kwargs, + ) + ) + metrics[MetricType.Recall].append( + Recall( + value=row[4], + **kwargs, + ) + ) + metrics[MetricType.F1].append( + F1( + value=row[5], + **kwargs, + ) + ) + metrics[MetricType.Accuracy].append( + Accuracy( + value=row[6], + **kwargs, + ) + ) + + return metrics + + def compute_detailed_pr_curve( + self, + iou_thresholds: list[float] = [0.5], + score_thresholds: list[float] = [ + score / 10.0 for score in range(1, 11) + ], + n_samples: int = 0, + ) -> list[DetailedPrecisionRecallCurve]: + + if self._detailed_pairs.size == 0: + return list() + + metrics = compute_detailed_pr_curve( + self._detailed_pairs, + label_counts=self._label_metadata, + iou_thresholds=np.array(iou_thresholds), + score_thresholds=np.array(score_thresholds), + n_samples=n_samples, + ) + + tp_idx = 0 + fp_misclf_idx = tp_idx + n_samples + 1 + fp_halluc_idx = fp_misclf_idx + n_samples + 1 + fn_misclf_idx = fp_halluc_idx + n_samples + 1 + fn_misprd_idx = fn_misclf_idx + n_samples + 1 + + results = list() + for label_idx in range(len(metrics)): + n_ious, n_scores, _, _ = metrics.shape + for iou_idx in range(n_ious): + curve = DetailedPrecisionRecallCurve( + iou=iou_thresholds[iou_idx], + value=list(), + label=self.index_to_label[label_idx], + ) + for score_idx in range(n_scores): + curve.value.append( + DetailedPrecisionRecallPoint( + score=score_thresholds[score_idx], + tp=metrics[iou_idx][score_idx][label_idx][tp_idx], + tp_examples=[ + self.index_to_uid[int(datum_idx)] + for datum_idx in metrics[iou_idx][score_idx][ + label_idx + ][tp_idx + 1 : fp_misclf_idx] + if int(datum_idx) >= 0 + ], + fp_misclassification=metrics[iou_idx][score_idx][ + label_idx + ][fp_misclf_idx], + fp_misclassification_examples=[ + self.index_to_uid[int(datum_idx)] + for datum_idx in metrics[iou_idx][score_idx][ + label_idx + ][fp_misclf_idx + 1 : fp_halluc_idx] + if int(datum_idx) >= 0 + ], + fp_hallucination=metrics[iou_idx][score_idx][ + label_idx + ][fp_halluc_idx], + fp_hallucination_examples=[ + self.index_to_uid[int(datum_idx)] + for datum_idx in metrics[iou_idx][score_idx][ + label_idx + ][fp_halluc_idx + 1 : fn_misclf_idx] + if int(datum_idx) >= 0 + ], + fn_misclassification=metrics[iou_idx][score_idx][ + label_idx + ][fn_misclf_idx], + fn_misclassification_examples=[ + self.index_to_uid[int(datum_idx)] + for datum_idx in metrics[iou_idx][score_idx][ + label_idx + ][fn_misclf_idx + 1 : fn_misprd_idx] + if int(datum_idx) >= 0 + ], + fn_missing_prediction=metrics[iou_idx][score_idx][ + label_idx + ][fn_misprd_idx], + fn_missing_prediction_examples=[ + self.index_to_uid[int(datum_idx)] + for datum_idx in metrics[iou_idx][score_idx][ + label_idx + ][fn_misprd_idx + 1 :] + if int(datum_idx) >= 0 + ], + ) + ) + results.append(curve) + return results + + +class DataLoader: + def __init__(self): + self._evaluator = Evaluator() + self.pairs = list() + self.groundtruth_count = defaultdict(lambda: defaultdict(int)) + self.prediction_count = defaultdict(lambda: defaultdict(int)) + + def _add_datum(self, uid: str) -> int: + if uid not in self._evaluator.uid_to_index: + index = len(self._evaluator.uid_to_index) + self._evaluator.uid_to_index[uid] = index + self._evaluator.index_to_uid[index] = uid + return self._evaluator.uid_to_index[uid] + + def _add_label(self, label: tuple[str, str]) -> tuple[int, int]: + label_id = len(self._evaluator.index_to_label) + label_key_id = len(self._evaluator.index_to_label_key) + if label not in self._evaluator.label_to_index: + self._evaluator.label_to_index[label] = label_id + self._evaluator.index_to_label[label_id] = label + + # update label key index + if label[0] not in self._evaluator.label_key_to_index: + self._evaluator.label_key_to_index[label[0]] = label_key_id + self._evaluator.index_to_label_key[label_key_id] = label[0] + label_key_id += 1 + + self._evaluator.label_index_to_label_key_index[ + label_id + ] = self._evaluator.label_key_to_index[label[0]] + label_id += 1 + + return ( + self._evaluator.label_to_index[label], + self._evaluator.label_key_to_index[label[0]], + ) + + def add_data( + self, + detections: list[Detection], + show_progress: bool = False, + ): + disable_tqdm = not show_progress + for detection in tqdm(detections, disable=disable_tqdm): + + # update metadata + self._evaluator.n_datums += 1 + self._evaluator.n_groundtruths += len(detection.groundtruths) + self._evaluator.n_predictions += len(detection.predictions) + + # update datum uid index + uid_index = self._add_datum(uid=detection.uid) + + # cache labels and annotations + keyed_groundtruths = defaultdict(list) + keyed_predictions = defaultdict(list) + for gidx, gann in enumerate(detection.groundtruths): + for glabel in gann.labels: + label_idx, label_key_idx = self._add_label(glabel) + self.groundtruth_count[label_idx][uid_index] += 1 + keyed_groundtruths[label_key_idx].append( + ( + gidx, + label_idx, + gann.extrema, + ) + ) + for pidx, pann in enumerate(detection.predictions): + for plabel, pscore in zip(pann.labels, pann.scores): + label_idx, label_key_idx = self._add_label(plabel) + self.prediction_count[label_idx][uid_index] += 1 + keyed_predictions[label_key_idx].append( + ( + pidx, + label_idx, + pscore, + pann.extrema, + ) + ) + + gt_keys = set(keyed_groundtruths.keys()) + pd_keys = set(keyed_predictions.keys()) + joint_keys = gt_keys.intersection(pd_keys) + gt_unique_keys = gt_keys - pd_keys + pd_unique_keys = pd_keys - gt_keys + + pairs = list() + for key in joint_keys: + boxes = np.array( + [ + np.array([*gextrema, *pextrema]) + for _, _, _, pextrema in keyed_predictions[key] + for _, _, gextrema in keyed_groundtruths[key] + ] + ) + ious = compute_iou(boxes) + pairs.extend( + [ + np.array( + [ + float(uid_index), + float(gidx), + float(pidx), + ious[ + pidx * len(keyed_groundtruths[key]) + gidx + ], + float(glabel), + float(plabel), + float(score), + ] + ) + for pidx, plabel, score, _ in keyed_predictions[key] + for gidx, glabel, _ in keyed_groundtruths[key] + ] + ) + for key in gt_unique_keys: + pairs.extend( + [ + np.array( + [ + float(uid_index), + float(gidx), + -1.0, + 0.0, + float(glabel), + -1.0, + -1.0, + ] + ) + for gidx, glabel, _ in keyed_groundtruths[key] + ] + ) + for key in pd_unique_keys: + pairs.extend( + [ + np.array( + [ + float(uid_index), + -1.0, + float(pidx), + 0.0, + -1.0, + float(plabel), + float(score), + ] + ) + for pidx, plabel, score, _ in keyed_predictions[key] + ] + ) + + self.pairs.append(np.array(pairs)) + + def add_data_from_valor_dict( + self, + detections: list[tuple[dict, dict]], + show_progress: bool = False, + ): + def _get_bbox_extrema( + data: list[list[list[float]]], + ) -> tuple[float, float, float, float]: + x = [point[0] for shape in data for point in shape] + y = [point[1] for shape in data for point in shape] + return (min(x), max(x), min(y), max(y)) + + disable_tqdm = not show_progress + for groundtruth, prediction in tqdm(detections, disable=disable_tqdm): + + # update metadata + self._evaluator.n_datums += 1 + self._evaluator.n_groundtruths += len(groundtruth["annotations"]) + self._evaluator.n_predictions += len(prediction["annotations"]) + + # update datum uid index + uid_index = self._add_datum(uid=groundtruth["datum"]["uid"]) + + # cache labels and annotations + keyed_groundtruths = defaultdict(list) + keyed_predictions = defaultdict(list) + for gidx, gann in enumerate(groundtruth["annotations"]): + for valor_label in gann["labels"]: + glabel = (valor_label["key"], valor_label["value"]) + label_idx, label_key_idx = self._add_label(glabel) + self.groundtruth_count[label_idx][uid_index] += 1 + keyed_groundtruths[label_key_idx].append( + ( + gidx, + label_idx, + _get_bbox_extrema(gann["bounding_box"]), + ) + ) + for pidx, pann in enumerate(prediction["annotations"]): + for valor_label in pann["labels"]: + plabel = (valor_label["key"], valor_label["value"]) + pscore = valor_label["score"] + label_idx, label_key_idx = self._add_label(plabel) + self.prediction_count[label_idx][uid_index] += 1 + keyed_predictions[label_key_idx].append( + ( + pidx, + label_idx, + pscore, + _get_bbox_extrema(pann["bounding_box"]), + ) + ) + + gt_keys = set(keyed_groundtruths.keys()) + pd_keys = set(keyed_predictions.keys()) + joint_keys = gt_keys.intersection(pd_keys) + gt_unique_keys = gt_keys - pd_keys + pd_unique_keys = pd_keys - gt_keys + + pairs = list() + for key in joint_keys: + boxes = np.array( + [ + np.array([*gextrema, *pextrema]) + for _, _, _, pextrema in keyed_predictions[key] + for _, _, gextrema in keyed_groundtruths[key] + ] + ) + ious = compute_iou(boxes) + pairs.extend( + [ + np.array( + [ + float(uid_index), + float(gidx), + float(pidx), + ious[ + pidx * len(keyed_groundtruths[key]) + gidx + ], + float(glabel), + float(plabel), + float(score), + ] + ) + for pidx, plabel, score, _ in keyed_predictions[key] + for gidx, glabel, _ in keyed_groundtruths[key] + ] + ) + for key in gt_unique_keys: + pairs.extend( + [ + np.array( + [ + float(uid_index), + float(gidx), + -1.0, + 0.0, + float(glabel), + -1.0, + -1.0, + ] + ) + for gidx, glabel, _ in keyed_groundtruths[key] + ] + ) + for key in pd_unique_keys: + pairs.extend( + [ + np.array( + [ + float(uid_index), + -1.0, + float(pidx), + 0.0, + -1.0, + float(plabel), + float(score), + ] + ) + for pidx, plabel, score, _ in keyed_predictions[key] + ] + ) + + self.pairs.append(np.array(pairs)) + + def finalize(self) -> Evaluator: + + self.pairs = [pair for pair in self.pairs if pair.size > 0] + if len(self.pairs) == 0: + raise ValueError("No data available to create evaluator.") + + n_datums = self._evaluator.n_datums + n_labels = len(self._evaluator.index_to_label) + + self._evaluator.n_labels = n_labels + + self._evaluator._label_metadata_per_datum = np.zeros( + (2, n_datums, n_labels), dtype=np.int32 + ) + for datum_idx in range(n_datums): + for label_idx in range(n_labels): + gt_count = ( + self.groundtruth_count[label_idx].get(datum_idx, 0) + if label_idx in self.groundtruth_count + else 0 + ) + pd_count = ( + self.prediction_count[label_idx].get(datum_idx, 0) + if label_idx in self.prediction_count + else 0 + ) + self._evaluator._label_metadata_per_datum[ + :, datum_idx, label_idx + ] = np.array([gt_count, pd_count]) + + self._evaluator._label_metadata = np.array( + [ + [ + float( + np.sum( + self._evaluator._label_metadata_per_datum[ + 0, :, label_idx + ] + ) + ), + float( + np.sum( + self._evaluator._label_metadata_per_datum[ + 1, :, label_idx + ] + ) + ), + float( + self._evaluator.label_index_to_label_key_index[ + label_idx + ] + ), + ] + for label_idx in range(n_labels) + ] + ) + + self._evaluator._detailed_pairs = np.concatenate( + self.pairs, + axis=0, + ) + + self._evaluator._ranked_pairs = compute_ranked_pairs( + self.pairs, + label_counts=self._evaluator._label_metadata, + ) + + return self._evaluator diff --git a/lite/valor_lite/detection/metric.py b/lite/valor_lite/detection/metric.py new file mode 100644 index 000000000..90c2baf81 --- /dev/null +++ b/lite/valor_lite/detection/metric.py @@ -0,0 +1,357 @@ +from dataclasses import dataclass +from enum import Enum + +from valor_lite.schemas import Metric + + +class MetricType(str, Enum): + Counts = "Counts" + Accuracy = "Accuracy" + Precision = "Precision" + Recall = "Recall" + F1 = "F1" + AP = "AP" + AR = "AR" + mAP = "mAP" + mAR = "mAR" + APAveragedOverIOUs = "APAveragedOverIOUs" + mAPAveragedOverIOUs = "mAPAveragedOverIOUs" + ARAveragedOverScores = "ARAveragedOverScores" + mARAveragedOverScores = "mARAveragedOverScores" + PrecisionRecallCurve = "PrecisionRecallCurve" + DetailedPrecisionRecallCurve = "DetailedPrecisionRecallCurve" + + +@dataclass +class Counts: + tp: int + fp: int + fn: int + label: tuple[str, str] + iou: float + score: float + + @property + def metric(self) -> Metric: + return Metric( + type=type(self).__name__, + value={ + "tp": self.tp, + "fp": self.fp, + "fn": self.fn, + }, + parameters={ + "iou": self.iou, + "score": self.score, + "label": { + "key": self.label[0], + "value": self.label[1], + }, + }, + ) + + def to_dict(self) -> dict: + return self.metric.to_dict() + + +@dataclass +class ClassMetric: + value: float + label: tuple[str, str] + iou: float + score: float + + @property + def metric(self) -> Metric: + return Metric( + type=type(self).__name__, + value=self.value, + parameters={ + "iou": self.iou, + "score": self.score, + "label": { + "key": self.label[0], + "value": self.label[1], + }, + }, + ) + + def to_dict(self) -> dict: + return self.metric.to_dict() + + +class Precision(ClassMetric): + pass + + +class Recall(ClassMetric): + pass + + +class Accuracy(ClassMetric): + pass + + +class F1(ClassMetric): + pass + + +@dataclass +class AP: + value: float + iou: float + label: tuple[str, str] + + @property + def metric(self) -> Metric: + return Metric( + type=type(self).__name__, + value=self.value, + parameters={ + "iou": self.iou, + "label": { + "key": self.label[0], + "value": self.label[1], + }, + }, + ) + + def to_dict(self) -> dict: + return self.metric.to_dict() + + +@dataclass +class mAP: + value: float + iou: float + label_key: str + + @property + def metric(self) -> Metric: + return Metric( + type=type(self).__name__, + value=self.value, + parameters={ + "iou": self.iou, + "label_key": self.label_key, + }, + ) + + def to_dict(self) -> dict: + return self.metric.to_dict() + + +@dataclass +class APAveragedOverIOUs: + value: float + ious: list[float] + label: tuple[str, str] + + @property + def metric(self) -> Metric: + return Metric( + type=type(self).__name__, + value=self.value, + parameters={ + "ious": self.ious, + "label": { + "key": self.label[0], + "value": self.label[1], + }, + }, + ) + + def to_dict(self) -> dict: + return self.metric.to_dict() + + +@dataclass +class mAPAveragedOverIOUs: + value: float + ious: list[float] + label_key: str + + @property + def metric(self) -> Metric: + return Metric( + type=type(self).__name__, + value=self.value, + parameters={ + "ious": self.ious, + "label_key": self.label_key, + }, + ) + + def to_dict(self) -> dict: + return self.metric.to_dict() + + +@dataclass +class AR: + value: float + score: float + ious: list[float] + label: tuple[str, str] + + @property + def metric(self) -> Metric: + return Metric( + type=type(self).__name__, + value=self.value, + parameters={ + "score": self.score, + "ious": self.ious, + "label": { + "key": self.label[0], + "value": self.label[1], + }, + }, + ) + + def to_dict(self) -> dict: + return self.metric.to_dict() + + +@dataclass +class mAR: + value: float + score: float + ious: list[float] + label_key: str + + @property + def metric(self) -> Metric: + return Metric( + type=type(self).__name__, + value=self.value, + parameters={ + "score": self.score, + "ious": self.ious, + "label_key": self.label_key, + }, + ) + + def to_dict(self) -> dict: + return self.metric.to_dict() + + +@dataclass +class ARAveragedOverScores: + value: float + scores: list[float] + ious: list[float] + label: tuple[str, str] + + @property + def metric(self) -> Metric: + return Metric( + type=type(self).__name__, + value=self.value, + parameters={ + "scores": self.scores, + "ious": self.ious, + "label": { + "key": self.label[0], + "value": self.label[1], + }, + }, + ) + + def to_dict(self) -> dict: + return self.metric.to_dict() + + +@dataclass +class mARAveragedOverScores: + value: float + scores: list[float] + ious: list[float] + label_key: str + + @property + def metric(self) -> Metric: + return Metric( + type=type(self).__name__, + value=self.value, + parameters={ + "scores": self.scores, + "ious": self.ious, + "label_key": self.label_key, + }, + ) + + def to_dict(self) -> dict: + return self.metric.to_dict() + + +@dataclass +class PrecisionRecallCurve: + """ + Interpolated over recalls 0.0, 0.01, ..., 1.0. + """ + + precision: list[float] + iou: float + label: tuple[str, str] + + @property + def metric(self) -> Metric: + return Metric( + type=type(self).__name__, + value=self.precision, + parameters={ + "iou": self.iou, + "label": {"key": self.label[0], "value": self.label[1]}, + }, + ) + + def to_dict(self) -> dict: + return self.metric.to_dict() + + +@dataclass +class DetailedPrecisionRecallPoint: + score: float + tp: int + fp_misclassification: int + fp_hallucination: int + fn_misclassification: int + fn_missing_prediction: int + tp_examples: list[str] + fp_misclassification_examples: list[str] + fp_hallucination_examples: list[str] + fn_misclassification_examples: list[str] + fn_missing_prediction_examples: list[str] + + def to_dict(self) -> dict: + return { + "score": self.score, + "tp": self.tp, + "fp_misclassification": self.fp_misclassification, + "fp_hallucination": self.fp_hallucination, + "fn_misclassification": self.fn_misclassification, + "fn_missing_prediction": self.fn_missing_prediction, + "tp_examples": self.tp_examples, + "fp_misclassification_examples": self.fp_misclassification_examples, + "fp_hallucination_examples": self.fp_hallucination_examples, + "fn_misclassification_examples": self.fn_misclassification_examples, + "fn_missing_prediction_examples": self.fn_missing_prediction_examples, + } + + +@dataclass +class DetailedPrecisionRecallCurve: + iou: float + value: list[DetailedPrecisionRecallPoint] + label: tuple[str, str] + + def to_dict(self) -> dict: + return { + "value": [pt.to_dict() for pt in self.value], + "iou": self.iou, + "label": { + "key": self.label[0], + "value": self.label[1], + }, + "type": "DetailedPrecisionRecallCurve", + } diff --git a/lite/valor_lite/schemas.py b/lite/valor_lite/schemas.py new file mode 100644 index 000000000..2279f464f --- /dev/null +++ b/lite/valor_lite/schemas.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass + + +@dataclass +class Metric: + type: str + value: float | dict | list + parameters: dict + + def to_dict(self) -> dict: + return { + "type": self.type, + "value": self.value, + "parameters": self.parameters, + }