Implement valor_core to compute metrics locally via numpy (#651)

Striveworks · Aug 22, 2024 · c996f6d · c996f6d
1 parent 18b08b7
commit c996f6d
Show file tree

Hide file tree

Showing 30 changed files with 23,511 additions and 5 deletions.
diff --git a/.github/workflows/benchmark-evaluations.yml → ...lows/client-api-benchmark-evaluations.yml b/.github/workflows/benchmark-evaluations.yml → ...lows/client-api-benchmark-evaluations.yml
@@ -1,4 +1,4 @@
-name: Run benchmarks on pre-existing data
+name: Run API + client benchmarks
 
 on:
   push:

diff --git a/.github/workflows/tests-and-coverage.yml → ...rkflows/client-api-tests-and-coverage.yml b/.github/workflows/tests-and-coverage.yml → ...rkflows/client-api-tests-and-coverage.yml
@@ -1,4 +1,4 @@
-name: Unit, functional, integration tests and code coverage
+name: Run API + client code coverage report
 
 on:
   push:

diff --git a/.github/workflows/core-benchmark-evaluations.yml b/.github/workflows/core-benchmark-evaluations.yml
@@ -0,0 +1,38 @@
+name: Run core benchmarks
+
+on:
+  push:
+    branches: "**"
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  run-benchmarks:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: install core
+        run: pip install -e .
+        working-directory: ./core
+      - name: run classification benchmarks
+        run: python benchmark_script.py
+        working-directory: ./core/benchmarks/classification
+      - name: print classification results
+        run: |
+          export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('results.json', 'r')), indent=4));")
+          echo "$BENCHMARK_RESULTS"
+        working-directory: ./core/benchmarks/classification
+      - name: run object detection benchmarks
+        run: python benchmark_script.py
+        working-directory: ./core/benchmarks/object-detection
+      - name: print object detection results
+        run: |
+          export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('results.json', 'r')), indent=4));")
+          echo "$BENCHMARK_RESULTS"
+        working-directory: ./core/benchmarks/object-detection
+      - run: make stop-env
diff --git a/.github/workflows/core-tests-and-coverage.yml b/.github/workflows/core-tests-and-coverage.yml
@@ -0,0 +1,36 @@
+name: Run core code coverage report
+
+on:
+  push:
+    branches: "**"
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  core-tests:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: .
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: run tests and report coverage
+        run: |
+          pip install -e ".[test]"
+          COVERAGE_FILE=.coverage.functional python -m coverage run --omit "tests/*"  -m pytest -v tests/functional-tests
+          COVERAGE_FILE=.coverage.unit python -m coverage run --omit "tests/*" -m pytest -v tests/unit-tests
+          python -m coverage combine
+          python -m coverage report -m
+          python -m coverage json
+          export TOTAL=$(python -c "import json;print(json.load(open('coverage.json'))['totals']['percent_covered_display'])")
+          echo "total=$TOTAL" >> $GITHUB_ENV
+          if (( $TOTAL < 90 )); then
+            echo "Coverage is below 90%"
+            exit 1
+          fi
+        working-directory: ./core
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,19 +32,19 @@ repos:
     rev: v1.1.376
     hooks:
       - id: pyright
-        additional_dependencies:
-          [
+        additional_dependencies: [
             "requests",
             "Pillow >= 9.1.0",
             "numpy",
+            "pandas>=2.2.2",
+            "pandas-stubs", # fixes pyright issues with pandas
             "pytest",
             "python-dotenv",
             "SQLAlchemy>=2.0",
             "fastapi[all]>=0.100.0",
             "importlib_metadata; python_version < '3.8'",
             "pydantic-settings",
             "tqdm",
-            "pandas",
             "packaging",
             "PyJWT[crypto]",
             "structlog",
@@ -57,4 +57,5 @@ repos:
             "nltk",
             "rouge_score",
             "evaluate",
+            "shapely",
           ]
diff --git a/core/LICENSE b/core/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Striveworks
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/core/README.md b/core/README.md
@@ -0,0 +1,238 @@
+# valor_core: Compute classification, object detection, and segmentation metrics locally.
+
+Valor is a centralized evaluation store which makes it easy to measure, explore, and rank model performance. Valor empowers data scientists and engineers to evaluate the performance of their machine learning pipelines and use those evaluations to make better modeling decisions in the future.
+
+`valor_core` is the start of a new backbone for Valor's metric calculations. In the future, the Valor API will import `valor_core`'s evaluation functions in order to efficiently compute its classification, object detection, and segmentation metrics. This module offers a few advantages over the existing `valor` evaluation implementations, including:
+- The ability to calculate metrics locally, without running separate database and API services
+- Faster compute times due to the use of vectors and arrays
+- Easier testing, debugging, and benchmarking due to the separation of concerns between evaluation computations and Postgres operations (e.g., filtering, querying)
+
+Valor is maintained by Striveworks, a cutting-edge MLOps company based out of Austin, Texas. We'd love to learn more about your interest in Valor and answer any questions you may have; please don't hesitate to reach out to us on [Slack](https://striveworks-public.slack.com/join/shared_invite/zt-1a0jx768y-2J1fffN~b4fXYM8GecvOhA#/shared-invite/email) or [GitHub](https://github.com/striveworks/valor).
+
+For more information, please see our [user docs](https://striveworks.github.io/valor/).
+
+## Usage
+
+### Passing Lists of GroundTruth and Prediction Objects
+
+The first way to use `valor_core` is to pass a list of groundtruth and prediction objects to an `evaluate_...` function, like so:
+
+```python
+
+groundtruths = [
+    schemas.GroundTruth(
+            datum=img1,
+            annotations=...
+     ), …
+]
+predictions = [
+    schemas.Prediction(
+            datum=img1,
+            annotations=...
+     ), …
+]
+
+evaluation = evaluate_detection(
+        groundtruths=groundtruths,
+        predictions=predictions,
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.5,
+        pr_curve_max_examples=1,
+    )
+```
+
+### Passing DataFrames
+
+The second way to use `valor_core` is to pass in a dataframe of groundtruths and predictions:
+
+```python
+
+groundtruth_df = pd.DataFrame(
+        [
+            {
+                "datum_id": 1,
+                "datum_uid": "uid1",
+                "id": 1,
+                "annotation_id": 1,
+                "label_id": 1,
+                "label_key": "k1",
+                "label_value": "v1",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[10, 10], [60, 10], [60, 40], [10, 40], [10, 10]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+            {
+                "datum_id": 1,
+                "datum_uid": "uid1",
+                "id": 2,
+                "annotation_id": 2,
+                "label_id": 2,
+                "label_key": "k2",
+                "label_value": "v2",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [
+                                [87, 10],
+                                [158, 10],
+                                [158, 820],
+                                [87, 820],
+                                [87, 10],
+                            ]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+            {
+                "datum_id": 2,
+                "datum_uid": "uid2",
+                "id": 3,
+                "annotation_id": 3,
+                "label_id": 1,
+                "label_key": "k1",
+                "label_value": "v1",
+                "is_instance": True,
+                "polygon": schemas.Polygon.from_dict(
+                    {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[15, 0], [70, 0], [70, 20], [15, 20], [15, 0]]
+                        ],
+                    }
+                ),
+                "raster": None,
+                "bounding_box": None,
+            },
+        ]
+)
+prediction_df = pd.DataFrame(
+    [
+        {
+            "id": 1,
+            "annotation_id": 4,
+            "score": 0.3,
+            "datum_id": 1,
+            "datum_uid": "uid1",
+            "label_id": 1,
+            "label_key": "k1",
+            "label_value": "v1",
+            "is_instance": True,
+            "polygon": schemas.Polygon.from_dict(
+                {
+                    "type": "Polygon",
+                    "coordinates": [
+                        [[10, 10], [60, 10], [60, 40], [10, 40], [10, 10]]
+                    ],
+                }
+            ),
+            "raster": None,
+            "bounding_box": None,
+        },
+        {
+            "id": 2,
+            "annotation_id": 5,
+            "score": 0.98,
+            "datum_id": 2,
+            "datum_uid": "uid2",
+            "label_id": 2,
+            "label_key": "k2",
+            "label_value": "v2",
+            "is_instance": True,
+            "polygon": schemas.Polygon.from_dict(
+                {
+                    "type": "Polygon",
+                    "coordinates": [
+                        [[15, 0], [70, 0], [70, 20], [15, 20], [15, 0]]
+                    ],
+                }
+            ),
+            "raster": None,
+            "bounding_box": None,
+        },
+    ]
+)
+
+evaluation = evaluate_detection(
+        groundtruths=groundtruth_df,
+        predictions=prediction_df,
+        metrics_to_return=[
+            enums.MetricType.AP,
+            enums.MetricType.AR,
+            enums.MetricType.mAP,
+            enums.MetricType.APAveragedOverIOUs,
+            enums.MetricType.mAR,
+            enums.MetricType.mAPAveragedOverIOUs,
+            enums.MetricType.PrecisionRecallCurve,
+            enums.MetricType.DetailedPrecisionRecallCurve,
+        ],
+        pr_curve_iou_threshold=0.5,
+        pr_curve_max_examples=1,
+    )
+```
+
+## Using a Data Manager
+
+Finally, you can use a manager class (i.e., `ValorDetectionManager`) to run your evaluation. The advantage to using a manager class is a) you won't have to keep all annotation types in memory in a large list and b) we can pre-compute certain columns (i.e., `iou`) in advance of the `.evaluate()` call.
+
+
+```python
+manager = valor_core.ValorDetectionManager(...)
+img1 = schemas.Datum(
+        uid="uid1",
+        metadata={
+            "height": image_height,
+            "width": image_width,
+        },
+    )
+groundtruths = [
+    schemas.GroundTruth(
+            datum=img1,
+            annotations=...
+     ), …
+]
+predictions = [
+    schemas.Prediction(
+            datum=img1,
+            annotations=...
+     ), …
+]
+
+
+# the user passes a list of all groundtruths and predictions for a list of datums
+# this allows us to precompute IOUs at the datum_uid + label_key level
+manager.add_data(groundtruths=groundtruths, predictions=predictions)
+
+# the user calls .evaluate() to compute the evaluation
+evaluation = manager.evaluate()
+
+# the user must pass all groundtruths and predictions for a given datum at once
+# this restriction makes it so we can compute IOUs right away and throw away excess info like rasters, saving a significant amount of memory
+with pytest.raises(ValueError):
+    manager.add_data_for_datum(groundtruths=groundtruths, predictions=predictions) # throws error since img1 has already been added to the manager's data
+
+# the user must also specify the label map, `convert_annotation_to_type`, etc. when instantiating the object
+# once set, these attributes can't be changed since subsequent IOU calculations will become apples-to-oranges with prior calculations
+with pytest.raises(ValueError):
+    manager.label_map = some_label_map # throws an error since label map can't be changed, only instantiated
+```
diff --git a/core/benchmarks/.gitignore b/core/benchmarks/.gitignore
@@ -0,0 +1 @@
+*.json