diff --git a/.github/workflows/benchmark-evaluations.yml b/.github/workflows/benchmark-evaluations.yml new file mode 100644 index 000000000..8d19e0d8b --- /dev/null +++ b/.github/workflows/benchmark-evaluations.yml @@ -0,0 +1,46 @@ +name: Run benchmarks on pre-existing data + +on: + push: + branches: "**" + +permissions: + id-token: write + contents: read + +jobs: + run-benchmarks: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: build postgres + run: | + docker build ./database -t pgvalor + - name: setup back end test env + run: docker compose -p valor -f docker-compose.yml -f docker-compose.cicd-override.yml --env-file ./api/.env.testing up --build -d + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: install api + run: pip install -e ".[test]" + working-directory: ./api + - name: install client + run: pip install -e ".[test]" + working-directory: ./client + - name: run classification benchmarks + run: python benchmark_script.py + working-directory: ./integration_tests/benchmarks/classification + - name: print classification results + run: | + export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('results.json', 'r')), indent=4));") + echo "$BENCHMARK_RESULTS" + working-directory: ./integration_tests/benchmarks/classification + - name: run object detection benchmarks + run: python benchmark_script.py + working-directory: ./integration_tests/benchmarks/object-detection + - name: print object detection results + run: | + export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('results.json', 'r')), indent=4));") + echo "$BENCHMARK_RESULTS" + working-directory: ./integration_tests/benchmarks/object-detection + - run: make stop-env diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml deleted file mode 100644 index 46ed2ab27..000000000 --- a/.github/workflows/stress-test.yml +++ /dev/null @@ -1,81 +0,0 @@ -name: Run stress tests manually via the GitHub Actions UI - -on: - workflow_dispatch: - -permissions: - id-token: write - contents: read - -env: - AWS_ROLE: arn:aws:iam::724664234782:role/Striveworks-Role-github_runner_npe - AWS_REGION: us-east-1 - -jobs: - integration-stress-tests: - env: - COVERAGE_FILE: .coverage.integration-stress-tests - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: build postgres - run: | - docker build ./database -t pgvalor - - name: setup back end test env - run: docker compose -p valor -f docker-compose.yml -f docker-compose.cicd-override.yml --env-file ./api/.env.testing up --build -d - - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - name: install api - run: pip install -e ".[test]" - working-directory: ./api - - name: install client - run: pip install -e ".[test]" - working-directory: ./client - - run: coverage run --source="api/valor_api,client/valor" -m pytest -v integration_tests/stress_test.py - - run: coverage report - - name: upload coverage report as artifact - uses: actions/upload-artifact@v3 - with: - name: ${{ env.COVERAGE_FILE }} - path: ${{ env.COVERAGE_FILE }} - - run: make stop-env - - run: docker compose -p valor -f docker-compose.yml -f docker-compose.cicd-override.yml --env-file ./api/.env.testing up --build -d - env: - VALOR_SECRET_KEY: ${{ vars.SECRET_KEY }} - VALOR_USERNAME: ${{ vars.USERNAME }} - VALOR_PASSWORD: ${{ vars.PASSWORD }} - - name: sleep to give back end time to spin up - run: sleep 15 - - combine-coverage-report: - needs: [integration-stress-tests] - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - run: pip install coverage - - uses: actions/download-artifact@v3 - with: - name: .coverage.integration-stress-tests - - run: coverage combine - - run: coverage report - # https://nedbatchelder.com/blog/202209/making_a_coverage_badge.html - - run: | - coverage json - export TOTAL=$(python -c "import json;print(json.load(open('coverage.json'))['totals']['percent_covered_display'])") - echo "total=$TOTAL" >> $GITHUB_ENV - - name: "Make badge" - if: github.ref == 'refs/heads/main' - uses: schneegans/dynamic-badges-action@v1.4.0 - with: - auth: ${{ secrets.GIST_SECRET }} - gistID: 501428c92df8d0de6805f40fb78b1363 - filename: valor-coverage.json - label: Coverage - message: ${{ env.total }}% - minColorRange: 50 - maxColorRange: 90 - valColorRange: ${{ env.total }} diff --git a/integration_tests/benchmarks/.gitignore b/integration_tests/benchmarks/.gitignore new file mode 100644 index 000000000..94a2dd146 --- /dev/null +++ b/integration_tests/benchmarks/.gitignore @@ -0,0 +1 @@ +*.json \ No newline at end of file diff --git a/integration_tests/benchmarks/classification/benchmark_script.py b/integration_tests/benchmarks/classification/benchmark_script.py new file mode 100644 index 000000000..5a0b8cfbe --- /dev/null +++ b/integration_tests/benchmarks/classification/benchmark_script.py @@ -0,0 +1,212 @@ +import json +import os +from datetime import datetime +from time import time + +import requests + +from valor import ( + Annotation, + Client, + Dataset, + Datum, + GroundTruth, + Label, + Model, + Prediction, + connect, +) + +connect("http://0.0.0.0:8000") +client = Client() + + +def download_data_if_not_exists(file_path: str, file_url: str): + """Download the data from a public bucket if it doesn't exist in the repo.""" + if os.path.exists(file_path): + return + + response = json.loads(requests.get(file_url).text) + with open(file_path, "w+") as file: + json.dump(response, file, indent=4) + + +def write_results_to_file(write_path: str, result_dict: dict): + """Write results to results.json""" + current_datetime = datetime.now().strftime("%d/%m/%Y %H:%M:%S") + + if os.path.isfile(write_path): + with open(write_path, "r") as file: + file.seek(0) + data = json.load(file) + else: + data = {} + + data[current_datetime] = result_dict + + with open(write_path, "w+") as file: + json.dump(data, file, indent=4) + + +def ingest_groundtruths_and_predictions( + dset: Dataset, model: Model, raw: dict, pair_limit: int +): + """Ingest the data into Valor.""" + + groundtruths = [] + predictions = [] + slice_ = ( + raw["groundtruth_prediction_pairs"][:pair_limit] + if pair_limit != -1 + else raw["groundtruth_prediction_pairs"] + ) + for groundtruth, prediction in slice_: + groundtruths.append( + GroundTruth( + datum=Datum( + uid=groundtruth["value"]["datum"]["uid"], + metadata={"width": 224, "height": 224}, + ), + annotations=[ + Annotation( + labels=[ + Label( + key=label["key"], + value=label["value"], + score=label["score"], + ) + for label in annotation["labels"] + ], + ) + for annotation in groundtruth["value"]["annotations"] + ], + ) + ) + + predictions.append( + Prediction( + datum=Datum( + uid=prediction["value"]["datum"]["uid"], + metadata={"width": 224, "height": 224}, + ), + annotations=[ + Annotation( + labels=[ + Label( + key=label["key"], + value=label["value"], + score=label["score"], + ) + for label in annotation["labels"] + ], + ) + for annotation in prediction["value"]["annotations"] + ], + ) + ) + + for gt in groundtruths: + dset.add_groundtruth(gt) + + for pred in predictions: + model.add_prediction(dset, pred) + + dset.finalize() + model.finalize_inferences(dataset=dset) + + +def run_base_evaluation(dset: Dataset, model: Model): + """Run a base evaluation (with no PR curves).""" + evaluation = model.evaluate_classification(dset) + evaluation.wait_for_completion(timeout=30) + return evaluation + + +def run_pr_curve_evaluation(dset: Dataset, model: Model): + """Run a base evaluation with PrecisionRecallCurve included.""" + evaluation = model.evaluate_classification( + dset, + metrics_to_return=[ + "Accuracy", + "Precision", + "Recall", + "F1", + "ROCAUC", + "PrecisionRecallCurve", + ], + ) + evaluation.wait_for_completion() + return evaluation + + +def run_detailed_pr_curve_evaluation(dset: Dataset, model: Model): + """Run a base evaluation with PrecisionRecallCurve and DetailedPrecisionRecallCurve included.""" + + evaluation = model.evaluate_classification( + dset, + metrics_to_return=[ + "Accuracy", + "Precision", + "Recall", + "F1", + "ROCAUC", + "PrecisionRecallCurve", + "DetailedPrecisionRecallCurve", + ], + ) + evaluation.wait_for_completion() + return evaluation + + +def run_benchmarking_analysis( + limits_to_test: list[int] = [1000, 1000], + results_file: str = "results.json", + data_file: str = "data.json", +): + """Time various function calls and export the results.""" + current_directory = os.path.dirname(os.path.realpath(__file__)) + write_path = f"{current_directory}/{results_file}" + data_path = f"{current_directory}/{data_file}" + + download_data_if_not_exists( + file_path=data_path, + file_url="https://pub-fae71003f78140bdaedf32a7c8d331d2.r2.dev/classification_data.json", + ) + + with open(data_path) as file: + file.seek(0) + raw_data = json.load(file) + + for limit in limits_to_test: + dset = Dataset.create(name="bird-identification") + model = Model.create(name="some_model") + + start_time = time() + + ingest_groundtruths_and_predictions( + dset=dset, model=model, raw=raw_data, pair_limit=limit + ) + ingest_time = time() - start_time + + try: + eval_ = run_base_evaluation(dset=dset, model=model) + except TimeoutError: + raise TimeoutError( + f"Evaluation timed out when processing {limit} datums." + ) + + results = { + "number_of_datums": limit, + "number_of_unique_labels": eval_.meta["labels"], + "number_of_annotations": eval_.meta["labels"], + "ingest_runtime": f"{(ingest_time):.1f} seconds", + "eval_runtime": f"{(eval_.meta['duration']):.1f} seconds", + } + write_results_to_file(write_path=write_path, result_dict=results) + + client.delete_dataset(dset.name, timeout=30) + client.delete_model(model.name, timeout=30) + + +if __name__ == "__main__": + run_benchmarking_analysis() diff --git a/integration_tests/benchmarks/object-detection/benchmark_script.py b/integration_tests/benchmarks/object-detection/benchmark_script.py new file mode 100644 index 000000000..556cd5141 --- /dev/null +++ b/integration_tests/benchmarks/object-detection/benchmark_script.py @@ -0,0 +1,290 @@ +import json +import os +from datetime import datetime +from time import time + +import requests + +from valor import ( + Annotation, + Client, + Dataset, + Datum, + GroundTruth, + Label, + Model, + Prediction, + connect, +) +from valor.schemas import MultiPolygon, Polygon, Raster + +connect("http://0.0.0.0:8000") +client = Client() + + +def download_data_if_not_exists(file_path: str, file_url: str): + """Download the data from a public bucket if it doesn't exist in the repo.""" + if os.path.exists(file_path): + return + + response = json.loads(requests.get(file_url).text) + with open(file_path, "w+") as file: + json.dump(response, file, indent=4) + + +def _convert_wkt_to_coordinates(wkt: str) -> list[list[tuple]]: + """Convert a WKT string into a nested list of coordinates.""" + return [ + [tuple(float(y) for y in x) for x in json.loads(wkt)["coordinates"][0]] + ] + + +def write_results_to_file(write_path: str, result_dict: dict): + """Write results to results.json""" + current_datetime = datetime.now().strftime("%d/%m/%Y %H:%M:%S") + + if os.path.isfile(write_path): + with open(write_path, "r") as file: + file.seek(0) + data = json.load(file) + else: + data = {} + + data[current_datetime] = result_dict + + with open(write_path, "w+") as file: + json.dump(data, file, indent=4) + + +def ingest_groundtruths_and_predictions( + dset: Dataset, model: Model, raw: list, pair_limit: int +): + """Ingest the data into Valor.""" + groundtruths = [] + predictions = [] + + for datum_id, data in raw[:pair_limit]: + datum = Datum( + uid=str(datum_id), + metadata=data["datum_metadata"], + ) + groundtruths.append( + GroundTruth( + datum=datum, + annotations=list( + [ + Annotation( + is_instance=ann["is_instance"], + labels=list( + [ + Label( + key=label["key"], + value=label["value"], + ) + for label in ann["labels"] + ] + ), + bounding_box=( + _convert_wkt_to_coordinates(ann["box"]) + if ann["box"] + else None + ), + raster=( + Raster.from_geometry( + geometry=MultiPolygon( + [ + _convert_wkt_to_coordinates( + ann["raster"] + ) + ] + ), + height=data["datum_metadata"]["height"], + width=data["datum_metadata"]["width"], + ) + if ann["raster"] + else None + ), + polygon=( + ( + Polygon( + _convert_wkt_to_coordinates( + ann["polygon"] + ) + ) + ) + if ann["polygon"] + else None + ), + ) + for ann in data["groundtruth_annotations"] + ] + ), + ) + ) + + predictions.append( + Prediction( + datum=datum, + annotations=list( + [ + Annotation( + is_instance=ann["is_instance"], + labels=list( + [ + Label( + key=label["key"], + value=label["value"], + score=label["score"], + ) + for label in ann["labels"] + ] + ), + bounding_box=( + _convert_wkt_to_coordinates(ann["box"]) + if ann["box"] + else None + ), + raster=( + Raster.from_geometry( + geometry=MultiPolygon( + [ + _convert_wkt_to_coordinates( + ann["raster"] + ) + ] + ), + height=data["datum_metadata"]["height"], + width=data["datum_metadata"]["width"], + ) + if ann["raster"] + else None + ), + polygon=( + ( + Polygon( + _convert_wkt_to_coordinates( + ann["polygon"] + ) + ) + ) + if ann["polygon"] + else None + ), + ) + for ann in data["prediction_annotations"] + ] + ), + ) + ) + + for gt in groundtruths: + dset.add_groundtruth(gt) + + for pred in predictions: + model.add_prediction(dset, pred) + + dset.finalize() + model.finalize_inferences(dataset=dset) + + +def run_base_evaluation(dset: Dataset, model: Model): + """Run a base evaluation (with no PR curves).""" + evaluation = model.evaluate_detection(dset) + evaluation.wait_for_completion(timeout=30) + return evaluation + + +def run_pr_curve_evaluation(dset: Dataset, model: Model): + """Run a base evaluation with PrecisionRecallCurve included.""" + evaluation = model.evaluate_detection( + dset, + metrics_to_return=[ + "AP", + "AR", + "mAP", + "APAveragedOverIOUs", + "mAR", + "mAPAveragedOverIOUs", + "PrecisionRecallCurve", + ], + ) + evaluation.wait_for_completion() + return evaluation + + +def run_detailed_pr_curve_evaluation(dset: Dataset, model: Model): + """Run a base evaluation with PrecisionRecallCurve and DetailedPrecisionRecallCurve included.""" + + evaluation = model.evaluate_detection( + dset, + metrics_to_return=[ + "AP", + "AR", + "mAP", + "APAveragedOverIOUs", + "mAR", + "mAPAveragedOverIOUs", + "PrecisionRecallCurve", + "DetailedPrecisionRecallCurve", + ], + ) + evaluation.wait_for_completion() + return evaluation + + +def run_benchmarking_analysis( + limits_to_test: list[int] = [3, 3], + results_file: str = "results.json", + data_file: str = "data.json", +): + """Time various function calls and export the results.""" + current_directory = os.path.dirname(os.path.realpath(__file__)) + write_path = f"{current_directory}/{results_file}" + data_path = f"{current_directory}/{data_file}" + + download_data_if_not_exists( + file_path=data_path, + file_url="https://pub-fae71003f78140bdaedf32a7c8d331d2.r2.dev/detection_data.json", + ) + + with open(data_path) as file: + file.seek(0) + raw_data = json.load(file) + + for limit in limits_to_test: + dset = Dataset.create(name="coco-dataset") + model = Model.create(name="coco-model") + + # convert dict into list of tuples so we can slice it + raw_data_tuple = [(key, value) for key, value in raw_data.items()] + + start_time = time() + + ingest_groundtruths_and_predictions( + dset=dset, model=model, raw=raw_data_tuple, pair_limit=limit + ) + ingest_time = time() - start_time + + try: + eval_ = run_base_evaluation(dset=dset, model=model) + except TimeoutError: + raise TimeoutError( + f"Evaluation timed out when processing {limit} datums." + ) + + results = { + "number_of_datums": limit, + "number_of_unique_labels": eval_.meta["labels"], + "number_of_annotations": eval_.meta["labels"], + "ingest_runtime": f"{(ingest_time):.1f} seconds", + "eval_runtime": f"{(eval_.meta['duration']):.1f} seconds", + } + + write_results_to_file(write_path=write_path, result_dict=results) + + client.delete_dataset(dset.name, timeout=30) + client.delete_model(model.name, timeout=30) + + +if __name__ == "__main__": + run_benchmarking_analysis() diff --git a/integration_tests/stress_test.py b/integration_tests/stress_test.py deleted file mode 100644 index 0dcb49b40..000000000 --- a/integration_tests/stress_test.py +++ /dev/null @@ -1,17 +0,0 @@ -# NOTE: These tests aren't run automatically on each commit. They are intended to be manually kicked-off using the [GitHub UI](https://leonardomontini.dev/github-action-manual-trigger/) -from integration_tests.client.datatype.test_data_generation import ( - test_generate_segmentation_data, -) - -from valor import Client - - -def test_large_dataset_upload(client: Client): - """Tests the upload of a large dataset to valor (runtime: ~20 minutes)""" - test_generate_segmentation_data( - client=client, - dataset_name="stress_test_dataset", - n_images=1000, - n_annotations=10, - n_labels=2, - )