Skip to content

Commit

Permalink
reformatted classification benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
czaloom committed Aug 15, 2024
1 parent f2169b8 commit 993fd04
Showing 1 changed file with 197 additions and 87 deletions.
284 changes: 197 additions & 87 deletions integration_tests/benchmarks/classification/benchmark_script.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import json
import os
import time
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path

import requests

Expand All @@ -22,7 +24,7 @@
client = Client()


def download_data_if_not_exists(file_path: str, file_url: str):
def download_data_if_not_exists(file_path: Path, file_url: str):
"""Download the data from a public bucket if it doesn't exist in the repo."""
if os.path.exists(file_path):
return
Expand All @@ -32,36 +34,37 @@ def download_data_if_not_exists(file_path: str, file_url: str):
json.dump(response, file, indent=4)


def write_results_to_file(write_path: str, result_dict: dict):
def write_results_to_file(write_path: Path, results: list[dict]):
"""Write results to results.json"""
current_datetime = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

if os.path.isfile(write_path):
with open(write_path, "r") as file:
file.seek(0)
data = json.load(file)
else:
data = {}

data[current_datetime] = result_dict
data[current_datetime] = results

with open(write_path, "w+") as file:
json.dump(data, file, indent=4)


def ingest_groundtruths_and_predictions(
dset: Dataset, model: Model, raw: dict, pair_limit: int
def ingest_groundtruths(
dset: Dataset,
raw: dict,
pair_limit: int,
timeout: int | None,
):
"""Ingest the data into Valor."""
"""Ingest groundtruths into Valor."""

groundtruths = []
predictions = []
slice_ = (
raw["groundtruth_prediction_pairs"][:pair_limit]
if pair_limit != -1
else raw["groundtruth_prediction_pairs"]
)
for groundtruth, prediction in slice_:
for groundtruth, _ in slice_:
groundtruths.append(
GroundTruth(
datum=Datum(
Expand All @@ -84,6 +87,26 @@ def ingest_groundtruths_and_predictions(
)
)

dset.add_groundtruths(groundtruths, timeout=timeout)


def ingest_predictions(
dset: Dataset,
model: Model,
raw: dict,
pair_limit: int,
timeout: int | None,
):
"""Ingest the data into Valor."""

predictions = []
slice_ = (
raw["groundtruth_prediction_pairs"][:pair_limit]
if pair_limit != -1
else raw["groundtruth_prediction_pairs"]
)
for _, prediction in slice_:

predictions.append(
Prediction(
datum=Datum(
Expand All @@ -106,65 +129,122 @@ def ingest_groundtruths_and_predictions(
)
)

dset.add_groundtruths(groundtruths, timeout=150)
model.add_predictions(dset, predictions, timeout=150)
model.add_predictions(dset, predictions, timeout=timeout)

dset.finalize()
model.finalize_inferences(dataset=dset)


def run_base_evaluation(dset: Dataset, model: Model):
def run_base_evaluation(dset: Dataset, model: Model, timeout: int | None):
"""Run a base evaluation (with no PR curves)."""
evaluation = model.evaluate_classification(dset)
evaluation.wait_for_completion()
try:
evaluation = model.evaluate_classification(dset)
evaluation.wait_for_completion(timeout=timeout)
except TimeoutError:
raise TimeoutError(
f"Base evaluation timed out when processing {evaluation.meta['datums']} datums." # type: ignore
)
return evaluation


def run_pr_curve_evaluation(dset: Dataset, model: Model):
def run_pr_curve_evaluation(dset: Dataset, model: Model, timeout: int | None):
"""Run a base evaluation with PrecisionRecallCurve included."""
evaluation = model.evaluate_classification(
dset,
metrics_to_return=[
MetricType.Accuracy,
MetricType.Precision,
MetricType.Recall,
MetricType.F1,
MetricType.ROCAUC,
MetricType.PrecisionRecallCurve,
],
)
evaluation.wait_for_completion()
try:
evaluation = model.evaluate_classification(
dset,
metrics_to_return=[
MetricType.Accuracy,
MetricType.Precision,
MetricType.Recall,
MetricType.F1,
MetricType.ROCAUC,
MetricType.PrecisionRecallCurve,
],
)
evaluation.wait_for_completion(timeout=timeout)
except TimeoutError:
raise TimeoutError(
f"PR evaluation timed out when processing {evaluation.meta['datums']} datums." # type: ignore
)
return evaluation


def run_detailed_pr_curve_evaluation(dset: Dataset, model: Model):
def run_detailed_pr_curve_evaluation(
dset: Dataset, model: Model, timeout: int | None
):
"""Run a base evaluation with PrecisionRecallCurve and DetailedPrecisionRecallCurve included."""

evaluation = model.evaluate_classification(
dset,
metrics_to_return=[
MetricType.Accuracy,
MetricType.Precision,
MetricType.Recall,
MetricType.F1,
MetricType.ROCAUC,
MetricType.PrecisionRecallCurve,
MetricType.DetailedPrecisionRecallCurve,
],
)
evaluation.wait_for_completion()
try:
evaluation = model.evaluate_classification(
dset,
metrics_to_return=[
MetricType.Accuracy,
MetricType.Precision,
MetricType.Recall,
MetricType.F1,
MetricType.ROCAUC,
MetricType.PrecisionRecallCurve,
MetricType.DetailedPrecisionRecallCurve,
],
)
evaluation.wait_for_completion(timeout=timeout)
except TimeoutError:
raise TimeoutError(
f"Detailed evaluation timed out when processing {evaluation.meta['datums']} datums." # type: ignore
)
return evaluation


@dataclass
class DataBenchmark:
ingestion: float
finalization: float
deletion: float

def result(self) -> dict[str, float | str]:
return {
"ingestion": round(self.ingestion, 2),
"finalization": round(self.finalization, 2),
"deletion": round(self.deletion, 2),
}


@dataclass
class EvaluationBenchmark:
limit: int
gt_stats: DataBenchmark
pd_stats: DataBenchmark
n_datums: int
n_annotations: int
n_labels: int
eval_base: float
eval_base_pr: float
eval_base_pr_detail: float

def result(self) -> dict[str, float | str | dict[str, str | float]]:
return {
"limit": self.limit,
"groundtruths": self.gt_stats.result(),
"predictions": self.pd_stats.result(),
"evaluation": {
"number_of_datums": self.n_datums,
"number_of_annotations": self.n_annotations,
"number_of_labels": self.n_labels,
"base": round(self.eval_base, 2),
"base+pr": round(self.eval_base_pr, 2),
"base+pr+detailed": round(self.eval_base_pr_detail, 2),
},
}


def run_benchmarking_analysis(
limits_to_test: list[int] = [5000, 5000],
limits: list[int],
results_file: str = "results.json",
data_file: str = "data.json",
ingestion_timeout: int | None = 150,
evaluation_timeout: int | None = 30,
):
"""Time various function calls and export the results."""
current_directory = os.path.dirname(os.path.realpath(__file__))
write_path = f"{current_directory}/{results_file}"
data_path = f"{current_directory}/{data_file}"
current_directory = Path(os.path.dirname(os.path.realpath(__file__)))
write_path = current_directory / Path(results_file)
data_path = current_directory / Path(data_file)

download_data_if_not_exists(
file_path=data_path,
Expand All @@ -175,58 +255,88 @@ def run_benchmarking_analysis(
file.seek(0)
raw_data = json.load(file)

for limit in limits_to_test:
results = list()
for limit in limits:

dset = Dataset.create(name=f"bird-identification-{time.time()}")
model = Model.create(name=f"some_model-{time.time()}")

# ingest groundtruths
start_time = time.time()
ingest_groundtruths(
dset=dset,
raw=raw_data,
pair_limit=limit,
timeout=ingestion_timeout,
)
gt_ingest_time = time.time() - start_time

ingest_groundtruths_and_predictions(
dset=dset, model=model, raw=raw_data, pair_limit=limit
# finalize groundtruths
start_time = time.time()
dset.finalize()
gt_finalization_time = time.time() - start_time

# ingest predictions
start_time = time.time()
ingest_predictions(
dset=dset,
model=model,
raw=raw_data,
pair_limit=limit,
timeout=ingestion_timeout,
)
ingest_time = time.time() - start_time
pd_ingest_time = time.time() - start_time

try:
eval_ = run_base_evaluation(dset=dset, model=model)
except TimeoutError:
raise TimeoutError(
f"Evaluation timed out when processing {limit} datums."
)
# finalize predictions
start_time = time.time()
model.finalize_inferences(dset)
pd_finalization_time = time.time() - start_time

try:
eval_pr = run_pr_curve_evaluation(dset=dset, model=model)
except TimeoutError:
raise TimeoutError(
f"PR Evaluation timed out when processing {limit} datums."
)
# run evaluations
eval_base = run_base_evaluation(
dset=dset, model=model, timeout=evaluation_timeout
)
eval_pr = run_pr_curve_evaluation(
dset=dset, model=model, timeout=evaluation_timeout
)
eval_detail = run_detailed_pr_curve_evaluation(
dset=dset, model=model, timeout=evaluation_timeout
)

try:
eval_pr_detail = run_detailed_pr_curve_evaluation(
dset=dset, model=model
)
except TimeoutError:
raise TimeoutError(
f"Detailed PR Evaluation timed out when processing {limit} datums."
)
# delete model
start = time.time()
client.delete_model(model.name, timeout=30)
pd_deletion_time = time.time() - start

# delete dataset
start = time.time()
client.delete_dataset(dset.name, timeout=30)
client.delete_model(model.name, timeout=30)
deletion_time = time.time() - start

results = {
"number_of_datums": limit,
"number_of_unique_labels": eval_.meta["labels"],
"number_of_annotations": eval_.meta["annotations"],
"ingest_runtime": f"{(ingest_time):.1f} seconds",
"eval_runtime": f"{(eval_.meta['duration']):.1f} seconds",
"eval_pr_runtime": f"{(eval_pr.meta['duration']):.1f} seconds",
"eval_detailed_pr_runtime": f"{(eval_pr_detail.meta['duration']):.1f} seconds",
"del_runtime": f"{(deletion_time):.1f} seconds",
}
write_results_to_file(write_path=write_path, result_dict=results)
gt_deletion_time = time.time() - start

results.append(
EvaluationBenchmark(
limit=limit,
gt_stats=DataBenchmark(
ingestion=gt_ingest_time,
finalization=gt_finalization_time,
deletion=gt_deletion_time,
),
pd_stats=DataBenchmark(
ingestion=pd_ingest_time,
finalization=pd_finalization_time,
deletion=pd_deletion_time,
),
n_datums=eval_base.meta["datums"],
n_annotations=eval_base.meta["annotations"],
n_labels=eval_base.meta["labels"],
eval_base=eval_base.meta["duration"],
eval_base_pr=eval_pr.meta["duration"],
eval_base_pr_detail=eval_detail.meta["duration"],
).result()
)

write_results_to_file(write_path=write_path, results=results)


if __name__ == "__main__":
run_benchmarking_analysis()
run_benchmarking_analysis(limits=[5000, 5000])

0 comments on commit 993fd04

Please sign in to comment.