Skip to content

Commit

Permalink
[DOP-20061] refactor stats utils
Browse files Browse the repository at this point in the history
  • Loading branch information
TiGrib committed Nov 8, 2024
1 parent 4397d2e commit 1792d92
Showing 1 changed file with 32 additions and 107 deletions.
139 changes: 32 additions & 107 deletions tests/test_server/utils/stats.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from datetime import datetime
from typing import Optional

from data_rentgen.db.models import Input, Output


async def relation_stats(relations: list[Input] | list[Output]) -> dict[int, dict[str, int | Optional[datetime]]]:
stats: dict[int, dict[str, int | Optional[datetime]]] = {}
async def relation_stats(relations: list[Input] | list[Output]):
stats = {}
for relation in relations:
if relation.dataset_id not in stats:
stats[relation.dataset_id] = {"num_bytes": 0, "num_rows": 0, "num_files": 0, "created_at": None}
Expand All @@ -22,117 +19,45 @@ async def relation_stats(relations: list[Input] | list[Output]) -> dict[int, dic


async def relation_stats_by_operations(relations: list[Input] | list[Output]):
operation_ids = {relation.operation_id for relation in relations}
stats = {}
for operation_id in operation_ids:
dataset_ids = {relation.dataset_id for relation in relations if relation.operation_id == operation_id}
for dataset_id in dataset_ids:
stats[(str(operation_id), dataset_id)] = {
"num_bytes": sum(
[
relation.num_bytes
for relation in relations
if relation.operation_id == operation_id and relation.dataset_id == dataset_id
],
),
"num_rows": sum(
[
relation.num_rows
for relation in relations
if relation.operation_id == operation_id and relation.dataset_id == dataset_id
],
),
"num_files": sum(
[
relation.num_files
for relation in relations
if relation.operation_id == operation_id and relation.dataset_id == dataset_id
],
),
"created_at": max(
[
relation.created_at
for relation in relations
if relation.operation_id == operation_id and relation.dataset_id == dataset_id
],
),
}
for relation in relations:
key = (str(relation.operation_id), relation.dataset_id)
if key not in stats:
stats[key] = {"num_bytes": 0, "num_rows": 0, "num_files": 0, "created_at": None}

stats[key]["num_bytes"] += relation.num_bytes
stats[key]["num_rows"] += relation.num_rows
stats[key]["num_files"] += relation.num_files
stats[key]["created_at"] = max(stats[key]["created_at"] or relation.created_at, relation.created_at)

return stats


async def relation_stats_by_runs(relations: list[Input] | list[Output]):
run_ids = {relation.run_id for relation in relations}
stats = {}
for run_id in run_ids:
dataset_ids = {relation.dataset_id for relation in relations if relation.run_id == run_id}
for dataset_id in dataset_ids:
stats[(str(run_id), dataset_id)] = {
"num_bytes": sum(
[
relation.num_bytes
for relation in relations
if relation.run_id == run_id and relation.dataset_id == dataset_id
],
),
"num_rows": sum(
[
relation.num_rows
for relation in relations
if relation.run_id == run_id and relation.dataset_id == dataset_id
],
),
"num_files": sum(
[
relation.num_files
for relation in relations
if relation.run_id == run_id and relation.dataset_id == dataset_id
],
),
"created_at": max(
[
relation.created_at
for relation in relations
if relation.run_id == run_id and relation.dataset_id == dataset_id
],
),
}
for relation in relations:
key = (str(relation.run_id), relation.dataset_id)
if key not in stats:
stats[key] = {"num_bytes": 0, "num_rows": 0, "num_files": 0, "created_at": None}

stats[key]["num_bytes"] += relation.num_bytes
stats[key]["num_rows"] += relation.num_rows
stats[key]["num_files"] += relation.num_files
stats[key]["created_at"] = max(stats[key]["created_at"] or relation.created_at, relation.created_at)

return stats


async def relation_stats_by_jobs(relations: list[Input] | list[Output]):
job_ids = {relation.job_id for relation in relations}
stats = {}
for job_id in job_ids:
dataset_ids = {relation.dataset_id for relation in relations if relation.job_id == job_id}
for dataset_id in dataset_ids:
stats[(str(job_id), dataset_id)] = {
"num_bytes": sum(
[
relation.num_bytes
for relation in relations
if relation.job_id == job_id and relation.dataset_id == dataset_id
],
),
"num_rows": sum(
[
relation.num_rows
for relation in relations
if relation.job_id == job_id and relation.dataset_id == dataset_id
],
),
"num_files": sum(
[
relation.num_files
for relation in relations
if relation.job_id == job_id and relation.dataset_id == dataset_id
],
),
"created_at": max(
[
relation.created_at
for relation in relations
if relation.job_id == job_id and relation.dataset_id == dataset_id
],
),
}
for relation in relations:
key = (str(relation.job_id), relation.dataset_id)
if key not in stats:
stats[key] = {"num_bytes": 0, "num_rows": 0, "num_files": 0, "created_at": None}

stats[key]["num_bytes"] += relation.num_bytes
stats[key]["num_rows"] += relation.num_rows
stats[key]["num_files"] += relation.num_files
stats[key]["created_at"] = max(stats[key]["created_at"] or relation.created_at, relation.created_at)

return stats

0 comments on commit 1792d92

Please sign in to comment.