Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize validation layout updates #8789

Merged
merged 41 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
a7df39d
Improve validation frame distribution in honeypot tasks
zhiltsov-max Dec 4, 2024
8f83f04
Improve frame distribution in automatic honeypot rerolls
zhiltsov-max Dec 4, 2024
4a9ce0f
Reset only active frames, fix errors
zhiltsov-max Dec 5, 2024
daebdcd
Add tests
zhiltsov-max Dec 5, 2024
1b71f6a
Add changelog
zhiltsov-max Dec 5, 2024
4c07c0c
Fix possible invalid honeypot picks in task creation
zhiltsov-max Dec 5, 2024
006e855
t
zhiltsov-max Dec 5, 2024
306c006
Optimize validation layout updates
zhiltsov-max Dec 6, 2024
f55595d
Fix indentation
zhiltsov-max Dec 6, 2024
d45a33d
Update cvat/apps/engine/cache.py
zhiltsov-max Dec 7, 2024
0f2b2ee
Refactor code: extract common functions, add typing, change chunked_l…
zhiltsov-max Dec 10, 2024
15bf9b7
Remove handling of impossible exceptions
zhiltsov-max Dec 10, 2024
ec0c353
Merge remote-tracking branch 'origin/zm/optimize-validation-layout-up…
zhiltsov-max Dec 10, 2024
4acbeb1
Merge remote-tracking branch 'origin/develop' into zm/optimize-valida…
zhiltsov-max Dec 10, 2024
1dc2b42
Fix request response and behavior in simultaneous deleted_frames and …
zhiltsov-max Dec 10, 2024
61eeb8d
Fix formatting
zhiltsov-max Dec 10, 2024
268b54b
Fix test
zhiltsov-max Dec 10, 2024
bea74b4
Merge branch 'develop' into zm/optimize-validation-layout-updates
zhiltsov-max Dec 12, 2024
e0e978a
Move import
zhiltsov-max Dec 12, 2024
043bf83
Remove extra sorting
zhiltsov-max Dec 12, 2024
e202521
Add sorting
zhiltsov-max Dec 12, 2024
5e3a797
Fix merge
zhiltsov-max Dec 12, 2024
db59fb2
Improve error message
zhiltsov-max Dec 12, 2024
e2b2807
Fix imports
zhiltsov-max Dec 12, 2024
88bd0ce
Refactor some code, fix frame counts use in random reroll
zhiltsov-max Dec 12, 2024
199ef37
Improve tests, fix random reroll in task
zhiltsov-max Dec 13, 2024
21c1866
Update changelog
zhiltsov-max Dec 13, 2024
57afe1e
Fix newline
zhiltsov-max Dec 17, 2024
3326cad
Apply suggestions from code review
zhiltsov-max Dec 18, 2024
82c4ab2
Rename variable
zhiltsov-max Dec 18, 2024
0d1555f
Add named arg in function call
zhiltsov-max Dec 18, 2024
7779b4e
Add notes on remove_segment_chunks api
zhiltsov-max Dec 18, 2024
151df1b
Fix cache removal log messages
zhiltsov-max Dec 18, 2024
86645ff
Add a model property for active validation frames
zhiltsov-max Dec 18, 2024
b67ba9a
Remove accumulating media cache
zhiltsov-max Dec 18, 2024
33b229e
Remove extra variables
zhiltsov-max Dec 18, 2024
78dbd35
Fix and refactor bulk rf m2m updates
zhiltsov-max Dec 19, 2024
e61f1ac
Fix related file field name
zhiltsov-max Dec 19, 2024
a268cd8
Fix cache keys for context image chunks
zhiltsov-max Dec 19, 2024
5c3522f
Fix honeypot skipping for unchanged honeypots
zhiltsov-max Dec 19, 2024
68d1771
Fix context image chunks removal for updated honeypot frames
zhiltsov-max Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 34 additions & 25 deletions cvat/apps/engine/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from cvat.apps.engine.task_validation import HoneypotFrameSelector
from cvat.apps.engine.rq_job_handler import RQJobMetaField, RQId
from cvat.apps.engine.utils import (
format_list, parse_exception_message, CvatChunkTimestampMismatchError,
format_list, grouped, parse_exception_message, CvatChunkTimestampMismatchError,
parse_specific_attributes, build_field_filter_params, get_list_view_name, reverse, take_by
)

Expand Down Expand Up @@ -1570,39 +1570,48 @@ def _update_frames_in_bulk(

# Update related images in 2 steps: remove all m2m for honeypots, then add (copy) new ones
# 1. remove
for updated_honeypots_batch in take_by(bulk_context.updated_honeypots, chunk_size=1000):
for updated_honeypots_batch in take_by(
bulk_context.updated_honeypots.values(), chunk_size=1000
):
models.RelatedFile.images.through.objects.filter(
image_id__in=updated_honeypots_batch
image_id__in=(db_honeypot.id for db_honeypot in updated_honeypots_batch)
).delete()

# 2. batched add (copy), collect all the new records and insert
# 2. batched add (copy): collect all the new records and insert
frame_provider = TaskFrameProvider(db_task)
validation_frame_uses_with_related_files: dict[int, list[int]] = {}
for honeypot_frame, db_honeypot in bulk_context.updated_honeypots.items():
validation_frame_uses_with_related_files.setdefault(
frame_provider.get_rel_frame_number(db_honeypot.real_frame), []
).append(honeypot_frame)
honeypots_by_validation_frame = grouped(
bulk_context.updated_honeypots,
key=lambda honeypot_frame: frame_provider.get_rel_frame_number(
bulk_context.updated_honeypots[honeypot_frame].real_frame
)
) # validation frame -> [honeypot_frame, ...]

new_m2m_objects = []
related_files_m2m_objects = (
m2m_objects_by_validation_image_id = grouped(
models.RelatedFile.images.through.objects
.filter(image_id__in=[
bulk_context.all_db_frames[f].id for f in db_task.data.validation_layout.frames
])
.all()
.filter(image_id__in=(
bulk_context.all_db_frames[validation_frame].id
for validation_frame in honeypots_by_validation_frame
))
.all(),
key=lambda m2m_obj: m2m_obj.image_id
)
for validation_frame, validation_frame_uses in (
validation_frame_uses_with_related_files.items()
):
new_m2m_objects.extend(
models.RelatedFile.images.through(
image_id=bulk_context.all_db_frames[honeypot_frame].id,
related_file_id=related_file_m2m_obj.related_file_id
)
for honeypot_frame in validation_frame_uses
for related_file_m2m_obj in related_files_m2m_objects
if related_file_m2m_obj.image_id == validation_frame
for validation_frame, validation_frame_honeypots in honeypots_by_validation_frame.items():
validation_frame_m2m_objects = m2m_objects_by_validation_image_id.get(
bulk_context.all_db_frames[validation_frame].id
)
if not validation_frame_m2m_objects:
continue

# Copy validation frame m2m objects to corresponding honeypots
for honeypot_frame in validation_frame_honeypots:
new_m2m_objects.extend(
models.RelatedFile.images.through(
image_id=bulk_context.all_db_frames[honeypot_frame].id,
related_file_id=m2m_obj.related_file_id
zhiltsov-max marked this conversation as resolved.
Show resolved Hide resolved
)
for m2m_obj in validation_frame_m2m_objects
)

models.RelatedFile.images.through.objects.bulk_create(new_m2m_objects, batch_size=1000)

Expand Down
35 changes: 34 additions & 1 deletion cvat/apps/engine/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
import sys
import traceback
from contextlib import suppress, nullcontext
from typing import Any, Dict, Generator, Iterable, Optional, Callable, Sequence, TypeVar, Union
from typing import (
Any, Callable, Dict, Generator, Iterable, Iterator, Optional, Mapping, Sequence, TypeVar, Union
)
import subprocess
import os
import urllib.parse
Expand Down Expand Up @@ -459,3 +461,34 @@ def format_list(
separator.join(items[:max_items]),
f" (and {remainder_count} more)" if 0 < remainder_count else "",
)


_K = TypeVar("_K")
_V = TypeVar("_V")


def grouped(
items: Iterator[_V] | Iterable[_V], *, key: Callable[[_V], _K]
) -> Mapping[_K, Sequence[_V]]:
"""
Returns a mapping with input iterable elements grouped by key, for example:

grouped(
[("apple1", "red"), ("apple2", "green"), ("apple3", "red")],
key=lambda v: v[1]
)
->
{
"red": [("apple1", "red"), ("apple3", "red")],
"green": [("apple2", "green")]
}

Similar to itertools.groupby, but allows reiteration on resulting groups.
"""

# Can be implemented with itertools.groupby, but it requires extra sorting for input elements
grouped_items = {}
for item in items:
grouped_items.setdefault(key(item), []).append(item)

return grouped_items
Loading