Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Manifest fixes #3146

Merged
merged 11 commits into from
May 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Changing a label on canvas does not work when 'Show object details' enabled (<https://github.com/openvinotoolkit/cvat/pull/3084>)
- Make sure frame unzip web worker correctly terminates after unzipping all images in a requested chunk (<https://github.com/openvinotoolkit/cvat/pull/3096>)
- Reset password link was unavailable before login (<https://github.com/openvinotoolkit/cvat/pull/3140>)
- Manifest: migration (<https://github.com/openvinotoolkit/cvat/pull/3146>)

### Security

Expand Down
92 changes: 86 additions & 6 deletions cvat/apps/engine/migrations/0038_manifest.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,120 @@
# Generated by Django 3.1.1 on 2021-02-20 08:36

import glob
import itertools
import logging
import os
import sys
from re import search

from django.conf import settings
from django.db import migrations

from cvat.apps.engine.models import (DimensionType, StorageChoice,
StorageMethodChoice)
from cvat.apps.engine.media_extractors import get_mime
from utils.dataset_manifest import ImageManifestManager, VideoManifestManager

def migrate_data(apps, shema_editor):
def get_logger():
migration = os.path.basename(__file__).split(".")[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does splitext work here? But I'm fine with the solution as well.

logger = logging.getLogger(name=migration)
logger.setLevel(logging.INFO)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a special file with log configs. Do you think we can put the code into https://github.com/openvinotoolkit/cvat/blob/develop/cvat/apps/engine/log.py?

file_handler = logging.FileHandler(os.path.join(settings.MIGRATIONS_LOGS_ROOT, f"{migration}.log"))
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.addHandler(logging.StreamHandler(sys.stderr))
return logger

def _get_query_set(apps):
Data = apps.get_model("engine", "Data")
query_set = Data.objects.filter(storage_method=StorageMethodChoice.CACHE)
return query_set

def migrate2meta(apps, shema_editor):
logger = get_logger()
query_set = _get_query_set(apps)
for db_data in query_set:
try:
upload_dir = '{}/{}/raw'.format(settings.MEDIA_DATA_ROOT, db_data.id)
logger.info('Migrate data({}), folder - {}'.format(db_data.id, upload_dir))
meta_path = os.path.join(upload_dir, "meta_info.txt")
if os.path.exists(os.path.join(upload_dir, 'manifest.jsonl')):
os.remove(os.path.join(upload_dir, 'manifest.jsonl'))
logger.info('A manifest file has been deleted')
if os.path.exists(os.path.join(upload_dir, 'index.json')):
os.remove(os.path.join(upload_dir, 'index.json'))
logger.info('A manifest index file has been deleted')
data_dir = upload_dir if db_data.storage == StorageChoice.LOCAL else settings.SHARE_ROOT
if hasattr(db_data, 'video'):
if os.path.exists(meta_path):
logger.info('A meta_info.txt already exists')
continue
media_file = os.path.join(data_dir, db_data.video.path)
logger.info('Preparing of the video meta has begun')
meta = VideoManifestManager(manifest_path=upload_dir) \
.prepare_meta(media_file=media_file, force=True)
with open(meta_path, "w") as meta_file:
for idx, pts, _ in meta:
meta_file.write(f"{idx} {pts}\n")
else:
name_format = "dummy_{}.txt"
sources = [db_image.path for db_image in db_data.images.all().order_by('frame')]
counter = itertools.count()
logger.info('Preparing of the dummy chunks has begun')
for idx, img_paths in itertools.groupby(sources, lambda x: next(counter) // db_data.chunk_size):
if os.path.exists(os.path.join(upload_dir, name_format.format(idx))):
logger.info(name_format.format(idx) + " already exists")
continue
with open(os.path.join(upload_dir, name_format.format(idx)), "w") as dummy_chunk:
dummy_chunk.writelines([f"{img_path}\n" for img_path in img_paths])
logger.info('Succesfull migration for the data({})'.format(db_data.id))
except Exception as ex:
logger.error(str(ex))

def migrate2manifest(apps, shema_editor):
logger = get_logger()
logger.info('The data migration has been started for creating manifest`s files')
query_set = _get_query_set(apps)
logger.info('Need to update {} data objects'.format(len(query_set)))
for db_data in query_set:
try:
upload_dir = '{}/{}/raw'.format(settings.MEDIA_DATA_ROOT, db_data.id)
logger.info('Migrate data({}), folder - {}'.format(db_data.id, upload_dir))
if os.path.exists(os.path.join(upload_dir, 'meta_info.txt')):
os.remove(os.path.join(upload_dir, 'meta_info.txt'))
os.remove(os.path.join(upload_dir, 'meta_info.txt'))
logger.info('{}/meta_info.txt has been deleted'.format(upload_dir))
else:
for path in glob.glob(f'{upload_dir}/dummy_*.txt'):
os.remove(path)
logger.info(f"{path} has been deleted")
# it's necessary for case with long data migration
if os.path.exists(os.path.join(upload_dir, 'manifest.jsonl')):
logger.info('Manifest file already exists')
continue
data_dir = upload_dir if db_data.storage == StorageChoice.LOCAL else settings.SHARE_ROOT
if hasattr(db_data, 'video'):
media_file = os.path.join(data_dir, db_data.video.path)
manifest = VideoManifestManager(manifest_path=upload_dir)
meta_info = manifest.prepare_meta(media_file=media_file)
logger.info('Preparing of the video meta information has begun')
meta_info = manifest.prepare_meta(media_file=media_file, force=True)
logger.info('Manifest creating has begun')
manifest.create(meta_info)
logger.info('Index creating has begun')
manifest.init_index()
else:
manifest = ImageManifestManager(manifest_path=upload_dir)
sources = []
if db_data.storage == StorageChoice.LOCAL:
for (root, _, files) in os.walk(data_dir):
sources.extend([os.path.join(root, f) for f in files])
sources.extend([os.path.join(root, f) for f in files if get_mime(f) == 'image'])
sources.sort()
# using share, this means that we can not explicitly restore the entire data structure
else:
sources = [os.path.join(data_dir, db_image.path) for db_image in db_data.images.all().order_by('frame')]
if any(list(filter(lambda x: x.dimension==DimensionType.DIM_3D, db_data.tasks.all()))):
logger.info('Preparing of images 3d meta information has begun')
content = []
for source in sources:
name, ext = os.path.splitext(os.path.relpath(source, upload_dir))
Expand All @@ -51,13 +123,15 @@ def migrate_data(apps, shema_editor):
'extension': ext
})
else:
logger.info('Preparing of 2d images meta information has begun')
meta_info = manifest.prepare_meta(sources=sources, data_dir=data_dir)
content = meta_info.content

if db_data.storage == StorageChoice.SHARE:
def _get_frame_step(str_):
match = search("step\s*=\s*([1-9]\d*)", str_)
return int(match.group(1)) if match else 1
logger.info('Data is located on the share, metadata update has been started')
step = _get_frame_step(db_data.frame_filter)
start = db_data.start_frame
stop = db_data.stop_frame + 1
Expand All @@ -67,10 +141,13 @@ def _get_frame_step(str_):
item = content.pop(0) if i in images_range else dict()
result_content.append(item)
content = result_content
logger.info('Manifest creating has begun')
manifest.create(content)
logger.info('Index creating has begun')
manifest.init_index()
logger.info('Succesfull migration for the data({})'.format(db_data.id))
except Exception as ex:
print(str(ex))
logger.error(str(ex))

class Migration(migrations.Migration):

Expand All @@ -79,5 +156,8 @@ class Migration(migrations.Migration):
]

operations = [
migrations.RunPython(migrate_data)
migrations.RunPython(
code=migrate2manifest,
reverse_code=migrate2meta
)
]
14 changes: 10 additions & 4 deletions utils/dataset_manifest/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ def index(self):
return self._index

class VideoManifestManager(_ManifestManager):
def __init__(self, manifest_path, *args, **kwargs):
def __init__(self, manifest_path):
super().__init__(manifest_path)
setattr(self._manifest, 'TYPE', 'video')
self.BASE_INFORMATION['properties'] = 3
Expand Down Expand Up @@ -381,9 +381,15 @@ def validate_base_info(self):
assert self._manifest.TYPE != json.loads(manifest_file.readline())['type']

class VideoManifestValidator(VideoManifestManager):
def __init__(self, **kwargs):
self.source_path = kwargs.pop('source_path')
super().__init__(self, **kwargs)
def __init__(self, source_path, manifest_path):
self.source_path = source_path
super().__init__(manifest_path)

@staticmethod
def _get_video_stream(container):
video_stream = next(stream for stream in container.streams if stream.type == 'video')
video_stream.thread_type = 'AUTO'
return video_stream

def validate_key_frame(self, container, video_stream, key_frame):
for packet in container.demux(video_stream):
Expand Down