From 84b8a85a2038dce6b31f8bcd011eeb29f71ab14f Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Wed, 7 Oct 2020 12:52:36 +0300
Subject: [PATCH] Raw image copying in dataset export (#2229)

* Raw image copying in dataset export

* use byteimage

* use opencv, swith frame data type for videos

* Fix image reading

* update dm dependency
---
 cvat/apps/dataset_manager/bindings.py     | 35 +++++++++++++++++------
 cvat/apps/dataset_manager/formats/cvat.py |  7 +++--
 cvat/apps/engine/frame_provider.py        | 27 ++++++++++-------
 cvat/apps/engine/media_extractors.py      |  2 ++
 cvat/requirements/base.txt                |  2 +-
 5 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/cvat/apps/dataset_manager/bindings.py b/cvat/apps/dataset_manager/bindings.py
index 30ec8bd90786..02d4eaabdf13 100644
--- a/cvat/apps/dataset_manager/bindings.py
+++ b/cvat/apps/dataset_manager/bindings.py
@@ -13,7 +13,7 @@
 from cvat.apps.engine.frame_provider import FrameProvider
 from cvat.apps.engine.models import AttributeType, ShapeType
 from datumaro.util import cast
-from datumaro.util.image import Image
+from datumaro.util.image import ByteImage, Image
 
 from .annotation import AnnotationManager, TrackManager
 
@@ -457,18 +457,37 @@ def __init__(self, task_data, include_images=False, include_outside=False):
 
         dm_items = []
 
+        is_video = task_data.meta['task']['mode'] == 'interpolation'
+        ext = ''
+        if is_video:
+            ext = FrameProvider.VIDEO_FRAME_EXT
         if include_images:
             frame_provider = FrameProvider(task_data.db_task.data)
+            if is_video:
+                # optimization for videos: use numpy arrays instead of bytes
+                # some formats or transforms can require image data
+                def _make_image(i, **kwargs):
+                    loader = lambda _: frame_provider.get_frame(i,
+                        quality=frame_provider.Quality.ORIGINAL,
+                        out_type=frame_provider.Type.NUMPY_ARRAY)[0]
+                    return Image(loader=loader, **kwargs)
+            else:
+                # for images use encoded data to avoid recoding
+                def _make_image(i, **kwargs):
+                    loader = lambda _: frame_provider.get_frame(i,
+                        quality=frame_provider.Quality.ORIGINAL,
+                        out_type=frame_provider.Type.BUFFER)[0].getvalue()
+                    return ByteImage(data=loader, **kwargs)
 
         for frame_data in task_data.group_by_frame(include_empty=True):
-            loader = None
+            image_args = {
+                'path': frame_data.name + ext,
+                'size': (frame_data.height, frame_data.width),
+            }
             if include_images:
-                loader = lambda p, i=frame_data.idx: frame_provider.get_frame(i,
-                    quality=frame_provider.Quality.ORIGINAL,
-                    out_type=frame_provider.Type.NUMPY_ARRAY)[0]
-            dm_image = Image(path=frame_data.name, loader=loader,
-                size=(frame_data.height, frame_data.width)
-            )
+                dm_image = _make_image(frame_data.idx, **image_args)
+            else:
+                dm_image = Image(**image_args)
             dm_anno = self._read_cvat_anno(frame_data, task_data)
             dm_item = datumaro.DatasetItem(id=osp.splitext(frame_data.name)[0],
                 annotations=dm_anno, image=dm_image,
diff --git a/cvat/apps/dataset_manager/formats/cvat.py b/cvat/apps/dataset_manager/formats/cvat.py
index 3c349947b769..632da163de8a 100644
--- a/cvat/apps/dataset_manager/formats/cvat.py
+++ b/cvat/apps/dataset_manager/formats/cvat.py
@@ -531,6 +531,10 @@ def _export(dst_file, task_data, anno_callback, save_images=False):
             anno_callback(f, task_data)
 
         if save_images:
+            ext = ''
+            if task_data.meta['task']['mode'] == 'interpolation':
+                ext = FrameProvider.VIDEO_FRAME_EXT
+
             img_dir = osp.join(temp_dir, 'images')
             frame_provider = FrameProvider(task_data.db_task.data)
             frames = frame_provider.get_frames(
@@ -538,9 +542,6 @@ def _export(dst_file, task_data, anno_callback, save_images=False):
                 frame_provider.Type.BUFFER)
             for frame_id, (frame_data, _) in enumerate(frames):
                 frame_name = task_data.frame_info[frame_id]['path']
-                ext = ''
-                if not '.' in osp.basename(frame_name):
-                    ext = '.png'
                 img_path = osp.join(img_dir, frame_name + ext)
                 os.makedirs(osp.dirname(img_path), exist_ok=True)
                 with open(img_path, 'wb') as f:
diff --git a/cvat/apps/engine/frame_provider.py b/cvat/apps/engine/frame_provider.py
index ed96bf99f3c0..3870ef3d1048 100644
--- a/cvat/apps/engine/frame_provider.py
+++ b/cvat/apps/engine/frame_provider.py
@@ -6,6 +6,7 @@
 from enum import Enum
 from io import BytesIO
 
+import cv2
 import numpy as np
 from PIL import Image
 
@@ -43,6 +44,9 @@ def reset(self):
         self.pos = -1
 
 class FrameProvider:
+    VIDEO_FRAME_EXT = '.PNG'
+    VIDEO_FRAME_MIME = 'image/png'
+
     class Quality(Enum):
         COMPRESSED = 0
         ORIGINAL = 100
@@ -129,13 +133,14 @@ def _validate_chunk_number(self, chunk_number):
 
         return chunk_number_
 
-    @staticmethod
-    def _av_frame_to_png_bytes(av_frame):
-        pil_img = av_frame.to_image()
-        buf = BytesIO()
-        pil_img.save(buf, format='PNG')
-        buf.seek(0)
-        return buf
+    @classmethod
+    def _av_frame_to_png_bytes(cls, av_frame):
+        ext = cls.VIDEO_FRAME_EXT
+        image = av_frame.to_ndarray(format='bgr24')
+        success, result = cv2.imencode(ext, image)
+        if not success:
+            raise Exception("Failed to encode image to '%s' format" % (ext))
+        return BytesIO(result.tobytes())
 
     def _convert_frame(self, frame, reader_class, out_type):
         if out_type == self.Type.BUFFER:
@@ -144,11 +149,11 @@ def _convert_frame(self, frame, reader_class, out_type):
             return frame.to_image() if reader_class is VideoReader else Image.open(frame)
         elif out_type == self.Type.NUMPY_ARRAY:
             if reader_class is VideoReader:
-                image = np.array(frame.to_image())
+                image = frame.to_ndarray(format='bgr24')
             else:
                 image = np.array(Image.open(frame))
-            if len(image.shape) == 3 and image.shape[2] in {3, 4}:
-                image[:, :, :3] = image[:, :, 2::-1] # RGB to BGR
+                if len(image.shape) == 3 and image.shape[2] in {3, 4}:
+                    image[:, :, :3] = image[:, :, 2::-1] # RGB to BGR
             return image
         else:
             raise Exception('unsupported output type')
@@ -171,7 +176,7 @@ def get_frame(self, frame_number, quality=Quality.ORIGINAL,
 
         frame = self._convert_frame(frame, loader.reader_class, out_type)
         if loader.reader_class is VideoReader:
-            return (frame, 'image/png')
+            return (frame, self.VIDEO_FRAME_MIME)
         return (frame, mimetypes.guess_type(frame_name))
 
     def get_frames(self, quality=Quality.ORIGINAL, out_type=Type.BUFFER):
diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py
index d9eead2b9f7e..21430838661d 100644
--- a/cvat/apps/engine/media_extractors.py
+++ b/cvat/apps/engine/media_extractors.py
@@ -244,6 +244,8 @@ def get_progress(self, pos):
         return pos / stream.duration if stream.duration else None
 
     def _get_av_container(self):
+        if isinstance(self._source_path[0], io.BytesIO):
+            self._source_path[0].seek(0) # required for re-reading
         return av.open(self._source_path[0])
 
     def get_preview(self):
diff --git a/cvat/requirements/base.txt b/cvat/requirements/base.txt
index 59406651fec9..60613b4356ca 100644
--- a/cvat/requirements/base.txt
+++ b/cvat/requirements/base.txt
@@ -44,4 +44,4 @@ tensorflow==2.2.1 # Optional requirement of Datumaro
 # archives. Don't use as a python module because it has GPL license.
 patool==1.12
 diskcache==5.0.2
-git+https://github.com/openvinotoolkit/datumaro@v0.1.0
\ No newline at end of file
+git+https://github.com/openvinotoolkit/datumaro@v0.1.2
\ No newline at end of file