From f83e820c77e238b7cdf36a9f0a94f6da5c1510b6 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Mon, 23 Aug 2021 16:49:00 +0200
Subject: [PATCH 01/16] initial commit

---
 data/xView.yaml                        |  2 +-
 detect.py                              |  2 +-
 models/common.py                       |  2 +-
 models/tf.py                           |  2 +-
 tests/__init__.py                      |  0
 tests/utils/__init__.py                |  0
 tests/utils/datasets/__init__.py       |  0
 tests/utils/datasets/test_coco.py      |  0
 train.py                               |  2 +-
 utils/autoanchor.py                    |  2 +-
 utils/datasets/__init__.py             |  0
 utils/datasets/coco.py                 |  7 +++
 utils/datasets/core.py                 | 70 ++++++++++++++++++++++++++
 utils/{datasets.py => datasets_old.py} | 31 +++++++++++-
 utils/loggers/wandb/wandb_utils.py     |  4 +-
 val.py                                 |  2 +-
 16 files changed, 116 insertions(+), 10 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/utils/__init__.py
 create mode 100644 tests/utils/datasets/__init__.py
 create mode 100644 tests/utils/datasets/test_coco.py
 create mode 100644 utils/datasets/__init__.py
 create mode 100644 utils/datasets/coco.py
 create mode 100644 utils/datasets/core.py
 rename utils/{datasets.py => datasets_old.py} (98%)

diff --git a/data/xView.yaml b/data/xView.yaml
index fabcdb0bdd13..b5af470058e0 100644
--- a/data/xView.yaml
+++ b/data/xView.yaml
@@ -36,7 +36,7 @@ download: |
   from PIL import Image
   from tqdm import tqdm
 
-  from utils.datasets import autosplit
+  from utils.datasets_old import autosplit
   from utils.general import download, xyxy2xywhn
 
 
diff --git a/detect.py b/detect.py
index 601d5daf9852..d58c4ff67e8c 100644
--- a/detect.py
+++ b/detect.py
@@ -20,7 +20,7 @@
 sys.path.append(FILE.parents[0].as_posix())  # add yolov5/ to path
 
 from models.experimental import attempt_load
-from utils.datasets import LoadStreams, LoadImages
+from utils.datasets_old import LoadStreams, LoadImages
 from utils.general import check_img_size, check_requirements, check_imshow, colorstr, non_max_suppression, \
     apply_classifier, scale_coords, xyxy2xywh, strip_optimizer, set_logging, increment_path, save_one_box
 from utils.plots import colors, plot_one_box
diff --git a/models/common.py b/models/common.py
index e1f5aea3abed..93f062063736 100644
--- a/models/common.py
+++ b/models/common.py
@@ -17,7 +17,7 @@
 from PIL import Image
 from torch.cuda import amp
 
-from utils.datasets import exif_transpose, letterbox
+from utils.datasets_old import exif_transpose, letterbox
 from utils.general import non_max_suppression, make_divisible, scale_coords, increment_path, xyxy2xywh, save_one_box
 from utils.plots import colors, plot_one_box
 from utils.torch_utils import time_sync
diff --git a/models/tf.py b/models/tf.py
index 40e7d20a9d84..ca19710b2830 100644
--- a/models/tf.py
+++ b/models/tf.py
@@ -52,7 +52,7 @@
 from models.common import Conv, Bottleneck, SPP, DWConv, Focus, BottleneckCSP, Concat, autopad, C3
 from models.experimental import MixConv2d, CrossConv, attempt_load
 from models.yolo import Detect
-from utils.datasets import LoadImages
+from utils.datasets_old import LoadImages
 from utils.general import make_divisible, check_file, check_dataset
 
 logger = logging.getLogger(__name__)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/utils/datasets/__init__.py b/tests/utils/datasets/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/utils/datasets/test_coco.py b/tests/utils/datasets/test_coco.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/train.py b/train.py
index 275e0a4b1a8e..63f4ceba5abb 100644
--- a/train.py
+++ b/train.py
@@ -33,7 +33,7 @@
 from models.experimental import attempt_load
 from models.yolo import Model
 from utils.autoanchor import check_anchors
-from utils.datasets import create_dataloader
+from utils.datasets_old import create_dataloader
 from utils.general import labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, \
     strip_optimizer, get_latest_run, check_dataset, check_file, check_git_status, check_img_size, \
     check_requirements, print_mutation, set_logging, one_cycle, colorstr, methods
diff --git a/utils/autoanchor.py b/utils/autoanchor.py
index 66a2712dfd5d..5699726c7ece 100644
--- a/utils/autoanchor.py
+++ b/utils/autoanchor.py
@@ -109,7 +109,7 @@ def print_results(k):
     if isinstance(dataset, str):  # *.yaml file
         with open(dataset, errors='ignore') as f:
             data_dict = yaml.safe_load(f)  # model dict
-        from utils.datasets import LoadImagesAndLabels
+        from utils.datasets_old import LoadImagesAndLabels
         dataset = LoadImagesAndLabels(data_dict['train'], augment=True, rect=True)
 
     # Get label wh
diff --git a/utils/datasets/__init__.py b/utils/datasets/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/utils/datasets/coco.py b/utils/datasets/coco.py
new file mode 100644
index 000000000000..de50bdced753
--- /dev/null
+++ b/utils/datasets/coco.py
@@ -0,0 +1,7 @@
+from typing import Dict
+
+import torch
+
+
+def load_coco_annotations(coco_data: dict) -> Dict[str, torch.Tensor]:
+    pass
diff --git a/utils/datasets/core.py b/utils/datasets/core.py
new file mode 100644
index 000000000000..d4bd5824203f
--- /dev/null
+++ b/utils/datasets/core.py
@@ -0,0 +1,70 @@
+from typing import Tuple, List
+
+import torch
+from torch.utils.data import Dataset
+
+
+DatasetEntry = Tuple[torch.Tensor, torch.Tensor, str]
+
+
+def assemble_data_loader() -> None:
+    pass
+
+
+def initiate_dataset(path: str, cache_images: bool) -> Dataset:
+    if COCODataset.validate_directory_structure(path=path):
+        return COCODataset(path=path, cache_images=cache_images)
+    if YOLODataset.validate_directory_structure(path=path):
+        return YOLODataset(path=path, cache_images=cache_images)
+
+
+class COCODataset(Dataset):
+
+    def __init__(self, path: str, cache_images: bool) -> None:
+        self.path = path
+        self.cache_images = cache_images
+        self.image_file_name = []
+
+    def __len__(self) -> int:
+        pass
+
+    def __getitem__(self, index: int) -> DatasetEntry:
+        pass
+
+    @staticmethod
+    def collate_fn(batch: List[DatasetEntry]) -> torch.Tensor:
+        pass
+
+    @staticmethod
+    def validate_directory_structure(path: str) -> None:
+        pass
+
+    @staticmethod
+    def load_labels(path: str) -> List[torch.Tensor]:
+        pass
+
+
+class YOLODataset(Dataset):
+
+    def __init__(self, path: str, cache_images: bool) -> None:
+        self.path = path
+        self.cache_images = cache_images
+        self.image_file_name = []
+
+    def __len__(self) -> int:
+        pass
+
+    def __getitem__(self, index: int) -> DatasetEntry:
+        pass
+
+    @staticmethod
+    def collate_fn(batch: List[DatasetEntry]) -> torch.Tensor:
+        pass
+
+    @staticmethod
+    def validate_directory_structure(path: str) -> None:
+        pass
+
+    @staticmethod
+    def load_labels(path: str) -> List[torch.Tensor]:
+        pass
diff --git a/utils/datasets.py b/utils/datasets_old.py
similarity index 98%
rename from utils/datasets.py
rename to utils/datasets_old.py
index 25a2ba6f9561..5dadf63e47a4 100755
--- a/utils/datasets.py
+++ b/utils/datasets_old.py
@@ -24,6 +24,7 @@
 from PIL import Image, ExifTags
 from torch.utils.data import Dataset
 from tqdm import tqdm
+from typing import Tuple
 
 from utils.augmentations import Albumentations, augment_hsv, copy_paste, letterbox, mixup, random_perspective
 from utils.general import check_requirements, check_file, check_dataset, xywh2xyxy, xywhn2xyxy, xyxy2xywhn, \
@@ -95,6 +96,18 @@ def create_dataloader(path, imgsz, batch_size, stride, single_cls=False, hyp=Non
                       rect=False, rank=-1, workers=8, image_weights=False, quad=False, prefix=''):
     # Make sure only the first process in DDP process the dataset first, and the following others can use the cache
     with torch_distributed_zero_first(rank):
+        print('path', path)
+        print('imgsz', imgsz)
+        print('batch_size', batch_size)
+        print('augment', augment)
+        print('hyp', hyp)
+        print('rect', rect)
+        print('cache', cache)
+        print('single_cls', single_cls)
+        print('stride', stride)
+        print('pad', pad)
+        print('image_weights', image_weights)
+        print('prefix', prefix)
         dataset = LoadImagesAndLabels(path, imgsz, batch_size,
                                       augment=augment,  # augment images
                                       hyp=hyp,  # augmentation hyperparameters
@@ -364,6 +377,9 @@ def img2label_paths(img_paths):
     return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in img_paths]
 
 
+
+# image_weights - unused
+
 class LoadImagesAndLabels(Dataset):  # for training/testing
     def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
                  cache_images=False, single_cls=False, stride=32, pad=0.0, prefix=''):
@@ -421,8 +437,12 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r
         [cache.pop(k) for k in ('hash', 'version', 'msgs')]  # remove items
         labels, shapes, self.segments = zip(*cache.values())
         self.labels = list(labels)
+
+        print(self.labels)
+
         self.shapes = np.array(shapes, dtype=np.float64)
         self.img_files = list(cache.keys())  # update
+        print(self.img_files)
         self.label_files = img2label_paths(cache.keys())  # update
         if single_cls:
             for x in self.labels:
@@ -596,10 +616,19 @@ def __getitem__(self, index):
         img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
         img = np.ascontiguousarray(img)
 
+        # print(type(torch.from_numpy(img)))
+        # print(type(labels_out))
+        # print(labels_out)
+        # print(type(self.img_files[index]))
+        # print(self.img_files[index])
+        # print(type(shapes))
+        # print(shapes)
+
         return torch.from_numpy(img), labels_out, self.img_files[index], shapes
 
     @staticmethod
     def collate_fn(batch):
+        # print('batch', type(batch), batch)
         img, label, path, shapes = zip(*batch)  # transposed
         for i, l in enumerate(label):
             l[:, 0] = i  # add target image index for build_targets()
@@ -800,7 +829,7 @@ def flatten_recursive(path='../datasets/coco128'):
         shutil.copyfile(file, new_path / Path(file).name)
 
 
-def extract_boxes(path='../datasets/coco128'):  # from utils.datasets import *; extract_boxes()
+def extract_boxes(path='../datasets/coco128'):  # from utils.datasets_old import *; extract_boxes()
     # Convert detection dataset into classification dataset, with one directory per class
     path = Path(path)  # images dir
     shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None  # remove existing
diff --git a/utils/loggers/wandb/wandb_utils.py b/utils/loggers/wandb/wandb_utils.py
index 8b2095afcb8b..d3f33c70415e 100644
--- a/utils/loggers/wandb/wandb_utils.py
+++ b/utils/loggers/wandb/wandb_utils.py
@@ -12,8 +12,8 @@
 FILE = Path(__file__).absolute()
 sys.path.append(FILE.parents[3].as_posix())  # add yolov5/ to path
 
-from utils.datasets import LoadImagesAndLabels
-from utils.datasets import img2label_paths
+from utils.datasets_old import LoadImagesAndLabels
+from utils.datasets_old import img2label_paths
 from utils.general import check_dataset, check_file
 
 try:
diff --git a/val.py b/val.py
index cbee8cf1c026..98a4db80dbc1 100644
--- a/val.py
+++ b/val.py
@@ -21,7 +21,7 @@
 sys.path.append(FILE.parents[0].as_posix())  # add yolov5/ to path
 
 from models.experimental import attempt_load
-from utils.datasets import create_dataloader
+from utils.datasets_old import create_dataloader
 from utils.general import coco80_to_coco91_class, check_dataset, check_file, check_img_size, check_requirements, \
     box_iou, non_max_suppression, scale_coords, xyxy2xywh, xywh2xyxy, set_logging, increment_path, colorstr
 from utils.metrics import ap_per_class, ConfusionMatrix

From 494d5c838ef8cce32304c0b43d1a5f033002c4dd Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Mon, 23 Aug 2021 19:02:55 +0200
Subject: [PATCH 02/16] coco annotations loading in progress

---
 utils/datasets/coco.py | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/utils/datasets/coco.py b/utils/datasets/coco.py
index de50bdced753..436ee6ed90dc 100644
--- a/utils/datasets/coco.py
+++ b/utils/datasets/coco.py
@@ -1,7 +1,40 @@
-from typing import Dict
+import json
+from typing import Dict, Union, List
 
 import torch
 
 
+def read_json_file(file_path: str, **kwargs) -> Union[list, dict]:
+    with open(file_path, 'r') as file:
+        return json.load(file, **kwargs)
+
+
 def load_coco_annotations(coco_data: dict) -> Dict[str, torch.Tensor]:
+    coco_image_entries_map = map_coco_image_entries(coco_image_entries=coco_data["images"])
+    coco_annotation_entries_map = map_coco_annotation_entries(coco_annotation_entries=coco_data["annotations"])
+    return {
+        coco_image_entries_map[image_id]["file_name"]: process_coco_annotation(
+            coco_annotation_entries=coco_annotation_entries_map[image_id],
+            coco_image_data=coco_image_entries_map[image_id]
+        )
+        for image_id
+        in sorted(coco_image_entries_map.keys())
+    }
+
+
+def map_coco_image_entries(coco_image_entries: List[dict]) -> Dict[int, dict]:
+    return {
+        image_data["id"]: image_data
+        for image_data
+        in coco_image_entries
+    }
+
+
+def map_coco_annotation_entries(coco_annotation_entries: List[dict]) -> Dict[int, List[dict]]:
     pass
+
+
+def process_coco_annotation(coco_annotation_entries: List[dict], coco_image_data: dict) -> torch.Tensor:
+    image_width = coco_image_data["width"]
+    image_height = coco_image_data["height"]
+

From b3f57e87211e91dd031dcf9cb3b1bd6bf3d83f7e Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Mon, 23 Aug 2021 21:04:21 +0200
Subject: [PATCH 03/16] base coco annotations loading is working

---
 utils/datasets/coco.py | 45 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/utils/datasets/coco.py b/utils/datasets/coco.py
index 436ee6ed90dc..7d41231aa4b7 100644
--- a/utils/datasets/coco.py
+++ b/utils/datasets/coco.py
@@ -1,7 +1,20 @@
 import json
 from typing import Dict, Union, List
+from collections import defaultdict
 
 import torch
+import numpy as np
+
+
+IMAGE_KEY = "images"
+IMAGE_FILE_NAME_KEY = "file_name"
+IMAGE_ID_KEY = "id"
+IMAGE_WIDTH_KEY = "width"
+IMAGE_HEIGHT_KEY = "height"
+ANNOTATION_KEY = "annotations"
+ANNOTATION_IMAGE_ID_KEY = "image_id"
+ANNOTATION_BBOX_KEY = "bbox"
+ANNOTATION_CATEGORY_ID = "category_id"
 
 
 def read_json_file(file_path: str, **kwargs) -> Union[list, dict]:
@@ -10,10 +23,10 @@ def read_json_file(file_path: str, **kwargs) -> Union[list, dict]:
 
 
 def load_coco_annotations(coco_data: dict) -> Dict[str, torch.Tensor]:
-    coco_image_entries_map = map_coco_image_entries(coco_image_entries=coco_data["images"])
-    coco_annotation_entries_map = map_coco_annotation_entries(coco_annotation_entries=coco_data["annotations"])
+    coco_image_entries_map = map_coco_image_entries(coco_image_entries=coco_data[IMAGE_KEY])
+    coco_annotation_entries_map = map_coco_annotation_entries(coco_annotation_entries=coco_data[ANNOTATION_KEY])
     return {
-        coco_image_entries_map[image_id]["file_name"]: process_coco_annotation(
+        coco_image_entries_map[image_id][IMAGE_FILE_NAME_KEY]: process_coco_annotation(
             coco_annotation_entries=coco_annotation_entries_map[image_id],
             coco_image_data=coco_image_entries_map[image_id]
         )
@@ -24,17 +37,33 @@ def load_coco_annotations(coco_data: dict) -> Dict[str, torch.Tensor]:
 
 def map_coco_image_entries(coco_image_entries: List[dict]) -> Dict[int, dict]:
     return {
-        image_data["id"]: image_data
+        image_data[IMAGE_ID_KEY]: image_data
         for image_data
         in coco_image_entries
     }
 
 
 def map_coco_annotation_entries(coco_annotation_entries: List[dict]) -> Dict[int, List[dict]]:
-    pass
+    result = defaultdict(list)
+    for coco_annotation_entry in coco_annotation_entries:
+        image_id = coco_annotation_entry[ANNOTATION_IMAGE_ID_KEY]
+        result[image_id].append(coco_annotation_entry)
+    return result
 
 
 def process_coco_annotation(coco_annotation_entries: List[dict], coco_image_data: dict) -> torch.Tensor:
-    image_width = coco_image_data["width"]
-    image_height = coco_image_data["height"]
-
+    image_width = coco_image_data[IMAGE_WIDTH_KEY]
+    image_height = coco_image_data[IMAGE_HEIGHT_KEY]
+    annotations = []
+    for coco_annotation_entry in coco_annotation_entries:
+        category_id = coco_annotation_entry[ANNOTATION_CATEGORY_ID]
+        x_min, y_min, width, height = coco_annotation_entry[ANNOTATION_BBOX_KEY]
+        annotations.append([
+            0,
+            category_id,
+            (x_min + width / 2) / image_width,
+            (y_min + height / 2) / image_height,
+            width / image_width,
+            height / image_height
+        ])
+    return torch.as_tensor(np.array(annotations))

From fb84b0c1788e15652d9b21a6d64f8780c1b9390e Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Tue, 24 Aug 2021 08:36:46 +0200
Subject: [PATCH 04/16] loading yolo annotations refactor in progress

---
 utils/datasets/core.py | 26 ++++++++++++++++++++++++--
 utils/datasets/yolo.py | 27 +++++++++++++++++++++++++++
 utils/datasets_old.py  |  1 -
 utils/file.py          | 11 +++++++++++
 4 files changed, 62 insertions(+), 3 deletions(-)
 create mode 100644 utils/datasets/yolo.py
 create mode 100644 utils/file.py

diff --git a/utils/datasets/core.py b/utils/datasets/core.py
index d4bd5824203f..8de2380277a1 100644
--- a/utils/datasets/core.py
+++ b/utils/datasets/core.py
@@ -19,11 +19,20 @@ def initiate_dataset(path: str, cache_images: bool) -> Dataset:
 
 
 class COCODataset(Dataset):
+    """
+    dataset
+    ├── annotations.json
+    └── images
+        ├── image-1.jpg
+        ├── image-2.jpg
+        └── ...
+    """
 
     def __init__(self, path: str, cache_images: bool) -> None:
         self.path = path
         self.cache_images = cache_images
-        self.image_file_name = []
+        self.image_file_names = []
+        self.labels = []
 
     def __len__(self) -> int:
         pass
@@ -45,11 +54,24 @@ def load_labels(path: str) -> List[torch.Tensor]:
 
 
 class YOLODataset(Dataset):
+    """
+    dataset
+    ├── image_names.txt
+    ├── images
+    │   ├── image-1.jpg
+    │   ├── image-2.jpg
+    │   └── ...
+    └── labels
+        ├── image-1.txt
+        ├── image-2.txt
+        └── ...
+    """
 
     def __init__(self, path: str, cache_images: bool) -> None:
         self.path = path
         self.cache_images = cache_images
-        self.image_file_name = []
+        self.image_file_names = []
+        self.labels = []
 
     def __len__(self) -> int:
         pass
diff --git a/utils/datasets/yolo.py b/utils/datasets/yolo.py
new file mode 100644
index 000000000000..afcf29857fcf
--- /dev/null
+++ b/utils/datasets/yolo.py
@@ -0,0 +1,27 @@
+import os
+import glob
+from pathlib import Path
+
+from typing import List, Union
+
+from utils.datasets_old import IMG_FORMATS
+from utils.file import read_text_file_lines
+
+
+def load_image_names_from_paths(paths: Union[str, List[str]]) -> List[str]:
+    image_paths = []
+    for path in paths if isinstance(paths, list) else [paths]:
+        path = Path(path)  # os-agnostic
+        if path.is_dir():  # dir
+            image_paths += glob.glob(str(path / '**' / '*.*'), recursive=True)
+        elif path.is_file():  # file
+            local_paths = read_text_file_lines(path)
+            parent = str(path.parent) + os.sep
+            image_paths += [
+                local_path.replace('./', parent) if local_path.startswith('./') else local_path
+                for local_path
+                in local_paths
+            ]
+        else:
+            raise Exception(f'{path} does not exist')
+    return sorted([x.replace('/', os.sep) for x in image_paths if x.split('.')[-1].lower() in IMG_FORMATS])
diff --git a/utils/datasets_old.py b/utils/datasets_old.py
index 5dadf63e47a4..d707faddf01e 100755
--- a/utils/datasets_old.py
+++ b/utils/datasets_old.py
@@ -24,7 +24,6 @@
 from PIL import Image, ExifTags
 from torch.utils.data import Dataset
 from tqdm import tqdm
-from typing import Tuple
 
 from utils.augmentations import Albumentations, augment_hsv, copy_paste, letterbox, mixup, random_perspective
 from utils.general import check_requirements, check_file, check_dataset, xywh2xyxy, xywhn2xyxy, xyxy2xywhn, \
diff --git a/utils/file.py b/utils/file.py
new file mode 100644
index 000000000000..3aa1b198d3fc
--- /dev/null
+++ b/utils/file.py
@@ -0,0 +1,11 @@
+from pathlib import Path
+from typing import List, Union
+
+
+def read_text_file_lines(file_path: Union[str, Path], remove_blank: bool = True) -> List[str]:
+    with open(file_path, "r") as file:
+        lines = [l.strip(' \n') for l in file.readlines()]
+        if remove_blank:
+            return list(filter(lambda l: len(l) > 0, lines))
+        else:
+            return lines

From 5ce1c2f9e036fc71e88861b23dc87f18271c59de Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Tue, 24 Aug 2021 08:59:03 +0200
Subject: [PATCH 05/16] plug some of util functions into new loaders

---
 utils/datasets/coco.py |  1 +
 utils/datasets/core.py | 19 ++++++++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/utils/datasets/coco.py b/utils/datasets/coco.py
index 7d41231aa4b7..39acbb5ad5e4 100644
--- a/utils/datasets/coco.py
+++ b/utils/datasets/coco.py
@@ -15,6 +15,7 @@
 ANNOTATION_IMAGE_ID_KEY = "image_id"
 ANNOTATION_BBOX_KEY = "bbox"
 ANNOTATION_CATEGORY_ID = "category_id"
+ANNOTATIONS_FILE_NAME = "annotations.json"
 
 
 def read_json_file(file_path: str, **kwargs) -> Union[list, dict]:
diff --git a/utils/datasets/core.py b/utils/datasets/core.py
index 8de2380277a1..befe1363e21e 100644
--- a/utils/datasets/core.py
+++ b/utils/datasets/core.py
@@ -1,8 +1,11 @@
 from typing import Tuple, List
 
+import os
 import torch
 from torch.utils.data import Dataset
 
+from utils.datasets.coco import read_json_file, ANNOTATIONS_FILE_NAME, load_coco_annotations
+from utils.datasets.yolo import load_image_names_from_paths
 
 DatasetEntry = Tuple[torch.Tensor, torch.Tensor, str]
 
@@ -31,11 +34,16 @@ class COCODataset(Dataset):
     def __init__(self, path: str, cache_images: bool) -> None:
         self.path = path
         self.cache_images = cache_images
-        self.image_file_names = []
-        self.labels = []
+
+        coco_data = read_json_file(os.path.join(path, ANNOTATIONS_FILE_NAME))
+        coco_annotations = load_coco_annotations(coco_data=coco_data)
+
+        self.image_paths = coco_annotations.keys()
+        self.labels = coco_annotations.values()
+        self.images = []
 
     def __len__(self) -> int:
-        pass
+        return len(self.image_paths)
 
     def __getitem__(self, index: int) -> DatasetEntry:
         pass
@@ -70,11 +78,12 @@ class YOLODataset(Dataset):
     def __init__(self, path: str, cache_images: bool) -> None:
         self.path = path
         self.cache_images = cache_images
-        self.image_file_names = []
+        self.image_paths = load_image_names_from_paths(paths=path)
         self.labels = []
+        self.images = []
 
     def __len__(self) -> int:
-        pass
+        return len(self.image_paths)
 
     def __getitem__(self, index: int) -> DatasetEntry:
         pass

From 65a86a683b6eb6e5f71fe85db95a1958bd0f8a70 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Tue, 24 Aug 2021 10:26:46 +0200
Subject: [PATCH 06/16] some general utils for file management and initial
 tests added

---
 requirements.txt          |  1 +
 tests/utils/test_file.py  | 62 +++++++++++++++++++++++++++++++++++++++
 tests/utils/test_utils.py |  7 +++++
 utils/datasets/core.py    | 43 +++++++++++++++++++++++++--
 utils/datasets/yolo.py    |  8 +++++
 utils/datasets_old.py     | 12 --------
 utils/file.py             | 15 +++++++++-
 7 files changed, 132 insertions(+), 16 deletions(-)
 create mode 100644 tests/utils/test_file.py
 create mode 100644 tests/utils/test_utils.py

diff --git a/requirements.txt b/requirements.txt
index f6361d591f1b..56158860b788 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,3 +30,4 @@ pandas
 # pycocotools>=2.0  # COCO mAP
 # albumentations>=1.0.3
 thop  # FLOPs computation
+# pytest
diff --git a/tests/utils/test_file.py b/tests/utils/test_file.py
new file mode 100644
index 000000000000..6b0e5bf8f9e6
--- /dev/null
+++ b/tests/utils/test_file.py
@@ -0,0 +1,62 @@
+import shutil
+from pathlib import Path
+from typing import Generator, Optional, List, Callable
+
+import pytest
+
+from tests.utils.test_utils import prepare_temporary_dir
+from utils.file import dump_text_file, get_directory_content
+
+
+@pytest.fixture
+def mock_directory_path() -> Generator[str, None, None]:
+    output_path = prepare_temporary_dir(directory_name="mock_directory_path")
+    yield output_path
+    shutil.rmtree(output_path)
+
+
+def mock_directory_content(directory_path: str) -> None:
+    dump_text_file(Path(directory_path).joinpath('file_1.json').as_posix(), '')
+    dump_text_file(Path(directory_path).joinpath('file_2.txt').as_posix(), '')
+    dump_text_file(Path(directory_path).joinpath('file_3.txt').as_posix(), '')
+
+
+@pytest.mark.parametrize(
+    "extension, mock_callback, expected_result",
+    [
+        (
+            None,
+            lambda x: None,
+            0
+        ),  # empty directory
+        (
+            None,
+            mock_directory_content,
+            3
+        ),  # directory contain 3 files
+        (
+            'json',
+            mock_directory_content,
+            1
+        ),  # directory contain 1 .json file
+        (
+            'txt',
+            mock_directory_content,
+            2
+        ),  # directory contain 2 .txt files
+        (
+            'avi',
+            mock_directory_content,
+            0
+        ),  # directory contain 0 .avi files
+    ]
+)
+def test_get_directory_content(
+    mock_directory_path: str,
+    extension: Optional[str],
+    mock_callback: Callable[[str], None],
+    expected_result: List[str]
+) -> None:
+    mock_callback(mock_directory_path)
+    result = get_directory_content(directory_path=mock_directory_path, extension=extension)
+    assert len(result) == expected_result
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
new file mode 100644
index 000000000000..989bd2173ec4
--- /dev/null
+++ b/tests/utils/test_utils.py
@@ -0,0 +1,7 @@
+from pathlib import Path
+
+
+def prepare_temporary_dir(directory_name: str) -> str:
+    directory_path = Path(__file__).parent.joinpath(directory_name)
+    directory_path.mkdir(parents=True, exist_ok=True)
+    return directory_path.as_posix()
diff --git a/utils/datasets/core.py b/utils/datasets/core.py
index befe1363e21e..a4789d271ad8 100644
--- a/utils/datasets/core.py
+++ b/utils/datasets/core.py
@@ -1,11 +1,11 @@
-from typing import Tuple, List
+from typing import Tuple, List, Optional
 
 import os
 import torch
 from torch.utils.data import Dataset
 
 from utils.datasets.coco import read_json_file, ANNOTATIONS_FILE_NAME, load_coco_annotations
-from utils.datasets.yolo import load_image_names_from_paths
+from utils.datasets.yolo import load_image_names_from_paths, img2label_paths
 
 DatasetEntry = Tuple[torch.Tensor, torch.Tensor, str]
 
@@ -25,6 +25,7 @@ class COCODataset(Dataset):
     """
     dataset
     ├── annotations.json
+    ├── dataset.cache [optional]
     └── images
         ├── image-1.jpg
         ├── image-2.jpg
@@ -32,6 +33,13 @@ class COCODataset(Dataset):
     """
 
     def __init__(self, path: str, cache_images: bool) -> None:
+        """
+        Load COCO labels along with images from provided path.
+
+        Args:
+            path: `str` - path to `dataset` root directory.
+            cache_images: `bool` - flag to force caching of images.
+        """
         self.path = path
         self.cache_images = cache_images
 
@@ -60,25 +68,38 @@ def validate_directory_structure(path: str) -> None:
     def load_labels(path: str) -> List[torch.Tensor]:
         pass
 
+    @staticmethod
+    def resolve_cache_path() -> Optional[str]:
+        pass
+
 
 class YOLODataset(Dataset):
     """
     dataset
-    ├── image_names.txt
+    ├── image_names.txt [optional]
     ├── images
     │   ├── image-1.jpg
     │   ├── image-2.jpg
     │   └── ...
     └── labels
+        ├── dataset.cache [optional]
         ├── image-1.txt
         ├── image-2.txt
         └── ...
     """
 
     def __init__(self, path: str, cache_images: bool) -> None:
+        """
+        Load YOLO labels along with images from provided path.
+
+        Args:
+            path: `str` - path to `dataset` root directory or to `image_names.txt` file.
+            cache_images: `bool` - flag to force caching of images.
+        """
         self.path = path
         self.cache_images = cache_images
         self.image_paths = load_image_names_from_paths(paths=path)
+        self.label_paths = img2label_paths(image_paths=self.image_paths)
         self.labels = []
         self.images = []
 
@@ -99,3 +120,19 @@ def validate_directory_structure(path: str) -> None:
     @staticmethod
     def load_labels(path: str) -> List[torch.Tensor]:
         pass
+
+    @staticmethod
+    def resolve_cache_path() -> Optional[str]:
+        pass
+
+
+class TransformedDataset(Dataset):
+
+    def __init__(self, source_dataset: Dataset) -> None:
+        self.source_dataset = source_dataset
+
+    def __len__(self) -> int:
+        return len(self.source_dataset)
+
+    def __getitem__(self, index: int) -> DatasetEntry:
+        return self.source_dataset[index]
diff --git a/utils/datasets/yolo.py b/utils/datasets/yolo.py
index afcf29857fcf..7ab4b21518e3 100644
--- a/utils/datasets/yolo.py
+++ b/utils/datasets/yolo.py
@@ -25,3 +25,11 @@ def load_image_names_from_paths(paths: Union[str, List[str]]) -> List[str]:
         else:
             raise Exception(f'{path} does not exist')
     return sorted([x.replace('/', os.sep) for x in image_paths if x.split('.')[-1].lower() in IMG_FORMATS])
+
+
+def img2label_paths(image_paths: List[str]) -> List[str]:
+    """
+    Define label paths as a function of image paths.
+    """
+    sa, sb = os.sep + 'images' + os.sep, os.sep + 'labels' + os.sep  # /images/, /labels/ substrings
+    return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in image_paths]
diff --git a/utils/datasets_old.py b/utils/datasets_old.py
index d707faddf01e..a570f123b7d8 100755
--- a/utils/datasets_old.py
+++ b/utils/datasets_old.py
@@ -436,12 +436,8 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r
         [cache.pop(k) for k in ('hash', 'version', 'msgs')]  # remove items
         labels, shapes, self.segments = zip(*cache.values())
         self.labels = list(labels)
-
-        print(self.labels)
-
         self.shapes = np.array(shapes, dtype=np.float64)
         self.img_files = list(cache.keys())  # update
-        print(self.img_files)
         self.label_files = img2label_paths(cache.keys())  # update
         if single_cls:
             for x in self.labels:
@@ -615,14 +611,6 @@ def __getitem__(self, index):
         img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
         img = np.ascontiguousarray(img)
 
-        # print(type(torch.from_numpy(img)))
-        # print(type(labels_out))
-        # print(labels_out)
-        # print(type(self.img_files[index]))
-        # print(self.img_files[index])
-        # print(type(shapes))
-        # print(shapes)
-
         return torch.from_numpy(img), labels_out, self.img_files[index], shapes
 
     @staticmethod
diff --git a/utils/file.py b/utils/file.py
index 3aa1b198d3fc..e0c3066df7ec 100644
--- a/utils/file.py
+++ b/utils/file.py
@@ -1,5 +1,6 @@
+from glob import glob
 from pathlib import Path
-from typing import List, Union
+from typing import List, Union, Optional
 
 
 def read_text_file_lines(file_path: Union[str, Path], remove_blank: bool = True) -> List[str]:
@@ -9,3 +10,15 @@ def read_text_file_lines(file_path: Union[str, Path], remove_blank: bool = True)
             return list(filter(lambda l: len(l) > 0, lines))
         else:
             return lines
+
+
+def get_directory_content(directory_path: str, extension: Optional[str] = None) -> List[str]:
+    wild_card = '*' if extension is None else f'*.{extension}'
+    pattern = Path(directory_path).joinpath(wild_card).as_posix()
+    return glob(pattern)
+
+
+def dump_text_file(file_path: str, content: str) -> None:
+    Path(file_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.write(content)

From e770624ae12f670a0c809b16da5dbcc5d77fef02 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Tue, 24 Aug 2021 10:26:46 +0200
Subject: [PATCH 07/16] some general utils for file management and initial
 tests added

---
 requirements.txt          |  1 +
 tests/utils/test_file.py  | 62 +++++++++++++++++++++++++++++++++++++++
 tests/utils/test_utils.py |  7 +++++
 utils/datasets/core.py    | 43 +++++++++++++++++++++++++--
 utils/datasets/yolo.py    |  8 +++++
 utils/datasets_old.py     | 24 ---------------
 utils/file.py             | 15 +++++++++-
 7 files changed, 132 insertions(+), 28 deletions(-)
 create mode 100644 tests/utils/test_file.py
 create mode 100644 tests/utils/test_utils.py

diff --git a/requirements.txt b/requirements.txt
index f6361d591f1b..56158860b788 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,3 +30,4 @@ pandas
 # pycocotools>=2.0  # COCO mAP
 # albumentations>=1.0.3
 thop  # FLOPs computation
+# pytest
diff --git a/tests/utils/test_file.py b/tests/utils/test_file.py
new file mode 100644
index 000000000000..6b0e5bf8f9e6
--- /dev/null
+++ b/tests/utils/test_file.py
@@ -0,0 +1,62 @@
+import shutil
+from pathlib import Path
+from typing import Generator, Optional, List, Callable
+
+import pytest
+
+from tests.utils.test_utils import prepare_temporary_dir
+from utils.file import dump_text_file, get_directory_content
+
+
+@pytest.fixture
+def mock_directory_path() -> Generator[str, None, None]:
+    output_path = prepare_temporary_dir(directory_name="mock_directory_path")
+    yield output_path
+    shutil.rmtree(output_path)
+
+
+def mock_directory_content(directory_path: str) -> None:
+    dump_text_file(Path(directory_path).joinpath('file_1.json').as_posix(), '')
+    dump_text_file(Path(directory_path).joinpath('file_2.txt').as_posix(), '')
+    dump_text_file(Path(directory_path).joinpath('file_3.txt').as_posix(), '')
+
+
+@pytest.mark.parametrize(
+    "extension, mock_callback, expected_result",
+    [
+        (
+            None,
+            lambda x: None,
+            0
+        ),  # empty directory
+        (
+            None,
+            mock_directory_content,
+            3
+        ),  # directory contain 3 files
+        (
+            'json',
+            mock_directory_content,
+            1
+        ),  # directory contain 1 .json file
+        (
+            'txt',
+            mock_directory_content,
+            2
+        ),  # directory contain 2 .txt files
+        (
+            'avi',
+            mock_directory_content,
+            0
+        ),  # directory contain 0 .avi files
+    ]
+)
+def test_get_directory_content(
+    mock_directory_path: str,
+    extension: Optional[str],
+    mock_callback: Callable[[str], None],
+    expected_result: List[str]
+) -> None:
+    mock_callback(mock_directory_path)
+    result = get_directory_content(directory_path=mock_directory_path, extension=extension)
+    assert len(result) == expected_result
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
new file mode 100644
index 000000000000..989bd2173ec4
--- /dev/null
+++ b/tests/utils/test_utils.py
@@ -0,0 +1,7 @@
+from pathlib import Path
+
+
+def prepare_temporary_dir(directory_name: str) -> str:
+    directory_path = Path(__file__).parent.joinpath(directory_name)
+    directory_path.mkdir(parents=True, exist_ok=True)
+    return directory_path.as_posix()
diff --git a/utils/datasets/core.py b/utils/datasets/core.py
index befe1363e21e..a4789d271ad8 100644
--- a/utils/datasets/core.py
+++ b/utils/datasets/core.py
@@ -1,11 +1,11 @@
-from typing import Tuple, List
+from typing import Tuple, List, Optional
 
 import os
 import torch
 from torch.utils.data import Dataset
 
 from utils.datasets.coco import read_json_file, ANNOTATIONS_FILE_NAME, load_coco_annotations
-from utils.datasets.yolo import load_image_names_from_paths
+from utils.datasets.yolo import load_image_names_from_paths, img2label_paths
 
 DatasetEntry = Tuple[torch.Tensor, torch.Tensor, str]
 
@@ -25,6 +25,7 @@ class COCODataset(Dataset):
     """
     dataset
     ├── annotations.json
+    ├── dataset.cache [optional]
     └── images
         ├── image-1.jpg
         ├── image-2.jpg
@@ -32,6 +33,13 @@ class COCODataset(Dataset):
     """
 
     def __init__(self, path: str, cache_images: bool) -> None:
+        """
+        Load COCO labels along with images from provided path.
+
+        Args:
+            path: `str` - path to `dataset` root directory.
+            cache_images: `bool` - flag to force caching of images.
+        """
         self.path = path
         self.cache_images = cache_images
 
@@ -60,25 +68,38 @@ def validate_directory_structure(path: str) -> None:
     def load_labels(path: str) -> List[torch.Tensor]:
         pass
 
+    @staticmethod
+    def resolve_cache_path() -> Optional[str]:
+        pass
+
 
 class YOLODataset(Dataset):
     """
     dataset
-    ├── image_names.txt
+    ├── image_names.txt [optional]
     ├── images
     │   ├── image-1.jpg
     │   ├── image-2.jpg
     │   └── ...
     └── labels
+        ├── dataset.cache [optional]
         ├── image-1.txt
         ├── image-2.txt
         └── ...
     """
 
     def __init__(self, path: str, cache_images: bool) -> None:
+        """
+        Load YOLO labels along with images from provided path.
+
+        Args:
+            path: `str` - path to `dataset` root directory or to `image_names.txt` file.
+            cache_images: `bool` - flag to force caching of images.
+        """
         self.path = path
         self.cache_images = cache_images
         self.image_paths = load_image_names_from_paths(paths=path)
+        self.label_paths = img2label_paths(image_paths=self.image_paths)
         self.labels = []
         self.images = []
 
@@ -99,3 +120,19 @@ def validate_directory_structure(path: str) -> None:
     @staticmethod
     def load_labels(path: str) -> List[torch.Tensor]:
         pass
+
+    @staticmethod
+    def resolve_cache_path() -> Optional[str]:
+        pass
+
+
+class TransformedDataset(Dataset):
+
+    def __init__(self, source_dataset: Dataset) -> None:
+        self.source_dataset = source_dataset
+
+    def __len__(self) -> int:
+        return len(self.source_dataset)
+
+    def __getitem__(self, index: int) -> DatasetEntry:
+        return self.source_dataset[index]
diff --git a/utils/datasets/yolo.py b/utils/datasets/yolo.py
index afcf29857fcf..7ab4b21518e3 100644
--- a/utils/datasets/yolo.py
+++ b/utils/datasets/yolo.py
@@ -25,3 +25,11 @@ def load_image_names_from_paths(paths: Union[str, List[str]]) -> List[str]:
         else:
             raise Exception(f'{path} does not exist')
     return sorted([x.replace('/', os.sep) for x in image_paths if x.split('.')[-1].lower() in IMG_FORMATS])
+
+
+def img2label_paths(image_paths: List[str]) -> List[str]:
+    """
+    Define label paths as a function of image paths.
+    """
+    sa, sb = os.sep + 'images' + os.sep, os.sep + 'labels' + os.sep  # /images/, /labels/ substrings
+    return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in image_paths]
diff --git a/utils/datasets_old.py b/utils/datasets_old.py
index d707faddf01e..864e1273794a 100755
--- a/utils/datasets_old.py
+++ b/utils/datasets_old.py
@@ -95,18 +95,6 @@ def create_dataloader(path, imgsz, batch_size, stride, single_cls=False, hyp=Non
                       rect=False, rank=-1, workers=8, image_weights=False, quad=False, prefix=''):
     # Make sure only the first process in DDP process the dataset first, and the following others can use the cache
     with torch_distributed_zero_first(rank):
-        print('path', path)
-        print('imgsz', imgsz)
-        print('batch_size', batch_size)
-        print('augment', augment)
-        print('hyp', hyp)
-        print('rect', rect)
-        print('cache', cache)
-        print('single_cls', single_cls)
-        print('stride', stride)
-        print('pad', pad)
-        print('image_weights', image_weights)
-        print('prefix', prefix)
         dataset = LoadImagesAndLabels(path, imgsz, batch_size,
                                       augment=augment,  # augment images
                                       hyp=hyp,  # augmentation hyperparameters
@@ -436,12 +424,8 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r
         [cache.pop(k) for k in ('hash', 'version', 'msgs')]  # remove items
         labels, shapes, self.segments = zip(*cache.values())
         self.labels = list(labels)
-
-        print(self.labels)
-
         self.shapes = np.array(shapes, dtype=np.float64)
         self.img_files = list(cache.keys())  # update
-        print(self.img_files)
         self.label_files = img2label_paths(cache.keys())  # update
         if single_cls:
             for x in self.labels:
@@ -615,14 +599,6 @@ def __getitem__(self, index):
         img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
         img = np.ascontiguousarray(img)
 
-        # print(type(torch.from_numpy(img)))
-        # print(type(labels_out))
-        # print(labels_out)
-        # print(type(self.img_files[index]))
-        # print(self.img_files[index])
-        # print(type(shapes))
-        # print(shapes)
-
         return torch.from_numpy(img), labels_out, self.img_files[index], shapes
 
     @staticmethod
diff --git a/utils/file.py b/utils/file.py
index 3aa1b198d3fc..e0c3066df7ec 100644
--- a/utils/file.py
+++ b/utils/file.py
@@ -1,5 +1,6 @@
+from glob import glob
 from pathlib import Path
-from typing import List, Union
+from typing import List, Union, Optional
 
 
 def read_text_file_lines(file_path: Union[str, Path], remove_blank: bool = True) -> List[str]:
@@ -9,3 +10,15 @@ def read_text_file_lines(file_path: Union[str, Path], remove_blank: bool = True)
             return list(filter(lambda l: len(l) > 0, lines))
         else:
             return lines
+
+
+def get_directory_content(directory_path: str, extension: Optional[str] = None) -> List[str]:
+    wild_card = '*' if extension is None else f'*.{extension}'
+    pattern = Path(directory_path).joinpath(wild_card).as_posix()
+    return glob(pattern)
+
+
+def dump_text_file(file_path: str, content: str) -> None:
+    Path(file_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.write(content)

From 8ef034bcf11938812c7c8c4864fb2ee796662d45 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Tue, 24 Aug 2021 13:09:35 +0200
Subject: [PATCH 08/16] work on labels cache in progress

---
 utils/datasets/cache.py | 39 +++++++++++++++++++++++++++++++++++++++
 utils/datasets_old.py   |  1 +
 2 files changed, 40 insertions(+)
 create mode 100644 utils/datasets/cache.py

diff --git a/utils/datasets/cache.py b/utils/datasets/cache.py
new file mode 100644
index 000000000000..b943ae914432
--- /dev/null
+++ b/utils/datasets/cache.py
@@ -0,0 +1,39 @@
+from pathlib import Path
+from typing import Optional, Union
+
+import numpy as np
+
+
+class ImageCache:
+    pass
+
+
+class LabelCache:
+
+    VERSION = 0.4
+    VERSION_KEY = "version"
+    HASH_KEY = "hash"
+    RESULTS_KEY = "results"
+
+    @staticmethod
+    def load(path: Union[str, Path], hash: str) -> Optional[dict]:
+        cache = LabelCache._safe_load(path=path)
+        if all([
+            cache,
+            cache[LabelCache.VERSION_KEY] == LabelCache.VERSION,
+            cache[LabelCache.HASH_KEY] == hash
+        ]):
+            return cache
+        else:
+            return None
+
+    @staticmethod
+    def save(path: Union[str, Path], hash: str) -> None:
+        pass
+
+    @staticmethod
+    def _safe_load(path: Union[str, Path]) -> Optional[dict]:
+        try:
+            return np.load(path, allow_pickle=True).item()
+        except:
+            return None
diff --git a/utils/datasets_old.py b/utils/datasets_old.py
index 864e1273794a..69c8f32a0363 100755
--- a/utils/datasets_old.py
+++ b/utils/datasets_old.py
@@ -405,6 +405,7 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r
         # Check cache
         self.label_files = img2label_paths(self.img_files)  # labels
         cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache')
+
         try:
             cache, exists = np.load(cache_path, allow_pickle=True).item(), True  # load dict
             assert cache['version'] == 0.4 and cache['hash'] == get_hash(self.label_files + self.img_files)

From 4246597bc18a33640508326ebf1bb05e8d9aafbb Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Tue, 24 Aug 2021 13:09:35 +0200
Subject: [PATCH 09/16] work on labels cache in progress

---
 utils/datasets/cache.py | 53 +++++++++++++++++++++++++++++++++++++++++
 utils/datasets/core.py  | 42 ++++++++++++++++++++++----------
 utils/datasets_old.py   | 11 ++++-----
 3 files changed, 87 insertions(+), 19 deletions(-)
 create mode 100644 utils/datasets/cache.py

diff --git a/utils/datasets/cache.py b/utils/datasets/cache.py
new file mode 100644
index 000000000000..62dffcc51f6f
--- /dev/null
+++ b/utils/datasets/cache.py
@@ -0,0 +1,53 @@
+import hashlib
+import os
+from pathlib import Path
+from typing import Optional, Union, List
+
+import numpy as np
+
+
+def get_hash(paths: List[str]) -> str:
+    """
+    Returns a single hash value of a list of paths (files or dirs)
+    """
+    size = sum(os.path.getsize(p) for p in paths if os.path.exists(p))  # sizes
+    h = hashlib.md5(str(size).encode())  # hash sizes
+    h.update(''.join(paths).encode())  # hash paths
+    return h.hexdigest()  # return hash
+
+
+class ImageCache:
+
+    def __init__(self, ) -> None:
+        pass
+
+
+class LabelCache:
+
+    VERSION = 0.4
+    VERSION_KEY = "version"
+    HASH_KEY = "hash"
+    RESULTS_KEY = "results"
+
+    @staticmethod
+    def load(path: Union[str, Path], hash: str) -> Optional[dict]:
+        cache = LabelCache._safe_load(path=path)
+        if all([
+            cache,
+            cache[LabelCache.VERSION_KEY] == LabelCache.VERSION,
+            cache[LabelCache.HASH_KEY] == hash
+        ]):
+            return cache
+        else:
+            return None
+
+    @staticmethod
+    def save(path: Union[str, Path], hash: str) -> None:
+        pass
+
+    @staticmethod
+    def _safe_load(path: Union[str, Path]) -> Optional[dict]:
+        try:
+            return np.load(path, allow_pickle=True).item()
+        except:
+            return None
diff --git a/utils/datasets/core.py b/utils/datasets/core.py
index a4789d271ad8..4e8d802007f3 100644
--- a/utils/datasets/core.py
+++ b/utils/datasets/core.py
@@ -1,9 +1,11 @@
+from pathlib import Path
 from typing import Tuple, List, Optional
 
 import os
 import torch
 from torch.utils.data import Dataset
 
+from utils.datasets.cache import LabelCache, get_hash
 from utils.datasets.coco import read_json_file, ANNOTATIONS_FILE_NAME, load_coco_annotations
 from utils.datasets.yolo import load_image_names_from_paths, img2label_paths
 
@@ -14,7 +16,7 @@ def assemble_data_loader() -> None:
     pass
 
 
-def initiate_dataset(path: str, cache_images: bool) -> Dataset:
+def initiate_dataset(path: str, cache_images: Optional[str]) -> Dataset:
     if COCODataset.validate_directory_structure(path=path):
         return COCODataset(path=path, cache_images=cache_images)
     if YOLODataset.validate_directory_structure(path=path):
@@ -32,13 +34,17 @@ class COCODataset(Dataset):
         └── ...
     """
 
-    def __init__(self, path: str, cache_images: bool) -> None:
+    def __init__(self, path: str, cache_images: Optional[str]) -> None:
         """
         Load COCO labels along with images from provided path.
 
         Args:
             path: `str` - path to `dataset` root directory.
-            cache_images: `bool` - flag to force caching of images.
+            cache_images: `Optional[str]` - flag enabling image caching. Can be equal to one of three values: `"ram"`,
+                `"disc"` or `None`. `"ram"` - all images are stored in memory to enable fastest access. This may however
+                result in exceeding the limit of available memory. `"disc"` - all images are stored on hard drive but in
+                raw, uncompressed form. This prevents memory overflow, and offers faster access to data then regular
+                image read. `None` - image caching is turned of.
         """
         self.path = path
         self.cache_images = cache_images
@@ -69,7 +75,7 @@ def load_labels(path: str) -> List[torch.Tensor]:
         pass
 
     @staticmethod
-    def resolve_cache_path() -> Optional[str]:
+    def resolve_cache_path() -> Path:
         pass
 
 
@@ -77,29 +83,40 @@ class YOLODataset(Dataset):
     """
     dataset
     ├── image_names.txt [optional]
+    ├── image_names.cache [optional]
     ├── images
     │   ├── image-1.jpg
     │   ├── image-2.jpg
     │   └── ...
     └── labels
-        ├── dataset.cache [optional]
         ├── image-1.txt
         ├── image-2.txt
         └── ...
     """
 
-    def __init__(self, path: str, cache_images: bool) -> None:
+    def __init__(self, path: str, cache_images: Optional[str]) -> None:
         """
         Load YOLO labels along with images from provided path.
 
         Args:
-            path: `str` - path to `dataset` root directory or to `image_names.txt` file.
-            cache_images: `bool` - flag to force caching of images.
+            path: `str` - path to `images` directory or to `image_names.txt` file.
+            cache_images: `Optional[str]` - flag enabling image caching. Can be equal to one of three values: `"ram"`,
+                `"disc"` or `None`. `"ram"` - all images are stored in memory to enable fastest access. This may however
+                result in exceeding the limit of available memory. `"disc"` - all images are stored on hard drive but in
+                raw, uncompressed form. This prevents memory overflow, and offers faster access to data then regular
+                image read. `None` - image caching is turned of.
         """
         self.path = path
         self.cache_images = cache_images
-        self.image_paths = load_image_names_from_paths(paths=path)
-        self.label_paths = img2label_paths(image_paths=self.image_paths)
+        self.image_paths: List[str] = load_image_names_from_paths(paths=path)
+        self.label_paths: List[str] = img2label_paths(image_paths=self.image_paths)
+
+        cache_path = YOLODataset.resolve_cache_path(path=self.path, label_paths=self.label_paths)
+        print(cache_path)
+        hash = get_hash(self.label_paths + self.image_paths)
+        cache = LabelCache.load(path=cache_path, hash=hash)
+        print(cache.keys())
+
         self.labels = []
         self.images = []
 
@@ -122,8 +139,9 @@ def load_labels(path: str) -> List[torch.Tensor]:
         pass
 
     @staticmethod
-    def resolve_cache_path() -> Optional[str]:
-        pass
+    def resolve_cache_path(path: str, label_paths: List[str]) -> Path:
+        path = Path(path)
+        return (path if path.is_file() else Path(label_paths[0]).parent).with_suffix('.cache')
 
 
 class TransformedDataset(Dataset):
diff --git a/utils/datasets_old.py b/utils/datasets_old.py
index 864e1273794a..09548726fcfd 100755
--- a/utils/datasets_old.py
+++ b/utils/datasets_old.py
@@ -103,7 +103,6 @@ def create_dataloader(path, imgsz, batch_size, stride, single_cls=False, hyp=Non
                                       single_cls=single_cls,
                                       stride=int(stride),
                                       pad=pad,
-                                      image_weights=image_weights,
                                       prefix=prefix)
 
     batch_size = min(batch_size, len(dataset))
@@ -364,17 +363,14 @@ def img2label_paths(img_paths):
     return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in img_paths]
 
 
-
-# image_weights - unused
-
 class LoadImagesAndLabels(Dataset):  # for training/testing
-    def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
+
+    def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False,
                  cache_images=False, single_cls=False, stride=32, pad=0.0, prefix=''):
         self.img_size = img_size
         self.augment = augment
         self.hyp = hyp
-        self.image_weights = image_weights
-        self.rect = False if image_weights else rect
+        self.rect = rect
         self.mosaic = self.augment and not self.rect  # load 4 images at a time into a mosaic (only during training)
         self.mosaic_border = [-img_size // 2, -img_size // 2]
         self.stride = stride
@@ -405,6 +401,7 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r
         # Check cache
         self.label_files = img2label_paths(self.img_files)  # labels
         cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache')
+
         try:
             cache, exists = np.load(cache_path, allow_pickle=True).item(), True  # load dict
             assert cache['version'] == 0.4 and cache['hash'] == get_hash(self.label_files + self.img_files)

From 54b164da281bfc7640b74c8b62900e5a98e4ef36 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Wed, 25 Aug 2021 22:48:22 +0200
Subject: [PATCH 10/16] RAM cache done

---
 utils/datasets/cache.py | 77 +++++++++++++++++++++++++++++++++++++++--
 utils/datasets/core.py  |  2 --
 utils/datasets/error.py |  3 ++
 utils/datasets_old.py   |  2 +-
 4 files changed, 78 insertions(+), 6 deletions(-)
 create mode 100644 utils/datasets/error.py

diff --git a/utils/datasets/cache.py b/utils/datasets/cache.py
index 62dffcc51f6f..825432bd9248 100644
--- a/utils/datasets/cache.py
+++ b/utils/datasets/cache.py
@@ -1,9 +1,15 @@
 import hashlib
 import os
+from multiprocessing.pool import ThreadPool
 from pathlib import Path
-from typing import Optional, Union, List
+from typing import Optional, Union, List, Dict
+from abc import ABC, abstractmethod
+from tqdm import tqdm
 
 import numpy as np
+import cv2
+
+from utils.datasets.error import CacheError
 
 
 def get_hash(paths: List[str]) -> str:
@@ -16,11 +22,76 @@ def get_hash(paths: List[str]) -> str:
     return h.hexdigest()  # return hash
 
 
-class ImageCache:
+class BaseImageCache(ABC):
+
+    _cache_size = 0
+    _loading_completed = False
+
+    def __init__(self, cache_type: str, thread_count: int = 8) -> None:
+        self._thread_count = min(thread_count, os.cpu_count())
+        self._cache_type = cache_type
+
+    @property
+    def cache_size(self) -> float:
+        return self._cache_size
+
+    def load_images(self, paths: List[str]) -> None:
+        self._load_images(paths=paths)
+        self._loading_completed = True
+        print(f"Image caching completed. ({self._cache_size / 1E9:.1f}GB {self._cache_type})")
+
+    def get_image(self, path: str) -> np.ndarray:
+        if not self._loading_completed:
+            raise CacheError("Could not obtain the image. Image cache is not yet initialized.")
+        image = self._get_image(path=path)
+        if image is None:
+            raise CacheError(f"Image with {path} path could not be found in cache.")
+        return image
 
-    def __init__(self, ) -> None:
+    @abstractmethod
+    def _load_images(self, paths: List[str]) -> None:
         pass
 
+    @abstractmethod
+    def _get_image(self, path: str) -> Optional[np.ndarray]:
+        pass
+
+
+class DiscImageCache(BaseImageCache):
+
+    def __init__(self, thread_count: int = 8) -> None:
+        super().__init__(cache_type="disc", thread_count=thread_count)
+
+    def _load_images(self, paths: List[str]) -> None:
+        pass
+
+    def _get_image(self, path: str) -> Optional[np.ndarray]:
+        pass
+
+
+class RAMImageCache(BaseImageCache):
+
+    def __init__(self, thread_count: int = 8) -> None:
+        super().__init__(cache_type="ram", thread_count=thread_count)
+        self._images: Dict[str, np.ndarray] = {}
+
+    def _load_images(self, paths: List[str]) -> None:
+        results = ThreadPool(self._thread_count).imap(lambda x: self._load_image(x), paths)
+        bar = tqdm(enumerate(results), total=len(paths))
+        for i in bar:
+            bar.desc = f"Caching images ({self._cache_size / 1E9:.1f}GB {self._cache_type})"
+        bar.close()
+
+    def _get_image(self, path: str) -> Optional[np.ndarray]:
+        return self._images.get(path)
+
+    def _load_image(self, path: str) -> None:
+        image = cv2.imread(path)
+        if image is None:
+            raise CacheError(f"Image with {path} path could not be found.")
+        self._images[path] = image
+        self._cache_size += image.nbytes
+
 
 class LabelCache:
 
diff --git a/utils/datasets/core.py b/utils/datasets/core.py
index 4e8d802007f3..fd3a5b0f3c0f 100644
--- a/utils/datasets/core.py
+++ b/utils/datasets/core.py
@@ -112,10 +112,8 @@ def __init__(self, path: str, cache_images: Optional[str]) -> None:
         self.label_paths: List[str] = img2label_paths(image_paths=self.image_paths)
 
         cache_path = YOLODataset.resolve_cache_path(path=self.path, label_paths=self.label_paths)
-        print(cache_path)
         hash = get_hash(self.label_paths + self.image_paths)
         cache = LabelCache.load(path=cache_path, hash=hash)
-        print(cache.keys())
 
         self.labels = []
         self.images = []
diff --git a/utils/datasets/error.py b/utils/datasets/error.py
new file mode 100644
index 000000000000..011567242b5e
--- /dev/null
+++ b/utils/datasets/error.py
@@ -0,0 +1,3 @@
+
+class CacheError(Exception):
+    pass
diff --git a/utils/datasets_old.py b/utils/datasets_old.py
index 09548726fcfd..6b43f2dfdcc0 100755
--- a/utils/datasets_old.py
+++ b/utils/datasets_old.py
@@ -91,7 +91,7 @@ def exif_transpose(image):
     return image
 
 
-def create_dataloader(path, imgsz, batch_size, stride, single_cls=False, hyp=None, augment=False, cache=False, pad=0.0,
+def create_dataloader(path, imgsz, batch_size, stride, single_cls=False, hyp=None, augment=False, cache=None, pad=0.0,
                       rect=False, rank=-1, workers=8, image_weights=False, quad=False, prefix=''):
     # Make sure only the first process in DDP process the dataset first, and the following others can use the cache
     with torch_distributed_zero_first(rank):

From 82db7ddc178ad787062ad2d5eb3898c01814f8b7 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Wed, 25 Aug 2021 23:15:42 +0200
Subject: [PATCH 11/16] Image provider created + small refactor

---
 utils/datasets/core.py                      | 18 +++--
 utils/datasets/{cache.py => image_cache.py} | 75 ++++++++++-----------
 utils/datasets/label_cache.py               | 47 +++++++++++++
 3 files changed, 94 insertions(+), 46 deletions(-)
 rename utils/datasets/{cache.py => image_cache.py} (65%)
 create mode 100644 utils/datasets/label_cache.py

diff --git a/utils/datasets/core.py b/utils/datasets/core.py
index fd3a5b0f3c0f..4e9bf65ae66a 100644
--- a/utils/datasets/core.py
+++ b/utils/datasets/core.py
@@ -1,12 +1,13 @@
+import os
 from pathlib import Path
 from typing import Tuple, List, Optional
 
-import os
 import torch
 from torch.utils.data import Dataset
 
-from utils.datasets.cache import LabelCache, get_hash
 from utils.datasets.coco import read_json_file, ANNOTATIONS_FILE_NAME, load_coco_annotations
+from utils.datasets.image_cache import ImageProvider
+from utils.datasets.label_cache import LabelCache, get_hash
 from utils.datasets.yolo import load_image_names_from_paths, img2label_paths
 
 DatasetEntry = Tuple[torch.Tensor, torch.Tensor, str]
@@ -52,9 +53,9 @@ def __init__(self, path: str, cache_images: Optional[str]) -> None:
         coco_data = read_json_file(os.path.join(path, ANNOTATIONS_FILE_NAME))
         coco_annotations = load_coco_annotations(coco_data=coco_data)
 
-        self.image_paths = coco_annotations.keys()
-        self.labels = coco_annotations.values()
-        self.images = []
+        self.image_paths = list(coco_annotations.keys())
+        self.labels = list(coco_annotations.values())
+        self.image_provider = ImageProvider(cache_images=cache_images, paths=self.image_paths)
 
     def __len__(self) -> int:
         return len(self.image_paths)
@@ -112,8 +113,11 @@ def __init__(self, path: str, cache_images: Optional[str]) -> None:
         self.label_paths: List[str] = img2label_paths(image_paths=self.image_paths)
 
         cache_path = YOLODataset.resolve_cache_path(path=self.path, label_paths=self.label_paths)
-        hash = get_hash(self.label_paths + self.image_paths)
-        cache = LabelCache.load(path=cache_path, hash=hash)
+        label_cache = LabelCache.load(
+            path=cache_path,
+            hash=get_hash(self.label_paths + self.image_paths)
+        )
+        self.image_provider = ImageProvider(cache_images=cache_images, paths=self.image_paths)
 
         self.labels = []
         self.images = []
diff --git a/utils/datasets/cache.py b/utils/datasets/image_cache.py
similarity index 65%
rename from utils/datasets/cache.py
rename to utils/datasets/image_cache.py
index 825432bd9248..6881766d88c1 100644
--- a/utils/datasets/cache.py
+++ b/utils/datasets/image_cache.py
@@ -1,25 +1,16 @@
-import hashlib
 import os
-from multiprocessing.pool import ThreadPool
-from pathlib import Path
-from typing import Optional, Union, List, Dict
 from abc import ABC, abstractmethod
-from tqdm import tqdm
+from multiprocessing.pool import ThreadPool
+from typing import Optional, List, Dict
 
-import numpy as np
 import cv2
+import numpy as np
+from tqdm import tqdm
 
 from utils.datasets.error import CacheError
 
 
-def get_hash(paths: List[str]) -> str:
-    """
-    Returns a single hash value of a list of paths (files or dirs)
-    """
-    size = sum(os.path.getsize(p) for p in paths if os.path.exists(p))  # sizes
-    h = hashlib.md5(str(size).encode())  # hash sizes
-    h.update(''.join(paths).encode())  # hash paths
-    return h.hexdigest()  # return hash
+NUM_THREADS = min(8, os.cpu_count())
 
 
 class BaseImageCache(ABC):
@@ -61,12 +52,19 @@ class DiscImageCache(BaseImageCache):
 
     def __init__(self, thread_count: int = 8) -> None:
         super().__init__(cache_type="disc", thread_count=thread_count)
+        self._image_paths: Dict[str, str] = {}
 
     def _load_images(self, paths: List[str]) -> None:
-        pass
+        pass  # TODO
 
     def _get_image(self, path: str) -> Optional[np.ndarray]:
-        pass
+        pass  # TODO
+
+    def _load_image(self, path: str) -> None:
+        pass  # TODO
+
+    def _init_cache(self, paths: List[str]) -> None:
+        pass  # TODO
 
 
 class RAMImageCache(BaseImageCache):
@@ -93,32 +91,31 @@ def _load_image(self, path: str) -> None:
         self._cache_size += image.nbytes
 
 
-class LabelCache:
+class ImageProvider:
 
-    VERSION = 0.4
-    VERSION_KEY = "version"
-    HASH_KEY = "hash"
-    RESULTS_KEY = "results"
+    def __init__(self, cache_images: Optional[str], paths: List[str]) -> None:
+        self._cache_images = cache_images
+        self._cache = ImageProvider._init_cache(cache_images=cache_images, paths=paths)
 
-    @staticmethod
-    def load(path: Union[str, Path], hash: str) -> Optional[dict]:
-        cache = LabelCache._safe_load(path=path)
-        if all([
-            cache,
-            cache[LabelCache.VERSION_KEY] == LabelCache.VERSION,
-            cache[LabelCache.HASH_KEY] == hash
-        ]):
-            return cache
+    def get_image(self, path: str) -> np.ndarray:
+        if self._cache_images:
+            return self._cache.get_image(path=path)
         else:
-            return None
+            image = cv2.imread(path)
+            if image is None:
+                raise CacheError(f"Image with {path} path could not be found.")
+            return image
 
     @staticmethod
-    def save(path: Union[str, Path], hash: str) -> None:
-        pass
-
-    @staticmethod
-    def _safe_load(path: Union[str, Path]) -> Optional[dict]:
-        try:
-            return np.load(path, allow_pickle=True).item()
-        except:
+    def _init_cache(cache_images: Optional[str], paths: List[str]) -> Optional[BaseImageCache]:
+        if cache_images == "disc":
+            cache = DiscImageCache(thread_count=NUM_THREADS)
+            cache.load_images(paths=paths)
+            return cache
+        if cache_images == 'ram':
+            cache = RAMImageCache(thread_count=NUM_THREADS)
+            cache.load_images(paths=paths)
+            return cache
+        if cache_images is None:
             return None
+        raise CacheError(f"Unsupported cache type. Expected disc, ram or None. {cache_images} given.")
diff --git a/utils/datasets/label_cache.py b/utils/datasets/label_cache.py
new file mode 100644
index 000000000000..551698c06344
--- /dev/null
+++ b/utils/datasets/label_cache.py
@@ -0,0 +1,47 @@
+import hashlib
+import os
+from pathlib import Path
+from typing import List, Union, Optional
+
+import numpy as np
+
+
+def get_hash(paths: List[str]) -> str:
+    """
+    Returns a single hash value of a list of paths (files or dirs)
+    """
+    size = sum(os.path.getsize(p) for p in paths if os.path.exists(p))  # sizes
+    h = hashlib.md5(str(size).encode())  # hash sizes
+    h.update(''.join(paths).encode())  # hash paths
+    return h.hexdigest()  # return hash
+
+
+class LabelCache:
+
+    VERSION = 0.4
+    VERSION_KEY = "version"
+    HASH_KEY = "hash"
+    RESULTS_KEY = "results"
+
+    @staticmethod
+    def load(path: Union[str, Path], hash: str) -> Optional[dict]:
+        cache = LabelCache._safe_load(path=path)
+        if all([
+            cache,
+            cache[LabelCache.VERSION_KEY] == LabelCache.VERSION,
+            cache[LabelCache.HASH_KEY] == hash
+        ]):
+            return cache
+        else:
+            return None
+
+    @staticmethod
+    def save(path: Union[str, Path], hash: str) -> None:
+        pass
+
+    @staticmethod
+    def _safe_load(path: Union[str, Path]) -> Optional[dict]:
+        try:
+            return np.load(path, allow_pickle=True).item()
+        except:
+            return None

From d71a659e75efb146111a3c3ed4df26746758e5bc Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Mon, 30 Aug 2021 16:17:23 +0200
Subject: [PATCH 12/16] save progress

---
 utils/datasets/image_cache.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/datasets/image_cache.py b/utils/datasets/image_cache.py
index 6881766d88c1..faf63194cd3e 100644
--- a/utils/datasets/image_cache.py
+++ b/utils/datasets/image_cache.py
@@ -53,6 +53,7 @@ class DiscImageCache(BaseImageCache):
     def __init__(self, thread_count: int = 8) -> None:
         super().__init__(cache_type="disc", thread_count=thread_count)
         self._image_paths: Dict[str, str] = {}
+        self._cache_path: Optional[str] = None
 
     def _load_images(self, paths: List[str]) -> None:
         pass  # TODO
@@ -63,7 +64,7 @@ def _get_image(self, path: str) -> Optional[np.ndarray]:
     def _load_image(self, path: str) -> None:
         pass  # TODO
 
-    def _init_cache(self, paths: List[str]) -> None:
+    def _init_cache(self, paths: List[str]) -> str:
         pass  # TODO
 
 

From 5423cc0cde584cd247c50aa9f85944ec493195c1 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Tue, 31 Aug 2021 08:57:49 +0200
Subject: [PATCH 13/16] disc cache completed

---
 utils/datasets/image_cache.py | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/utils/datasets/image_cache.py b/utils/datasets/image_cache.py
index faf63194cd3e..5f489f2eeb9d 100644
--- a/utils/datasets/image_cache.py
+++ b/utils/datasets/image_cache.py
@@ -1,6 +1,7 @@
 import os
 from abc import ABC, abstractmethod
 from multiprocessing.pool import ThreadPool
+from pathlib import Path
 from typing import Optional, List, Dict
 
 import cv2
@@ -27,6 +28,8 @@ def cache_size(self) -> float:
         return self._cache_size
 
     def load_images(self, paths: List[str]) -> None:
+        if self._loading_completed:
+            raise CacheError(f"load_images method can only be called once.")
         self._load_images(paths=paths)
         self._loading_completed = True
         print(f"Image caching completed. ({self._cache_size / 1E9:.1f}GB {self._cache_type})")
@@ -56,16 +59,36 @@ def __init__(self, thread_count: int = 8) -> None:
         self._cache_path: Optional[str] = None
 
     def _load_images(self, paths: List[str]) -> None:
-        pass  # TODO
+        self._cache_path = self._init_cache(paths=paths)
+        self._image_paths = {
+            path: Path(self._cache_path) / Path(path).with_suffix('.npy').name
+            for path
+            in paths
+        }
+        results = ThreadPool(self._thread_count).imap(lambda x: self._load_image(x), paths)
+        bar = tqdm(enumerate(results), total=len(paths))
+        for i in bar:
+            bar.desc = f"Caching images ({self._cache_size / 1E9:.1f}GB {self._cache_type})"
+        bar.close()
 
     def _get_image(self, path: str) -> Optional[np.ndarray]:
-        pass  # TODO
+        target_path = self._image_paths.get(path)
+        if target_path is None:
+            return None
+        return np.load(target_path)
 
     def _load_image(self, path: str) -> None:
-        pass  # TODO
+        image = cv2.imread(path)
+        if image is None:
+            raise CacheError(f"Image with {path} path could not be found.")
+        target_path = self._image_paths[path]
+        np.save(target_path, image)
+        self._cache_size += image.nbytes
 
     def _init_cache(self, paths: List[str]) -> str:
-        pass  # TODO
+        cache_path = Path(paths[0]).parent.as_posix() + '_npy'
+        Path(cache_path).mkdir(parents=True, exist_ok=True)
+        return cache_path
 
 
 class RAMImageCache(BaseImageCache):

From 7fec4d2c4006939c08140c54a1564fe80f786c44 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Tue, 31 Aug 2021 11:24:06 +0200
Subject: [PATCH 14/16] yolo label loader in progress

---
 utils/datasets/core.py        | 75 ++++++++++++++++-------------------
 utils/datasets/label_cache.py |  4 +-
 utils/datasets/yolo.py        |  9 +++++
 3 files changed, 45 insertions(+), 43 deletions(-)

diff --git a/utils/datasets/core.py b/utils/datasets/core.py
index 4e9bf65ae66a..22268476b1ab 100644
--- a/utils/datasets/core.py
+++ b/utils/datasets/core.py
@@ -14,14 +14,7 @@
 
 
 def assemble_data_loader() -> None:
-    pass
-
-
-def initiate_dataset(path: str, cache_images: Optional[str]) -> Dataset:
-    if COCODataset.validate_directory_structure(path=path):
-        return COCODataset(path=path, cache_images=cache_images)
-    if YOLODataset.validate_directory_structure(path=path):
-        return YOLODataset(path=path, cache_images=cache_images)
+    pass  # TODO
 
 
 class COCODataset(Dataset):
@@ -49,35 +42,31 @@ def __init__(self, path: str, cache_images: Optional[str]) -> None:
         """
         self.path = path
         self.cache_images = cache_images
-
-        coco_data = read_json_file(os.path.join(path, ANNOTATIONS_FILE_NAME))
-        coco_annotations = load_coco_annotations(coco_data=coco_data)
-
-        self.image_paths = list(coco_annotations.keys())
-        self.labels = list(coco_annotations.values())
+        self.image_paths, self.labels = self._load_image_paths_and_labels(path=path)
         self.image_provider = ImageProvider(cache_images=cache_images, paths=self.image_paths)
 
     def __len__(self) -> int:
         return len(self.image_paths)
 
     def __getitem__(self, index: int) -> DatasetEntry:
-        pass
+        image_path = self.image_paths[index]
+        labels = self.labels[index]
+        image = self.image_provider.get_image(path=image_path)
+        return torch.from_numpy(image), labels, image_path
 
     @staticmethod
     def collate_fn(batch: List[DatasetEntry]) -> torch.Tensor:
-        pass
-
-    @staticmethod
-    def validate_directory_structure(path: str) -> None:
-        pass
+        pass  # TODO:
 
     @staticmethod
-    def load_labels(path: str) -> List[torch.Tensor]:
-        pass
+    def _load_image_paths_and_labels(path: str) -> Tuple[List[str], List[torch.Tensor]]:
+        coco_data = read_json_file(os.path.join(path, ANNOTATIONS_FILE_NAME))
+        coco_annotations = load_coco_annotations(coco_data=coco_data)
+        return list(coco_annotations.keys()), list(coco_annotations.values())
 
     @staticmethod
     def resolve_cache_path() -> Path:
-        pass
+        pass  # TODO:
 
 
 class YOLODataset(Dataset):
@@ -109,36 +98,40 @@ def __init__(self, path: str, cache_images: Optional[str]) -> None:
         """
         self.path = path
         self.cache_images = cache_images
-        self.image_paths: List[str] = load_image_names_from_paths(paths=path)
-        self.label_paths: List[str] = img2label_paths(image_paths=self.image_paths)
-
-        cache_path = YOLODataset.resolve_cache_path(path=self.path, label_paths=self.label_paths)
-        label_cache = LabelCache.load(
-            path=cache_path,
-            hash=get_hash(self.label_paths + self.image_paths)
-        )
+        self.image_paths, self.labels = self._load_image_paths_and_labels(path=path)
         self.image_provider = ImageProvider(cache_images=cache_images, paths=self.image_paths)
 
-        self.labels = []
-        self.images = []
-
     def __len__(self) -> int:
         return len(self.image_paths)
 
     def __getitem__(self, index: int) -> DatasetEntry:
-        pass
+        image_path = self.image_paths[index]
+        labels = self.labels[index]
+        image = self.image_provider.get_image(path=image_path)
+        return torch.from_numpy(image), labels, image_path
 
     @staticmethod
     def collate_fn(batch: List[DatasetEntry]) -> torch.Tensor:
         pass
 
     @staticmethod
-    def validate_directory_structure(path: str) -> None:
-        pass
-
-    @staticmethod
-    def load_labels(path: str) -> List[torch.Tensor]:
-        pass
+    def _load_image_paths_and_labels(path: str) -> Tuple[List[str], List[torch.Tensor]]:
+        image_paths = load_image_names_from_paths(paths=path)
+        label_paths = img2label_paths(image_paths=image_paths)
+
+        # TODO: finalize yolo labels cache plugin
+        # cache_path = YOLODataset.resolve_cache_path(path=path, label_paths=label_paths)
+        # label_cache = LabelCache.load(
+        #     path=cache_path,
+        #     hash=get_hash(label_paths + image_paths)
+        # )
+        # labels = [
+        #     label_cache[image_path]
+        #     for image_path
+        #     in image_paths
+        # ]
+
+        return image_paths, labels
 
     @staticmethod
     def resolve_cache_path(path: str, label_paths: List[str]) -> Path:
diff --git a/utils/datasets/label_cache.py b/utils/datasets/label_cache.py
index 551698c06344..0883331c3769 100644
--- a/utils/datasets/label_cache.py
+++ b/utils/datasets/label_cache.py
@@ -36,8 +36,8 @@ def load(path: Union[str, Path], hash: str) -> Optional[dict]:
             return None
 
     @staticmethod
-    def save(path: Union[str, Path], hash: str) -> None:
-        pass
+    def save(path: Union[str, Path], hash: str, data: dict) -> None:
+        pass  # TODO
 
     @staticmethod
     def _safe_load(path: Union[str, Path]) -> Optional[dict]:
diff --git a/utils/datasets/yolo.py b/utils/datasets/yolo.py
index 7ab4b21518e3..474255fdafc9 100644
--- a/utils/datasets/yolo.py
+++ b/utils/datasets/yolo.py
@@ -33,3 +33,12 @@ def img2label_paths(image_paths: List[str]) -> List[str]:
     """
     sa, sb = os.sep + 'images' + os.sep, os.sep + 'labels' + os.sep  # /images/, /labels/ substrings
     return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in image_paths]
+
+
+class YOLOLabelsLoader:
+
+    def __init__(self) -> None:
+        pass  # TODO
+
+    def load_label(self) -> None:
+        pass  # TODO

From 2a433ea45220ee54a3b874a8ba0df808b2670472 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Tue, 31 Aug 2021 12:52:46 +0200
Subject: [PATCH 15/16] save progress

---
 utils/datasets/coco.py        |  5 +++++
 utils/datasets/core.py        |  5 +++++
 utils/datasets/image_cache.py | 17 +++++++++++++++++
 utils/datasets/label_cache.py |  5 +++++
 utils/datasets/todo.txt       |  4 ++++
 utils/datasets/yolo.py        | 11 +++++++++--
 6 files changed, 45 insertions(+), 2 deletions(-)
 create mode 100644 utils/datasets/todo.txt

diff --git a/utils/datasets/coco.py b/utils/datasets/coco.py
index 39acbb5ad5e4..ba7b460b4c7b 100644
--- a/utils/datasets/coco.py
+++ b/utils/datasets/coco.py
@@ -1,3 +1,8 @@
+# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
+"""
+COCO dataset loading utils
+"""
+
 import json
 from typing import Dict, Union, List
 from collections import defaultdict
diff --git a/utils/datasets/core.py b/utils/datasets/core.py
index 22268476b1ab..a296b127cde3 100644
--- a/utils/datasets/core.py
+++ b/utils/datasets/core.py
@@ -1,3 +1,8 @@
+# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
+"""
+Dataset loaders
+"""
+
 import os
 from pathlib import Path
 from typing import Tuple, List, Optional
diff --git a/utils/datasets/image_cache.py b/utils/datasets/image_cache.py
index 5f489f2eeb9d..9ba07a54eb3b 100644
--- a/utils/datasets/image_cache.py
+++ b/utils/datasets/image_cache.py
@@ -1,3 +1,8 @@
+# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
+"""
+Image loading and caching helpers
+"""
+
 import os
 from abc import ABC, abstractmethod
 from multiprocessing.pool import ThreadPool
@@ -118,6 +123,18 @@ def _load_image(self, path: str) -> None:
 class ImageProvider:
 
     def __init__(self, cache_images: Optional[str], paths: List[str]) -> None:
+        """
+        High level class responsible for loading images. ImageProvider has the ability to cache images on disk or in
+        memory to speed up the loading process.
+
+        Args:
+            cache_images: `Optional[str]` - flag enabling image caching. Can be equal to one of three values: `"ram"`,
+                `"disc"` or `None`. `"ram"` - all images are stored in memory to enable fastest access. This may however
+                result in exceeding the limit of available memory. `"disc"` - all images are stored on hard drive but in
+                raw, uncompressed form. This prevents memory overflow, and offers faster access to data then regular
+                image read. `None` - image caching is turned of.
+            paths: `List[str]` - list of image paths that you would like to cache.
+        """
         self._cache_images = cache_images
         self._cache = ImageProvider._init_cache(cache_images=cache_images, paths=paths)
 
diff --git a/utils/datasets/label_cache.py b/utils/datasets/label_cache.py
index 0883331c3769..c54b5c5f6df6 100644
--- a/utils/datasets/label_cache.py
+++ b/utils/datasets/label_cache.py
@@ -1,3 +1,8 @@
+# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
+"""
+Labels caching helpers
+"""
+
 import hashlib
 import os
 from pathlib import Path
diff --git a/utils/datasets/todo.txt b/utils/datasets/todo.txt
new file mode 100644
index 000000000000..9c68ee9b5079
--- /dev/null
+++ b/utils/datasets/todo.txt
@@ -0,0 +1,4 @@
+# handle corrupted images
+# handle prefix, most likely by using proper logging
+# coco label caching
+# yolo label loading
\ No newline at end of file
diff --git a/utils/datasets/yolo.py b/utils/datasets/yolo.py
index 474255fdafc9..d3a0de921133 100644
--- a/utils/datasets/yolo.py
+++ b/utils/datasets/yolo.py
@@ -1,3 +1,8 @@
+# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
+"""
+YOLO dataset loading utils
+"""
+
 import os
 import glob
 from pathlib import Path
@@ -37,8 +42,10 @@ def img2label_paths(image_paths: List[str]) -> List[str]:
 
 class YOLOLabelsLoader:
 
-    def __init__(self) -> None:
-        pass  # TODO
+    def __init__(self, image_paths: List[str], labels_paths: List[str]) -> None:
+        self.image_paths = image_paths
+        self.labels_paths = labels_paths
+        self.missing_labels = 0
 
     def load_label(self) -> None:
         pass  # TODO

From 753138cadc8ee8a0d740cc0283d3906dcf23b0ed Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Tue, 31 Aug 2021 14:03:49 +0200
Subject: [PATCH 16/16] coco dataset is working, single class transformation is
 working

---
 utils/datasets/coco.py  |  1 -
 utils/datasets/core.py  | 77 ++++++++++++++++++++++++++++++++---------
 utils/datasets/error.py |  4 +++
 utils/datasets/todo.txt |  4 ++-
 4 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/utils/datasets/coco.py b/utils/datasets/coco.py
index ba7b460b4c7b..a1d87d1f9fcb 100644
--- a/utils/datasets/coco.py
+++ b/utils/datasets/coco.py
@@ -20,7 +20,6 @@
 ANNOTATION_IMAGE_ID_KEY = "image_id"
 ANNOTATION_BBOX_KEY = "bbox"
 ANNOTATION_CATEGORY_ID = "category_id"
-ANNOTATIONS_FILE_NAME = "annotations.json"
 
 
 def read_json_file(file_path: str, **kwargs) -> Union[list, dict]:
diff --git a/utils/datasets/core.py b/utils/datasets/core.py
index a296b127cde3..5809e64dfac1 100644
--- a/utils/datasets/core.py
+++ b/utils/datasets/core.py
@@ -10,7 +10,8 @@
 import torch
 from torch.utils.data import Dataset
 
-from utils.datasets.coco import read_json_file, ANNOTATIONS_FILE_NAME, load_coco_annotations
+from utils.datasets.coco import read_json_file, load_coco_annotations
+from utils.datasets.error import COCODatasetError
 from utils.datasets.image_cache import ImageProvider
 from utils.datasets.label_cache import LabelCache, get_hash
 from utils.datasets.yolo import load_image_names_from_paths, img2label_paths
@@ -33,7 +34,10 @@ class COCODataset(Dataset):
         └── ...
     """
 
-    def __init__(self, path: str, cache_images: Optional[str]) -> None:
+    ANNOTATIONS_FILE_NAME = "annotations.json"
+    IMAGES_DIRECTORY_NAME = "images"
+
+    def __init__(self, path: str, cache_images: Optional[str] = None) -> None:
         """
         Load COCO labels along with images from provided path.
 
@@ -45,6 +49,7 @@ def __init__(self, path: str, cache_images: Optional[str]) -> None:
                 raw, uncompressed form. This prevents memory overflow, and offers faster access to data then regular
                 image read. `None` - image caching is turned of.
         """
+        self._validate_dataset_path(path=path)
         self.path = path
         self.cache_images = cache_images
         self.image_paths, self.labels = self._load_image_paths_and_labels(path=path)
@@ -65,9 +70,23 @@ def collate_fn(batch: List[DatasetEntry]) -> torch.Tensor:
 
     @staticmethod
     def _load_image_paths_and_labels(path: str) -> Tuple[List[str], List[torch.Tensor]]:
-        coco_data = read_json_file(os.path.join(path, ANNOTATIONS_FILE_NAME))
+        images_path = os.path.join(path, COCODataset.IMAGES_DIRECTORY_NAME)
+        annotations_path = os.path.join(path, COCODataset.ANNOTATIONS_FILE_NAME)
+        coco_data = read_json_file(file_path=annotations_path)
         coco_annotations = load_coco_annotations(coco_data=coco_data)
-        return list(coco_annotations.keys()), list(coco_annotations.values())
+        image_paths = [
+            os.path.join(images_path, image_name)
+            for image_name
+            in coco_annotations.keys()
+        ]
+        return image_paths, list(coco_annotations.values())
+
+    @staticmethod
+    def _validate_dataset_path(path: str) -> None:
+        images_path = os.path.join(path, COCODataset.IMAGES_DIRECTORY_NAME)
+        annotations_path = os.path.join(path, COCODataset.ANNOTATIONS_FILE_NAME)
+        if not os.path.isfile(annotations_path) or not os.path.isdir(images_path):
+            raise COCODatasetError("Given path does not point to COCO dataset.")
 
     @staticmethod
     def resolve_cache_path() -> Path:
@@ -89,7 +108,7 @@ class YOLODataset(Dataset):
         └── ...
     """
 
-    def __init__(self, path: str, cache_images: Optional[str]) -> None:
+    def __init__(self, path: str, cache_images: Optional[str] = None) -> None:
         """
         Load YOLO labels along with images from provided path.
 
@@ -125,16 +144,16 @@ def _load_image_paths_and_labels(path: str) -> Tuple[List[str], List[torch.Tenso
         label_paths = img2label_paths(image_paths=image_paths)
 
         # TODO: finalize yolo labels cache plugin
-        # cache_path = YOLODataset.resolve_cache_path(path=path, label_paths=label_paths)
-        # label_cache = LabelCache.load(
-        #     path=cache_path,
-        #     hash=get_hash(label_paths + image_paths)
-        # )
-        # labels = [
-        #     label_cache[image_path]
-        #     for image_path
-        #     in image_paths
-        # ]
+        cache_path = YOLODataset.resolve_cache_path(path=path, label_paths=label_paths)
+        label_cache = LabelCache.load(
+            path=cache_path,
+            hash=get_hash(label_paths + image_paths)
+        )
+        labels = [
+            label_cache[image_path]
+            for image_path
+            in image_paths
+        ]
 
         return image_paths, labels
 
@@ -146,11 +165,35 @@ def resolve_cache_path(path: str, label_paths: List[str]) -> Path:
 
 class TransformedDataset(Dataset):
 
-    def __init__(self, source_dataset: Dataset) -> None:
+    def __init__(
+            self,
+            source_dataset: Dataset,
+            img_size: int = 640,
+            batch_size: int = 16,
+            augment: bool = False,
+            hyp=None,
+            rect=False,
+            single_cls: bool = False,
+            stride: int = 32,
+            pad: float = 0.0
+    ) -> None:
         self.source_dataset = source_dataset
+        self.img_size = img_size
+        self.batch_size = batch_size
+        self.augment = augment
+        self.hyp = hyp
+        self.rect = rect
+        self.stride = stride
+        self.single_cls = single_cls
+        self.pad = pad
 
     def __len__(self) -> int:
         return len(self.source_dataset)
 
     def __getitem__(self, index: int) -> DatasetEntry:
-        return self.source_dataset[index]
+        image, labels, image_path = self.source_dataset[index]
+
+        if self.single_cls:
+            labels[:, 0] = 0
+
+        return image, labels, image_path
diff --git a/utils/datasets/error.py b/utils/datasets/error.py
index 011567242b5e..9d7341073faf 100644
--- a/utils/datasets/error.py
+++ b/utils/datasets/error.py
@@ -1,3 +1,7 @@
 
 class CacheError(Exception):
     pass
+
+
+class COCODatasetError(Exception):
+    pass
diff --git a/utils/datasets/todo.txt b/utils/datasets/todo.txt
index 9c68ee9b5079..d2a55949f8c1 100644
--- a/utils/datasets/todo.txt
+++ b/utils/datasets/todo.txt
@@ -1,4 +1,6 @@
 # handle corrupted images
 # handle prefix, most likely by using proper logging
 # coco label caching
-# yolo label loading
\ No newline at end of file
+# yolo label loading
+
+# why we need information about batch_size if we only return one image
\ No newline at end of file