From e08005c4011f60aa097e17255ef8c01b1850672e Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Wed, 23 Mar 2022 01:19:37 +0100 Subject: [PATCH] Multi-threaded VisDrone and VOC downloads (#7108) * Multi-threaded VOC download * Update VOC.yaml * Update * Update general.py * Update general.py --- data/GlobalWheat2020.yaml | 1 + data/Objects365.yaml | 1 + data/SKU-110K.yaml | 1 + data/VOC.yaml | 2 +- data/VisDrone.yaml | 2 +- data/coco.yaml | 1 + utils/general.py | 11 +++++++---- 7 files changed, 13 insertions(+), 6 deletions(-) diff --git a/data/GlobalWheat2020.yaml b/data/GlobalWheat2020.yaml index 869dace0be2b..c1ba289f2833 100644 --- a/data/GlobalWheat2020.yaml +++ b/data/GlobalWheat2020.yaml @@ -34,6 +34,7 @@ names: ['wheat_head'] # class names download: | from utils.general import download, Path + # Download dir = Path(yaml['path']) # dataset root dir urls = ['https://zenodo.org/record/4298502/files/global-wheat-codalab-official.zip', diff --git a/data/Objects365.yaml b/data/Objects365.yaml index 4c7cf3fdb2c8..bd6e5d6e1144 100644 --- a/data/Objects365.yaml +++ b/data/Objects365.yaml @@ -65,6 +65,7 @@ download: | from utils.general import Path, download, np, xyxy2xywhn + # Make Directories dir = Path(yaml['path']) # dataset root dir for p in 'images', 'labels': diff --git a/data/SKU-110K.yaml b/data/SKU-110K.yaml index 9481b7a04aee..46459eab6bb7 100644 --- a/data/SKU-110K.yaml +++ b/data/SKU-110K.yaml @@ -24,6 +24,7 @@ download: | from tqdm import tqdm from utils.general import np, pd, Path, download, xyxy2xywh + # Download dir = Path(yaml['path']) # dataset root dir parent = Path(dir.parent) # download dir diff --git a/data/VOC.yaml b/data/VOC.yaml index 975d56466de1..be04fb1e2ecb 100644 --- a/data/VOC.yaml +++ b/data/VOC.yaml @@ -62,7 +62,7 @@ download: | urls = [url + 'VOCtrainval_06-Nov-2007.zip', # 446MB, 5012 images url + 'VOCtest_06-Nov-2007.zip', # 438MB, 4953 images url + 'VOCtrainval_11-May-2012.zip'] # 1.95GB, 17126 images - download(urls, dir=dir / 'images', delete=False) + download(urls, dir=dir / 'images', delete=False, threads=3) # Convert path = dir / f'images/VOCdevkit' diff --git a/data/VisDrone.yaml b/data/VisDrone.yaml index 83a5c7d55e06..2a3b2f03e674 100644 --- a/data/VisDrone.yaml +++ b/data/VisDrone.yaml @@ -54,7 +54,7 @@ download: | 'https://github.com/ultralytics/yolov5/releases/download/v1.0/VisDrone2019-DET-val.zip', 'https://github.com/ultralytics/yolov5/releases/download/v1.0/VisDrone2019-DET-test-dev.zip', 'https://github.com/ultralytics/yolov5/releases/download/v1.0/VisDrone2019-DET-test-challenge.zip'] - download(urls, dir=dir) + download(urls, dir=dir, threads=4) # Convert for d in 'VisDrone2019-DET-train', 'VisDrone2019-DET-val', 'VisDrone2019-DET-test-dev': diff --git a/data/coco.yaml b/data/coco.yaml index 3ed7e48a2185..7494fc2f9cd1 100644 --- a/data/coco.yaml +++ b/data/coco.yaml @@ -30,6 +30,7 @@ names: ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 't download: | from utils.general import download, Path + # Download labels segments = False # segment or box labels dir = Path(yaml['path']) # dataset root dir diff --git a/utils/general.py b/utils/general.py index e8b3b05c5fe1..b0c5e9d69ab7 100755 --- a/utils/general.py +++ b/utils/general.py @@ -449,8 +449,9 @@ def check_dataset(data, autodownload=True): if val: val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])] # val path if not all(x.exists() for x in val): - LOGGER.info('\nDataset not found, missing paths: %s' % [str(x) for x in val if not x.exists()]) + LOGGER.info(emojis('\nDataset not found ⚠️, missing paths %s' % [str(x) for x in val if not x.exists()])) if s and autodownload: # download script + t = time.time() root = path.parent if 'path' in data else '..' # unzip directory i.e. '../' if s.startswith('http') and s.endswith('.zip'): # URL f = Path(s).name # filename @@ -465,9 +466,11 @@ def check_dataset(data, autodownload=True): r = os.system(s) else: # python script r = exec(s, {'yaml': data}) # return None - LOGGER.info(f"Dataset autodownload {f'success, saved to {root}' if r in (0, None) else 'failure'}\n") + dt = f'({round(time.time() - t, 1)}s)' + s = f"success ✅ {dt}, saved to {colorstr('bold', root)}" if r in (0, None) else f"failure {dt} ❌" + LOGGER.info(emojis(f"Dataset download {s}")) else: - raise Exception('Dataset not found.') + raise Exception(emojis('Dataset not found ❌')) return data # dictionary @@ -491,7 +494,7 @@ def download_one(url, dir): if curl: os.system(f"curl -L '{url}' -o '{f}' --retry 9 -C -") # curl download, retry and resume on fail else: - torch.hub.download_url_to_file(url, f, progress=True) # torch download + torch.hub.download_url_to_file(url, f, progress=threads == 1) # torch download if unzip and f.suffix in ('.zip', '.gz'): LOGGER.info(f'Unzipping {f}...') if f.suffix == '.zip':