From f527704cd32c42bc0bba9cce04601783b8563204 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Fri, 18 Jun 2021 10:21:47 +0200 Subject: [PATCH] Cache v0.3: improved corrupt image/label reporting (#3676) * Cache v0.3: improved corrupt image/label reporting Fix for https://github.com/ultralytics/yolov5/issues/3656#issuecomment-863660899 * cleanup --- utils/datasets.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/utils/datasets.py b/utils/datasets.py index bcb8c36e0e64..f927abb20f5a 100755 --- a/utils/datasets.py +++ b/utils/datasets.py @@ -390,7 +390,7 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache') # cached labels if cache_path.is_file(): cache, exists = torch.load(cache_path), True # load - if cache['hash'] != get_hash(self.label_files + self.img_files): # changed + if cache['hash'] != get_hash(self.label_files + self.img_files) or cache['version'] != 0.3: cache, exists = self.cache_labels(cache_path, prefix), False # re-cache else: cache, exists = self.cache_labels(cache_path, prefix), False # cache @@ -400,11 +400,12 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r if exists: d = f"Scanning '{cache_path}' images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted" tqdm(None, desc=prefix + d, total=n, initial=n) # display cache results + if cache['msgs']: + logging.info('\n'.join(cache['msgs'])) # display warnings assert nf > 0 or not augment, f'{prefix}No labels in {cache_path}. Can not train without labels. See {help_url}' # Read cache - cache.pop('hash') # remove hash - cache.pop('version') # remove version + [cache.pop(k) for k in ('hash', 'version', 'msgs')] # remove items labels, shapes, self.segments = zip(*cache.values()) self.labels = list(labels) self.shapes = np.array(shapes, dtype=np.float64) @@ -461,26 +462,31 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r def cache_labels(self, path=Path('./labels.cache'), prefix=''): # Cache dataset labels, check images and read shapes x = {} # dict - nm, nf, ne, nc = 0, 0, 0, 0 # number missing, found, empty, corrupt + nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages desc = f"{prefix}Scanning '{path.parent / path.stem}' images and labels..." with Pool(num_threads) as pool: pbar = tqdm(pool.imap_unordered(verify_image_label, zip(self.img_files, self.label_files, repeat(prefix))), desc=desc, total=len(self.img_files)) - for im_file, l, shape, segments, nm_f, nf_f, ne_f, nc_f in pbar: + for im_file, l, shape, segments, nm_f, nf_f, ne_f, nc_f, msg in pbar: nm += nm_f nf += nf_f ne += ne_f nc += nc_f if im_file: x[im_file] = [l, shape, segments] + if msg: + msgs.append(msg) pbar.desc = f"{desc}{nf} found, {nm} missing, {ne} empty, {nc} corrupted" pbar.close() + if msgs: + logging.info('\n'.join(msgs)) if nf == 0: logging.info(f'{prefix}WARNING: No labels found in {path}. See {help_url}') x['hash'] = get_hash(self.label_files + self.img_files) x['results'] = nf, nm, ne, nc, len(self.img_files) - x['version'] = 0.2 # cache version + x['msgs'] = msgs # warnings + x['version'] = 0.3 # cache version try: torch.save(x, path) # save cache for next time logging.info(f'{prefix}New cache created: {path}') @@ -1084,11 +1090,11 @@ def verify_image_label(args): else: nm = 1 # label missing l = np.zeros((0, 5), dtype=np.float32) - return im_file, l, shape, segments, nm, nf, ne, nc + return im_file, l, shape, segments, nm, nf, ne, nc, '' except Exception as e: nc = 1 - logging.info(f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}') - return [None, None, None, None, nm, nf, ne, nc] + msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}' + return [None, None, None, None, nm, nf, ne, nc, msg] def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False):