From eb49297b316f9eb127101203873a641ab2983b76 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 26 Aug 2021 15:51:04 +0200 Subject: [PATCH] Auto-fix corrupt JPEGs (#4548) * Autofix corrupt JPEGs This PR automatically re-saves corrupt JPEGs and trains with the resaved images. WARNING: this will overwrite the existing corrupt JPEGs in a dataset and replace them with correct JPEGs, though the filesize may increase and the image contents may not be exactly the same due to lossy JPEG compression schemes. Results may vary by JPEG decoder and hardware. Current behavior is to exclude corrupt JPEGs from training with a warning to the user, but many users have been complaining about large parts of their dataset being excluded from training. * Clarify re-save reason --- utils/datasets.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/utils/datasets.py b/utils/datasets.py index 25a2ba6f9561..eea8ad348452 100755 --- a/utils/datasets.py +++ b/utils/datasets.py @@ -314,7 +314,7 @@ def __init__(self, sources='streams.txt', img_size=640, stride=32, auto=True): print('') # newline # check for common shapes - s = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0].shape for x in self.imgs], 0) # shapes + s = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0].shape for x in self.imgs]) self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal if not self.rect: print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.') @@ -568,7 +568,7 @@ def __getitem__(self, index): if self.augment: # Albumentations img, labels = self.albumentations(img, labels) - nl = len(labels) # update after albumentations + nl = len(labels) # update after albumentations # HSV color-space augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v']) @@ -861,7 +861,7 @@ def autosplit(path='../datasets/coco128/images', weights=(0.9, 0.1, 0.0), annota def verify_image_label(args): # Verify one image-label pair im_file, lb_file, prefix = args - nm, nf, ne, nc = 0, 0, 0, 0 # number missing, found, empty, corrupt + nm, nf, ne, nc, msg, segments = 0, 0, 0, 0, '', [] # number (missing, found, empty, corrupt), message, segments try: # verify images im = Image.open(im_file) @@ -872,10 +872,11 @@ def verify_image_label(args): if im.format.lower() in ('jpg', 'jpeg'): with open(im_file, 'rb') as f: f.seek(-2, 2) - assert f.read() == b'\xff\xd9', 'corrupted JPEG' + if f.read() != b'\xff\xd9': # corrupt JPEG + im.save(im_file, format='JPEG', subsampling=0, quality=100) # re-save image + msg = f'{prefix}WARNING: corrupt JPEG restored and saved {im_file}' # verify labels - segments = [] # instance segments if os.path.isfile(lb_file): nf = 1 # label found with open(lb_file, 'r') as f: @@ -896,7 +897,7 @@ def verify_image_label(args): else: nm = 1 # label missing l = np.zeros((0, 5), dtype=np.float32) - return im_file, l, shape, segments, nm, nf, ne, nc, '' + return im_file, l, shape, segments, nm, nf, ne, nc, msg except Exception as e: nc = 1 msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'