Skip to content

Commit

Permalink
Auto-fix corrupt JPEGs (#4548)
Browse files Browse the repository at this point in the history
* Autofix corrupt JPEGs

This PR automatically re-saves corrupt JPEGs and trains with the resaved images. WARNING: this will overwrite the existing corrupt JPEGs in a dataset and replace them with correct JPEGs, though the filesize may increase and the image contents may not be exactly the same due to lossy JPEG compression schemes. Results may vary by JPEG decoder and hardware.

Current behavior is to exclude corrupt JPEGs from training with a warning to the user, but many users have been complaining about large parts of their dataset being excluded from training.

* Clarify re-save reason
  • Loading branch information
glenn-jocher authored Aug 26, 2021
1 parent 2da6444 commit 11f85e7
Showing 1 changed file with 7 additions and 6 deletions.
13 changes: 7 additions & 6 deletions utils/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def __init__(self, sources='streams.txt', img_size=640, stride=32, auto=True):
print('') # newline

# check for common shapes
s = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0].shape for x in self.imgs], 0) # shapes
s = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0].shape for x in self.imgs])
self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal
if not self.rect:
print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
Expand Down Expand Up @@ -568,7 +568,7 @@ def __getitem__(self, index):
if self.augment:
# Albumentations
img, labels = self.albumentations(img, labels)
nl = len(labels) # update after albumentations
nl = len(labels) # update after albumentations

# HSV color-space
augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
Expand Down Expand Up @@ -861,7 +861,7 @@ def autosplit(path='../datasets/coco128/images', weights=(0.9, 0.1, 0.0), annota
def verify_image_label(args):
# Verify one image-label pair
im_file, lb_file, prefix = args
nm, nf, ne, nc = 0, 0, 0, 0 # number missing, found, empty, corrupt
nm, nf, ne, nc, msg, segments = 0, 0, 0, 0, '', [] # number (missing, found, empty, corrupt), message, segments
try:
# verify images
im = Image.open(im_file)
Expand All @@ -872,10 +872,11 @@ def verify_image_label(args):
if im.format.lower() in ('jpg', 'jpeg'):
with open(im_file, 'rb') as f:
f.seek(-2, 2)
assert f.read() == b'\xff\xd9', 'corrupted JPEG'
if f.read() != b'\xff\xd9': # corrupt JPEG
im.save(im_file, format='JPEG', subsampling=0, quality=100) # re-save image
msg = f'{prefix}WARNING: corrupt JPEG restored and saved {im_file}'

# verify labels
segments = [] # instance segments
if os.path.isfile(lb_file):
nf = 1 # label found
with open(lb_file, 'r') as f:
Expand All @@ -896,7 +897,7 @@ def verify_image_label(args):
else:
nm = 1 # label missing
l = np.zeros((0, 5), dtype=np.float32)
return im_file, l, shape, segments, nm, nf, ne, nc, ''
return im_file, l, shape, segments, nm, nf, ne, nc, msg
except Exception as e:
nc = 1
msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'
Expand Down

0 comments on commit 11f85e7

Please sign in to comment.