Skip to content

Commit

Permalink
Update dataset_stats() for zipped datasets (#3926)
Browse files Browse the repository at this point in the history
* Update `dataset_stats()` for zipped datasets

@kalenmike

* cleanup
  • Loading branch information
glenn-jocher authored Jul 8, 2021
1 parent 850970e commit 8c6f9e1
Showing 1 changed file with 17 additions and 3 deletions.
20 changes: 17 additions & 3 deletions utils/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -888,9 +888,11 @@ def verify_image_label(args):

def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False):
""" Return dataset statistics dictionary with images and instances counts per split per class
Usage: from utils.datasets import *; dataset_stats('coco128.yaml', verbose=True)
Usage1: from utils.datasets import *; dataset_stats('coco128.yaml', verbose=True)
Usage2: from utils.datasets import *; dataset_stats('../datasets/coco128.zip', verbose=True)
Arguments
path: Path to data.yaml
path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
autodownload: Attempt to download dataset if not found locally
verbose: Print stats dictionary
"""
Expand All @@ -899,8 +901,20 @@ def round_labels(labels):
# Update labels to integer class and 6 decimal place floats
return [[int(c), *[round(x, 6) for x in points]] for c, *points in labels]

with open(check_file(path)) as f:
def unzip(path):
# Unzip data.zip TODO: CONSTRAINT: path/to/abc.zip MUST unzip to 'path/to/abc/'
if str(path).endswith('.zip'): # path is data.zip
assert os.system(f'unzip -q {path} -d {path.parent}') == 0, f'Error unzipping {path}'
data_dir = path.with_suffix('') # dataset directory
return True, data_dir, list(data_dir.rglob('*.yaml'))[0] # zipped, data_dir, yaml_path
else: # path is data.yaml
return False, None, path

zipped, data_dir, yaml_path = unzip(Path(path))
with open(check_file(yaml_path)) as f:
data = yaml.safe_load(f) # data dict
if zipped:
data['path'] = data_dir # TODO: should this be dir.resolve()?
check_dataset(data, autodownload) # download dataset if missing
nc = data['nc'] # number of classes
stats = {'nc': nc, 'names': data['names']} # statistics dictionary
Expand Down

0 comments on commit 8c6f9e1

Please sign in to comment.