-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #289 from rleigh-codelibre/idr012-create-hdf
idr012: Add create-hdf script
- Loading branch information
Showing
1 changed file
with
198 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from __future__ import print_function | ||
|
||
import csv | ||
import re | ||
import sys | ||
from pathlib import Path | ||
|
||
import h5py | ||
import numpy as np | ||
|
||
idr012_location = Path('/uod/idr/filesets/idr0012-fuchs-cellmorph/downloaded/primary/data/www.ebi.ac.uk/huber-srv/cellmorph/data') | ||
|
||
tsv_objects = {} | ||
plates = set() | ||
|
||
f = h5py.File("idr012.h5", "w") | ||
|
||
for p in (idr012_location.glob('*/*.tab')): | ||
r = re.search('^(HT\d+)([A-Z]\d+)_ftrs.tab$', str(p.name)) | ||
plate = r.group(1) | ||
well = r.group(2) | ||
print("%s, %s %s" % (p, plate, well)) | ||
tsv_objects[(plate, well)] = p | ||
plates.add(plate) | ||
|
||
for plate in sorted(plates): | ||
f.create_group(plate) | ||
|
||
|
||
def determine_tsv_dtype(files): | ||
fieldnames = None | ||
coltypes = [] | ||
|
||
float_sig_digits = re.compile('^-?0*\.?0*(\d+)0*\.?0*(e-?\d+)?$') | ||
|
||
for file in files: | ||
print('Checking CSV dtypes: %s' % (file)) | ||
with open(str(file)) as csvfile: | ||
reader = csv.DictReader(csvfile, delimiter='\t') | ||
if fieldnames is None: | ||
fieldnames = reader.fieldnames | ||
for col in range(len(reader.fieldnames)): | ||
typeinfo = {} | ||
colname = reader.fieldnames[col] | ||
typeinfo['field'] = colname | ||
typeinfo['column'] = col | ||
typeinfo['stype'] = None | ||
typeinfo['smax'] = 0 | ||
typeinfo['min'] = None | ||
typeinfo['max'] = None | ||
typeinfo['sig-digits'] = 0 | ||
typeinfo['contains-blank'] = False | ||
coltypes.append(typeinfo) | ||
else: | ||
if fieldnames != reader.fieldnames: | ||
raise Exception("Field name mismatch between CSV files") | ||
|
||
for row in reader: | ||
for col in range(len(reader.fieldnames)): | ||
typeinfo = coltypes[col] | ||
value = row[typeinfo['field']] | ||
|
||
if value == '': | ||
typeinfo['contains-blank'] = True | ||
continue | ||
|
||
pv = None | ||
try: | ||
pv = int(value) | ||
if typeinfo['stype'] is None: | ||
typeinfo['stype'] = 'i' | ||
except ValueError: | ||
try: | ||
pv = float(value) | ||
if typeinfo['stype'] is None or \ | ||
typeinfo['stype'] == 'i': | ||
typeinfo['stype'] = 'f' | ||
except ValueError: | ||
typeinfo['stype'] = 's' | ||
|
||
if typeinfo['stype'] == 'f': | ||
sigmatch = float_sig_digits.match(value) | ||
if sigmatch: | ||
sig = len(sigmatch.group(1)) | ||
if sig is None: | ||
sig = 0 | ||
if typeinfo['sig-digits'] is None: | ||
typeinfo['sig-digits'] = sig | ||
else: | ||
typeinfo['sig-digits'] = max( | ||
typeinfo['sig-digits'], sig) | ||
|
||
if pv is not None: | ||
if typeinfo['min'] is None: | ||
typeinfo['min'] = pv | ||
else: | ||
typeinfo['min'] = min(typeinfo['min'], pv) | ||
if typeinfo['max'] is None: | ||
typeinfo['max'] = pv | ||
else: | ||
typeinfo['max'] = max(typeinfo['max'], pv) | ||
|
||
slen = len(value.encode('utf-8')) | ||
typeinfo['smax'] = max(typeinfo['smax'], slen) | ||
|
||
for col in range(len(reader.fieldnames)): | ||
try: | ||
typeinfo = coltypes[col] | ||
if typeinfo['stype'] == 'i' and \ | ||
typeinfo['contains-blank'] is True: | ||
# No way to represent null values, so retain as text | ||
typeinfo['dtype'] = (typeinfo['field'], | ||
'S' + str(typeinfo['smax'])) | ||
elif typeinfo['stype'] == 'i': | ||
vmin = typeinfo['min'] | ||
vmax = typeinfo['max'] | ||
if vmin < 0: | ||
if vmin >= -2**7+1 and vmax <= 2**7-1: | ||
typeinfo['dtype'] = (typeinfo['field'], 'i1') | ||
elif vmin >= -2**15+1 and vmax <= 2**15-1: | ||
typeinfo['dtype'] = (typeinfo['field'], 'i2') | ||
elif vmin >= -2**31+1 and vmax <= 2**31-1: | ||
typeinfo['dtype'] = (typeinfo['field'], 'i4') | ||
elif vmin >= -2**63+1 and vmax <= 2**63-1: | ||
typeinfo['dtype'] = (typeinfo['field'], 'i8') | ||
else: | ||
raise Exception("Signed integer too large: " + | ||
"%s, %s/%s" % | ||
(typeinfo['field'], | ||
vmin, vmax)) | ||
else: | ||
if vmax <= 2**8-1: | ||
typeinfo['dtype'] = (typeinfo['field'], 'u1') | ||
elif vmax <= 2**16-1: | ||
typeinfo['dtype'] = (typeinfo['field'], 'u2') | ||
elif vmax <= 2**32-1: | ||
typeinfo['dtype'] = (typeinfo['field'], 'u4') | ||
elif vmax <= 2**64-1: | ||
typeinfo['dtype'] = (typeinfo['field'], 'u8') | ||
else: | ||
raise Exception("Unsigned integer too large:" + | ||
"%s, %s/%s" % | ||
(typeinfo['field'], | ||
vmin, vmax)) | ||
elif typeinfo['stype'] == 'f': | ||
sig = typeinfo['sig-digits'] | ||
if sig <= 6: | ||
typeinfo['dtype'] = (typeinfo['field'], 'f4') | ||
elif sig <= 15: | ||
typeinfo['dtype'] = (typeinfo['field'], 'f8') | ||
else: | ||
typeinfo['dtype'] = (typeinfo['field'], 'f16') | ||
print("Float too large: %s, %s sig digits" % | ||
(typeinfo['field'], sig), file=sys.stderr) | ||
else: | ||
typeinfo['dtype'] = (typeinfo['field'], | ||
'S' + str(typeinfo['smax'])) | ||
except Exception as e: | ||
raise Exception("Invalid field %s" % (typeinfo['field'])) | ||
|
||
dtype = np.dtype([col['dtype'] for col in coltypes]) | ||
|
||
return dtype | ||
|
||
|
||
# Determine uniform dtype for all wells | ||
|
||
dtype = determine_tsv_dtype(tsv_objects.values()) | ||
print(dtype) | ||
|
||
# Save well data, grouped by plate | ||
|
||
for plate, well in sorted(tsv_objects.keys()): | ||
g = f[plate] | ||
od = g.create_dataset(well, (0,), maxshape=(None,), dtype=dtype, | ||
chunks=(128,), compression='gzip') | ||
print("Importing object data: %s %s" % (plate, well)) | ||
with open(tsv_objects[(plate, well)], newline='') as csvfile: | ||
reader = csv.reader(csvfile, delimiter='\t') | ||
# skip header | ||
next(reader, None) | ||
|
||
ndata = [] | ||
|
||
for row in reader: | ||
for col in range(len(dtype)): | ||
if row[col] == '' and dtype[col].kind == 'f': | ||
row[col] = 'nan' | ||
drow = np.array(tuple(row), dtype=dtype) | ||
ndata.append(drow) | ||
|
||
od.resize((od.shape[0] + len(ndata),)) | ||
|
||
print("Writing feature data") | ||
od[-len(ndata):] = ndata | ||
print("Done") |