Skip to content

Commit

Permalink
Merge pull request #289 from rleigh-codelibre/idr012-create-hdf
Browse files Browse the repository at this point in the history
idr012: Add create-hdf script
  • Loading branch information
sbesson authored Oct 19, 2020
2 parents 8550167 + 2c4ce69 commit d031d5a
Showing 1 changed file with 198 additions and 0 deletions.
198 changes: 198 additions & 0 deletions idr0012-fuchs-cellmorph/create-hdf
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#!/usr/bin/env python3

from __future__ import print_function

import csv
import re
import sys
from pathlib import Path

import h5py
import numpy as np

idr012_location = Path('/uod/idr/filesets/idr0012-fuchs-cellmorph/downloaded/primary/data/www.ebi.ac.uk/huber-srv/cellmorph/data')

tsv_objects = {}
plates = set()

f = h5py.File("idr012.h5", "w")

for p in (idr012_location.glob('*/*.tab')):
r = re.search('^(HT\d+)([A-Z]\d+)_ftrs.tab$', str(p.name))
plate = r.group(1)
well = r.group(2)
print("%s, %s %s" % (p, plate, well))
tsv_objects[(plate, well)] = p
plates.add(plate)

for plate in sorted(plates):
f.create_group(plate)


def determine_tsv_dtype(files):
fieldnames = None
coltypes = []

float_sig_digits = re.compile('^-?0*\.?0*(\d+)0*\.?0*(e-?\d+)?$')

for file in files:
print('Checking CSV dtypes: %s' % (file))
with open(str(file)) as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t')
if fieldnames is None:
fieldnames = reader.fieldnames
for col in range(len(reader.fieldnames)):
typeinfo = {}
colname = reader.fieldnames[col]
typeinfo['field'] = colname
typeinfo['column'] = col
typeinfo['stype'] = None
typeinfo['smax'] = 0
typeinfo['min'] = None
typeinfo['max'] = None
typeinfo['sig-digits'] = 0
typeinfo['contains-blank'] = False
coltypes.append(typeinfo)
else:
if fieldnames != reader.fieldnames:
raise Exception("Field name mismatch between CSV files")

for row in reader:
for col in range(len(reader.fieldnames)):
typeinfo = coltypes[col]
value = row[typeinfo['field']]

if value == '':
typeinfo['contains-blank'] = True
continue

pv = None
try:
pv = int(value)
if typeinfo['stype'] is None:
typeinfo['stype'] = 'i'
except ValueError:
try:
pv = float(value)
if typeinfo['stype'] is None or \
typeinfo['stype'] == 'i':
typeinfo['stype'] = 'f'
except ValueError:
typeinfo['stype'] = 's'

if typeinfo['stype'] == 'f':
sigmatch = float_sig_digits.match(value)
if sigmatch:
sig = len(sigmatch.group(1))
if sig is None:
sig = 0
if typeinfo['sig-digits'] is None:
typeinfo['sig-digits'] = sig
else:
typeinfo['sig-digits'] = max(
typeinfo['sig-digits'], sig)

if pv is not None:
if typeinfo['min'] is None:
typeinfo['min'] = pv
else:
typeinfo['min'] = min(typeinfo['min'], pv)
if typeinfo['max'] is None:
typeinfo['max'] = pv
else:
typeinfo['max'] = max(typeinfo['max'], pv)

slen = len(value.encode('utf-8'))
typeinfo['smax'] = max(typeinfo['smax'], slen)

for col in range(len(reader.fieldnames)):
try:
typeinfo = coltypes[col]
if typeinfo['stype'] == 'i' and \
typeinfo['contains-blank'] is True:
# No way to represent null values, so retain as text
typeinfo['dtype'] = (typeinfo['field'],
'S' + str(typeinfo['smax']))
elif typeinfo['stype'] == 'i':
vmin = typeinfo['min']
vmax = typeinfo['max']
if vmin < 0:
if vmin >= -2**7+1 and vmax <= 2**7-1:
typeinfo['dtype'] = (typeinfo['field'], 'i1')
elif vmin >= -2**15+1 and vmax <= 2**15-1:
typeinfo['dtype'] = (typeinfo['field'], 'i2')
elif vmin >= -2**31+1 and vmax <= 2**31-1:
typeinfo['dtype'] = (typeinfo['field'], 'i4')
elif vmin >= -2**63+1 and vmax <= 2**63-1:
typeinfo['dtype'] = (typeinfo['field'], 'i8')
else:
raise Exception("Signed integer too large: " +
"%s, %s/%s" %
(typeinfo['field'],
vmin, vmax))
else:
if vmax <= 2**8-1:
typeinfo['dtype'] = (typeinfo['field'], 'u1')
elif vmax <= 2**16-1:
typeinfo['dtype'] = (typeinfo['field'], 'u2')
elif vmax <= 2**32-1:
typeinfo['dtype'] = (typeinfo['field'], 'u4')
elif vmax <= 2**64-1:
typeinfo['dtype'] = (typeinfo['field'], 'u8')
else:
raise Exception("Unsigned integer too large:" +
"%s, %s/%s" %
(typeinfo['field'],
vmin, vmax))
elif typeinfo['stype'] == 'f':
sig = typeinfo['sig-digits']
if sig <= 6:
typeinfo['dtype'] = (typeinfo['field'], 'f4')
elif sig <= 15:
typeinfo['dtype'] = (typeinfo['field'], 'f8')
else:
typeinfo['dtype'] = (typeinfo['field'], 'f16')
print("Float too large: %s, %s sig digits" %
(typeinfo['field'], sig), file=sys.stderr)
else:
typeinfo['dtype'] = (typeinfo['field'],
'S' + str(typeinfo['smax']))
except Exception as e:
raise Exception("Invalid field %s" % (typeinfo['field']))

dtype = np.dtype([col['dtype'] for col in coltypes])

return dtype


# Determine uniform dtype for all wells

dtype = determine_tsv_dtype(tsv_objects.values())
print(dtype)

# Save well data, grouped by plate

for plate, well in sorted(tsv_objects.keys()):
g = f[plate]
od = g.create_dataset(well, (0,), maxshape=(None,), dtype=dtype,
chunks=(128,), compression='gzip')
print("Importing object data: %s %s" % (plate, well))
with open(tsv_objects[(plate, well)], newline='') as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
# skip header
next(reader, None)

ndata = []

for row in reader:
for col in range(len(dtype)):
if row[col] == '' and dtype[col].kind == 'f':
row[col] = 'nan'
drow = np.array(tuple(row), dtype=dtype)
ndata.append(drow)

od.resize((od.shape[0] + len(ndata),))

print("Writing feature data")
od[-len(ndata):] = ndata
print("Done")

0 comments on commit d031d5a

Please sign in to comment.