Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

idr012: Add create-hdf script #289

Merged
merged 1 commit into from
Oct 19, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 198 additions & 0 deletions idr0012-fuchs-cellmorph/create-hdf
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#!/usr/bin/env python3

from __future__ import print_function

import csv
import re
import sys
from pathlib import Path

import h5py
import numpy as np

idr012_location = Path('/uod/idr/filesets/idr0012-fuchs-cellmorph/downloaded/primary/data/www.ebi.ac.uk/huber-srv/cellmorph/data')

tsv_objects = {}
plates = set()

f = h5py.File("idr012.h5", "w")

for p in (idr012_location.glob('*/*.tab')):
r = re.search('^(HT\d+)([A-Z]\d+)_ftrs.tab$', str(p.name))
plate = r.group(1)
well = r.group(2)
print("%s, %s %s" % (p, plate, well))
tsv_objects[(plate, well)] = p
plates.add(plate)

for plate in sorted(plates):
f.create_group(plate)


def determine_tsv_dtype(files):
fieldnames = None
coltypes = []

float_sig_digits = re.compile('^-?0*\.?0*(\d+)0*\.?0*(e-?\d+)?$')

for file in files:
print('Checking CSV dtypes: %s' % (file))
with open(str(file)) as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t')
if fieldnames is None:
fieldnames = reader.fieldnames
for col in range(len(reader.fieldnames)):
typeinfo = {}
colname = reader.fieldnames[col]
typeinfo['field'] = colname
typeinfo['column'] = col
typeinfo['stype'] = None
typeinfo['smax'] = 0
typeinfo['min'] = None
typeinfo['max'] = None
typeinfo['sig-digits'] = 0
typeinfo['contains-blank'] = False
coltypes.append(typeinfo)
else:
if fieldnames != reader.fieldnames:
raise Exception("Field name mismatch between CSV files")

for row in reader:
for col in range(len(reader.fieldnames)):
typeinfo = coltypes[col]
value = row[typeinfo['field']]

if value == '':
typeinfo['contains-blank'] = True
continue

pv = None
try:
pv = int(value)
if typeinfo['stype'] is None:
typeinfo['stype'] = 'i'
except ValueError:
try:
pv = float(value)
if typeinfo['stype'] is None or \
typeinfo['stype'] == 'i':
typeinfo['stype'] = 'f'
except ValueError:
typeinfo['stype'] = 's'

if typeinfo['stype'] == 'f':
sigmatch = float_sig_digits.match(value)
if sigmatch:
sig = len(sigmatch.group(1))
if sig is None:
sig = 0
if typeinfo['sig-digits'] is None:
typeinfo['sig-digits'] = sig
else:
typeinfo['sig-digits'] = max(
typeinfo['sig-digits'], sig)

if pv is not None:
if typeinfo['min'] is None:
typeinfo['min'] = pv
else:
typeinfo['min'] = min(typeinfo['min'], pv)
if typeinfo['max'] is None:
typeinfo['max'] = pv
else:
typeinfo['max'] = max(typeinfo['max'], pv)

slen = len(value.encode('utf-8'))
typeinfo['smax'] = max(typeinfo['smax'], slen)

for col in range(len(reader.fieldnames)):
try:
typeinfo = coltypes[col]
if typeinfo['stype'] == 'i' and \
typeinfo['contains-blank'] is True:
# No way to represent null values, so retain as text
typeinfo['dtype'] = (typeinfo['field'],
'S' + str(typeinfo['smax']))
elif typeinfo['stype'] == 'i':
vmin = typeinfo['min']
vmax = typeinfo['max']
if vmin < 0:
if vmin >= -2**7+1 and vmax <= 2**7-1:
typeinfo['dtype'] = (typeinfo['field'], 'i1')
elif vmin >= -2**15+1 and vmax <= 2**15-1:
typeinfo['dtype'] = (typeinfo['field'], 'i2')
elif vmin >= -2**31+1 and vmax <= 2**31-1:
typeinfo['dtype'] = (typeinfo['field'], 'i4')
elif vmin >= -2**63+1 and vmax <= 2**63-1:
typeinfo['dtype'] = (typeinfo['field'], 'i8')
else:
raise Exception("Signed integer too large: " +
"%s, %s/%s" %
(typeinfo['field'],
vmin, vmax))
else:
if vmax <= 2**8-1:
typeinfo['dtype'] = (typeinfo['field'], 'u1')
elif vmax <= 2**16-1:
typeinfo['dtype'] = (typeinfo['field'], 'u2')
elif vmax <= 2**32-1:
typeinfo['dtype'] = (typeinfo['field'], 'u4')
elif vmax <= 2**64-1:
typeinfo['dtype'] = (typeinfo['field'], 'u8')
else:
raise Exception("Unsigned integer too large:" +
"%s, %s/%s" %
(typeinfo['field'],
vmin, vmax))
elif typeinfo['stype'] == 'f':
sig = typeinfo['sig-digits']
if sig <= 6:
typeinfo['dtype'] = (typeinfo['field'], 'f4')
elif sig <= 15:
typeinfo['dtype'] = (typeinfo['field'], 'f8')
else:
typeinfo['dtype'] = (typeinfo['field'], 'f16')
print("Float too large: %s, %s sig digits" %
(typeinfo['field'], sig), file=sys.stderr)
else:
typeinfo['dtype'] = (typeinfo['field'],
'S' + str(typeinfo['smax']))
except Exception as e:
raise Exception("Invalid field %s" % (typeinfo['field']))

dtype = np.dtype([col['dtype'] for col in coltypes])

return dtype


# Determine uniform dtype for all wells

dtype = determine_tsv_dtype(tsv_objects.values())
print(dtype)

# Save well data, grouped by plate

for plate, well in sorted(tsv_objects.keys()):
g = f[plate]
od = g.create_dataset(well, (0,), maxshape=(None,), dtype=dtype,
chunks=(128,), compression='gzip')
print("Importing object data: %s %s" % (plate, well))
with open(tsv_objects[(plate, well)], newline='') as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
# skip header
next(reader, None)

ndata = []

for row in reader:
for col in range(len(dtype)):
if row[col] == '' and dtype[col].kind == 'f':
row[col] = 'nan'
drow = np.array(tuple(row), dtype=dtype)
ndata.append(drow)

od.resize((od.shape[0] + len(ndata),))

print("Writing feature data")
od[-len(ndata):] = ndata
print("Done")