Merge pull request #289 from rleigh-codelibre/idr012-create-hdf

idr012: Add create-hdf script
IDR · Oct 19, 2020 · d031d5a · d031d5a
2 parents 8550167 + 2c4ce69
commit d031d5a
Showing 1 changed file with 198 additions and 0 deletions.
diff --git a/idr0012-fuchs-cellmorph/create-hdf b/idr0012-fuchs-cellmorph/create-hdf
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+
+from __future__ import print_function
+
+import csv
+import re
+import sys
+from pathlib import Path
+
+import h5py
+import numpy as np
+
+idr012_location = Path('/uod/idr/filesets/idr0012-fuchs-cellmorph/downloaded/primary/data/www.ebi.ac.uk/huber-srv/cellmorph/data')
+
+tsv_objects = {}
+plates = set()
+
+f = h5py.File("idr012.h5", "w")
+
+for p in (idr012_location.glob('*/*.tab')):
+    r = re.search('^(HT\d+)([A-Z]\d+)_ftrs.tab$', str(p.name))
+    plate = r.group(1)
+    well = r.group(2)
+    print("%s, %s %s" % (p, plate, well))
+    tsv_objects[(plate, well)] = p
+    plates.add(plate)
+
+for plate in sorted(plates):
+    f.create_group(plate)
+
+
+def determine_tsv_dtype(files):
+    fieldnames = None
+    coltypes = []
+
+    float_sig_digits = re.compile('^-?0*\.?0*(\d+)0*\.?0*(e-?\d+)?$')
+
+    for file in files:
+        print('Checking CSV dtypes: %s' % (file))
+        with open(str(file)) as csvfile:
+            reader = csv.DictReader(csvfile, delimiter='\t')
+            if fieldnames is None:
+                fieldnames = reader.fieldnames
+                for col in range(len(reader.fieldnames)):
+                    typeinfo = {}
+                    colname = reader.fieldnames[col]
+                    typeinfo['field'] = colname
+                    typeinfo['column'] = col
+                    typeinfo['stype'] = None
+                    typeinfo['smax'] = 0
+                    typeinfo['min'] = None
+                    typeinfo['max'] = None
+                    typeinfo['sig-digits'] = 0
+                    typeinfo['contains-blank'] = False
+                    coltypes.append(typeinfo)
+            else:
+                if fieldnames != reader.fieldnames:
+                    raise Exception("Field name mismatch between CSV files")
+
+            for row in reader:
+                for col in range(len(reader.fieldnames)):
+                    typeinfo = coltypes[col]
+                    value = row[typeinfo['field']]
+
+                    if value == '':
+                        typeinfo['contains-blank'] = True
+                        continue
+
+                    pv = None
+                    try:
+                        pv = int(value)
+                        if typeinfo['stype'] is None:
+                            typeinfo['stype'] = 'i'
+                    except ValueError:
+                        try:
+                            pv = float(value)
+                            if typeinfo['stype'] is None or \
+                               typeinfo['stype'] == 'i':
+                                typeinfo['stype'] = 'f'
+                        except ValueError:
+                            typeinfo['stype'] = 's'
+
+                    if typeinfo['stype'] == 'f':
+                        sigmatch = float_sig_digits.match(value)
+                        if sigmatch:
+                            sig = len(sigmatch.group(1))
+                            if sig is None:
+                                sig = 0
+                            if typeinfo['sig-digits'] is None:
+                                typeinfo['sig-digits'] = sig
+                            else:
+                                typeinfo['sig-digits'] = max(
+                                    typeinfo['sig-digits'], sig)
+
+                    if pv is not None:
+                        if typeinfo['min'] is None:
+                            typeinfo['min'] = pv
+                        else:
+                            typeinfo['min'] = min(typeinfo['min'], pv)
+                        if typeinfo['max'] is None:
+                            typeinfo['max'] = pv
+                        else:
+                            typeinfo['max'] = max(typeinfo['max'], pv)
+
+                    slen = len(value.encode('utf-8'))
+                    typeinfo['smax'] = max(typeinfo['smax'], slen)
+
+            for col in range(len(reader.fieldnames)):
+                try:
+                    typeinfo = coltypes[col]
+                    if typeinfo['stype'] == 'i' and \
+                       typeinfo['contains-blank'] is True:
+                        # No way to represent null values, so retain as text
+                        typeinfo['dtype'] = (typeinfo['field'],
+                                             'S' + str(typeinfo['smax']))
+                    elif typeinfo['stype'] == 'i':
+                        vmin = typeinfo['min']
+                        vmax = typeinfo['max']
+                        if vmin < 0:
+                            if vmin >= -2**7+1 and vmax <= 2**7-1:
+                                typeinfo['dtype'] = (typeinfo['field'], 'i1')
+                            elif vmin >= -2**15+1 and vmax <= 2**15-1:
+                                typeinfo['dtype'] = (typeinfo['field'], 'i2')
+                            elif vmin >= -2**31+1 and vmax <= 2**31-1:
+                                typeinfo['dtype'] = (typeinfo['field'], 'i4')
+                            elif vmin >= -2**63+1 and vmax <= 2**63-1:
+                                typeinfo['dtype'] = (typeinfo['field'], 'i8')
+                            else:
+                                raise Exception("Signed integer too large: " +
+                                                "%s, %s/%s" %
+                                                (typeinfo['field'],
+                                                 vmin, vmax))
+                        else:
+                            if vmax <= 2**8-1:
+                                typeinfo['dtype'] = (typeinfo['field'], 'u1')
+                            elif vmax <= 2**16-1:
+                                typeinfo['dtype'] = (typeinfo['field'], 'u2')
+                            elif vmax <= 2**32-1:
+                                typeinfo['dtype'] = (typeinfo['field'], 'u4')
+                            elif vmax <= 2**64-1:
+                                typeinfo['dtype'] = (typeinfo['field'], 'u8')
+                            else:
+                                raise Exception("Unsigned integer too large:" +
+                                                "%s, %s/%s" %
+                                                (typeinfo['field'],
+                                                 vmin, vmax))
+                    elif typeinfo['stype'] == 'f':
+                        sig = typeinfo['sig-digits']
+                        if sig <= 6:
+                            typeinfo['dtype'] = (typeinfo['field'], 'f4')
+                        elif sig <= 15:
+                            typeinfo['dtype'] = (typeinfo['field'], 'f8')
+                        else:
+                            typeinfo['dtype'] = (typeinfo['field'], 'f16')
+                            print("Float too large: %s, %s sig digits" %
+                                  (typeinfo['field'], sig), file=sys.stderr)
+                    else:
+                        typeinfo['dtype'] = (typeinfo['field'],
+                                             'S' + str(typeinfo['smax']))
+                except Exception as e:
+                    raise Exception("Invalid field %s" % (typeinfo['field']))
+
+    dtype = np.dtype([col['dtype'] for col in coltypes])
+
+    return dtype
+
+
+# Determine uniform dtype for all wells
+
+dtype = determine_tsv_dtype(tsv_objects.values())
+print(dtype)
+
+# Save well data, grouped by plate
+
+for plate, well in sorted(tsv_objects.keys()):
+    g = f[plate]
+    od = g.create_dataset(well, (0,), maxshape=(None,), dtype=dtype,
+                          chunks=(128,), compression='gzip')
+    print("Importing object data: %s %s" % (plate, well))
+    with open(tsv_objects[(plate, well)], newline='') as csvfile:
+        reader = csv.reader(csvfile, delimiter='\t')
+        # skip header
+        next(reader, None)
+
+        ndata = []
+
+        for row in reader:
+            for col in range(len(dtype)):
+                if row[col] == '' and dtype[col].kind == 'f':
+                    row[col] = 'nan'
+            drow = np.array(tuple(row), dtype=dtype)
+            ndata.append(drow)
+
+        od.resize((od.shape[0] + len(ndata),))
+
+        print("Writing feature data")
+        od[-len(ndata):] = ndata
+        print("Done")