-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpippi_merge.py
156 lines (130 loc) · 4.99 KB
/
pippi_merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#############################################################
# pippi: parse it, plot it
# ------------------------
# Merge program for pippi.
#
# Author: Pat Scott ([email protected])
# Originally developed: March 2012
#############################################################
from __future__ import print_function
from pippi_utils import *
def merge(filenames):
#Functions to merge multiple chains. If these are ascii files,
#spit them out to stdout (for e.g. piping into a file, or some
#other program). Basically just 'cat' with a little bit of
#error checking. If these are hdf5 files, then cat the matching
#records of the files and spit them out into the last argument.
if (len(filenames) == 0):
usage()
return
files = {}
dataset_collection = []
dataset_set_collection = []
# Work out whether we are doing an hdf5 merge or an ascii merge
try:
import h5py
f = h5py.File(filenames[0],'r')
h5merge = True
print()
print("Files identified as hdf5. Interpreting final argument as output filename.")
print()
print("Concatenating common datasets and outputting to {0}...".format(filenames[-1]))
print()
except:
h5merge = False
if any(x.endswith(".hdf5") for x in filenames) and not h5merge:
sys.exit("ERROR: Python package h5py not detected, but judging from the filenames you're trying to merge hdf5 files.")
if h5merge:
# We are doing an hdf5 merge.
# Try to open the output file (the last filename given)
try:
fout = h5py.File(filenames[-1],'w-')
except:
print("Could not create output file {0}!".format(filenames[-1]))
print("Please make sure it does not exist already.")
print()
return
print(" Determining common datasets...")
for fname in filenames[0:-1]:
print(" Opening: {0}".format(fname))
try:
f = h5py.File(fname,'r')
except:
print("Could not open file {0}!".format(fname))
print()
return
files[fname] = f
datasets = {}
# Identify the datasets in the file
get_datasets(f,datasets)
dataset_collection.append(datasets)
dataset_set_collection.append(set(datasets))
#Find all common datasets, ensuring they have the same types and shapes
common_datasets = set()
for x in set.intersection(*dataset_set_collection):
datatype = dataset_collection[0][x].dtype
datashape = dataset_collection[0][x].shape
if all(f[x].dtype == datatype and f[x].shape[1:] == datashape[1:] for f in dataset_collection):
common_datasets.add(x)
print()
print(" Common datasets: ")
for x in common_datasets: print(" {0}".format(x))
print()
#Find the length of each dataset and create it (empty) in the new file
print(" Creating empty datasets of required lengths in {0}...".format(filenames[-1]))
out_dsets = {}
for ds in common_datasets:
length = 0
for f in dataset_collection: length += f[ds].len()
datatype = dataset_collection[0][ds].dtype
datashape = (length,) + dataset_collection[0][ds].shape[1:]
out_dsets[ds] = fout.create_dataset(ds, datashape, dtype=datatype)
print()
#Copy the data over to the new file
print(" Adding data to empty datasets in {0}...".format(filenames[-1]))
for ds in common_datasets:
print(" Populating {0}".format(ds))
index_low = 0
for f in dataset_collection:
index_high = index_low + f[ds].len()
out_dsets[ds][index_low:index_high,...] = f[ds][...]
index_low = index_high
print()
print("Done.")
print()
else: # We are doing an ASCII merge
firstLine = True
for filename in filenames:
#Try to open input file
infile = safe_open(filename)
#Read the first line
line = infile.readline()
#Skip comments at the beginning of the file
while (line[0] == '#'):
line = infile.readline()
#Work out the number of columns in the first valid line
columns = len(line.split())
while (columns != 0):
try:
if (firstLine):
#Save the number of columns if this is the first line of the first chain
columnsPredicted = columns
firstLine = False
elif (columns != columnsPredicted):
#Crash if a later chain or line has a different number of columns to the first one
sys.exit('Error: chains do not match (number of columns differ). Quitting...')
#Output the current line to stdout and get the next one
print(line.rstrip('\n'))
#Read the next line
line = infile.readline()
#Work out the number of columns in the next line
columns = len(line.split())
except IOError:
break
#Shut the chain file and move on to the next
infile.close
def get_datasets(g,datasets):
import h5py
for name, item in g.items():
if isinstance(item,h5py.Group): get_datasets(item,datasets)
if isinstance(item,h5py.Dataset): datasets[item.name] = item