This repository has been archived by the owner on Dec 10, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload.py
195 lines (142 loc) · 6.46 KB
/
load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from warnings import warn
from pathlib import Path
import numpy as np
import xarray
from functools import partial
from natsort import natsorted
def _auto_open_mfboutdataset(datapath, chunks={}, info=True, keep_guards=True):
filepaths, filetype = _expand_filepaths(datapath)
# Open just one file to read processor splitting
nxpe, nype, mxg, myg, mxsub, mysub = _read_splitting(filepaths[0], info)
paths_grid, concat_dims = _arrange_for_concatenation(filepaths, nxpe, nype)
_preprocess = partial(_trim, ghosts={'x': mxg, 'y': myg})
ds = xarray.open_mfdataset(paths_grid, concat_dims=concat_dims,
data_vars='minimal', preprocess=_preprocess,
engine=filetype, chunks=chunks,
infer_order_from_coords=False)
ds, metadata = _strip_metadata(ds)
return ds, metadata
def _expand_filepaths(datapath):
"""Determines filetypes and opens all dump files."""
path = Path(datapath)
filetype = _check_filetype(path)
filepaths = _expand_wildcards(path)
if not filepaths:
raise IOError("No datafiles found matching datapath={}"
.format(datapath))
if len(filepaths) > 128:
warn("Trying to open a large number of files - setting xarray's"
" `file_cache_maxsize` global option to {} to accommodate this. "
"Recommend using `xr.set_options(file_cache_maxsize=NUM)`"
" to explicitly set this to a large enough value."
.format(str(len(filepaths))), UserWarning)
xarray.set_options(file_cache_maxsize=len(filepaths))
return filepaths, filetype
def _check_filetype(path):
if path.suffix == '.nc':
filetype = 'netcdf4'
elif path.suffix == '.h5netcdf':
filetype = 'h5netcdf'
else:
raise IOError("Do not know how to read file extension "
"\"{path.suffix}\"")
return filetype
def _expand_wildcards(path):
"""Return list of filepaths matching wildcard"""
# Find first parent directory which does not contain a wildcard
base_dir = next(parent for parent in path.parents if '*' not in str(parent))
# Find path relative to parent
search_pattern = str(path.relative_to(base_dir))
# Search this relative path from the parent directory for all files matching user input
filepaths = list(base_dir.glob(search_pattern))
# Sort by numbers in filepath before returning
# Use "natural" sort to avoid lexicographic ordering of numbers
# e.g. ['0', '1', '10', '11', etc.]
return natsorted(filepaths, key=lambda filepath: str(filepath))
def _read_splitting(filepath, info=True):
ds = xarray.open_dataset(str(filepath))
# TODO check that BOUT doesn't ever set the number of guards to be different to the number of ghosts
# Account for case of no parallelisation, when nxpe etc won't be in dataset
def get_scalar(ds, key, default=1, info=True):
if key in ds:
return ds[key].values
else:
if info is True:
print(f"{key} not found, setting to {default}")
return default
nxpe = get_scalar(ds, 'NXPE', default=1)
nype = get_scalar(ds, 'NYPE', default=1)
mxg = get_scalar(ds, 'MXG', default=2)
myg = get_scalar(ds, 'MYG', default=0)
mxsub = get_scalar(ds, 'MXSUB', default=ds.dims['x'] - 2 * mxg)
mysub = get_scalar(ds, 'MYSUB', default=ds.dims['y'] - 2 * myg)
# Avoid trying to open this file twice
ds.close()
return nxpe, nype, mxg, myg, mxsub, mysub
def _arrange_for_concatenation(filepaths, nxpe=1, nype=1):
"""
Arrange filepaths into a nested list-of-lists which represents their
ordering across different processors and consecutive simulation runs.
Filepaths must be a sorted list. Uses the fact that BOUT's output files are
named as num = nxpe*i + j, and assumes that any consectutive simulation
runs are in directories which when sorted are in the correct order
(e.g. /run0/*, /run1/*, ...).
"""
nprocs = nxpe * nype
n_runs = int(len(filepaths) / nprocs)
if len(filepaths) % nprocs != 0:
raise ValueError("Each run directory does not contain an equal number"
"of output files. If the parallelization scheme of "
"your simulation changed partway-through, then please"
"load each directory separately and concatenate them"
"along the time dimension with xarray.concat().")
# Create list of lists of filepaths, so that xarray knows how they should
# be concatenated by xarray.open_mfdataset()
# Only possible with this Pull Request to xarray
# https://github.com/pydata/xarray/pull/2553
paths = iter(filepaths)
paths_grid = [[[next(paths) for x in range(nxpe)]
for y in range(nype)]
for t in range(n_runs)]
# Dimensions along which no concatenation is needed are still present as
# single-element lists, so need to concatenation along dim=None for those
concat_dims = [None, None, None]
if len(filepaths) > nprocs:
concat_dims[0] = 't'
if nype > 1:
concat_dims[1] = 'y'
if nxpe > 1:
concat_dims[2] = 'x'
return paths_grid, concat_dims
def _trim(ds, ghosts={}, keep_guards=True):
"""
Trims all ghost and guard cells off a single dataset read from a single
BOUT dump file, to prepare for concatenation.
Parameters
----------
ghosts : dict, optional
guards : dict, optional
keep_guards : dict, optional
"""
# TODO generalise this function to handle guard cells being optional
if not keep_guards:
raise NotImplementedError
selection = {}
for dim in ds.dims:
if ghosts.get(dim, False):
selection[dim] = slice(ghosts[dim], -ghosts[dim])
trimmed_ds = ds.isel(**selection)
return trimmed_ds
def _strip_metadata(ds):
"""
Extract the metadata (nxpe, myg etc.) from the Dataset.
Assumes that all scalar variables are metadata, not physical data!
"""
# Find only the scalar variables
variables = list(ds.variables)
scalar_vars = [var for var in variables
if not any(dim in ['t', 'x', 'y', 'z'] for dim in ds[var].dims)]
# Save metadata as a dictionary
metadata_vals = [np.asscalar(ds[var].values) for var in scalar_vars]
metadata = dict(zip(scalar_vars, metadata_vals))
return ds.drop(scalar_vars), metadata