-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP: indexing with broadcasting #1473
Changes from 7 commits
105bd64
23b4fe0
726ba5d
df7011f
f9232cb
17b6465
33c51d3
03a336f
d5af395
866de91
08e7444
84afc98
7b33269
50ea56e
bac0089
1206c28
c2747be
0671f39
ffccff1
becf539
1ae4b4c
1967bf5
c2eeff3
df12c04
5ba367d
d25c1f1
0115994
1b4e854
36d052f
9bd53ca
563cafa
1712060
884423a
c2e6f42
002eafa
eedfb3f
bb2e515
5983a67
7a5ff79
0b559bc
bad828e
a821a2b
6550880
464e711
7dd171d
e8f006b
a8ec82b
32749d4
31401d4
f42ddfd
8d96ad3
19f7204
a8f60ba
69f8570
5eb00b7
d133766
631f6e9
72587de
3231445
dd325c5
434a004
d518f7a
ba3cc88
f63f3d5
aa10635
fd73e82
6202aff
f9746fd
f78c932
1c027cd
d11829f
0777128
f580c99
4ebe852
20f5cb9
ab08af8
92dded6
a624424
a4cd724
f66c9b6
24309c4
fd698de
9cbaff9
a5c7766
f242166
bff18f0
21c11c4
1975f66
73ad94e
b49f813
1fd6b3a
46dd7c7
11f3e4f
1d3eddc
bcb25f1
7104964
173968b
addb91a
7ad7d36
91dd833
118a5d8
dc9f8a6
5726c89
523ecaa
a3a83db
765ae45
3deaf5c
c8c8a12
a16a04b
1b34cd4
24599a7
d5d967b
d0d6a6f
4f08e2e
fbbe35c
db23c93
031be9a
969f9cf
8608451
b4e5b36
9523039
dc60348
8a62ad9
6b96960
cb84154
9726531
caa79fe
170abc5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -371,7 +371,7 @@ def shape(self): | |
return tuple(shape) | ||
|
||
def __array__(self, dtype=None): | ||
array = orthogonally_indexable(self.array) | ||
array = broadcasted_indexable(self.array) | ||
return np.asarray(array[self.key], dtype=None) | ||
|
||
def __getitem__(self, key): | ||
|
@@ -434,7 +434,7 @@ def __setitem__(self, key, value): | |
self.array[key] = value | ||
|
||
|
||
def orthogonally_indexable(array): | ||
def broadcasted_indexable(array): | ||
if isinstance(array, np.ndarray): | ||
return NumpyIndexingAdapter(array) | ||
if isinstance(array, pd.Index): | ||
|
@@ -445,24 +445,10 @@ def orthogonally_indexable(array): | |
|
||
|
||
class NumpyIndexingAdapter(utils.NDArrayMixin): | ||
"""Wrap a NumPy array to use orthogonal indexing (array indexing | ||
accesses different dimensions independently, like netCDF4-python variables) | ||
"""Wrap a NumPy array to use broadcasted indexing | ||
""" | ||
# note: this object is somewhat similar to biggus.NumpyArrayAdapter in that | ||
# it implements orthogonal indexing, except it casts to a numpy array, | ||
# isn't lazy and supports writing values. | ||
def __init__(self, array): | ||
self.array = np.asarray(array) | ||
|
||
def __array__(self, dtype=None): | ||
return np.asarray(self.array, dtype=dtype) | ||
|
||
def _convert_key(self, key): | ||
key = expanded_indexer(key, self.ndim) | ||
if any(not isinstance(k, integer_types + (slice,)) for k in key): | ||
# key would trigger fancy indexing | ||
key = orthogonal_indexer(key, self.shape) | ||
return key | ||
self.array = array | ||
|
||
def _ensure_ndarray(self, value): | ||
# We always want the result of indexing to be a NumPy array. If it's | ||
|
@@ -474,29 +460,37 @@ def _ensure_ndarray(self, value): | |
return value | ||
|
||
def __getitem__(self, key): | ||
key = self._convert_key(key) | ||
return self._ensure_ndarray(self.array[key]) | ||
|
||
def __setitem__(self, key, value): | ||
key = self._convert_key(key) | ||
self.array[key] = value | ||
|
||
|
||
class DaskIndexingAdapter(utils.NDArrayMixin): | ||
"""Wrap a dask array to support orthogonal indexing | ||
"""Wrap a dask array to support broadcasted-indexing. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe "xarray-style" is better than "broadcasted" |
||
""" | ||
def __init__(self, array): | ||
self.array = array | ||
|
||
def __getitem__(self, key): | ||
key = expanded_indexer(key, self.ndim) | ||
if any(not isinstance(k, integer_types + (slice,)) for k in key): | ||
""" key: tuple of Variable, slice, integer """ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it should only be a NumPy or dask array here, not a Variable. |
||
# basic or orthogonal indexing | ||
if all(isinstance(k, (integer_types, slice)) or k.squeeze().ndim <= 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For simple cases where everything is an integers or slice, we want to just use a single indexing call like In the hard case when some arguments are arrays, we should try There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After looking at dask.array in a little more detail, I think we need to keep a work-around for "orthogonal" indexing in dask. It looks like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See fujiisoup#1 |
||
for k in key): | ||
value = self.array | ||
for axis, subkey in reversed(list(enumerate(key))): | ||
if hasattr(subkey, 'squeeze'): | ||
subkey = subkey.squeeze() | ||
if subkey.ndim == 0: # make at least 1-d array | ||
subkey = subkey.flatten() | ||
value = value[(slice(None),) * axis + (subkey,)] | ||
return value | ||
else: | ||
value = self.array[key] | ||
return value | ||
# TODO Dask does not support nd-array indexing. | ||
# flatten() -> .vindex[] -> reshape() should be used | ||
# instead of `.load()` | ||
value = np.asarray(self.array)[key] | ||
return value | ||
|
||
|
||
class PandasIndexAdapter(utils.NDArrayMixin): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,8 @@ | |
from . import utils | ||
from .pycompat import (basestring, OrderedDict, zip, integer_types, | ||
dask_array_type) | ||
from .indexing import (PandasIndexAdapter, orthogonally_indexable) | ||
from .indexing import (DaskIndexingAdapter, PandasIndexAdapter, | ||
broadcasted_indexable) | ||
|
||
import xarray as xr # only for Dataset and DataArray | ||
|
||
|
@@ -297,7 +298,7 @@ def data(self, data): | |
|
||
@property | ||
def _indexable_data(self): | ||
return orthogonally_indexable(self._data) | ||
return broadcasted_indexable(self._data) | ||
|
||
def load(self): | ||
"""Manually trigger loading of this variable's data from disk or a | ||
|
@@ -376,29 +377,89 @@ def _item_key_to_tuple(self, key): | |
else: | ||
return key | ||
|
||
def _broadcast_indexes(self, key): | ||
""" | ||
Parameters | ||
----------- | ||
key: One of | ||
array | ||
a mapping of dimension names to index. | ||
|
||
Returns | ||
------- | ||
dims: Tuple of strings. | ||
Dimension of the resultant variable. | ||
indexers: list of integer, array-like, or slice. This is aligned | ||
along self.dims. | ||
""" | ||
key = self._item_key_to_tuple(key) # key is a tuple | ||
# key is a tuple of full size | ||
key = indexing.expanded_indexer(key, self.ndim) | ||
basic_indexing_types = integer_types + (slice,) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's make this a module level constant instead, since we access it in a few places. |
||
if all([isinstance(k, basic_indexing_types) for k in key]): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid the unneeded extra |
||
return self._broadcast_indexes_basic(key) | ||
else: | ||
return self._broadcast_indexes_advanced(key) | ||
|
||
def _broadcast_indexes_basic(self, key): | ||
dims = tuple(dim for k, dim in zip(key, self.dims) | ||
if not isinstance(k, integer_types)) | ||
return dims, key | ||
|
||
def nonzero(self): | ||
""" Equivalent numpy's nonzero but returns a tuple of Varibles. """ | ||
if isinstance(self._data, (np.ndarray, pd.Index, PandasIndexAdapter)): | ||
nonzeros = np.nonzero(self._data) | ||
elif isinstance(self._data, dask_array_type): | ||
# TODO we should replace dask's native nonzero | ||
# after https://github.com/dask/dask/issues/1076 is implemented. | ||
nonzeros = np.nonzero(self.load()._data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would just do |
||
|
||
return tuple([as_variable(nz, name=dim) for nz, dim | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Construct the variable objects directly here rather than using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to include |
||
in zip(nonzeros, self.dims)]) | ||
|
||
def _isbool_type(self): | ||
""" Return if the variabe is bool or not """ | ||
if isinstance(self._data, (np.ndarray, PandasIndexAdapter, pd.Index)): | ||
return self._data.dtype is np.dtype('bool') | ||
elif isinstance(self._data, dask_array_type): | ||
raise NotImplementedError | ||
|
||
def _broadcast_indexes_advanced(self, key): | ||
variables = [] | ||
|
||
for dim, value in zip(self.dims, key): | ||
if isinstance(value, slice): | ||
value = np.arange(self.sizes[dim])[value] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use the
|
||
|
||
try: # TODO we need our own Exception. | ||
variable = as_variable(value, name=dim) | ||
except ValueError as e: | ||
if "cannot set variable" in str(e): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My current implementation for this exception handling is rather bad. except DimensionMismatchError:
raise IndexError('...') There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree. Can you switch this for Inherit from |
||
raise IndexError("Unlabelled multi-dimensional array " | ||
"cannot be used for indexing.") | ||
else: | ||
raise e | ||
if variable._isbool_type(): # boolean indexing case | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we make this just |
||
variables.extend(list(variable.nonzero())) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a little hesitate to allow multi-dimensional boolean indexers here. The problem with using Instead, I would error for boolean indexers with more than one dimension, and then convert with Multi-dimensional boolean indexers are useful, but I think the main use-case is indexing with a single argument like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree. I add the sanity check for the boolean array. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't need |
||
else: | ||
variables.append(variable) | ||
variables = _broadcast_compat_variables(*variables) | ||
dims = variables[0].dims # all variables have the same dims | ||
key = tuple(variable.data for variable in variables) | ||
return dims, key | ||
|
||
def __getitem__(self, key): | ||
"""Return a new Array object whose contents are consistent with | ||
getting the provided key from the underlying data. | ||
|
||
NB. __getitem__ and __setitem__ implement "orthogonal indexing" like | ||
netCDF4-python, where the key can only include integers, slices | ||
(including `Ellipsis`) and 1d arrays, each of which are applied | ||
orthogonally along their respective dimensions. | ||
NB. __getitem__ and __setitem__ implement "diagonal indexing" like | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I like the name "diagonal indexing". |
||
np.ndarray. | ||
|
||
The difference does not matter in most cases unless you are using | ||
numpy's "fancy indexing," which can otherwise result in data arrays | ||
whose shapes is inconsistent (or just uninterpretable with) with the | ||
variable's dimensions. | ||
|
||
If you really want to do indexing like `x[x > 0]`, manipulate the numpy | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we still need this around, until we support boolean indexing with multi-dimensional indexers. |
||
array `x.values` directly. | ||
This method will replace __getitem__ after we make sure its stability. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. delete |
||
""" | ||
key = self._item_key_to_tuple(key) | ||
key = indexing.expanded_indexer(key, self.ndim) | ||
dims = tuple(dim for k, dim in zip(key, self.dims) | ||
if not isinstance(k, integer_types)) | ||
values = self._indexable_data[key] | ||
# orthogonal indexing should ensure the dimensionality is consistent | ||
dims, index_tuple = self._broadcast_indexes(key) | ||
values = self._indexable_data[index_tuple] | ||
if hasattr(values, 'ndim'): | ||
assert values.ndim == len(dims), (values.ndim, len(dims)) | ||
else: | ||
|
@@ -412,15 +473,15 @@ def __setitem__(self, key, value): | |
|
||
See __getitem__ for more details. | ||
""" | ||
key = self._item_key_to_tuple(key) | ||
dims, index_tuple = self._broadcast_indexes(key) | ||
if isinstance(self._data, dask_array_type): | ||
raise TypeError("this variable's data is stored in a dask array, " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is actually no longer true -- There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
'which does not support item assignment. To ' | ||
'assign to this variable, you must first load it ' | ||
'into memory explicitly using the .load_data() ' | ||
'method or accessing its .values attribute.') | ||
data = orthogonally_indexable(self._data) | ||
data[key] = value | ||
data = broadcasted_indexable(self._data) | ||
data[index_tuple] = value | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
If it's an |
||
|
||
@property | ||
def attrs(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe
xarray_indexable
would be a better name here