Skip to content

Commit

Permalink
Merge pull request #367 from davidhassell/dask-get-filenames
Browse files Browse the repository at this point in the history
dask: `Data.get_filenames`
  • Loading branch information
davidhassell authored Apr 4, 2022
2 parents d4411e9 + 2752f1c commit e827ca4
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 19 deletions.
2 changes: 2 additions & 0 deletions cf/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from .abstract import FileArray

from .cachedarray import CachedArray
from .netcdfarray import NetCDFArray
from .umarray import UMArray
Expand Down
45 changes: 26 additions & 19 deletions cf/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from ..mixin_container import Container
from ..units import Units
from . import ( # GatheredSubarray,; RaggedContiguousSubarray,; RaggedIndexedContiguousSubarray,; RaggedIndexedSubarray,
FileArray,
NetCDFArray,
UMArray,
)
Expand Down Expand Up @@ -9414,37 +9415,43 @@ def insert_dimension(self, position=0, inplace=False):

return d

@daskified(_DASKIFIED_VERBOSE)
def get_filenames(self):
"""Return the names of files containing parts of the data array.
:Returns:
`set`
The file names in normalized, absolute form. If the data
is are memory then an empty `set` is returned.
The file names in normalized, absolute form. If the
data is in memory then an empty `set` is returned.
**Examples:**
**Examples**
>>> f = cf.NetCDFArray(TODODASK)
>>> d = cf.Data(f)
>>> d.get_filenames()
{TODODASK}
>>> f = cf.read('../file[123]')[0]
>>> f.get_filenames()
{'/data/user/file1',
'/data/user/file2',
'/data/user/file3'}
>>> a = f.array
>>> f.get_filenames()
>>> d = cf.Data([1, 2, 3])
>>> d.get_filenames()
set()
"""
print("TODODASK - is this still possible?")
out = set(
[
abspath(p.subarray.get_filename())
for p in self.partitions.matrix.flat
if p.in_file
]
)
out.discard(None)
out = set()

dx = self._get_dask()
hlg = dx.dask
dsk = hlg.to_dict()
for key, value in hlg.get_all_dependencies().items():
if value:
continue

# This key has no dependencies, and so is raw data.
a = dsk[key]
if isinstance(a, FileArray):
out.add(abspath(a.get_filename()))

out.discard(None)
return out

@daskified(_DASKIFIED_VERBOSE)
Expand Down
4 changes: 4 additions & 0 deletions cf/test/test_Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3922,6 +3922,10 @@ def test_Data_set_units(self):
with self.assertRaises(ValueError):
d.set_units("km")

@unittest.skipIf(TEST_DASKIFIED_ONLY, "Needs updated NetCDFArray to test")
def test_Data_get_filenames(self):
pass

def test_Data_tolist(self):
for x in (1, [1, 2], [[1, 2], [3, 4]]):
d = cf.Data(x)
Expand Down

0 comments on commit e827ca4

Please sign in to comment.