Skip to content

Commit

Permalink
Add sunpy style sample data downloading (#421)
Browse files Browse the repository at this point in the history
* Add sunpy style sample data downloading

* Random cleanup: don't write files to working directory

* Attempt to override data download on RTD

* Fix a whoopise

* Fix warning and error

* Add some very basic tests

* Change doc paths to match sample data

* rtd debug

* Feck you RTD

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add changelog

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
Cadair and pre-commit-ci[bot] authored Jul 23, 2024
1 parent e5fd787 commit 0f19a59
Show file tree
Hide file tree
Showing 13 changed files with 202 additions and 20 deletions.
9 changes: 4 additions & 5 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,16 @@ version: 2
build:
os: ubuntu-22.04
tools:
python: "mambaforge-4.10"
python: "mambaforge-latest"
jobs:
post_checkout:
- git fetch --unshallow || true
pre_install:
- git update-index --assume-unchanged .rtd-environment.yml docs/conf.py
pre_build:
- parfive https://g-a36282.cd214.a567.data.globus.org/user_tools_tutorial_data/BKPLX_stokesI.tar https://g-a36282.cd214.a567.data.globus.org/user_tools_tutorial_data/AJQWW_single_mosaic.tar
- mkdir -p $HOME/dkist_data/BKPLX $HOME/dkist_data/AJQWW
- tar -xv -f BKPLX_stokesI.tar --directory $HOME/dkist_data/BKPLX
- tar -xv -f AJQWW_single_mosaic.tar --directory $HOME/dkist_data/AJQWW
# Note $DKIST_SAMPLE_DIR is set by RTD to ~/dkist_data
- python -c "from dkist.data.sample import download_all_sample_data; download_all_sample_data()"
- python -c "from dkist.data.sample import VISP_BKPLX; print(VISP_BKPLX)"

conda:
environment: .rtd-environment.yml
Expand Down
1 change: 1 addition & 0 deletions changelog/421.trivial.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added two partial datasets to `dkist.data.sample` for documentation and testing.
18 changes: 17 additions & 1 deletion dkist/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
from importlib.metadata import PackageNotFoundError
from importlib.metadata import version as _version

import platformdirs as _platformdirs

from .logger import setup_default_dkist_logger as _setup_log
import dkist.config as _config

log = _setup_log(__name__)

Expand All @@ -14,7 +17,7 @@
__version__ = "unknown"


__all__ = ["TiledDataset", "Dataset", "load_dataset", "system_info"]
__all__ = ["TiledDataset", "Dataset", "load_dataset", "system_info", "conf"]


def write_default_config(overwrite=False):
Expand All @@ -29,6 +32,19 @@ def write_default_config(overwrite=False):
return _config.create_config_file("dkist", "dkist", overwrite=overwrite)


class Conf(_config.ConfigNamespace):
"""
Configuration Parameters for the `dkist` Package.
"""
sample_data_directory = _config.ConfigItem(
_platformdirs.user_data_dir(appname="dkist"),
"Location to download sample data to."
)


conf = Conf()


# Do internal imports last (so logger etc is initialised)
from dkist.dataset import Dataset, TiledDataset, load_dataset
from dkist.utils.sysinfo import system_info
110 changes: 110 additions & 0 deletions dkist/data/_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import os
import tarfile
from pathlib import Path
from urllib.parse import urljoin

from parfive import Downloader, Results

from astropy.io import fits

from dkist import conf

VISP_HEADER = fits.Header.fromtextfile(Path(__file__).parent / "VISP_HEADER.hdr")
_SAMPLE_DATASETS = {
"VISP_BKPLX": ("https://g-a36282.cd214.a567.data.globus.org/user_tools_tutorial_data/", "BKPLX_stokesI.tar"),
"VBI_AJQWW": ("https://g-a36282.cd214.a567.data.globus.org/user_tools_tutorial_data/", "AJQWW_single_mosaic.tar"),
}


def _download_and_extract_sample_data(names, overwrite, path):
"""
Downloads a list of files.
Parameters
----------
names : list[str]
The names of the datasets to download and extract
overwrite : bool
Will overwrite a file on disk if True.
path : `pathlib.Path`
The sample data path to save the tar files
Returns
-------
`parfive.Results`
Download results. Will behave like a list of files.
"""
dl = Downloader(overwrite=overwrite, progress=True)

existing_files = []

for name in names:
base_url, filename = _SAMPLE_DATASETS[name]
if (filepath := path / filename).exists():
existing_files.append(filepath)
continue

url = urljoin(base_url, filename)
dl.enqueue_file(url, path=path)

results = Results()
if dl.queued_downloads:
results = dl.download()
results += existing_files

file_folder = {filename: name for name, (_, filename) in _SAMPLE_DATASETS.items() if name in names}

for i, tarpath in enumerate(results):
output_path = path / file_folder[Path(tarpath).name]
with tarfile.open(tarpath, "r:*") as tar:
tar.extractall(path=output_path, filter="data")
results[i] = output_path

return results


def _get_sample_datasets(dataset_names, no_download=False, force_download=False):
"""
Returns a list of disk locations corresponding to a list of filenames for
sample data, downloading the sample data files as necessary.
Parameters
----------
no_download : `bool`
If ``True``, do not download any files, even if they are not present.
Default is ``False``.
force_download : `bool`
If ``True``, download all files, and overwrite any existing ones.
Default is ``False``.
Returns
-------
`list` of `pathlib.Path`
List of disk locations corresponding to the list of filenames. An entry
will be ``None`` if ``no_download == True`` and the file is not present.
Raises
------
RuntimeError
Raised if any of the files cannot be downloaded from any of the mirrors.
"""
sampledata_dir = Path(conf.sample_data_directory)
if env_override := os.environ.get("DKIST_SAMPLE_DIR"):
# For some reason, RTD adds ' around the path in the env var.
sampledata_dir = Path(env_override.strip("'"))
sampledata_dir = sampledata_dir.expanduser()

datasets = dict((k,v) for k, v in _SAMPLE_DATASETS.items() if k in dataset_names) # noqa: C402
download_paths = [sampledata_dir / fn for _, fn in datasets.values()]

if no_download:
return [sampledata_dir / name for name in datasets.keys() if (sampledata_dir / name).exists()]

results = _download_and_extract_sample_data(datasets.keys(), overwrite=force_download, path=sampledata_dir)

if results.errors:
raise RuntimeError(
f"{len(results.errors)} sample data files failed "
"to download, the first error is above.") from results.errors[0].exception

return list(results)
28 changes: 23 additions & 5 deletions dkist/data/sample.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,27 @@
import pathlib
"""
This module provides some (partial) sample datasets.
"""

from astropy.io import fits
from ._sample import _SAMPLE_DATASETS, VISP_HEADER, _get_sample_datasets

__all__ = ["VISP_HEADER"]
__all__ = ["download_all_sample_data", *sorted(_SAMPLE_DATASETS.keys()), "VISP_HEADER"]

_data_dir = pathlib.Path(__file__).parent

VISP_HEADER = fits.Header.fromtextfile(_data_dir / "VISP_HEADER.hdr")
# See PEP 562 (https://peps.python.org/pep-0562/) for module-level __dir__()
def __dir__():
return __all__


# See PEP 562 (https://peps.python.org/pep-0562/) for module-level __getattr__()
def __getattr__(name):
if name in _SAMPLE_DATASETS:
return _get_sample_datasets(name)[0]

raise AttributeError(f"module '{__name__}' has no attribute '{name}'")


def download_all_sample_data():
"""
Download all sample data at once that has not already been downloaded.
"""
return _get_sample_datasets(_SAMPLE_DATASETS.keys())
Empty file added dkist/data/tests/__init__.py
Empty file.
38 changes: 38 additions & 0 deletions dkist/data/tests/test_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os
from unittest.mock import call

import pytest


@pytest.fixture
def tmp_sample_dir(tmp_path):
old_path = os.environ.get("DKIST_SAMPLE_DIR", "")
os.environ["DKIST_SAMPLE_DIR"] = str(tmp_path)
yield tmp_path
os.environ["DKIST_SAMPLE_DIR"] = old_path


def test_module_dir():
import dkist.data.sample

assert "VBI_AJQWW" in dir(dkist.data.sample)
assert "VISP_BKPLX" in dir(dkist.data.sample)


@pytest.mark.parametrize("attrname", ["VBI_AJQWW", "VISP_BKPLX"])
def test_module_getattr(mocker, attrname):
mock = mocker.patch("dkist.data.sample._get_sample_datasets")
import dkist.data.sample

getattr(dkist.data.sample, attrname)

mock.assert_has_calls([call(attrname), call().__getitem__(0)])


@pytest.mark.internet_off
def test_fail(tmp_sample_dir):
"""
No remote data means this test should fail.
"""
with pytest.raises(RuntimeError, match="1 sample data files failed"):
from dkist.data.sample import VISP_BKPLX # noqa: F401
4 changes: 2 additions & 2 deletions dkist/tests/test_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ def test_pixel_to_world(benchmark, visp_dataset_no_headers):
@pytest.mark.parametrize("axes", [
["y", None, None, "x"],
])
def test_plot_dataset(benchmark, axes, visp_dataset_no_headers):
def test_plot_dataset(benchmark, axes, visp_dataset_no_headers, tmp_path):
@benchmark
def plot_and_save_fig(ds=visp_dataset_no_headers, axes=axes):
ds.plot(plot_axes=axes)
plt.savefig("tmpplot")
plt.savefig(tmp_path / "tmpplot.png")
plt.close()


Expand Down
2 changes: 1 addition & 1 deletion docs/tutorial/2_search_and_asdf_download.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ Note that you can also pass more than one result to be downloaded.
A simple example of both of these is:

```{code-cell} python
Fido.fetch(visp[:3], path="~/dkist_data/{instrument}/{dataset_id}/")
Fido.fetch(visp[:3], path="~/dkist_data/{instrument}_{dataset_id}/")
```

This will put each of our ASDF files in a directory named with the corresponding Dataset ID and Instrument.
2 changes: 1 addition & 1 deletion docs/tutorial/3_the_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ from sunpy.net import Fido, attrs as a

```{code-cell} ipython3
res = Fido.search(a.dkist.Dataset('BKPLX'))
files = Fido.fetch(res, path="~/dkist_data/{dataset_id}")
files = Fido.fetch(res, path="~/dkist_data/{instrument}_{dataset_id}")
files
```

Expand Down
2 changes: 1 addition & 1 deletion docs/tutorial/4_more_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import dkist
import dkist.net
res = Fido.search(a.dkist.Dataset('BKPLX'))
files = Fido.fetch(res, path="~/dkist_data/{dataset_id}")
files = Fido.fetch(res, path="~/dkist_data/{instrument}_{dataset_id}")
ds = dkist.load_dataset(files)
```

Expand Down
4 changes: 2 additions & 2 deletions docs/tutorial/5_downloading_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ import dkist.net
from sunpy.net import Fido, attrs as a
res = Fido.search(a.dkist.Dataset('BKPLX'))
files = Fido.fetch(res, path="~/dkist_data/{dataset_id}")
files = Fido.fetch(res, path="~/dkist_data/{instrument}_{dataset_id}")
ds = dkist.load_dataset(files)
```

Expand Down Expand Up @@ -84,7 +84,7 @@ So for example:
```{code-cell} ipython3
:tags: [skip-execution]
ds[0, 0].files.download(path="~/dkist_data/{dataset_id}")
ds[0, 0].files.download(path="~/dkist_data/{instrument}_{dataset_id}")
```

would save the file to `~/dkist_data/BKPLX/VISP_2023_10_16T18_21_47_508_00630200_I_BKPLX_L1.fits`.
Expand Down
4 changes: 2 additions & 2 deletions docs/tutorial/6_visualization.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import dkist.net

```{code-cell} ipython3
res = Fido.search(a.dkist.Dataset("BKPLX"))
asdf_file = Fido.fetch(res, path="~/dkist_data/{dataset_id}")
asdf_file = Fido.fetch(res, path="~/dkist_data/{instrument}_{dataset_id}")
ds = dkist.load_dataset(asdf_file)
```
Expand Down Expand Up @@ -106,7 +106,7 @@ For the next few examples we'll go back to using some VBI data.

```{code-cell} ipython3
res = Fido.search(a.dkist.Dataset("AJQWW"))
asdf_file = Fido.fetch(res, path="~/dkist_data/{dataset_id}")
asdf_file = Fido.fetch(res, path="~/dkist_data/{instrument}_{dataset_id}")
# We extract the top left tile of the VBI mosaic
ds = dkist.load_dataset(asdf_file)[0, 0]
Expand Down

0 comments on commit 0f19a59

Please sign in to comment.