Merge pull request #450 from AllenInstitute/GH-448/data-access-page

GH-448 Add data access page
AllenInstitute · Jun 29, 2020 · ac92c1a · ac92c1a
2 parents 066dfb0 + 4f3c086
commit ac92c1a
Show file tree

Hide file tree

Showing 18 changed files with 22,519 additions and 11 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -10,4 +10,4 @@ recursive-exclude * *.py[co]
 
 recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
 
-recursive-include ipfx *.json *.txt *.md
+recursive-include ipfx *.json *.txt *.md *.csv
diff --git a/docs/download_data.rst b/docs/download_data.rst
@@ -0,0 +1,56 @@
+Data Access
+===========
+
+The electrophysiology data files for the PatchSeq experiments released by the
+Allen Institute are stored in `Neurodata Without Borders 2.0 <https://nwb.org>`_ (NWB) format.
+The files are hosted on the `Distributed Archives for Neurophysiology Data Integration (DANDI) <https://dandiarchive.org>`_.
+
+The PatchSeq data release is composed of  two archives:
+
+Mouse data archive (114 GB): `<https://dandiarchive.org/dandiset/000020>`_
+
+Human data archive (12 GB): `<https://dandiarchive.org/dandiset/000023>`_
+
+Each archive is accompanied by the corresponding file manifest and the experiment metadata tables.
+
+The file manifest table contains information about the files included in the archive,
+their location ("archive_uri" column) and the corresponding cell("cell_specimen_id" column).
+The file manifest combines information about several different data modalities (see the "technique" column)
+recorded from each cell. The files with the intracellular electrophysiological recordings stored on DANDI are denoted as
+"technique" = intracellular_electrophysiology.
+
+In turn, the experiment metadata table includes information about the experimental conditions
+for each cell("specimen_id" column). This table could be used to select the desired cells
+satisfying particular experimental conditions. Then, given the desired "specimen_ids",
+you can find the corresponding DANDI urls of these data from the file manifest.
+
+IPFX includes a utility that provides file manifest and experiment data of the published archives.
+
+For example, to obtain detailed information about Human data archive:
+
+.. code-block:: python
+
+    from ipfx.data_access import get_archive_info
+    archive_url, file_manifest, experiment_metadata = get_archive_info(dataset="human")
+
+where ``archive_uri`` is the DANDI URL for the Human data,
+``file_manifest`` is a pandas.DataFrame of file manifest and
+``experiment_metadata`` is a pandas.DataFrame of experiment metadata.
+To obtain the same information for the Mouse data, change to `dataset="mouse"` in the function argument.
+
+You can download data files by directly entering the DANDI's archive_uri in your browser.
+Alternatively, a more powerful option is to install DANDI's command line client:
+
+.. code-block:: bash
+
+    pip install dandi
+
+With client installed, you can easily download individual files or an entire archive as:
+
+.. code-block:: bash
+
+    dandi download --output-dir <DIRECTORY> <URL>
+
+where <DIRECTORY> is the existing directory on your file system
+and <URL> is the url of a file or an archive.
+
diff --git a/docs/gallery/analysis_examples/all_analysis.py b/docs/gallery/analysis_examples/all_analysis.py
@@ -9,7 +9,10 @@
 from ipfx.dataset.create import create_ephys_data_set
 from ipfx.data_set_features import extract_data_set_features
 from ipfx.utilities import drop_failed_sweeps
-# Download and access the experimental data
+
+# Download and access the experimental data from DANDI archive per instructions in the documentation
+# Example below will use an nwb file provided with the package
+
 nwb_file = os.path.join(
     os.path.dirname(os.getcwd()),
     "data",

diff --git a/docs/gallery/analysis_examples/lsq_analysis.py b/docs/gallery/analysis_examples/lsq_analysis.py
@@ -16,7 +16,9 @@
 import os
 import matplotlib.pyplot as plt
 
-# Download and access the experimental data
+# Download and access the experimental data from DANDI archive per instructions in the documentation
+# Example below will use an nwb file provided with the package
+
 nwb_file = os.path.join(
     os.path.dirname(os.getcwd()),
     "data",

diff --git a/docs/gallery/analysis_examples/ramp_analysis.py b/docs/gallery/analysis_examples/ramp_analysis.py
@@ -17,7 +17,9 @@
 )
 from ipfx.stimulus_protocol_analysis import RampAnalysis
 
-# Download and access the experimental data
+# Download and access the experimental data from DANDI archive per instructions in the documentation
+# Example below will use an nwb file provided with the package
+
 nwb_file = os.path.join(
     os.path.dirname(os.getcwd()),
     "data",

diff --git a/docs/gallery/analysis_examples/short_square_analysis.py b/docs/gallery/analysis_examples/short_square_analysis.py
@@ -16,7 +16,9 @@
 from ipfx.epochs import get_stim_epoch
 from ipfx.utilities import drop_failed_sweeps
 
-# Download and access the experimental data
+# Download and access the experimental data from DANDI archive per instructions in the documentation
+# Example below will use an nwb file provided with the package
+
 nwb_file = os.path.join(
     os.path.dirname(os.getcwd()),
     "data",

diff --git a/docs/gallery/qc_examples/sweep_qc.py b/docs/gallery/qc_examples/sweep_qc.py
@@ -14,7 +14,9 @@
 from ipfx.stimulus import StimulusOntology
 
 
-# Download and access the experimental data
+# Download and access the experimental data from DANDI archive per instructions in the documentation
+# Example below will use an nwb file provided with the package
+
 nwb_file = os.path.join(
     os.path.dirname(os.getcwd()),
     "data",

diff --git a/docs/gallery/spikes_examples/detect_spikes_single_sweep.py b/docs/gallery/spikes_examples/detect_spikes_single_sweep.py
@@ -10,7 +10,9 @@
 from ipfx.dataset.create import create_ephys_data_set
 from ipfx.feature_extractor import SpikeFeatureExtractor
 
-# Download and access the experimental data
+# Download and access the experimental data from DANDI archive per instructions in the documentation
+# Example below will use an nwb file provided with the package
+
 nwb_file = os.path.join(
     os.path.dirname(os.getcwd()),
     "data",

diff --git a/docs/gallery/spikes_examples/detect_spikes_single_sweep_window.py b/docs/gallery/spikes_examples/detect_spikes_single_sweep_window.py
@@ -10,7 +10,9 @@
 from ipfx.dataset.create import create_ephys_data_set
 from ipfx.feature_extractor import SpikeFeatureExtractor
 
-# Download and access the experimental data
+# Download and access the experimental data from DANDI archive per instructions in the documentation
+# Example below will use an nwb file provided with the package
+
 nwb_file = os.path.join(
     os.path.dirname(os.getcwd()),
     "data",

diff --git a/docs/gallery/spikes_examples/estimate_params.py b/docs/gallery/spikes_examples/estimate_params.py
@@ -12,7 +12,9 @@
 from ipfx.utilities import drop_failed_sweeps
 import matplotlib.pyplot as plt
 
-# Download and access the experimental data
+# Download and access the experimental data from DANDI archive per instructions in the documentation
+# Example below will use an nwb file provided with the package
+
 nwb_file = os.path.join(
     os.path.dirname(os.getcwd()),
     "data",

diff --git a/docs/gallery/spikes_examples/spike_train_features.py b/docs/gallery/spikes_examples/spike_train_features.py
@@ -11,7 +11,9 @@
     SpikeFeatureExtractor, SpikeTrainFeatureExtractor
 )
 
-# Download and access the experimental data
+# Download and access the experimental data from DANDI archive per instructions in the documentation
+# Example below will use an nwb file provided with the package
+
 nwb_file = os.path.join(
     os.path.dirname(os.getcwd()),
     "data",

diff --git a/docs/index.rst b/docs/index.rst
@@ -5,12 +5,13 @@
    installation
    quick_start
    tutorial
+   download_data
    stimuli
-   authors
    auto_examples/index
    pipeline
    API Documentation <ipfx>
    Github <https://github.com/alleninstitute/ipfx>
+   authors
    Releases <https://github.com/alleninstitute/ipfx/releases>
 
 

diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -90,6 +90,8 @@ all available features for a given dataset in one call.
 IPFX supports datasets stored in `Neurodata Without Borders 2.0 <https://nwb.org>`_ (NWB) format
 via a :py:class:`~ipfx.dataset.ephys_data_set.EphysDataSet` class, which provides a well-known interface to all of the data in an experiment.
 The data released by the Allen Institute is hosted on the DANDI public archive in the NWB format.
+Refer to :doc:`download_data` page for the instructions on downloading the data files.
+
 To create an instance of the :py:class:`~ipfx.dataset.ephys_data_set.EphysDataSet`:
 
 .. code-block:: python

diff --git a/ipfx/data_access.py b/ipfx/data_access.py
@@ -0,0 +1,59 @@
+import pandas as pd
+from typing import Tuple
+import os
+
+PARENT_DIR = os.path.dirname(__file__)
+DATA_DIR = os.path.join(PARENT_DIR, "data_release")
+ARCHIVE_INFO = pd.DataFrame(
+    {
+        "dataset": ["human", "mouse"],
+        "size (GB)": [12,114],
+        "archive_url": [
+            "https://dandiarchive.org/dandiset/000023",
+            "https://dandiarchive.org/dandiset/000020",
+        ],
+        "file_manifest_path":[
+            os.path.join(DATA_DIR, "2020-06-26_human_file_manifest.csv"),
+            os.path.join(DATA_DIR, "2020-06-26_mouse_file_manifest.csv"),
+        ],
+        "experiment_metadata_path": [
+            os.path.join(DATA_DIR, "20200625_patchseq_metadata_human.csv"),
+            os.path.join(DATA_DIR, "20200625_patchseq_metadata_mouse.csv")
+        ],
+    }
+).set_index("dataset")
+
+
+def get_archive_info(
+        dataset: str, 
+        archive_info:pd.DataFrame = ARCHIVE_INFO
+)-> Tuple[str, pd.DataFrame, pd.DataFrame]:
+    """
+    Provide information about released archive
+
+    Parameters
+    ----------
+    dataset : name of the dataset to query. Currently supported options are:
+        - human
+        - mouse
+    archive_info : dataframe of metadata and manifest files for each supported 
+        dataset. Dataset name is the index. 
+
+    Returns
+    -------
+    Information about the archive
+    """
+
+    if dataset in archive_info.index.values:
+        file_manifest_path = archive_info.at[dataset, "file_manifest_path"]
+        metadata_path = archive_info.at[dataset, "experiment_metadata_path"]
+        archive_url = archive_info.at[dataset, "archive_url"]
+    else:
+        raise ValueError(
+            f"No archive for the dataset '{dataset}'. Choose from the known "
+            f"datasets: {archive_info.index.values}"
+        )
+
+    file_manifest = pd.read_csv(file_manifest_path)
+    experiment_metadata = pd.read_csv(metadata_path)
+    return archive_url, file_manifest, experiment_metadata