merge with master; custom download scripts cloud awareness

spedas · Nov 20, 2024 · 323f462 · 323f462
1 parent 7bf651f
commit 323f462
Show file tree

Hide file tree

Showing 22 changed files with 2,323 additions and 126 deletions.
diff --git a/README.md b/README.md
@@ -111,6 +111,11 @@ The recommended way of setting your local data directory is to set the `SPEDAS_D
 
 Mission specific data directories (e.g., `MMS_DATA_DIR` for MMS, `THM_DATA_DIR` for THEMIS) can also be set, and these will override `SPEDAS_DATA_DIR`
 
+## Cloud Repositories
+
+`SPEDAS_DATA_DIR` and mission specific data directories can also be the URI of a cloud repository (e.g., an S3 repository). If this data directory is set to an URI, files will be downloaded from the data server to the URI location. The data will then be streamed from the URI without needing to download the file locally. 
+
+In order to successfully access the specified cloud repository, the user is required to correctly set up permissions to be able to read and write to that cloud repository on their own. Refer (here)[https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-files.html] for how to prepare your AWS configuration and credentials.
 
 ## Usage
 

diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
@@ -31,6 +31,12 @@ By default, the data is stored in your pyspedas directory in a folder named 'pyd
 
 Mission specific data directories (e.g., **MMS_DATA_DIR** for MMS, **THM_DATA_DIR** for THEMIS) can also be set, and these will override **SPEDAS_DATA_DIR**.
 
+Cloud Repositories
+------------------------
+**SPEDAS_DATA_DIR** and mission specific data directories can also be the URI of a cloud repository (e.g., an S3 repository). If this data directory is set to an URI, files will be downloaded from the data server to the URI location. The data will then be streamed from the URI without needing to download the file locally. 
+
+In order to successfully access the specified cloud repository, the user is required to correctly set up permissions to be able to read and write to that cloud repository on their own. Refer `here <https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-files.html>`_ for how to prepare your AWS configuration and credentials.
+
 Loading and Plotting Data
 ---------------------------
 You can load data into tplot variables by calling pyspedas.mission.instrument(), e.g.,

diff --git a/docs/source/projects.rst b/docs/source/projects.rst
@@ -8,7 +8,8 @@ Some key points that apply to most or all of these load routines:
 
 * PySPEDAS maintains a cache of previously downloaded data.   The cache location to use is controlled by the SPEDAS_DATA_DIR environment variable.
   Many missions allow the user to set a data directory specific to that mission, overriding the global SPEDAS_DATA_DIR setting. For example,
-  THM_DATA_DIR can be used to specify the local directory to use for the THEMIS mission.
+  THM_DATA_DIR can be used to specify the local directory to use for the THEMIS mission. 
+  The cache location can be a local file directory or a URI location (e.g., an S3 repository).
 
 * By default, PySPEDAS contacts the data server to get a list of filenames to fulfill the request,
   and compares the modification times on the server and locally cached files to determine

diff --git a/pyspedas/projects/cluster/load_csa.py b/pyspedas/projects/cluster/load_csa.py
@@ -22,6 +22,8 @@
 from typing import List
 from .config import CONFIG
 
+from pyspedas.utilities.download import is_fsspec_uri
+import fsspec
 
 def cl_master_datatypes():
     """Return list of data types."""
@@ -193,9 +195,17 @@ def load_csa(trange:List[str]=['2001-02-01', '2001-02-03'],
     # Encode the url urllib.parse.quote
     url = base_url + (query_string)
 
-    local_path = CONFIG['local_data_dir']
-    Path(local_path).mkdir(parents=True, exist_ok=True)
-    out_gz = os.path.join(local_path, 'temp_cluster_file.tar.gz')  # Temp file name
+    local_path = CONFIG['local_data_dir'] # could be URI
+    if is_fsspec_uri(local_path):
+        local_protocol, lpath = local_path.split("://")
+        local_fs = fsspec.filesystem(local_protocol, anon=False)
+
+        out_gz = '/'.join([local_path, 'temp_cluster_file.tar.gz'])  # Temp file name
+        fileobj = local_fs.open(out_gz, 'wb')
+    else:
+        Path(local_path).mkdir(parents=True, exist_ok=True)
+        out_gz = os.path.join(local_path, 'temp_cluster_file.tar.gz')  # Temp file name
+        fileobj = open(out_gz, 'wb')
 
     # Download the file.
     logging.info("Downloading Cluster data, please wait....")
@@ -211,24 +221,46 @@ def load_csa(trange:List[str]=['2001-02-01', '2001-02-03'],
     logging.info("Download complete.")
 
     # Open the downloaded file.
-    with open(out_gz, 'wb') as w:
+    with fileobj as w:
         w.write(r.content)
 
     # Extract the tar archive.
-    tar = tarfile.open(out_gz, "r:gz")
-    f = tar.getnames()
-    if sys.version_info >= (3, 12):
-        tar.extractall(path=local_path, filter='fully_trusted')
+    if is_fsspec_uri(out_gz):
+        # Cloud-Awareness: Opens byte stream for tarfile package.
+        bo = local_fs.open(out_gz, "rb")
+        tar = tarfile.open(fileobj=bo)
     else:
-        tar.extractall(path=local_path)
+        tar = tarfile.open(out_gz, "r:gz")
+    f = tar.getnames()
+
+    for member in tar.getmembers():
+        if member.isfile():
+            p = '/'.join([local_path, member.path])
+            if is_fsspec_uri(p):
+                membo = local_fs.open(p, "wb")
+            else:
+                os.makedirs(str(Path(p).parent), exist_ok=True)
+                membo = open(p, "wb")
+
+            # Python > 3.9 requirement from setup.py
+            # note: data is written after file is read into memory
+            # https://stackoverflow.com/a/62247729
+            with tar.extractfile(member.path) as tarbo:
+                membo.write(tarbo.read())
+
+            membo.close()
     tar.close()
     # Remove the tar.gz file but keep the extracted.
-    os.remove(out_gz)
+    if is_fsspec_uri(out_gz):
+        local_fs.delete(out_gz)
+    else:
+        os.remove(out_gz)
 
     # Get unique set of files.
     f_set = set(f)
     # File list with full path.
-    out_files = [os.path.join(local_path, s) for s in list(f_set)]
+    sep = "/" if is_fsspec_uri(local_path) else os.path.sep
+    out_files = [sep.join([local_path, s]) for s in list(f_set)]
     out_files = sorted(out_files)
 
     if downloadonly: