Ncpi index fix (#232)

* Renamed anvil to ncpi * Update ncpi datasets catalog * Modified script to download NCPI datasets into platform subfolders * Updated NCPI integration dataset * Removed unused variable * Removed ncpi top level folder to spread results among subfolders * Change output dir to data instead of ncpi subdir * Moved NCPI subdirs into main data folder for ingest as per Yaphet's request Co-authored-by: Alex Waldrop <[email protected]>
helxplatform · Jun 30, 2022 · d74a991 · d74a991
1 parent c68a991
commit d74a991
Show file tree

Hide file tree

Showing 8 changed files with 202 additions and 494 deletions.
diff --git a/bin/get_anvil_data_dicts.py → bin/get_ncpi_data_dicts.py b/bin/get_anvil_data_dicts.py → bin/get_ncpi_data_dicts.py
@@ -3,7 +3,7 @@
 # This script is used to generate the input to index Anvil Datasets on Dug
 # Parse, Download dbgap datasets currently hosted on Anvil Platform (tsv downloaded from https://anvilproject.org/data)
 # Output all datasets to an output tarball into the data directory to be indexed
-# NOTE: The anvil-dataset-catalog-results.tsv should be updated manually to ensure you sync all current Anvil datasets
+# NOTE: The ncpi-dataset-catalog-results.tsv should be updated manually to ensure you sync all current Anvil datasets
 
 #######
 
@@ -14,8 +14,8 @@
 
 # Hard-coded relative paths for the anvil catalog input file and output bolus
 # This obviously isn't very elegant but it'll do for now
-input_file = "../data/anvil-dataset-catalog-results.tsv"
-output_dir = "../data/anvil_dbgap_data_dicts/"
+input_file = "../data/ncpi-dataset-catalog-results.tsv"
+output_dir = "../data/"
 
 
 # Helper function
@@ -78,20 +78,24 @@ def main():
                 header = True
                 continue
 
+            # Get platform and make subdir if necessary
+            platform = row["Platform"].split(";")
+            platform = platform[0] if "BDC" not in platform else "BDC"
+
             # Add any phs dbgap studies to queue of files to get
             study_id = row["Study Accession"]
             if study_id.startswith("phs") and study_id not in studies:
                 studies[study_id] = True
                 try:
                     # Try to download to output folder if the study hasn't already been downloaded
-                    if not os.path.exists(f"{output_dir}/{study_id}"):
+                    if not os.path.exists(f"{output_dir}/{platform}/{study_id}"):
                         print(f"Downloading: {study_id}")
-                        if not download_dbgap_study(study_id, output_dir):
+                        if not download_dbgap_study(study_id, f"{output_dir}/{platform}"):
                             missing_data_dict_studies[study_id] = True
 
                 except Exception as e:
                     # If anything happens, delete the folder so we don't mistake it for success
-                    shutil.rmtree(f"{output_dir}/{study_id}")
+                    shutil.rmtree(f"{output_dir}/{platform}/{study_id}")
 
     # Count the number subdir currently in output_dir as the number of downloaded
     num_downloaded  = len([path for path in os.walk(output_dir) if path[0] != output_dir])
@@ -104,14 +108,14 @@ def main():
 
     # Write out list of datasets with no data dicts
     with open(f"{output_dir}/download_summary.txt", "w") as sum_file:
-        sum_file.write(f"Unique dbgap datasets in anvil table: {num_possible}\n")
+        sum_file.write(f"Unique dbgap datasets in ncpi table: {num_possible}\n")
         sum_file.write(f"Successfully Downloaded: {num_downloaded}\n")
         sum_file.write(f"Total dbgap datasests missing data dicts: {num_missing_data_dicts}\n")
         sum_file.write(f"Dbgap datasests missing data dicts:\n")
         for item in missing_data_dict_studies:
             sum_file.write(f"{item}\n")
 
-    print(f"Unique dbgap datasets in anvil table: {num_possible}\n")
+    print(f"Unique dbgap datasets in ncpi table: {num_possible}\n")
     print(f"Successfully Downloaded: {num_downloaded}\n")
     print(f"Total dbgap datasests missing data dicts: {num_missing_data_dicts}\n")
 

diff --git a/data/AnVIL.tar.gz b/data/AnVIL.tar.gz
diff --git a/data/BDC.tar.gz b/data/BDC.tar.gz
diff --git a/data/CRDC.tar.gz b/data/CRDC.tar.gz
diff --git a/data/KFDRC.tar.gz b/data/KFDRC.tar.gz
diff --git a/data/anvil-dataset-catalog-results.tsv b/data/anvil-dataset-catalog-results.tsv
diff --git a/data/anvil_dbgap_data_dicts.tar.gz b/data/anvil_dbgap_data_dicts.tar.gz
diff --git a/data/ncpi-dataset-catalog-results.tsv b/data/ncpi-dataset-catalog-results.tsv