Skip to content

Commit

Permalink
Ncpi index fix (#232)
Browse files Browse the repository at this point in the history
* Renamed anvil to ncpi

* Update ncpi datasets catalog

* Modified script to download NCPI datasets into platform subfolders

* Updated NCPI integration dataset

* Removed unused variable

* Removed ncpi top level folder to spread results among subfolders

* Change output dir to data instead of ncpi subdir

* Moved NCPI subdirs into main data folder for ingest as per Yaphet's request

Co-authored-by: Alex Waldrop <[email protected]>
  • Loading branch information
alexwaldrop and Alex Waldrop authored Jun 30, 2022
1 parent c68a991 commit d74a991
Show file tree
Hide file tree
Showing 8 changed files with 202 additions and 494 deletions.
20 changes: 12 additions & 8 deletions bin/get_anvil_data_dicts.py → bin/get_ncpi_data_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# This script is used to generate the input to index Anvil Datasets on Dug
# Parse, Download dbgap datasets currently hosted on Anvil Platform (tsv downloaded from https://anvilproject.org/data)
# Output all datasets to an output tarball into the data directory to be indexed
# NOTE: The anvil-dataset-catalog-results.tsv should be updated manually to ensure you sync all current Anvil datasets
# NOTE: The ncpi-dataset-catalog-results.tsv should be updated manually to ensure you sync all current Anvil datasets

#######

Expand All @@ -14,8 +14,8 @@

# Hard-coded relative paths for the anvil catalog input file and output bolus
# This obviously isn't very elegant but it'll do for now
input_file = "../data/anvil-dataset-catalog-results.tsv"
output_dir = "../data/anvil_dbgap_data_dicts/"
input_file = "../data/ncpi-dataset-catalog-results.tsv"
output_dir = "../data/"


# Helper function
Expand Down Expand Up @@ -78,20 +78,24 @@ def main():
header = True
continue

# Get platform and make subdir if necessary
platform = row["Platform"].split(";")
platform = platform[0] if "BDC" not in platform else "BDC"

# Add any phs dbgap studies to queue of files to get
study_id = row["Study Accession"]
if study_id.startswith("phs") and study_id not in studies:
studies[study_id] = True
try:
# Try to download to output folder if the study hasn't already been downloaded
if not os.path.exists(f"{output_dir}/{study_id}"):
if not os.path.exists(f"{output_dir}/{platform}/{study_id}"):
print(f"Downloading: {study_id}")
if not download_dbgap_study(study_id, output_dir):
if not download_dbgap_study(study_id, f"{output_dir}/{platform}"):
missing_data_dict_studies[study_id] = True

except Exception as e:
# If anything happens, delete the folder so we don't mistake it for success
shutil.rmtree(f"{output_dir}/{study_id}")
shutil.rmtree(f"{output_dir}/{platform}/{study_id}")

# Count the number subdir currently in output_dir as the number of downloaded
num_downloaded = len([path for path in os.walk(output_dir) if path[0] != output_dir])
Expand All @@ -104,14 +108,14 @@ def main():

# Write out list of datasets with no data dicts
with open(f"{output_dir}/download_summary.txt", "w") as sum_file:
sum_file.write(f"Unique dbgap datasets in anvil table: {num_possible}\n")
sum_file.write(f"Unique dbgap datasets in ncpi table: {num_possible}\n")
sum_file.write(f"Successfully Downloaded: {num_downloaded}\n")
sum_file.write(f"Total dbgap datasests missing data dicts: {num_missing_data_dicts}\n")
sum_file.write(f"Dbgap datasests missing data dicts:\n")
for item in missing_data_dict_studies:
sum_file.write(f"{item}\n")

print(f"Unique dbgap datasets in anvil table: {num_possible}\n")
print(f"Unique dbgap datasets in ncpi table: {num_possible}\n")
print(f"Successfully Downloaded: {num_downloaded}\n")
print(f"Total dbgap datasests missing data dicts: {num_missing_data_dicts}\n")

Expand Down
Binary file added data/AnVIL.tar.gz
Binary file not shown.
Binary file added data/BDC.tar.gz
Binary file not shown.
Binary file added data/CRDC.tar.gz
Binary file not shown.
Binary file added data/KFDRC.tar.gz
Binary file not shown.
486 changes: 0 additions & 486 deletions data/anvil-dataset-catalog-results.tsv

This file was deleted.

Binary file removed data/anvil_dbgap_data_dicts.tar.gz
Binary file not shown.
190 changes: 190 additions & 0 deletions data/ncpi-dataset-catalog-results.tsv

Large diffs are not rendered by default.

0 comments on commit d74a991

Please sign in to comment.