Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BDC dbGaP IDs #285

Merged
merged 8 commits into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions bin/get_dbgap_data_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
from ftplib import FTP, error_perm
import csv
import click
import requests
from urllib.parse import urljoin

# Default to logging at the INFO level.
logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)

# Helper function
def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):
Expand All @@ -39,13 +41,14 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):
# Step 1: First we try and get all the data_dict files
try:
ftp.cwd(f"{study_id_path}/pheno_variable_summaries")
except error_perm:
except error_perm as e1:
logging.warning(f"Exception {e1} thrown when trying to access {study_id_path}/pheno_variable_summaries on the dbGaP FTP server.")
# Delete subdirectory so we don't think it's full
shutil.rmtree(local_path)
try:
files_in_dir = ftp.nlst(study_id_path)
except:
logging.error(f"dbGaP study accession identifier not found in dbGaP data: {dbgap_accession_id}")
except error_perm as e2:
logging.error(f"dbGaP study accession identifier not found on dbGaP server ({e2}): {study_id_path}")
return 0

logging.warning(f"No data dictionaries available for study {dbgap_accession_id}: {files_in_dir}")
Expand All @@ -55,8 +58,18 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):
for ftp_filename in ftp_filelist:
if 'data_dict' in ftp_filename:
with open(f"{local_path}/{ftp_filename}", "wb") as data_dict_file:
ftp.retrbinary(f"RETR {ftp_filename}", data_dict_file.write)
logging.debug(f"Downloaded {ftp_filename} to {local_path}/{ftp_filename}")
logging.debug(f"Downloading {ftp_filename} to {local_path}/{ftp_filename}")

# ftp.retrbinary() seems to cause this program to crash.
# Luckily, dbGaP is also available on HTTP!
filename_url = f"https://ftp.ncbi.nlm.nih.gov/{study_id_path}/pheno_variable_summaries/{ftp_filename}"
response = requests.get(filename_url)
if not response.ok:
logging.error(f"Could not download {filename_url}: {response}")
continue

data_dict_file.write(response.content)
logging.info(f"Downloaded {ftp_filename} to {local_path}/{ftp_filename} in {response.elapsed.microseconds} microseconds.")
count_downloaded_vars += 1

# Step 2: Check to see if there's a GapExchange file in the parent folder
Expand All @@ -67,7 +80,7 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):
if 'GapExchange' in ftp_filename:
with open(f"{local_path}/{ftp_filename}", "wb") as data_dict_file:
ftp.retrbinary(f"RETR {ftp_filename}", data_dict_file.write)
logging.debug(f"Downloaded {ftp_filename} to {local_path}/{ftp_filename}")
logging.info(f"Downloaded {ftp_filename} to {local_path}/{ftp_filename}")
ftp.quit()
return count_downloaded_vars

Expand Down Expand Up @@ -151,7 +164,7 @@ def get_dbgap_data_dicts(input_file, format, field, outdir, group_by, skip):
# TODO: this skip logic was added to deal with phs000285.v3.p2 and phs000007.v32.p13, which doesn't work
# for some reason.
if dbgap_id in dbgap_ids_to_skip:
logging.info(f"Skipping dbGaP ID {dbgap_id}")
logging.info(f"Skipping dbGaP accession {dbgap_id}")
continue

dbgap_dir = os.path.join(output_dir_for_row, dbgap_id)
Expand Down
12 changes: 12 additions & 0 deletions data/bdc_dbgap_download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
#
# Download the data dictionaries from dbGaP as listed in bdc_dbgap_ids.csv
# into the bdc_dbgap/ directory.
#

CSV_FILE=bdc_dbgap_ids.csv
OUTPUT_DIR=bdc_dbgap_data_dicts
SCRIPT=../bin/get_dbgap_data_dicts.py

mkdir -p $OUTPUT_DIR
python $SCRIPT $CSV_FILE --format CSV --field Accession --outdir $OUTPUT_DIR --skip phs000571.v6.p2
Loading