From e1e31204d304d05cafd22fe0f02f72dd161e64bb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 13 Nov 2023 17:09:18 -0500 Subject: [PATCH 1/2] Improved PharmGKB download and made it work. --- src/datahandlers/pharmgkb.py | 36 ++++++++++++++++++++++++++++ src/snakefiles/datacollect.snakefile | 14 +++++++++++ 2 files changed, 50 insertions(+) create mode 100644 src/datahandlers/pharmgkb.py diff --git a/src/datahandlers/pharmgkb.py b/src/datahandlers/pharmgkb.py new file mode 100644 index 00000000..50105c1a --- /dev/null +++ b/src/datahandlers/pharmgkb.py @@ -0,0 +1,36 @@ +from os import path +from zipfile import ZipFile +import requests +from pathlib import Path + +from src.prefixes import GTOPDB +from src.babel_utils import pull_via_urllib + + +def download_and_unzip(url, filepath): + """ + Given a `.zip` file, download it to a location, then uncompress it in the same folder. + + :param url_prefix: The URL prefix, e.g. https://api.pharmgkb.org/v1/download/file/data/ + :param filename: The filename to download and save, e.g. genes.zip + :param filepath: The location to write the file to. + """ + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(filepath, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + bname = Path(filepath).stem + ddir = path.dirname(filepath) + with ZipFile(filepath, 'r') as zipObj: + zipObj.extractall(path.join(ddir, bname)) + + +def pull_pharmgkb(genes_zip_filepath, chemicals_zip_filepath, drugs_zip_filepath): + """ + Download the PharmaGKB files we need. + """ + download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/genes.zip', genes_zip_filepath) + download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/chemicals.zip', chemicals_zip_filepath) + download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/drugs.zip', drugs_zip_filepath) + diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index f9b32534..760e4d17 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -29,6 +29,7 @@ import src.datahandlers.chebi as chebi import src.datahandlers.hgncfamily as hgncfamily import src.datahandlers.pantherfamily as pantherfamily import src.datahandlers.complexportal as complexportal +import src.datahandlers.pharmgkb as pharmgkb import src.prefixes as prefixes @@ -553,3 +554,16 @@ rule get_chebi: config['download_directory'] + '/CHEBI/database_accession.tsv', run: chebi.pull_chebi() + +# PharmaGKB + +rule get_pharmgkb: + output: + genes_zip_filepath = config['download_directory'] + '/PharmGKB/genes.zip', + genes_tsv_filepath = config['download_directory'] + '/PharmGKB/genes/genes.tsv', + chemicals_zip_filepath = config['download_directory'] + '/PharmGKB/chemicals.zip', + chemicals_tsv_filepath = config['download_directory'] + '/PharmGKB/chemicals/chemicals.tsv', + drugs_zip_filepath = config['download_directory'] + '/PharmGKB/drugs.zip', + drugs_tsv_filepath = config['download_directory'] + '/PharmGKB/drugs/drugs.tsv', + run: + pharmgkb.pull_pharmgkb(output.genes_zip_filepath, output.chemicals_zip_filepath, output.drugs_zip_filepath) \ No newline at end of file From 273715a61bc267ba9e88f7e0b032ee419fb281c3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 13 Nov 2023 17:16:17 -0500 Subject: [PATCH 2/2] Added phenotypes and variants as downloads. --- src/datahandlers/pharmgkb.py | 4 +++- src/snakefiles/datacollect.snakefile | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/datahandlers/pharmgkb.py b/src/datahandlers/pharmgkb.py index 50105c1a..e440473d 100644 --- a/src/datahandlers/pharmgkb.py +++ b/src/datahandlers/pharmgkb.py @@ -26,11 +26,13 @@ def download_and_unzip(url, filepath): zipObj.extractall(path.join(ddir, bname)) -def pull_pharmgkb(genes_zip_filepath, chemicals_zip_filepath, drugs_zip_filepath): +def pull_pharmgkb(genes_zip_filepath, chemicals_zip_filepath, drugs_zip_filepath, phenotypes_zip_filepath, variants_zip_filepath): """ Download the PharmaGKB files we need. """ download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/genes.zip', genes_zip_filepath) download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/chemicals.zip', chemicals_zip_filepath) download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/drugs.zip', drugs_zip_filepath) + download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/phenotypes.zip', phenotypes_zip_filepath) + download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/variants.zip', variants_zip_filepath) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 760e4d17..34811719 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -565,5 +565,15 @@ rule get_pharmgkb: chemicals_tsv_filepath = config['download_directory'] + '/PharmGKB/chemicals/chemicals.tsv', drugs_zip_filepath = config['download_directory'] + '/PharmGKB/drugs.zip', drugs_tsv_filepath = config['download_directory'] + '/PharmGKB/drugs/drugs.tsv', - run: - pharmgkb.pull_pharmgkb(output.genes_zip_filepath, output.chemicals_zip_filepath, output.drugs_zip_filepath) \ No newline at end of file + phenotypes_zip_filepath = config['download_directory'] + '/PharmGKB/phenotypes.zip', + phenotypes_tsv_filepath = config['download_directory'] + '/PharmGKB/phenotypes/phenotypes.tsv', + variants_zip_filepath = config['download_directory'] + '/PharmGKB/variants.zip', + variants_tsv_filepath = config['download_directory'] + '/PharmGKB/variants/variants.tsv', + run: + pharmgkb.pull_pharmgkb( + output.genes_zip_filepath, + output.chemicals_zip_filepath, + output.drugs_zip_filepath, + output.phenotypes_zip_filepath, + output.variants_zip_filepath, + ) \ No newline at end of file