diff --git a/src/datahandlers/pharmgkb.py b/src/datahandlers/pharmgkb.py new file mode 100644 index 00000000..e440473d --- /dev/null +++ b/src/datahandlers/pharmgkb.py @@ -0,0 +1,38 @@ +from os import path +from zipfile import ZipFile +import requests +from pathlib import Path + +from src.prefixes import GTOPDB +from src.babel_utils import pull_via_urllib + + +def download_and_unzip(url, filepath): + """ + Given a `.zip` file, download it to a location, then uncompress it in the same folder. + + :param url_prefix: The URL prefix, e.g. https://api.pharmgkb.org/v1/download/file/data/ + :param filename: The filename to download and save, e.g. genes.zip + :param filepath: The location to write the file to. + """ + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(filepath, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + bname = Path(filepath).stem + ddir = path.dirname(filepath) + with ZipFile(filepath, 'r') as zipObj: + zipObj.extractall(path.join(ddir, bname)) + + +def pull_pharmgkb(genes_zip_filepath, chemicals_zip_filepath, drugs_zip_filepath, phenotypes_zip_filepath, variants_zip_filepath): + """ + Download the PharmaGKB files we need. + """ + download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/genes.zip', genes_zip_filepath) + download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/chemicals.zip', chemicals_zip_filepath) + download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/drugs.zip', drugs_zip_filepath) + download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/phenotypes.zip', phenotypes_zip_filepath) + download_and_unzip('https://api.pharmgkb.org/v1/download/file/data/variants.zip', variants_zip_filepath) + diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index f9b32534..34811719 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -29,6 +29,7 @@ import src.datahandlers.chebi as chebi import src.datahandlers.hgncfamily as hgncfamily import src.datahandlers.pantherfamily as pantherfamily import src.datahandlers.complexportal as complexportal +import src.datahandlers.pharmgkb as pharmgkb import src.prefixes as prefixes @@ -553,3 +554,26 @@ rule get_chebi: config['download_directory'] + '/CHEBI/database_accession.tsv', run: chebi.pull_chebi() + +# PharmaGKB + +rule get_pharmgkb: + output: + genes_zip_filepath = config['download_directory'] + '/PharmGKB/genes.zip', + genes_tsv_filepath = config['download_directory'] + '/PharmGKB/genes/genes.tsv', + chemicals_zip_filepath = config['download_directory'] + '/PharmGKB/chemicals.zip', + chemicals_tsv_filepath = config['download_directory'] + '/PharmGKB/chemicals/chemicals.tsv', + drugs_zip_filepath = config['download_directory'] + '/PharmGKB/drugs.zip', + drugs_tsv_filepath = config['download_directory'] + '/PharmGKB/drugs/drugs.tsv', + phenotypes_zip_filepath = config['download_directory'] + '/PharmGKB/phenotypes.zip', + phenotypes_tsv_filepath = config['download_directory'] + '/PharmGKB/phenotypes/phenotypes.tsv', + variants_zip_filepath = config['download_directory'] + '/PharmGKB/variants.zip', + variants_tsv_filepath = config['download_directory'] + '/PharmGKB/variants/variants.tsv', + run: + pharmgkb.pull_pharmgkb( + output.genes_zip_filepath, + output.chemicals_zip_filepath, + output.drugs_zip_filepath, + output.phenotypes_zip_filepath, + output.variants_zip_filepath, + ) \ No newline at end of file