Skip to content

Commit

Permalink
Merge pull request #297 from TranslatorSRI/improve-biomart
Browse files Browse the repository at this point in the history
We've been having problems with BioMart downloads caused by trying to download too many columns at once (#194). Until we figure out how to fully work around that, this PR allows us to designate a list of BioMart ensembles that we should skip during downloads.
  • Loading branch information
gaurav authored Sep 23, 2024
2 parents 4ec58a0 + 084f563 commit 716a784
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 8 deletions.
10 changes: 9 additions & 1 deletion config.json
Original file line number Diff line number Diff line change
Expand Up @@ -81,5 +81,13 @@
"HMDB",
"PUBCHEM.COMPOUND"
]
}
},

"ensembl_datasets_to_skip": [
"elucius_gene_ensembl",
"hgfemale_gene_ensembl",
"charengus_gene_ensembl",
"otshawytscha_gene_ensembl",
"aocellaris_gene_ensembl"
]
}
23 changes: 16 additions & 7 deletions src/datahandlers/ensembl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from src.babel_utils import make_local_name
import traceback

from src.babel_utils import make_local_name, get_config
from apybiomart import find_datasets, query, find_attributes
import os

Expand All @@ -13,7 +15,7 @@
def pull_ensembl(complete_file):
f = find_datasets()

skip_dataset_ids = {'hgfemale_gene_ensembl'}
skip_dataset_ids = set(get_config()['ensembl_datasets_to_skip'])

cols = {"ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source",
"external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id',
Expand All @@ -29,11 +31,18 @@ def pull_ensembl(complete_file):
# but then updates the config? That sounds bogus.
if os.path.exists(outfile):
continue
atts = find_attributes(ds)
existingatts = set(atts['Attribute_ID'].to_list())
attsIcanGet = cols.intersection(existingatts)
df = query(attributes=list(attsIcanGet), filters={}, dataset=ds)
df.to_csv(outfile, index=False, sep='\t')
try:
atts = find_attributes(ds)
existingatts = set(atts['Attribute_ID'].to_list())
attsIcanGet = cols.intersection(existingatts)
df = query(attributes=list(attsIcanGet), filters={}, dataset=ds)
df.to_csv(outfile, index=False, sep='\t')
except Exception as exc:
biomart_dir = os.path.dirname(outfile)
print(f'Deleting BioMart directory {biomart_dir} so its clear it needs to be downloaded again.')
os.rmdir(biomart_dir)

raise exc
with open(complete_file, 'w') as outf:
outf.write(f'Downloaded gene sets for {len(f)} data sets.')

Expand Down

0 comments on commit 716a784

Please sign in to comment.