Skip to content

Commit

Permalink
fix: adjust to upstream data changes (#95)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Jul 14, 2024
1 parent 2d01fde commit 871af29
Show file tree
Hide file tree
Showing 20 changed files with 54 additions and 79 deletions.
3 changes: 0 additions & 3 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,6 @@ rule all:
f"output/full/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype.hpoa",
f"output/full/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype_to_genes.txt",
f"output/full/viguno/hpo-{DV.hpo}+{PV.viguno}/hpo.bin",
f"output/full/viguno/hpo-{DV.hpo}+{PV.viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
# ----- background/population structural variants and annotations thereof
f"output/full/tracks/track-strucvars-dbvar-grch37-{DV.dbvar}+{DV.tracks}/dbvar.bed.gz",
f"output/full/tracks/track-strucvars-dbvar-grch38-{DV.dbvar}+{DV.tracks}/dbvar.bed.gz",
Expand Down Expand Up @@ -275,7 +274,6 @@ rule all:
f"output/reduced-dev/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype.hpoa",
f"output/reduced-dev/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype_to_genes.txt",
f"output/reduced-dev/viguno/hpo-{DV.hpo}+{PV.viguno}/hpo.bin",
f"output/reduced-dev/viguno/hpo-{DV.hpo}+{PV.viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
# -- annonars
f"output/reduced-dev/annonars/cadd-grch37-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY",
f"output/reduced-dev/annonars/cadd-grch38-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY",
Expand Down Expand Up @@ -307,7 +305,6 @@ rule all:
f"output/reduced-exomes/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype.hpoa",
f"output/reduced-exomes/viguno/hpo-{DV.hpo}+{PV.viguno}/phenotype_to_genes.txt",
f"output/reduced-exomes/viguno/hpo-{DV.hpo}+{PV.viguno}/hpo.bin",
f"output/reduced-exomes/viguno/hpo-{DV.hpo}+{PV.viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
# -- annonars
f"output/reduced-exomes/annonars/cadd-grch37-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY",
f"output/reduced-exomes/annonars/cadd-grch38-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY",
Expand Down
2 changes: 1 addition & 1 deletion download_urls.yml
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@
strategy: head
count: 10000

- url: https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz
- url: https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.gtf.gz
excerpt_strategy:
strategy: head
count: 10000
Expand Down
6 changes: 3 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ dependencies:
# Parallel (de)compression.
- pigz
# Varfish related
- annonars =0.34.0
- viguno =0.2.0
- mehari =0.21.1
- annonars =0.39.0
- viguno =0.3.1
- mehari =0.25.5
- varfish-server-worker =0.12.0
# S3 uploads
- s5cmd =2.1.0
Expand Down
4 changes: 2 additions & 2 deletions excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv
Git LFS file not shown
4 changes: 2 additions & 2 deletions excerpt-data/652646c24140df2a/mondo.obo
Git LFS file not shown
3 changes: 0 additions & 3 deletions excerpt-data/6efab26cca71a549/Homo_sapiens.GRCh38.109.gtf.gz

This file was deleted.

3 changes: 0 additions & 3 deletions excerpt-data/6efab26cca71a549/url.txt

This file was deleted.

4 changes: 2 additions & 2 deletions excerpt-data/709832e39857a725/fixSeqLiftOverPsl.txt.gz
Git LFS file not shown
4 changes: 2 additions & 2 deletions excerpt-data/8ee47118be15da10/current_README
Git LFS file not shown
4 changes: 2 additions & 2 deletions excerpt-data/a9229376a47367b7/database
Git LFS file not shown
4 changes: 2 additions & 2 deletions excerpt-data/a9dd799550b2e5ae/martservice
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/e2748774e1b011f7/Homo_sapiens.GRCh38.112.gtf.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/e2748774e1b011f7/url.txt
Git LFS file not shown
20 changes: 1 addition & 19 deletions rules/output/viguno/hpo.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import os
VIGUNO_SIMULATE_THREADS = int(os.environ.get("VIGUNO_SIMULATE_THREADS", 96))


rule output_viguno_pheno: # -- copy HPO and simulate
rule output_viguno_pheno: # -- copy HPO
input:
obo="work/download/hpo/{v_hpo}/hp.obo",
hpoa="work/download/hpo/{v_hpo}/phenotype.hpoa",
Expand All @@ -16,7 +16,6 @@ rule output_viguno_pheno: # -- copy HPO and simulate
obo="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo",
hpoa="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa",
phenotype_to_genes="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt",
rocksdb_identity="output/full/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
wildcard_constraints:
v_hpo=RE_VERSION,
v_viguno=RE_VERSION,
Expand All @@ -31,22 +30,6 @@ rule output_viguno_pheno: # -- copy HPO and simulate
awk -F $'\t' 'BEGIN {{ OFS=FS }} {{ print $3, $4, $1, $2, $6 }}' \
{input.genes_to_phenotype} \
> {output.phenotype_to_genes}
viguno simulate \
--ic-base gene \
--similarity resnik \
--combiner fun-sim-avg \
--path-hpo-dir $(dirname {input.obo}) \
--path-out-rocksdb $(dirname {output.rocksdb_identity}) \
--min-terms 1 \
$(if [[ "{RUNS_IN_CI}" == "True" ]]; then \
echo --max-terms 1; \
echo --num-simulations 10; \
echo --only-gene ARID1B; \
else \
echo --max-terms 10; \
fi) \
--seed 42
"""


Expand All @@ -55,7 +38,6 @@ rule global_hpo_to_bin: # -- convert to .bin
obo="work/download/hpo/{v_hpo}/hp.obo",
hpoa="work/download/hpo/{v_hpo}/phenotype.hpoa",
genes_to_phenotype="work/download/hpo/{v_hpo}/phenotype_to_genes.txt",
rocksdb_identity="output/full/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
output:
bin="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin",
spec_yaml=("output/full/viguno/hpo-{v_hpo}+{v_viguno}/spec.yaml"),
Expand Down
2 changes: 2 additions & 0 deletions rules/reduced/annonars.smk
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ rule subset_annonars: # -- create exomes subset
shell:
r"""
annonars db-utils copy \
--skip-cfs dbsnp_by_rsid \
--skip-cfs clinvar_by_accession \
--path-in $(dirname {input.rocksdb_identity}) \
--path-out $(dirname {output.rocksdb_identity}) \
--path-beds {input.bed}
Expand Down
23 changes: 0 additions & 23 deletions rules/reduced/hpo.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,12 @@ rule subset_viguno_pheno_exomes: # -- create exomes subset
obo="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo",
hpoa="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa",
phenotype_to_genes="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt",
rocksdb_identity="output/full/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
bin="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin",
spec_yaml="output/full/viguno/hpo-{v_hpo}+{v_viguno}/spec.yaml",
output:
obo="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo",
hpoa="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa",
phenotype_to_genes="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt",
rocksdb_identity="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
bin="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin",
spec_yaml="output/reduced-exomes/viguno/hpo-{v_hpo}+{v_viguno}/spec.yaml",
wildcard_constraints:
Expand All @@ -28,8 +26,6 @@ rule subset_viguno_pheno_exomes: # -- create exomes subset
cp -a {input.phenotype_to_genes} {output.phenotype_to_genes}
cp -a {input.bin} {output.bin}
cp -a {input.spec_yaml} {output.spec_yaml}
cp -ar $(dirname {input.rocksdb_identity})/. $(dirname {output.rocksdb_identity})/.
"""


Expand All @@ -38,14 +34,12 @@ rule subset_worker_pheno_dev: # -- create development subset
obo="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo",
hpoa="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa",
phenotype_to_genes="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt",
rocksdb_identity="output/full/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
bin="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin",
spec_yaml="output/full/viguno/hpo-{v_hpo}+{v_viguno}/spec.yaml",
output:
obo="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo",
hpoa="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa",
phenotype_to_genes="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt",
rocksdb_identity="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
bin="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin",
spec_yaml="output/reduced-dev/viguno/hpo-{v_hpo}+{v_viguno}/spec.yaml",
wildcard_constraints:
Expand All @@ -61,21 +55,4 @@ rule subset_worker_pheno_dev: # -- create development subset
cp -a {input.phenotype_to_genes} {output.phenotype_to_genes}
cp -a {input.bin} {output.bin}
cp -a {input.spec_yaml} {output.spec_yaml}
viguno simulate \
--ic-base gene \
--similarity resnik \
--combiner fun-sim-avg \
--path-hpo-dir $(dirname {input.obo}) \
--path-out-rocksdb $(dirname {output.rocksdb_identity}) \
--min-terms 1 \
$(if [[ "{RUNS_IN_CI}" == "True" ]]; then \
echo --max-terms 1; \
echo --num-simulations 10; \
echo --only-gene ARID1B; \
else \
echo --max-terms 10; \
echo --num-simulations 100; \
fi) \
--seed 42
"""
2 changes: 1 addition & 1 deletion rules/work/genes/ensembl.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ rule genes_ensembl_create_xlink: # -- create ENSEMBL gene information xlink tab
wget --no-check-certificate \
-O $TMPDIR/current_README \
https://ftp.ensembl.org/pub/current_README
grep "Ensembl Release {DV.ensembl} Databases" $TMPDIR/current_README \
grep "The current release is Ensembl {DV.ensembl}" $TMPDIR/current_README \
|| (echo "Ensembl version is not {DV.ensembl}." && exit 1)
echo -e "ensembl_gene_id\tensembl_transcript_id\tentrez_id\tgene_symbol" \
Expand Down
3 changes: 2 additions & 1 deletion scripts/genes-integrate-diseases.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,8 @@ def parse_mondo_obo(path: str) -> List[MondoDisease]:
relation=list(map(MondoDiseaseRelation, synonym.scope.split(" "))),
)
)
result.append(MondoDisease(mondo_id=term.id, name=term.name, synonyms=synonyms))
if term.name:
result.append(MondoDisease(mondo_id=term.id, name=term.name, synonyms=synonyms))
return result


Expand Down
30 changes: 23 additions & 7 deletions varfish_db_downloader/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import requests
import requests_ftp
from loguru import logger
from reretry import retry

from varfish_db_downloader import __version__, wget

Expand Down Expand Up @@ -138,6 +139,10 @@ def urls_download(urls, data_dir, urls_yaml, force):
raise click.ClickException("URL discrepancy (see logs above)")


class UrlCheckFailed(Exception):
pass


@wget_.command()
@click.option("--urls-yaml", default="download_urls.yml")
@click.argument("urls", nargs=-1)
Expand All @@ -148,18 +153,29 @@ def urls_check_upstream(urls, urls_yaml):

requests_ftp.monkeypatch_session()

def try_get_failed(e: UrlCheckFailed):
logger.info(" failed: {} (maybe retry)", e)

@retry(tries=5, delay=1, backoff=2, logger=None, fail_callback=try_get_failed)
def try_get(session: requests.Session, url: str):
with s.get(entry.url, allow_redirects=True, stream=True) as r:
if r.ok:
r.close()
else:
raise UrlCheckFailed(str(r))

error_count = 0
for entry in wget.load_urls_yaml(urls_yaml):
s = requests.Session()
if not entry.skip_upstream_check and (not urls or entry.url in urls):
logger.info(" checking {}...", entry.url)
with s.get(entry.url, allow_redirects=True, stream=True) as r:
if r.ok:
logger.info(" => OK")
r.close()
else:
error_count += 1
logger.warning(" NOT OK: {}", r)
try:
try_get(s, entry.url)
except UrlCheckFailed as e:
error_count += 1
logger.warning(" NOT OK: {}", e)
else:
logger.info(" => OK")
else:
logger.info(" Skipping {}...", entry.url)

Expand Down
6 changes: 3 additions & 3 deletions varfish_db_downloader/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ class DataVersions:
clingen_gene=TODAY,
clingen_variant=TODAY,
ensembl_37="87",
ensembl_38="109",
ensembl="111",
ensembl_38="112",
ensembl="112",
today=TODAY,
dbnsfp="4.5",
dbscsnv="1.1",
Expand All @@ -155,7 +155,7 @@ class DataVersions:
ucsc_genomic_super_dups_38="20141019",
ucsc_alt_seq_liftover_37="20200322",
ucsc_alt_seq_liftover_38="20221103",
ucsc_fix_seq_liftover_37="20200524",
ucsc_fix_seq_liftover_37="20200609",
ucsc_fix_seq_liftover_38="20221103",
refseq_37="105",
refseq_38="GCF_000001405.40+RS_2023_03",
Expand Down

0 comments on commit 871af29

Please sign in to comment.