Skip to content

Commit

Permalink
feat: importing GTex data into annonars genes database (#59)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Sep 13, 2023
1 parent 7d43865 commit a2e63f0
Show file tree
Hide file tree
Showing 10 changed files with 149 additions and 2 deletions.
1 change: 1 addition & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ include: "rules/work/genes/dbnsfp.smk"
include: "rules/work/genes/clingen.smk"
include: "rules/work/genes/ensembl.smk"
include: "rules/work/genes/gnomad.smk"
include: "rules/work/genes/gtex.smk"
include: "rules/work/genes/hgnc.smk"
include: "rules/work/genes/mehari_data_tx.smk"
include: "rules/work/genes/ncbi.smk"
Expand Down
10 changes: 10 additions & 0 deletions download_urls.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
- url: https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
excerpt_strategy:
strategy: no-excerpt
count: null

- url: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
excerpt_strategy:
strategy: manual
count: null

- url: https://github.com/Orphanet/orphapacket/archive/refs/tags/v10.1.tar.gz
excerpt_strategy:
strategy: no-excerpt
Expand Down
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies:
- cattrs
- click
- loguru
- numpy
- pyyaml
- requests
- requests-ftp
Expand Down Expand Up @@ -40,7 +41,7 @@ dependencies:
# Parallel (de)compression.
- pigz
# Varfish related
- annonars =0.15.0
- annonars =0.18.0
- viguno =0.1.6
- mehari =0.6.2
- varfish-server-worker =0.10.1
Expand Down
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/2295c2a0487d0dab/url.txt
Git LFS file not shown
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/9e484a896c7516d6/url.txt
Git LFS file not shown
4 changes: 3 additions & 1 deletion rules/output/annonars/genes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
orpha="work/genes/orphapacket/{v_orpha}+{date}/orpha_diseases.tsv",
rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv",
shet="work/genes/shet/2019/shet_weghorn_2019.tsv",
gtex="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz",
output:
rocksdb_identity=(
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/"
Expand Down Expand Up @@ -46,7 +47,8 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
--path-in-orpha {input.orpha} \
--path-in-ncbi {input.ncbi} \
--path-in-rcnv {input.rcnv} \
--path-in-shet {input.shet}
--path-in-shet {input.shet} \
--path-in-gtex {input.gtex}
varfish-db-downloader tpl \
--template rules/output/annonars/genes.spec.yaml \
Expand Down
2 changes: 2 additions & 0 deletions rules/output/annonars/genes.spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,5 @@ x-created-from:
version: 2022-Collins-et-al
- name: sHet scores
version: 2019-Weghorn-et-a.
- name: GTex data
version: v8
119 changes: 119 additions & 0 deletions rules/work/genes/gtex.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import json
import csv
import gzip
import sys
import typing

import attrs
import cattrs
import numpy as np


@attrs.frozen(auto_attribs=True)
class GtexTissueRecord:
tissue: str
tissue_detailed: str
tpms: typing.List[float] = attrs.field(factory=list)


@attrs.frozen(auto_attribs=True)
class GtexGeneRecord:
hgnc_id: str
ensembl_gene_id: str
ensembl_gene_version: str
records: typing.List[GtexTissueRecord]


rule genes_gtex_v8_download: # -- download GTex v8 gene expression data
output:
attributes="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",
attributes_md5="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt.md5",
genes_tpm="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz",
genes_tpm_md5="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz.md5",
shell:
r"""
wget --no-check-certificate \
-O {output.attributes} \
https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
wget --no-check-certificate \
-O {output.genes_tpm} \
https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
md5sum {output.attributes} > {output.attributes_md5}
md5sum {output.genes_tpm} > {output.genes_tpm_md5}
"""


rule genes_gtex_v8_map: # -- map GTex v8 gene files for annonars
input:
attributes="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",
genes_tpm="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz",
genes_xlink=f"output/full/mehari/genes-xlink-{DV.today}/genes-xlink.tsv",
output:
genes_tpm="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz",
run:
# Load mapping from sample ID to sample tissue details
smtsd_count = {}
with open(input.attributes, "rt") as inputf:
reader = csv.DictReader(inputf, delimiter="\t")
sampid_to_tissue = {}
for row in reader:
sampid_to_tissue[row["SAMPID"]] = (row["SMTS"], row["SMTSD"])
smtsd_count.setdefault(row["SMTSD"], 0)
smtsd_count[row["SMTSD"]] += 1
print("Sample counts per tissue:", file=sys.stderr)
for smtsd, count in sorted(smtsd_count.items(), key=lambda x: x[1], reverse=True):
print(f"{smtsd}: {count}", file=sys.stderr)
# Load mapping from ENSEMBL to HGNC gene ID
with open(input.genes_xlink, "rt") as inputf:
reader = csv.DictReader(inputf, delimiter="\t")
ensembl_to_hgnc = {row["ensembl_gene_id"]: row["hgnc_id"] for row in reader}

# Map GTEx v8 gene expression data to counts JSONL data for annonars
print("Transmogrifying expression data...", file=sys.stderr)
with gzip.open(input.genes_tpm, "rt") as inputf, gzip.open(
output.genes_tpm, "wt"
) as outputf:
for _ in range(2):
next(inputf)
reader = csv.DictReader(inputf, delimiter="\t")
for row in reader:
ensembl_gene_id, ensembl_gene_version = row["Name"].split(".", 1)
hgnc_id = ensembl_to_hgnc.get(ensembl_gene_id)
if hgnc_id is None:
print(f"Skipping {ensembl_gene_id}.{ensembl_gene_version}", file=sys.stderr)
continue

tissue_records = {}

for sampid, tpm in row.items():
if not sampid.startswith("GTEX-"):
continue
smts, smtsd = sampid_to_tissue[sampid]
if smtsd not in tissue_records:
tissue_records[smtsd] = GtexTissueRecord(tissue=smts, tissue_detailed=smtsd)
tissue_records[smtsd].tpms.append(float(tpm))

records = []
for tissue_record in tissue_records.values():
records.append(
attrs.evolve(
tissue_record,
tpms=np.quantile(
np.array(tissue_record.tpms), [0.0, 0.25, 0.5, 0.75, 1.0]
).tolist(),
)
)

gene_record = GtexGeneRecord(
hgnc_id=hgnc_id,
ensembl_gene_id=ensembl_gene_id,
ensembl_gene_version=ensembl_gene_version,
records=list(sorted(records, key=lambda r: (r.tissue, r.tissue_detailed))),
)
print(
json.dumps(cattrs.unstructure(gene_record)),
file=outputf,
)
print("... done transmogrifying GTex data", file=sys.stderr)

0 comments on commit a2e63f0

Please sign in to comment.