feat: importing GTex data into annonars genes database (#59)

varfish-org · Sep 13, 2023 · a2e63f0 · a2e63f0
1 parent 7d43865
commit a2e63f0
Show file tree

Hide file tree

Showing 10 changed files with 149 additions and 2 deletions.
diff --git a/Snakefile b/Snakefile
@@ -327,6 +327,7 @@ include: "rules/work/genes/dbnsfp.smk"
 include: "rules/work/genes/clingen.smk"
 include: "rules/work/genes/ensembl.smk"
 include: "rules/work/genes/gnomad.smk"
+include: "rules/work/genes/gtex.smk"
 include: "rules/work/genes/hgnc.smk"
 include: "rules/work/genes/mehari_data_tx.smk"
 include: "rules/work/genes/ncbi.smk"

diff --git a/download_urls.yml b/download_urls.yml
@@ -1,3 +1,13 @@
+- url: https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
+  excerpt_strategy:
+    strategy: no-excerpt
+    count: null
+
+- url: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
+  excerpt_strategy:
+    strategy: manual
+    count: null
+
 - url: https://github.com/Orphanet/orphapacket/archive/refs/tags/v10.1.tar.gz
   excerpt_strategy:
     strategy: no-excerpt

diff --git a/environment.yml b/environment.yml
@@ -11,6 +11,7 @@ dependencies:
   - cattrs
   - click
   - loguru
+  - numpy
   - pyyaml
   - requests
   - requests-ftp
@@ -40,7 +41,7 @@ dependencies:
   # Parallel (de)compression.
   - pigz
   # Varfish related
-  - annonars =0.15.0
+  - annonars =0.18.0
   - viguno =0.1.6
   - mehari =0.6.2
   - varfish-server-worker =0.10.1

diff --git a/excerpt-data/2295c2a0487d0dab/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz b/excerpt-data/2295c2a0487d0dab/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
diff --git a/excerpt-data/2295c2a0487d0dab/url.txt b/excerpt-data/2295c2a0487d0dab/url.txt
diff --git a/excerpt-data/9e484a896c7516d6/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt b/excerpt-data/9e484a896c7516d6/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
diff --git a/excerpt-data/9e484a896c7516d6/url.txt b/excerpt-data/9e484a896c7516d6/url.txt
diff --git a/rules/output/annonars/genes.smk b/rules/output/annonars/genes.smk
@@ -13,6 +13,7 @@ rule output_annonars_genes:  # -- build annonars genes RocksDB file
         orpha="work/genes/orphapacket/{v_orpha}+{date}/orpha_diseases.tsv",
         rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv",
         shet="work/genes/shet/2019/shet_weghorn_2019.tsv",
+        gtex="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz",
     output:
         rocksdb_identity=(
             "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/"
@@ -46,7 +47,8 @@ rule output_annonars_genes:  # -- build annonars genes RocksDB file
             --path-in-orpha {input.orpha} \
             --path-in-ncbi {input.ncbi} \
             --path-in-rcnv {input.rcnv} \
-            --path-in-shet {input.shet}
+            --path-in-shet {input.shet} \
+            --path-in-gtex {input.gtex}
 
         varfish-db-downloader tpl \
             --template rules/output/annonars/genes.spec.yaml \

diff --git a/rules/output/annonars/genes.spec.yaml b/rules/output/annonars/genes.spec.yaml
@@ -40,3 +40,5 @@ x-created-from:
     version: 2022-Collins-et-al
   - name: sHet scores
     version: 2019-Weghorn-et-a.
+  - name: GTex data
+    version: v8
diff --git a/rules/work/genes/gtex.smk b/rules/work/genes/gtex.smk
@@ -0,0 +1,119 @@
+import json
+import csv
+import gzip
+import sys
+import typing
+
+import attrs
+import cattrs
+import numpy as np
+
+
+@attrs.frozen(auto_attribs=True)
+class GtexTissueRecord:
+    tissue: str
+    tissue_detailed: str
+    tpms: typing.List[float] = attrs.field(factory=list)
+
+
+@attrs.frozen(auto_attribs=True)
+class GtexGeneRecord:
+    hgnc_id: str
+    ensembl_gene_id: str
+    ensembl_gene_version: str
+    records: typing.List[GtexTissueRecord]
+
+
+rule genes_gtex_v8_download:  # -- download GTex v8 gene expression data
+    output:
+        attributes="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",
+        attributes_md5="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt.md5",
+        genes_tpm="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz",
+        genes_tpm_md5="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz.md5",
+    shell:
+        r"""
+        wget --no-check-certificate \
+            -O {output.attributes} \
+            https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
+
+        wget --no-check-certificate \
+            -O {output.genes_tpm} \
+            https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
+
+        md5sum {output.attributes} > {output.attributes_md5}
+        md5sum {output.genes_tpm} > {output.genes_tpm_md5}
+        """
+
+
+rule genes_gtex_v8_map:  # -- map GTex v8 gene files for annonars
+    input:
+        attributes="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",
+        genes_tpm="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz",
+        genes_xlink=f"output/full/mehari/genes-xlink-{DV.today}/genes-xlink.tsv",
+    output:
+        genes_tpm="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz",
+    run:
+        # Load mapping from sample ID to sample tissue details
+        smtsd_count = {}
+        with open(input.attributes, "rt") as inputf:
+            reader = csv.DictReader(inputf, delimiter="\t")
+            sampid_to_tissue = {}
+            for row in reader:
+                sampid_to_tissue[row["SAMPID"]] = (row["SMTS"], row["SMTSD"])
+                smtsd_count.setdefault(row["SMTSD"], 0)
+                smtsd_count[row["SMTSD"]] += 1
+        print("Sample counts per tissue:", file=sys.stderr)
+        for smtsd, count in sorted(smtsd_count.items(), key=lambda x: x[1], reverse=True):
+            print(f"{smtsd}: {count}", file=sys.stderr)
+            # Load mapping from ENSEMBL to HGNC gene ID
+        with open(input.genes_xlink, "rt") as inputf:
+            reader = csv.DictReader(inputf, delimiter="\t")
+            ensembl_to_hgnc = {row["ensembl_gene_id"]: row["hgnc_id"] for row in reader}
+
+            # Map GTEx v8 gene expression data to counts JSONL data for annonars
+        print("Transmogrifying expression data...", file=sys.stderr)
+        with gzip.open(input.genes_tpm, "rt") as inputf, gzip.open(
+            output.genes_tpm, "wt"
+        ) as outputf:
+            for _ in range(2):
+                next(inputf)
+            reader = csv.DictReader(inputf, delimiter="\t")
+            for row in reader:
+                ensembl_gene_id, ensembl_gene_version = row["Name"].split(".", 1)
+                hgnc_id = ensembl_to_hgnc.get(ensembl_gene_id)
+                if hgnc_id is None:
+                    print(f"Skipping {ensembl_gene_id}.{ensembl_gene_version}", file=sys.stderr)
+                    continue
+
+                tissue_records = {}
+
+                for sampid, tpm in row.items():
+                    if not sampid.startswith("GTEX-"):
+                        continue
+                    smts, smtsd = sampid_to_tissue[sampid]
+                    if smtsd not in tissue_records:
+                        tissue_records[smtsd] = GtexTissueRecord(tissue=smts, tissue_detailed=smtsd)
+                    tissue_records[smtsd].tpms.append(float(tpm))
+
+                records = []
+                for tissue_record in tissue_records.values():
+                    records.append(
+                        attrs.evolve(
+                            tissue_record,
+                            tpms=np.quantile(
+                                np.array(tissue_record.tpms), [0.0, 0.25, 0.5, 0.75, 1.0]
+                            ).tolist(),
+                        )
+                    )
+
+                gene_record = GtexGeneRecord(
+                    hgnc_id=hgnc_id,
+                    ensembl_gene_id=ensembl_gene_id,
+                    ensembl_gene_version=ensembl_gene_version,
+                    records=list(sorted(records, key=lambda r: (r.tissue, r.tissue_detailed))),
+                )
+                print(
+                    json.dumps(cattrs.unstructure(gene_record)),
+                    file=outputf,
+                )
+        print("... done transmogrifying GTex data", file=sys.stderr)