feat(IPVC-2331): add splign manual steps (#23)

Co-authored-by: Shane Giles <[email protected]>
invitae · Apr 17, 2024 · 254176f · 254176f
1 parent 8431f27
commit 254176f
Show file tree

Hide file tree

Showing 8 changed files with 143 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -357,6 +357,19 @@ docker compose run seqrepo-load
 UTA_ETL_SKIP_GENE_LOAD=true docker compose run uta-load
 ```
 
+#### 2C. Manual splign transcripts
+To load splign-manual transcripts, the workflow expects an input txdata.yaml file and splign alignments. Define this path 
+using the environment variable $UTA_SPLIGN_MANUAL_DIR. These file paths should exist:
+- `$UTA_SPLIGN_MANUAL_DIR/splign-manual/txdata.yaml`
+- `$UTA_SPLIGN_MANUAL_DIR/splign-manual/alignments/*.splign`
+
+[txdata.yaml](loading/data/splign-manual/txdata.yaml) defines the transcripts and their metadata. The [alignments dir](loading/data/splign-manual/alignments) contains the splign alignments.
+To run the workflow:
+```
+export UTA_SPLIGN_MANUAL_DIR=$(pwd)/loading/data/splign-manual/
+docker compose run splign-manual
+```
+
 UTA has updated and the database has been dumped into a pgd file in `UTA_ETL_WORK_DIR`. SeqRepo has been updated in place.
 
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -60,3 +60,15 @@ services:
       - ${UTA_ETL_LOG_DIR}:/mito-extract/logs
     working_dir: /opt/repos/uta
     network_mode: host
+  splign-manual:
+    image: uta-update
+    command: sbin/uta-splign-manual 2024-02-20 ${UTA_ETL_OLD_UTA_VERSION} /uta-splign-manual/input /uta-splign-manual/work /uta-splign-manual/logs
+    depends_on:
+      uta:
+        condition: service_healthy
+    volumes:
+      - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
+      - ${UTA_SPLIGN_MANUAL_DIR}:/uta-splign-manual/input
+      - ${UTA_ETL_WORK_DIR}:/uta-splign-manual/work
+      - ${UTA_ETL_LOG_DIR}:/uta-splign-manual/logs
+    network_mode: host
diff --git a/loading/data/splign-manual/README.md b/loading/data/splign-manual/README.md
@@ -50,7 +50,7 @@ For a given RefSeq transcript (e.g., NM_000996.3), do the following:
    - Click on the gene id to go to the gene page (e.g., `6165`)
    - N.B. Strand is inferred from the orientation of aligned exons.
 
-1. Enter the gene and CDS info in txdata.yaml
+1. Enter the gene, geneID, and CDS info in txdata.yaml
 
 1. Get the chromosome and coordinates from the gene page
    - From the "Genomic Context" section, note the chromosomal

diff --git a/loading/data/splign-manual/txdata.yaml b/loading/data/splign-manual/txdata.yaml
@@ -7,74 +7,88 @@ NM_000000.0:      # transcript_accession
   cds:            # CDS start and end, 1-based inclusive
   hgnc:           # HGNC *symbol*
   genomic_region: # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/259291
+  gene_id:        # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/259291
 
 
 NM_001025190.1:
   # This RefSeq was permanently suppressed because it is now thought that this gene is a pseudogene
   cds: 1,3162
   hgnc: MSLNL
   genomic_region: NC_000016.9 (819428..831996, complement)
+  gene_id: 401827
 
 NM_006060.6:
   cds: 222,1781
   hgnc: IKZF1
   genomic_region: NC_000007.13 (50344378..50367358) , (50444231..50472799)
+  gene_id: 10320
 
 NM_000769.4:
   cds: 26,1498
   hgnc: CYP2C19
   genomic_region: NC_000010.10 (96522463..96612671)
+  gene_id: 1557
 
 NM_001807.4:
   cds: 17,2287
   hgnc: CEL
   genomic_region: NC_000009.11 (135936741..135947250)
+  gene_id: 1056
 
 NM_002116.7:
   cds: 85,1182
   hgnc: HLA-A
   genomic_region: NC_000006.11 (29910247..29913661)
+  gene_id: 3105
 
 NM_002122.3:
   cds: 54,821
   hgnc: HLA-DQA1
   genomic_region: NC_000006.11 (32605169..32612152)
+  gene_id: 3117
 
 NM_006060.5:
   cds: 269,1828
   hgnc: IKZF1
   genomic_region: NC_000007.13 (50344378..50367358) , (50444231..50472799)
+  gene_id: 10320
 
 NM_000996.3:
   cds: 65,397
   hgnc: RPL35A
   genomic_region: NC_000003.11 (197677023..197682722)
+  gene_id: 6165
 
 NM_001261826.2:
   cds: 293,3940
   hgnc: AP3D1
   genomic_region: NC_000019.9 (2100987..2151556, complement)
+  gene_id: 8943
 
 NM_001355436.1:
   cds: 144,7130
   hgnc: SPTB
   genomic_region: NC_000014.8 (65213001..65346604, complement)
+  gene_id: 6710
 
 NM_001428.4:
   cds: 117,1421
   hgnc: ENO1
   genomic_region: NC_000001.10 (8921059..8939151, complement)
+  gene_id: 2023
 
 NM_032589.2:
-  genomic_region: NM_032589.2 was permanently suppressed because currently there is support for the transcript but not for the protein.
+  # NM_032589.2 was permanently suppressed because currently there is support for the transcript but not for the protein.
   cds: 150,425
   hgnc: DSCR8
   genomic_region: NC_000021.8 (39493545..39528605)
+  gene_id: 84677
 
 NM_176886.1:
   cds: 1,900
   hgnc: TAS2R45
   genomic_region: NW_003571050.1 (327525..328424, complement)
+  gene_id: 259291
 
 
 
@@ -90,6 +104,7 @@ NM_002457.4:
   cds: 28,15897
   hgnc: MUC2
   genomic_region: NC_000011.9 (1074875..1104417)
+  gene_id: 4583
 
 
 # Case 2: overall low coverage and/or identity. 
@@ -99,6 +114,7 @@ NM_001277444.1:
   cds: 76,3411
   hgnc: NBPF9
   genomic_region: NC_000001.10 (144811743..144830407)
+  gene_id: 400818
 
 
 # Case 3: high identity alignments but with large gaps.  These
@@ -110,18 +126,21 @@ NM_031421.4:
   cds: 131,2149
   hgnc: TTC25
   genomic_region: NC_000017.10 (40086888..40117669)
+  gene_id: 83538
 
 NM_001349168.1:
   # Splign alignment has 159 nt unaligned exonic sequence. This is unusable. -Reece 2020-04-08
   cds: 239,4762
   hgnc: DCAF1
   genomic_region: NC_000003.11 (51433298..51534018, complement)
+  gene_id: 9730
 
 NM_001733.5:
   # Splign alignment has 232 nt unaligned exonic sequence. This is unusable. -Reece 2020-04-08
   cds: 220,2337
   hgnc: C1R
   genomic_region: NC_000012.11 (7241205..7245043, complement) , (7187513..7189412, complement)
+  gene_id: 715
 
 # Transcript, gene, and genomic alignment info
 # cds start,end (in human, 1-based coordinates) and hgnc symbol
@@ -132,53 +151,64 @@ NM_001038633.3:      # transcript_accession
   cds: 893,1684            # CDS start and end, 1-based inclusive
   hgnc: RSPO1           # HGNC *symbol*
   genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/284654
+  gene_id: 284654
 
 NM_005363.3:      # transcript_accession
   cds: 208,1152           # CDS start and end, 1-based inclusive
   hgnc: MAGEA6          # HGNC *symbol*
   genomic_region: NC_000023.10 (151867245..151870814) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/4105
+  gene_id: 4105
 
 NM_006561.3:      # transcript_accession
   cds: 161,1726           # CDS start and end, 1-based inclusive
   hgnc: CELF2           # HGNC *symbol*
   genomic_region: NC_000010.10 (10838851..11378674) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/10659
+  gene_id: 10659
 
 NM_001242908.1:      # transcript_accession
   cds: 714,1505           # CDS start and end, 1-based inclusive
   hgnc: RSPO1          # HGNC *symbol*
   genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/284654
+  gene_id: 284654
 
 NM_001242909.1:      # transcript_accession
   cds: 474,1184           # CDS start and end, 1-based inclusive
   hgnc: RSPO1          # HGNC *symbol*
   genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/259291
+  gene_id: 284654
 
 NM_001242910.1:      # transcript_accession
   cds: 714,1316           # CDS start and end, 1-based inclusive
   hgnc: RSPO1          # HGNC *symbol*
   genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/284654
+  gene_id: 284654
 
 NM_001012709.1:      # transcript_accession
   cds: 46,912            # CDS start and end, 1-based inclusive
   hgnc: KRTAP5-4           # HGNC *symbol*
   genomic_region: NC_000011.9 (1642188..1643368, complement)  # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/387267
+  gene_id: 387267
 
 NM_001123068.1:      # transcript_accession
   cds: 34,528            # CDS start and end, 1-based inclusive
   hgnc: COAS-2          # HGNC *symbol*
   genomic_region: NC_000001.10 (143767144..143767881, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/644591
+  gene_id: 644591
 
 NM_130797.2:      # transcript_accession
   cds: 130,2727           # CDS start and end, 1-based inclusive
   hgnc: DPPX          # HGNC *symbol*
   genomic_region: NC_000007.13 (153584419..154264025) , (154400205..154685995) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/1804
+  gene_id: 1804
 
 NM_033060.2:      # transcript_accession
   cds: 42,425           # CDS start and end, 1-based inclusive
   hgnc: KRTAP4-1          # HGNC *symbol*
   genomic_region: NC_000017.10 (39340352..39341147, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/1804
+  gene_id: 85285
 
 NM_033060.3:      # transcript_accession
   cds: 58,441           # CDS start and end, 1-based inclusive
   hgnc: KRTAP4-1          # HGNC *symbol*
   genomic_region: NC_000017.10 (39340352..39341163, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/1804
+  gene_id: 85285
diff --git a/.../data/splign-manual/generate-loading-data → sbin/generate-loading-data b/.../data/splign-manual/generate-loading-data → sbin/generate-loading-data
@@ -6,6 +6,7 @@ import argparse
 import csv
 import gzip
 import logging
+import os
 import sys
 
 import yaml
@@ -20,7 +21,6 @@ method = "splign-manual"
 
 txinfo_fn = "txinfo.gz"
 exonset_fn = "exonset.gz"
-txdata_fn = "txdata.yaml"
 
 
 def parse_args(argv):
@@ -31,6 +31,16 @@ def parse_args(argv):
         "FILES",
         nargs="*"
         )
+    ap.add_argument(
+        "--txdata",
+        required=True,
+        help="Path to txdata.yaml"
+        )
+    ap.add_argument(
+        "--output-dir",
+        required=True,
+        help="Path to output directory"
+        )
     opts = ap.parse_args(argv)
     return opts
 
@@ -61,16 +71,19 @@ def parse_splign(fn, txdata):
     try:
         txd = txdata[tx_ac]
     except KeyError:
-        raise KeyError(f"{tx_ac}: no cds or gene_symbol info in {txdata_fn}")
+        raise KeyError(f"{tx_ac}: no cds or gene_symbol info in txdata")
 
     gene_symbol = txd["hgnc"]
-    gene_id = txd.get("gene_id")
-    if gene_symbol is None:
-        _logger.warning(f"No gene symbol in {txdata_fn} for {tx_ac}")
+
+    gene_id = txd["gene_id"]
+    if gene_id is None:
+        msg = f"No gene id in txdata for {tx_ac}"
+        _logger.error(msg)
+        raise ValueError(msg)
 
     cds = txd["cds"]
     if cds is None:
-        _logger.warning(f"No CDS info {txdata_fn} for {tx_ac}; will be non-coding transcript")
+        _logger.warning(f"No CDS info txdata for {tx_ac}; will be non-coding transcript")
         cds_se_i = None
     else:
         cds = [int(i) for i in txd["cds"].split(",")]
@@ -99,10 +112,10 @@ if __name__ == "__main__":
 
     opts = parse_args(sys.argv[1:])
 
-    txdata = yaml.load(open(txdata_fn), Loader=yaml.SafeLoader)
+    txdata = yaml.load(open(opts.txdata), Loader=yaml.SafeLoader)
 
-    txinfo_out = uta.formats.txinfo.TxInfoWriter(gzip.open(txinfo_fn, "wt"))
-    exonset_out = uta.formats.exonset.ExonSetWriter(gzip.open(exonset_fn, "wt"))
+    txinfo_out = uta.formats.txinfo.TxInfoWriter(gzip.open(os.path.join(opts.output_dir, txinfo_fn), "wt"))
+    exonset_out = uta.formats.exonset.ExonSetWriter(gzip.open(os.path.join(opts.output_dir, exonset_fn), "wt"))
 
     for fn in opts.FILES:
         _logger.info("# " + fn)

diff --git a/sbin/uta-diff b/sbin/uta-diff
@@ -66,7 +66,7 @@ if __name__ == "__main__":
     print("""UTA comparison: url={url}, s1={s1}, s2={s2}
 t: time taken (seconds)
 n1, n2: total number of rows in schemas s1 and s2
-nu1, nu2, c: number of rows unique to s1, unique to s2, and common to both
+nu1, nu2, nc: number of rows unique to s1, unique to s2, and common to both
 cols: cols used for comparison
 """.format(url=url, s1=s1, s2=s2))
     print(pt)

diff --git a/sbin/uta-splign-manual b/sbin/uta-splign-manual
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+# Process splign-manual alignments
+
+set -euxo pipefail
+
+seqrepo_version=$1
+source_uta_v=$2
+input_dir=$3
+working_dir=$4
+log_dir=$5
+
+if [ -z "$seqrepo_version" ] || [ -z "$source_uta_v" ] || [ -z "$input_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
+then
+    echo 'Usage: sbin/uta-splign-manual <source_uta_version> <input_dir> <working_dir> <log_dir>'
+    exit 1
+fi
+
+# set local variables and create working directories
+loading_uta_v="uta"
+working_dir="$working_dir/splign-manual"
+log_dir="$log_dir/splign-manual"
+mkdir -p "$log_dir"
+mkdir -p "$working_dir"
+
+# Generate txinfo.gz and exonset.gz files
+python sbin/generate-loading-data $input_dir/alignments/*.splign --txdata $input_dir/txdata.yaml --output-dir $working_dir 2>&1 | tee "$log_dir/generate-loading-data.log"
+
+# Generate fasta files
+seqrepo export $(gzip -cdq $working_dir/txinfo.gz  | cut -f2 | tail +2) --instance-name "$seqrepo_version" | gzip -c > $working_dir/seqs.fa.gz 2>&1 | tee "$log_dir/seqrepo-export.log"
+
+# Generate seqinfo.gz file
+sbin/fasta-to-seqinfo -o NCBI $working_dir/seqs.fa.gz | gzip -c > $working_dir/seqinfo.gz 2>&1 | tee "$log_dir/fasta-to-seqinfo.log"
+
+# Load seqinfo
+uta --conf=etc/global.conf --conf=etc/[email protected] load-seqinfo $working_dir/seqinfo.gz 2>&1 | tee "$log_dir/load-seqinfo.log"
+
+# Load txinfo
+uta --conf=etc/global.conf --conf=etc/[email protected] load-txinfo $working_dir/txinfo.gz 2>&1 | tee "$log_dir/load-txinfo.log"
+
+# Load exonset
+uta --conf=etc/global.conf --conf=etc/[email protected] load-exonset $working_dir/exonset.gz 2>&1 | tee "$log_dir/load-exonset.log"
+
+# Align exons
+uta --conf=etc/global.conf --conf=etc/[email protected] align-exons 2>&1 | tee "$log_dir/align-exons.log"
+
+### run diff
+sbin/uta-diff "$source_uta_v" "$loading_uta_v" 2>&1 | tee "$log_dir/uta-diff.log"
+
+### psql_dump
+pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$working_dir/uta.pgd.gz"