feat: 438 vep annotation order (#441)

bihealth · Sep 14, 2023 · c28d67e · c28d67e
1 parent ac1a34f
commit c28d67e
Show file tree

Hide file tree

Showing 12 changed files with 345 additions and 31 deletions.
diff --git a/snappy_pipeline/workflows/cbioportal_export/__init__.py b/snappy_pipeline/workflows/cbioportal_export/__init__.py
@@ -5,9 +5,6 @@
 snappy_pipeline. It does the necessary data transformations to comply with the
 formats expected by cBioPortal and writes out the necessary metadata and data
 files.
-
-The output is a archive (tarball) that can be uploaded to a cbioportal instance
-for import.
 """
 
 from collections import OrderedDict
@@ -436,11 +433,11 @@ def __init__(self, parent):
     def get_args(self, action):
         # Validate action
         self._validate_action(action)
-        if action == "gistic":
-            return {"action_type": "gistic", "extra_args": {"pipeline_id": "ENSEMBL"}}
         if action == "log2":
+            return {"action_type": "log2", "extra_args": {"pipeline_id": "ENSEMBL"}}
+        if action == "gistic":
             return {
-                "action_type": "log2",
+                "action_type": "gistic",
                 "extra_args": {
                     "pipeline_id": "ENSEMBL",
                     "amplification": "9",

diff --git a/snappy_pipeline/workflows/somatic_variant_annotation/Snakefile b/snappy_pipeline/workflows/somatic_variant_annotation/Snakefile
@@ -92,7 +92,7 @@ rule somatic_variant_annotation_jannovar_annotate_somatic_vcf:
     log:
         **wf.get_log_file("jannovar", "annotate_somatic_vcf"),
     wrapper:
-        wf.wrapper_path("jannovar_par/annotate_somatic_vcf")
+        wf.wrapper_path("jannovar/annotate_somatic_vcf")
 
 
 # Variant Statistics Computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py
@@ -26,7 +26,31 @@
 Step Output
 ===========
 
-TODO
+Users can annotate all genes & transcripts overlapping with the variant locus, or
+they can select one representative gene and transcript for annotation.
+In the latter case, the output vcf file will only contain one annotation per variant, while
+in the former case, there might be over 100 annotations for each variant.
+
+The ordering of features driving the representative annotation choice is under user control.
+The default order is:
+
+1. ``biotype``: protein coding genes come first, it is unclear what is the order for other types of genes
+2. ``mane``: the `MANE transcript <https://www.ncbi.nlm.nih.gov/refseq/MANE/>`_ is selected before other transcripts
+3. ``appris``: the `APPRIS principal isoform <https://academic.oup.com/bioinformatics/article/38/Supplement_2/ii89/6701991>`_ is selected before alternates
+4. ``tsl``: `Transcript Support Level <http://www.ensembl.org/info/genome/genebuild/transcript_quality_tags.html>`_ values in increasing order
+5. ``ccds``: Transcripts with `CCDS <https://www.ncbi.nlm.nih.gov/projects/CCDS/CcdsBrowse.cgi>`_ ids are selected before those without
+6. ``canonical``: ENSEMBL canonical transcripts are selected before the others
+7. ``rank``: VEP internal ranking is used
+8. ``length``: longer transcripts are preferred to shorter ones
+
+This order is (hopefully) suitable for cBioPortal export, as well defined transcripts from protein-coding genes are selected when possible.
+However, it is recommended to check the full annotation for variants in or nearby disease-relevant genes.
+
+All annotators generate a vcf with one annotation per transcript, and some annotators
+(only ENSEMBL's Variant Effect Predictor in the current implementation) can also produce another
+output containing all annotations.
+The single annotation vcf is named ``<mapper>.<caller>.<annotator>.vcf.gz`` and
+the full annotation output is named ``<mapper>.<caller>.<annotator>.full.vcf.gz``
 
 ====================
 Global Configuration
@@ -115,7 +139,7 @@
       assembly: GRCh38
       cache_version: 102        # WARNING- this must match the wrapper's vep version!
       tx_flag: "gencode_basic"  # The flag selecting the transcripts.  One of "gencode_basic", "refseq", and "merged".
-      pick: yes                 # Other option: no (report one or all consequences)
+      pick_order: ["biotype", "mane", "appris", "tsl", "ccds", "canonical", "rank", "length"]
       num_threads: 8
       buffer_size: 1000
       output_options:
@@ -131,6 +155,9 @@ class AnnotateSomaticVcfStepPart(BaseStepPart):
         The ``tumor_library`` wildcard can actually be the name of a donor!
     """
 
+    #: Only creates vcf with one annotation per variant
+    has_full = False
+
     def __init__(self, parent):
         super().__init__(parent)
         # Build shortcut from cancer bio sample name to matched cancre sample
@@ -169,6 +196,9 @@ def get_output_files(self, action):
             "{{mapper}}.{{var_caller}}.{annotator}.{{tumor_library}}"
         ).format(annotator=self.annotator)
         key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"}
+        if self.has_full:
+            key_ext["full"] = ".full.vcf.gz"
+            key_ext["full_tbi"] = ".full.vcf.gz.tbi"
         for key, ext in key_ext.items():
             yield key, prefix + ext
             yield key + "_md5", prefix + ext + ".md5"
@@ -261,6 +291,12 @@ class VepAnnotateSomaticVcfStepPart(AnnotateSomaticVcfStepPart):
     #: Class available actions
     actions = ("run",)
 
+    #: Also creates vcf with all annotations
+    has_full = True
+
+    #: Allowed keywords for pick order
+    PICK_ORDER = ("biotype", "mane", "appris", "tsl", "ccds", "canonical", "rank", "length")
+
     def get_resource_usage(self, action):
         """Get Resource Usage
 
@@ -282,8 +318,10 @@ def check_config(self):
             return
         if not self.config["vep"]["tx_flag"] in ("merged", "refseq", "gencode_basic"):
             raise InvalidConfiguration("tx_flag must be 'gencode_basic', or 'merged' or 'refseq'")
-        if not self.config["vep"]["pick"] in ("yes", "no"):
-            raise InvalidConfiguration("pick must be either 'yes' or 'no'")
+        if not all([x in self.PICK_ORDER for x in self.config["vep"]["pick_order"]]):
+            raise InvalidConfiguration(
+                "pick order keywords must be in {}".format(", ".join(self.PICK_ORDER))
+            )
 
 
 class SomaticVariantAnnotationWorkflow(BaseStep):
@@ -362,6 +400,23 @@ def get_result_files(self):
                 ".conda_list.txt.md5",
             ),
         )
+        # Annotators with full output
+        full = list(
+            map(
+                lambda x: x.replace("jannovar", "jannovar_annotate_somatic_vcf"),
+                filter(
+                    lambda x: self.sub_steps[x].has_full,
+                    set(self.config["tools"]) & set(ANNOTATION_TOOLS),
+                ),
+            )
+        )
+        yield from self._yield_result_files_matched(
+            os.path.join("output", name_pattern, "out", name_pattern + ".full{ext}"),
+            mapper=self.config["tools_ngs_mapping"],
+            caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
+            annotator=full,
+            ext=EXT_VALUES,
+        )
         # joint calling
         name_pattern = "{mapper}.{caller}.{annotator}.{donor.name}"
         yield from self._yield_result_files_joint(

diff --git a/snappy_wrappers/wrappers/cbioportal/merge_tables/script.R b/snappy_wrappers/wrappers/cbioportal/merge_tables/script.R
@@ -37,13 +37,13 @@ merge_tables <- function(fns, mappings, type=c("log2", "gistic", "segment", "exp
     }
 
     if (type == "gistic") {
-        stopifnot(all(c("pipeline_id") %in% names(args)))
+        stopifnot(all(c("pipeline_id", "amplification") %in% names(args)))
         # Copy numbers (in "cn" column) are transformed into (pseudo-) gistic codes:
         # 0: Deep deletion, 1: heterozygous deletion, 2: copy number neutral, 3: gain, 4: amplification
         # In https://doi.org/10.1038/s41586-022-04738-6, the amplification is defined as 
         # copy number greater or equal to 9 copies (args$amplification = 9).
         args$amplification <- as.numeric(args$amplification)
-        cn <- read_sample_files(fns, args$pipeline_id, "cn")
+        cn <- round(read_sample_files(fns, args$pipeline_id, "cn"))
         cn[args$amplification<=cn] <- args$amplification
         cn[2<cn & cn<args$amplification] <- 3
         cn[cn==args$amplification] <- 4
@@ -57,15 +57,14 @@ merge_tables <- function(fns, mappings, type=c("log2", "gistic", "segment", "exp
     }
 
     if (type == "log2") {
-        stopifnot(all(c("pipeline_id", "amplification") %in% names(args)))
+        stopifnot(all(c("pipeline_id") %in% names(args)))
         tmp <- read_sample_files(fns, args$pipeline_id, "log2")
         method <- "max"
     }
 
     if (remove_feature_version) rownames(tmp) <- sub("\\.[0-9]+$", "", rownames(tmp))
 
     rslt <- map_feature_id(tmp, mappings, from=args$pipeline_id, to="SYMBOL", method=method)
-    if (type == "gistic") rslt <- rslt+2
 
     rslt <- data.frame(
         Hugo_Symbol=rownames(rslt),

diff --git a/snappy_wrappers/wrappers/jannovar/annotate_somatic_vcf/wrapper.py b/snappy_wrappers/wrappers/jannovar/annotate_somatic_vcf/wrapper.py
@@ -9,7 +9,7 @@
 
 # Get shortcuts to static data and step configuration
 static_config = snakemake.config["static_data_config"]
-anno_config = snakemake.config["step_config"]["somatic_variant_annotation"]
+anno_config = snakemake.config["step_config"]["somatic_variant_annotation"]["jannovar"]
 
 # Build list of arguments to pass to Jannovar
 annotation_args = []
@@ -81,8 +81,12 @@
 annotation_snippet = " \\\n    ".join(annotation_args)
 
 # Build intervals argument
-arg_intervals = " ".join(
-    ["--interval {}".format(interval) for interval in snakemake.params["args"]["intervals"]]
+arg_intervals = (
+    " ".join(
+        ["--interval {}".format(interval) for interval in snakemake.params["args"]["intervals"]]
+    )
+    if "args" in snakemake.params and "intervals" in snakemake.params["args"]
+    else ""
 )
 
 shell(
@@ -113,8 +117,8 @@
     -XX:+UseG1GC \
     --input-vcf {snakemake.input.vcf} \
     --output-vcf {snakemake.output.vcf} \
-    --database {snakemake.config[step_config][somatic_variant_annotation][path_jannovar_ser]} \
-    --ref-fasta {snakemake.config[static_data_config][reference][path]} \
+    --database {anno_config[path_jannovar_ser]} \
+    --ref-fasta {static_config[reference][path]} \
     {arg_intervals} \
     {annotation_snippet}
 

diff --git a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/jannovar.yaml b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/jannovar.yaml
@@ -348,6 +348,7 @@ output:                                               # Output columns definitio
             add_sequence: False                       # The arguments "add_sequence", "want_short" & "want_long"
             want_short: True                          # allow the user to change this behaviour.
             want_long: False
+            illegal: True                             # Skip variant when HGVSp is illegal
         on_missing: skip
         # default: ""
 

diff --git a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/protein_mutation_parser.py b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/protein_mutation_parser.py
@@ -230,6 +230,25 @@ def _build_protein_pattern():
     return re.compile(pattern)
 
 
+@functools.lru_cache
+def _build_silent_dinucleotide():
+    prefix = "^(([A-z0-9_\.\(\)-]+):)?p\.\(?"  # noqa: W605
+    postfix = "\)?$"  # noqa: W605
+
+    aa = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + ")"
+    aaTer = (
+        "("
+        + "|".join(aa_codes_short)
+        + "|"
+        + "|".join(aa_codes_long)
+        + "|\*|Ter"  # noqa: W605
+        + ")"
+    )
+    nb = "([0-9]+)"
+
+    return re.compile(prefix + aa + aaTer + "*" + nb + "=" + postfix)
+
+
 def _aa(aa, want_short=True, want_long=False):
     if want_short and aa in long_to_short.keys():
         return long_to_short[aa]
@@ -361,7 +380,7 @@ def _parse_mutation_groups(groups, want_short=True, want_long=False):
             + "1ext"
             + groups[iGroup + 1]
         )
-    iGroup += 3
+    iGroup += 2
 
     if groups[iGroup] is not None:
         # Extension C terminus
@@ -414,10 +433,12 @@ def parse_protein_mutation(x, args):
     add_sequence = False
     want_short = True
     want_long = False
+    illegal = True
     if args is not None and isinstance(args, dict):
         add_sequence = args.get("add_sequence", add_sequence)
         want_short = args.get("want_short", want_short)
         want_long = args.get("want_long", want_long)
+        illegal = args.get("illegal", illegal)
     if want_short and want_long:
         raise Exception("Cannot force both short & long amino-acide representations")
 
@@ -429,7 +450,14 @@ def parse_protein_mutation(x, args):
             )
             rslts.append(rslt)
         except ProteinMutationFormatException as e:
-            raise Exception(e)
+            if not illegal:
+                m = _build_silent_dinucleotide().match(mut)
+                if not m:
+                    rslts.append("")
+                else:
+                    raise Exception(e)
+            else:
+                raise Exception(e)
 
     return rslts
 

diff --git a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/vep.yaml b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/vep.yaml
@@ -350,6 +350,7 @@ output:                                               # Output columns definitio
             add_sequence: False                       # The arguments "add_sequence", "want_short" & "want_long"
             want_short: True                          # allow the user to change this behaviour.
             want_long: False
+            illegal: False                            # Allow some illegal HGVSp format, replacing them with empty
         on_missing: skip
         # default: ""