Skip to content

Commit

Permalink
feat: 438 vep annotation order (#441)
Browse files Browse the repository at this point in the history
  • Loading branch information
ericblanc20 authored Sep 14, 2023
1 parent ac1a34f commit c28d67e
Show file tree
Hide file tree
Showing 12 changed files with 345 additions and 31 deletions.
9 changes: 3 additions & 6 deletions snappy_pipeline/workflows/cbioportal_export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@
snappy_pipeline. It does the necessary data transformations to comply with the
formats expected by cBioPortal and writes out the necessary metadata and data
files.
The output is a archive (tarball) that can be uploaded to a cbioportal instance
for import.
"""

from collections import OrderedDict
Expand Down Expand Up @@ -436,11 +433,11 @@ def __init__(self, parent):
def get_args(self, action):
# Validate action
self._validate_action(action)
if action == "gistic":
return {"action_type": "gistic", "extra_args": {"pipeline_id": "ENSEMBL"}}
if action == "log2":
return {"action_type": "log2", "extra_args": {"pipeline_id": "ENSEMBL"}}
if action == "gistic":
return {
"action_type": "log2",
"action_type": "gistic",
"extra_args": {
"pipeline_id": "ENSEMBL",
"amplification": "9",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ rule somatic_variant_annotation_jannovar_annotate_somatic_vcf:
log:
**wf.get_log_file("jannovar", "annotate_somatic_vcf"),
wrapper:
wf.wrapper_path("jannovar_par/annotate_somatic_vcf")
wf.wrapper_path("jannovar/annotate_somatic_vcf")


# Variant Statistics Computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
63 changes: 59 additions & 4 deletions snappy_pipeline/workflows/somatic_variant_annotation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,31 @@
Step Output
===========
TODO
Users can annotate all genes & transcripts overlapping with the variant locus, or
they can select one representative gene and transcript for annotation.
In the latter case, the output vcf file will only contain one annotation per variant, while
in the former case, there might be over 100 annotations for each variant.
The ordering of features driving the representative annotation choice is under user control.
The default order is:
1. ``biotype``: protein coding genes come first, it is unclear what is the order for other types of genes
2. ``mane``: the `MANE transcript <https://www.ncbi.nlm.nih.gov/refseq/MANE/>`_ is selected before other transcripts
3. ``appris``: the `APPRIS principal isoform <https://academic.oup.com/bioinformatics/article/38/Supplement_2/ii89/6701991>`_ is selected before alternates
4. ``tsl``: `Transcript Support Level <http://www.ensembl.org/info/genome/genebuild/transcript_quality_tags.html>`_ values in increasing order
5. ``ccds``: Transcripts with `CCDS <https://www.ncbi.nlm.nih.gov/projects/CCDS/CcdsBrowse.cgi>`_ ids are selected before those without
6. ``canonical``: ENSEMBL canonical transcripts are selected before the others
7. ``rank``: VEP internal ranking is used
8. ``length``: longer transcripts are preferred to shorter ones
This order is (hopefully) suitable for cBioPortal export, as well defined transcripts from protein-coding genes are selected when possible.
However, it is recommended to check the full annotation for variants in or nearby disease-relevant genes.
All annotators generate a vcf with one annotation per transcript, and some annotators
(only ENSEMBL's Variant Effect Predictor in the current implementation) can also produce another
output containing all annotations.
The single annotation vcf is named ``<mapper>.<caller>.<annotator>.vcf.gz`` and
the full annotation output is named ``<mapper>.<caller>.<annotator>.full.vcf.gz``
====================
Global Configuration
Expand Down Expand Up @@ -115,7 +139,7 @@
assembly: GRCh38
cache_version: 102 # WARNING- this must match the wrapper's vep version!
tx_flag: "gencode_basic" # The flag selecting the transcripts. One of "gencode_basic", "refseq", and "merged".
pick: yes # Other option: no (report one or all consequences)
pick_order: ["biotype", "mane", "appris", "tsl", "ccds", "canonical", "rank", "length"]
num_threads: 8
buffer_size: 1000
output_options:
Expand All @@ -131,6 +155,9 @@ class AnnotateSomaticVcfStepPart(BaseStepPart):
The ``tumor_library`` wildcard can actually be the name of a donor!
"""

#: Only creates vcf with one annotation per variant
has_full = False

def __init__(self, parent):
super().__init__(parent)
# Build shortcut from cancer bio sample name to matched cancre sample
Expand Down Expand Up @@ -169,6 +196,9 @@ def get_output_files(self, action):
"{{mapper}}.{{var_caller}}.{annotator}.{{tumor_library}}"
).format(annotator=self.annotator)
key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"}
if self.has_full:
key_ext["full"] = ".full.vcf.gz"
key_ext["full_tbi"] = ".full.vcf.gz.tbi"
for key, ext in key_ext.items():
yield key, prefix + ext
yield key + "_md5", prefix + ext + ".md5"
Expand Down Expand Up @@ -261,6 +291,12 @@ class VepAnnotateSomaticVcfStepPart(AnnotateSomaticVcfStepPart):
#: Class available actions
actions = ("run",)

#: Also creates vcf with all annotations
has_full = True

#: Allowed keywords for pick order
PICK_ORDER = ("biotype", "mane", "appris", "tsl", "ccds", "canonical", "rank", "length")

def get_resource_usage(self, action):
"""Get Resource Usage
Expand All @@ -282,8 +318,10 @@ def check_config(self):
return
if not self.config["vep"]["tx_flag"] in ("merged", "refseq", "gencode_basic"):
raise InvalidConfiguration("tx_flag must be 'gencode_basic', or 'merged' or 'refseq'")
if not self.config["vep"]["pick"] in ("yes", "no"):
raise InvalidConfiguration("pick must be either 'yes' or 'no'")
if not all([x in self.PICK_ORDER for x in self.config["vep"]["pick_order"]]):
raise InvalidConfiguration(
"pick order keywords must be in {}".format(", ".join(self.PICK_ORDER))
)


class SomaticVariantAnnotationWorkflow(BaseStep):
Expand Down Expand Up @@ -362,6 +400,23 @@ def get_result_files(self):
".conda_list.txt.md5",
),
)
# Annotators with full output
full = list(
map(
lambda x: x.replace("jannovar", "jannovar_annotate_somatic_vcf"),
filter(
lambda x: self.sub_steps[x].has_full,
set(self.config["tools"]) & set(ANNOTATION_TOOLS),
),
)
)
yield from self._yield_result_files_matched(
os.path.join("output", name_pattern, "out", name_pattern + ".full{ext}"),
mapper=self.config["tools_ngs_mapping"],
caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
annotator=full,
ext=EXT_VALUES,
)
# joint calling
name_pattern = "{mapper}.{caller}.{annotator}.{donor.name}"
yield from self._yield_result_files_joint(
Expand Down
7 changes: 3 additions & 4 deletions snappy_wrappers/wrappers/cbioportal/merge_tables/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ merge_tables <- function(fns, mappings, type=c("log2", "gistic", "segment", "exp
}

if (type == "gistic") {
stopifnot(all(c("pipeline_id") %in% names(args)))
stopifnot(all(c("pipeline_id", "amplification") %in% names(args)))
# Copy numbers (in "cn" column) are transformed into (pseudo-) gistic codes:
# 0: Deep deletion, 1: heterozygous deletion, 2: copy number neutral, 3: gain, 4: amplification
# In https://doi.org/10.1038/s41586-022-04738-6, the amplification is defined as
# copy number greater or equal to 9 copies (args$amplification = 9).
args$amplification <- as.numeric(args$amplification)
cn <- read_sample_files(fns, args$pipeline_id, "cn")
cn <- round(read_sample_files(fns, args$pipeline_id, "cn"))
cn[args$amplification<=cn] <- args$amplification
cn[2<cn & cn<args$amplification] <- 3
cn[cn==args$amplification] <- 4
Expand All @@ -57,15 +57,14 @@ merge_tables <- function(fns, mappings, type=c("log2", "gistic", "segment", "exp
}

if (type == "log2") {
stopifnot(all(c("pipeline_id", "amplification") %in% names(args)))
stopifnot(all(c("pipeline_id") %in% names(args)))
tmp <- read_sample_files(fns, args$pipeline_id, "log2")
method <- "max"
}

if (remove_feature_version) rownames(tmp) <- sub("\\.[0-9]+$", "", rownames(tmp))

rslt <- map_feature_id(tmp, mappings, from=args$pipeline_id, to="SYMBOL", method=method)
if (type == "gistic") rslt <- rslt+2

rslt <- data.frame(
Hugo_Symbol=rownames(rslt),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

# Get shortcuts to static data and step configuration
static_config = snakemake.config["static_data_config"]
anno_config = snakemake.config["step_config"]["somatic_variant_annotation"]
anno_config = snakemake.config["step_config"]["somatic_variant_annotation"]["jannovar"]

# Build list of arguments to pass to Jannovar
annotation_args = []
Expand Down Expand Up @@ -81,8 +81,12 @@
annotation_snippet = " \\\n ".join(annotation_args)

# Build intervals argument
arg_intervals = " ".join(
["--interval {}".format(interval) for interval in snakemake.params["args"]["intervals"]]
arg_intervals = (
" ".join(
["--interval {}".format(interval) for interval in snakemake.params["args"]["intervals"]]
)
if "args" in snakemake.params and "intervals" in snakemake.params["args"]
else ""
)

shell(
Expand Down Expand Up @@ -113,8 +117,8 @@
-XX:+UseG1GC \
--input-vcf {snakemake.input.vcf} \
--output-vcf {snakemake.output.vcf} \
--database {snakemake.config[step_config][somatic_variant_annotation][path_jannovar_ser]} \
--ref-fasta {snakemake.config[static_data_config][reference][path]} \
--database {anno_config[path_jannovar_ser]} \
--ref-fasta {static_config[reference][path]} \
{arg_intervals} \
{annotation_snippet}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ output: # Output columns definitio
add_sequence: False # The arguments "add_sequence", "want_short" & "want_long"
want_short: True # allow the user to change this behaviour.
want_long: False
illegal: True # Skip variant when HGVSp is illegal
on_missing: skip
# default: ""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,25 @@ def _build_protein_pattern():
return re.compile(pattern)


@functools.lru_cache
def _build_silent_dinucleotide():
prefix = "^(([A-z0-9_\.\(\)-]+):)?p\.\(?" # noqa: W605
postfix = "\)?$" # noqa: W605

aa = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + ")"
aaTer = (
"("
+ "|".join(aa_codes_short)
+ "|"
+ "|".join(aa_codes_long)
+ "|\*|Ter" # noqa: W605
+ ")"
)
nb = "([0-9]+)"

return re.compile(prefix + aa + aaTer + "*" + nb + "=" + postfix)


def _aa(aa, want_short=True, want_long=False):
if want_short and aa in long_to_short.keys():
return long_to_short[aa]
Expand Down Expand Up @@ -361,7 +380,7 @@ def _parse_mutation_groups(groups, want_short=True, want_long=False):
+ "1ext"
+ groups[iGroup + 1]
)
iGroup += 3
iGroup += 2

if groups[iGroup] is not None:
# Extension C terminus
Expand Down Expand Up @@ -414,10 +433,12 @@ def parse_protein_mutation(x, args):
add_sequence = False
want_short = True
want_long = False
illegal = True
if args is not None and isinstance(args, dict):
add_sequence = args.get("add_sequence", add_sequence)
want_short = args.get("want_short", want_short)
want_long = args.get("want_long", want_long)
illegal = args.get("illegal", illegal)
if want_short and want_long:
raise Exception("Cannot force both short & long amino-acide representations")

Expand All @@ -429,7 +450,14 @@ def parse_protein_mutation(x, args):
)
rslts.append(rslt)
except ProteinMutationFormatException as e:
raise Exception(e)
if not illegal:
m = _build_silent_dinucleotide().match(mut)
if not m:
rslts.append("")
else:
raise Exception(e)
else:
raise Exception(e)

return rslts

Expand Down
1 change: 1 addition & 0 deletions snappy_wrappers/wrappers/vcf2maf/vcf_to_table/vep.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ output: # Output columns definitio
add_sequence: False # The arguments "add_sequence", "want_short" & "want_long"
want_short: True # allow the user to change this behaviour.
want_long: False
illegal: False # Allow some illegal HGVSp format, replacing them with empty
on_missing: skip
# default: ""

Expand Down
Loading

0 comments on commit c28d67e

Please sign in to comment.