From e8f5716993cb53177ac08bf4a6062de48375b7a9 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 25 Apr 2023 13:16:00 +0200 Subject: [PATCH 1/7] feat: change varfish-annotator to mehari (#392) --- .../workflows/varfish_export/__init__.py | 8 +- .../mehari/annotate_seqvars/environment.yaml | 1 + .../mehari/annotate_seqvars/wrapper.py | 123 +++++++++++ .../annotate_strucvars/environment.yaml | 1 + .../annotate_strucvars/fix_manta_invs.py | 192 +++++++++++++++++ .../mehari/annotate_strucvars/wrapper.py | 148 ++++++++++++++ .../wrappers/mehari/bam_qc/environment.yaml | 1 + .../wrappers/mehari/bam_qc/wrapper.py | 1 + .../wrappers/mehari/environment.yaml | 11 + .../annotate_svs/fix_manta_invs.py | 193 +----------------- 10 files changed, 481 insertions(+), 198 deletions(-) create mode 120000 snappy_wrappers/wrappers/mehari/annotate_seqvars/environment.yaml create mode 100644 snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py create mode 120000 snappy_wrappers/wrappers/mehari/annotate_strucvars/environment.yaml create mode 100644 snappy_wrappers/wrappers/mehari/annotate_strucvars/fix_manta_invs.py create mode 100644 snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py create mode 120000 snappy_wrappers/wrappers/mehari/bam_qc/environment.yaml create mode 120000 snappy_wrappers/wrappers/mehari/bam_qc/wrapper.py create mode 100644 snappy_wrappers/wrappers/mehari/environment.yaml mode change 100644 => 120000 snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py diff --git a/snappy_pipeline/workflows/varfish_export/__init__.py b/snappy_pipeline/workflows/varfish_export/__init__.py index 1012444ea..dde735975 100644 --- a/snappy_pipeline/workflows/varfish_export/__init__.py +++ b/snappy_pipeline/workflows/varfish_export/__init__.py @@ -121,12 +121,8 @@ release: GRCh37 # REQUIRED: default 'GRCh37' # Path to BED file with exons; used for reducing data to near-exon small variants. path_exon_bed: null # REQUIRED: exon BED file to use - # Path to Jannovar RefSeq ``.ser`` file for annotation - path_refseq_ser: REQUIRED # REQUIRED: path to RefSeq .ser file - # Path to Jannovar ENSEMBL ``.ser`` file for annotation - path_ensembl_ser: REQUIRED # REQUIRED: path to ENSEMBL .ser file - # Path to VarFish annotator database file to use for annotating. - path_db: REQUIRED # REQUIRED: spath to varfish-annotator DB file to use + # Path to mehari database. + path_mehari_db: REQUIRED # REQUIRED: path to mehari database """ diff --git a/snappy_wrappers/wrappers/mehari/annotate_seqvars/environment.yaml b/snappy_wrappers/wrappers/mehari/annotate_seqvars/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/annotate_seqvars/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py b/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py new file mode 100644 index 000000000..b08d61c4c --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py @@ -0,0 +1,123 @@ +import os + +from snakemake.shell import shell + +__author__ = "Manuel Holtgrewe " + +# Get shortcut to configuration of varfish_export step +step_name = snakemake.params.args["step_name"] +export_config = snakemake.config["step_config"][step_name] + +DEF_HELPER_FUNCS = r""" +compute-md5() +{ + if [[ $# -ne 2 ]]; then + >&2 echo "Invalid number of arguments: $#" + exit 1 + fi + md5sum $1 \ + | awk '{ gsub(/.*\//, "", $2); print; }' \ + > $2 +} +""" + +shell( + r""" +set -x + +# Write files for reproducibility ----------------------------------------------------------------- + +{DEF_HELPER_FUNCS} + +# Write out information about conda and save a copy of the wrapper with picked variables +# as well as the environment.yaml file. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +compute-md5 {snakemake.log.conda_list} {snakemake.log.conda_list_md5} +compute-md5 {snakemake.log.conda_info} {snakemake.log.conda_info_md5} +cp {__real_file__} {snakemake.log.wrapper} +compute-md5 {snakemake.log.wrapper} {snakemake.log.wrapper_md5} +cp $(dirname {__file__})/environment.yaml {snakemake.log.env_yaml} +compute-md5 {snakemake.log.env_yaml} {snakemake.log.env_yaml_md5} + +# Also pipe stderr to log file -------------------------------------------------------------------- + +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +# Create auto-cleaned temporary directory +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +# Run actual tools -------------------------------------------------------------------------------- + +# Extract around BED file, if given. +if [[ -n "{export_config[path_exon_bed]}" ]] && [[ "{export_config[path_exon_bed]}" != "None" ]]; then + set -e + bcftools view \ + -R {export_config[path_exon_bed]} \ + {snakemake.input.vcf} \ + | bcftools sort -T $TMPDIR \ + | bcftools norm -d all \ + | bgzip -c \ + > $TMPDIR/tmp.vcf.gz + tabix -f $TMPDIR/tmp.vcf.gz +else + set -e + ln -sr {snakemake.input.vcf} $TMPDIR/tmp.vcf.gz + ln -sr {snakemake.input.vcf}.tbi $TMPDIR/tmp.vcf.gz.tbi +fi + +release=$(echo {export_config[release]} | tr '[:upper:]' '[:lower:]') + +# Execute VarFish Annotator +mehari \ + annotate \ + seqvars \ + --release ${release} \ + --path-db {export_config[path_mehari_db]} \ + --path-input-ped {snakemake.input.ped} \ + --path-input-vcf $TMPDIR/tmp.vcf.gz \ + --path-output-tsv >(gzip -c > {snakemake.output.gts}) + +cat | gzip -c > {snakemake.output.db-infos} <"): + ref_seq.append(seq) + return "".join(ref_seq).upper() + + +def convert_manta_inversions(args, reader, writer, inv_bnds): + for record in reader: + if record.ID[0] in inv_bnds: + continue # skip mate record + _, mate_pos, inv_type = analyze_inv(record) + if inv_type != InversionType.NONE: + if inv_type == InversionType.INV5: + # Adjust POS for 5' inversion + record.POS -= 1 + mate_pos -= 1 + record.REF = samtools_faidx( + args.reference_fasta, record.CHROM, record.POS, record.POS + ) + + id_suffix = record.ID[0].split("MantaBND")[1] + idx = id_suffix.rfind(":") + record.ID = ["MantaINV%s" % id_suffix[:idx]] + + record.ALT = [vcfpy.SymbolicAllele("INV")] + + record.INFO["END"] = mate_pos + record.INFO["SVTYPE"] = "INV" + record.INFO["SVLEN"] = [mate_pos - record.POS] + if record.INFO.get("IMPRECISE"): + mate_id = record.INFO["MATEID"][0] + record.INFO["CIEND"] = inv_bnds[mate_id].INFO["CIPOS"] + elif record.INFO.get("HOMLEN"): + record.INFO["CIEND"] = [-record.INFO["HOMLEN"][0], 0] + + if inv_type == InversionType.INV5 and "HOMSEQ" in record.INFO: + cipos = record.INFO["CIPOS"] + hom_seq_start = record.POS + cipos[0] + 1 + hom_seq_end = record.POS + cipos[1] + record.INFO["HOMSEQ"] = [ + samtools_faidx(args.reference_fasta, record.CHROM, hom_seq_start, hom_seq_end) + ] + + # remove BND-specific info + for key in ("MATEID", "BND_DEPTH", "MATE_BND_DEPTH"): + if key in record.INFO: + record.INFO.pop(key) + + # update INFO/EVENT + if "EVENT" in record.INFO: + eid_suffix = record.INFO["EVENT"].split("MantaBND")[1] + idx = eid_suffix.rfind(":") + record.INFO["EVENT"] = "MantaINV%s" % eid_suffix[:idx] + + # set INV3/INV5 tags + if inv_type == InversionType.INV3: + record.INFO["INV3"] = True + elif inv_type == InversionType.INV5: + record.INFO["INV5"] = True + + record.INFO = dict(sorted(record.INFO.items())) + + writer.write_record(record) + + +def run_manta(args, reader): + """Run MANTA conversion.""" + # First, collect all inversion BND records. + with vcfpy.Reader.from_path(args.input_vcf) as reader2: + inv_bnds = collect_inv_bnds(reader2) + # Go through file a second time and convert inversions. + header = reader.header.copy() + header.add_info_line( + { + "ID": "INV3", + "Number": "0", + "Type": "Flag", + "Description": "Inversion breakends open 3' of reported location", + } + ) + header.add_info_line( + { + "ID": "INV5", + "Number": "0", + "Type": "Flag", + "Description": "Inversion breakends open 5' of reported location", + } + ) + header.add_line( + vcfpy.AltAlleleHeaderLine.from_mapping( + { + "ID": "INV", + "Description": "Inversion", + } + ) + ) + with vcfpy.Writer.from_path(args.output_vcf, header) as writer: + convert_manta_inversions(args, reader, writer, inv_bnds) + + +def run(args): + """Main entry point after parsing command line. + + Non-MANTA VCF files are handled directly by copying. MANTA files are processed with + run_manta(). + """ + with vcfpy.Reader.from_path(args.input_vcf) as reader: + if not looks_like_manta(reader.header): # simply copy + with vcfpy.Writer.from_path(args.output_vcf, reader.header) as writer: + for record in reader: + writer.write_record(record) + else: + run_manta(args, reader) + + +def main(argv=None): + parser = argparse.ArgumentParser() + parser.add_argument("--reference-fasta", required=True, help="Reference FASTA file.") + parser.add_argument("--input-vcf", required=True, help="Input VCF file.") + parser.add_argument("--output-vcf", required=True, help="Output VCF file.") + + args = parser.parse_args(argv) + return run(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py b/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py new file mode 100644 index 000000000..0565841fb --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py @@ -0,0 +1,148 @@ +import os + +from snakemake.shell import shell + +__author__ = "Manuel Holtgrewe " + +# Optionally get path to coverage VCF file. +coverage_vcf = " ".join(getattr(snakemake.input, "vcf_cov", [])) + +# Get shortcut to configuration of varfish_export step +step_name = snakemake.params.args["step_name"] +export_config = snakemake.config["step_config"][step_name] +# Get shortcut to "fix_manta_invs.py" postprocessing script +fix_manta_invs = os.path.join( + os.path.dirname(__file__), + "fix_manta_invs.py", +) + +DEF_HELPER_FUNCS = r""" +compute-md5() +{ + if [[ $# -ne 2 ]]; then + >&2 echo "Invalid number of arguments: $#" + exit 1 + fi + md5sum $1 \ + | awk '{ gsub(/.*\//, "", $2); print; }' \ + > $2 +} +""" + +shell( + r""" +set -x + +# Write files for reproducibility ----------------------------------------------------------------- + +{DEF_HELPER_FUNCS} + +# Write out information about conda and save a copy of the wrapper with picked variables +# as well as the environment.yaml file. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +compute-md5 {snakemake.log.conda_list} {snakemake.log.conda_list_md5} +compute-md5 {snakemake.log.conda_info} {snakemake.log.conda_info_md5} +cp {__real_file__} {snakemake.log.wrapper} +compute-md5 {snakemake.log.wrapper} {snakemake.log.wrapper_md5} +cp $(dirname {__file__})/environment.yaml {snakemake.log.env_yaml} +compute-md5 {snakemake.log.env_yaml} {snakemake.log.env_yaml_md5} + +# Also pipe stderr to log file -------------------------------------------------------------------- + +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +# Create auto-cleaned temporary directory +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +# Run actual tools -------------------------------------------------------------------------------- + +samples=$(cut -f 2 {snakemake.input.ped} | tr '\n' ',' | sed -e 's/,$//g') + +# Fix the Manta inversions +i=0 +for vcf in {snakemake.input.vcf}; do + let "i=$i+1" + num=$(printf %03d $i) + + python3 {fix_manta_invs} \ + --reference-fasta {snakemake.config[static_data_config][reference][path]} \ + --input-vcf $vcf \ + --output-vcf $TMPDIR/fixed_bnd_to_inv_unsorted.$num.vcf + bcftools sort -o $TMPDIR/fixed_bnd_to_inv.$num.vcf $TMPDIR/fixed_bnd_to_inv_unsorted.$num.vcf + + # Add the missing "GT" tag + echo '##FORMAT=' \ + > $TMPDIR/header.gt.txt + + bcftools annotate \ + -h $TMPDIR/header.gt.txt \ + $TMPDIR/fixed_bnd_to_inv.$num.vcf \ + -O z \ + -o $TMPDIR/final_for_import.$num.vcf.gz + tabix -s1 -b2 -e2 -f $TMPDIR/final_for_import.$num.vcf.gz +done + +# Compatibility mode with currently deployed VarFish Server +compatibility_option="--opt-out callers-array" + +# Execute VarFish Annotator +varfish-annotator \ + annotate-svs \ + -XX:MaxHeapSize=10g \ + -XX:+UseG1GC \ + \ + --release {export_config[release]} \ + \ + $(if [[ "{coverage_vcf}" != "" ]]; then \ + for path in {coverage_vcf}; do \ + echo --coverage-vcf $path; \ + done; \ + fi) \ + \ + --db-path {export_config[path_db]} \ + --refseq-ser-path {export_config[path_refseq_ser]} \ + --ensembl-ser-path {export_config[path_ensembl_ser]} \ + --input-ped {snakemake.input.ped} \ + \ + $(for vcf in $TMPDIR/final_for_import.*.vcf.gz; do \ + echo --input-vcf $vcf; \ + done) \ + --output-db-info {snakemake.output.db_infos} \ + --output-gts {snakemake.output.gts} \ + --output-feature-effects {snakemake.output.feature_effects} \ + $compatibility_option + +# Compute MD5 sums on output files +compute-md5 {snakemake.output.db_infos} {snakemake.output.db_infos_md5} +compute-md5 {snakemake.output.gts} {snakemake.output.gts_md5} +compute-md5 {snakemake.output.feature_effects} {snakemake.output.feature_effects_md5} + +# Create output links ----------------------------------------------------------------------------- + +for path in {snakemake.output.output_links}; do + dst=$path + src=work/${{dst#output/}} + ln -sr $src $dst +done +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +{DEF_HELPER_FUNCS} + +sleep 1s # try to wait for log file flush +compute-md5 {snakemake.log.log} {snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/mehari/bam_qc/environment.yaml b/snappy_wrappers/wrappers/mehari/bam_qc/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/bam_qc/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/mehari/bam_qc/wrapper.py b/snappy_wrappers/wrappers/mehari/bam_qc/wrapper.py new file mode 120000 index 000000000..16f251594 --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/bam_qc/wrapper.py @@ -0,0 +1 @@ +../../varfish_annotator/bam_qc/wrapper.py \ No newline at end of file diff --git a/snappy_wrappers/wrappers/mehari/environment.yaml b/snappy_wrappers/wrappers/mehari/environment.yaml new file mode 100644 index 000000000..fec4b885e --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/environment.yaml @@ -0,0 +1,11 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bcftools >=1.9 + - htslib >=1.9 + - samtools >=1.9 + - mehari ==0.2.0 + - jq + - vcfpy + - pysam diff --git a/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py b/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py deleted file mode 100644 index 8b306e707..000000000 --- a/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Helper script to fix MANTA inversions.""" - -import argparse -import enum -import subprocess -import sys - -import vcfpy - - -def looks_like_manta(header): - """Checks VCF header whether the file looks like coming from MANTA.""" - for line in header.lines: - if line.key == "source" and line.value.startswith("GenerateSVCandidates"): - return True - elif line.key == "source" and line.value.startswith("DRAGEN"): - return True - return False - - -@enum.unique -class InversionType(enum.Enum): - """Inversion type.""" - - INV3 = "INV3" - INV5 = "INV5" - NONE = "NONE" - - -def analyze_inv(record): - def get_mate_info(alt, c): - arr = alt.split(c) - mate_chrom, mate_pos = arr[1].split(":") - return mate_chrom, int(mate_pos) - - alt = record.ALT[0].serialize() - if alt.startswith("["): - mate_chrom, mate_pos = get_mate_info(alt, "[") - if mate_chrom == record.CHROM: - return mate_chrom, mate_pos, InversionType.INV5 - elif alt.endswith("]"): - mate_chrom, mate_pos = get_mate_info(alt, "]") - if mate_chrom == record.CHROM: - return mate_chrom, mate_pos, InversionType.INV3 - return None, None, InversionType.NONE - - -def collect_inv_bnds(reader): - result = {} - for record in reader: - _, _, inv_type = analyze_inv(record) - if inv_type != InversionType.NONE: - if record.ID[0] in result: - result[record.ID[0]] = record - else: - result[record.INFO["MATEID"][0]] = None - return result - - -def samtools_faidx(reference_fasta, chrom, start_pos, end_pos): - region = f"{chrom}:{start_pos}-{end_pos}" - samtools_out = subprocess.check_output( - ["samtools", "faidx", reference_fasta, region], text=True - ) - ref_seq = [] - for seq in samtools_out.split("\n"): - if not seq.startswith(">"): - ref_seq.append(seq) - return "".join(ref_seq).upper() - - -def convert_manta_inversions(args, reader, writer, inv_bnds): - for record in reader: - if record.ID[0] in inv_bnds: - continue # skip mate record - _, mate_pos, inv_type = analyze_inv(record) - if inv_type != InversionType.NONE: - if inv_type == InversionType.INV5: - # Adjust POS for 5' inversion - record.POS -= 1 - mate_pos -= 1 - record.REF = samtools_faidx( - args.reference_fasta, record.CHROM, record.POS, record.POS - ) - - id_suffix = record.ID[0].split("MantaBND")[1] - idx = id_suffix.rfind(":") - record.ID = ["MantaINV%s" % id_suffix[:idx]] - - record.ALT = [vcfpy.SymbolicAllele("INV")] - - record.INFO["END"] = mate_pos - record.INFO["SVTYPE"] = "INV" - record.INFO["SVLEN"] = [mate_pos - record.POS] - if record.INFO.get("IMPRECISE"): - mate_id = record.INFO["MATEID"][0] - record.INFO["CIEND"] = inv_bnds[mate_id].INFO["CIPOS"] - elif record.INFO.get("HOMLEN"): - record.INFO["CIEND"] = [-record.INFO["HOMLEN"][0], 0] - - if inv_type == InversionType.INV5 and "HOMSEQ" in record.INFO: - cipos = record.INFO["CIPOS"] - hom_seq_start = record.POS + cipos[0] + 1 - hom_seq_end = record.POS + cipos[1] - record.INFO["HOMSEQ"] = [ - samtools_faidx(args.reference_fasta, record.CHROM, hom_seq_start, hom_seq_end) - ] - - # remove BND-specific info - for key in ("MATEID", "BND_DEPTH", "MATE_BND_DEPTH"): - if key in record.INFO: - record.INFO.pop(key) - - # update INFO/EVENT - if "EVENT" in record.INFO: - eid_suffix = record.INFO["EVENT"].split("MantaBND")[1] - idx = eid_suffix.rfind(":") - record.INFO["EVENT"] = "MantaINV%s" % eid_suffix[:idx] - - # set INV3/INV5 tags - if inv_type == InversionType.INV3: - record.INFO["INV3"] = True - elif inv_type == InversionType.INV5: - record.INFO["INV5"] = True - - record.INFO = dict(sorted(record.INFO.items())) - - writer.write_record(record) - - -def run_manta(args, reader): - """Run MANTA conversion.""" - # First, collect all inversion BND records. - with vcfpy.Reader.from_path(args.input_vcf) as reader2: - inv_bnds = collect_inv_bnds(reader2) - # Go through file a second time and convert inversions. - header = reader.header.copy() - header.add_info_line( - { - "ID": "INV3", - "Number": "0", - "Type": "Flag", - "Description": "Inversion breakends open 3' of reported location", - } - ) - header.add_info_line( - { - "ID": "INV5", - "Number": "0", - "Type": "Flag", - "Description": "Inversion breakends open 5' of reported location", - } - ) - header.add_line( - vcfpy.AltAlleleHeaderLine.from_mapping( - { - "ID": "INV", - "Description": "Inversion", - } - ) - ) - with vcfpy.Writer.from_path(args.output_vcf, header) as writer: - convert_manta_inversions(args, reader, writer, inv_bnds) - - -def run(args): - """Main entry point after parsing command line. - - Non-MANTA VCF files are handled directly by copying. MANTA files are processed with - run_manta(). - """ - with vcfpy.Reader.from_path(args.input_vcf) as reader: - if not looks_like_manta(reader.header): # simply copy - with vcfpy.Writer.from_path(args.output_vcf, reader.header) as writer: - for record in reader: - writer.write_record(record) - else: - run_manta(args, reader) - - -def main(argv=None): - parser = argparse.ArgumentParser() - parser.add_argument("--reference-fasta", required=True, help="Reference FASTA file.") - parser.add_argument("--input-vcf", required=True, help="Input VCF file.") - parser.add_argument("--output-vcf", required=True, help="Output VCF file.") - - args = parser.parse_args(argv) - return run(args) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py b/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py new file mode 120000 index 000000000..0eefaaa27 --- /dev/null +++ b/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py @@ -0,0 +1 @@ +../../mehari/annotate_strucvars/fix_manta_invs.py \ No newline at end of file From 92184c67823cb45cf2deaf297f093194ca01b858 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 2 May 2023 12:02:20 +0200 Subject: [PATCH 2/7] use mehari in varfish_export --- .../workflows/varfish_export/Snakefile | 70 +++++++++---------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/snappy_pipeline/workflows/varfish_export/Snakefile b/snappy_pipeline/workflows/varfish_export/Snakefile index e8deaef8f..463ba6fe8 100644 --- a/snappy_pipeline/workflows/varfish_export/Snakefile +++ b/snappy_pipeline/workflows/varfish_export/Snakefile @@ -45,67 +45,67 @@ rule varfish_export_write_pedigree_run: wf.substep_dispatch("write_pedigree", "run", wildcards, output) -# Run varfish-annotator-cli annotate ------------------------------------------ +# Run varfish-annotator-cli annotate-seqvars ----------------------------------- -rule varfish_export_varfish_annotator_annotate: +rule varfish_export_mehari_annotate_seqvars: input: - unpack(wf.get_input_files("varfish_annotator", "annotate")), + unpack(wf.get_input_files("mehari", "annotate_seqvars")), output: - **wf.get_output_files("varfish_annotator", "annotate"), - threads: wf.get_resource("varfish_annotator", "annotate", "threads") + **wf.get_output_files("mehari", "annotate_seqvars"), + threads: wf.get_resource("mehari", "annotate_seqvars", "threads") resources: - time=wf.get_resource("varfish_annotator", "annotate", "time"), - memory=wf.get_resource("varfish_annotator", "annotate", "memory"), - partition=wf.get_resource("varfish_annotator", "annotate", "partition"), - tmpdir=wf.get_resource("varfish_annotator", "annotate", "tmpdir"), + time=wf.get_resource("mehari", "annotate_seqvars", "time"), + memory=wf.get_resource("mehari", "annotate_seqvars", "memory"), + partition=wf.get_resource("mehari", "annotate_seqvars", "partition"), + tmpdir=wf.get_resource("mehari", "annotate_seqvars", "tmpdir"), log: - **wf.get_log_file("varfish_annotator", "annotate"), + **wf.get_log_file("mehari", "annotate_seqvars"), params: - **{"args": wf.get_params("varfish_annotator", "annotate")}, + **{"args": wf.get_params("mehari", "annotate_seqvars")}, wrapper: - wf.wrapper_path("varfish_annotator/annotate") + wf.wrapper_path("mehari/annotate_seqvars") -# Run varfish-annotator-cli annotate-svs --------------------------------------- +# Run varfish-annotator-cli annotate-strucvars --------------------------------- -rule varfish_export_varfish_annotator_annotate_svs: +rule varfish_export_mehari_annotate_strucvars: input: - unpack(wf.get_input_files("varfish_annotator", "annotate_svs")), + unpack(wf.get_input_files("mehari", "annotate_strucvars")), output: - **wf.get_output_files("varfish_annotator", "annotate_svs"), - threads: wf.get_resource("varfish_annotator", "annotate_svs", "threads") + **wf.get_output_files("mehari", "annotate_strucvars"), + threads: wf.get_resource("mehari", "annotate_strucvars", "threads") resources: - time=wf.get_resource("varfish_annotator", "annotate_svs", "time"), - memory=wf.get_resource("varfish_annotator", "annotate_svs", "memory"), - partition=wf.get_resource("varfish_annotator", "annotate_svs", "partition"), - tmpdir=wf.get_resource("varfish_annotator", "annotate_svs", "tmpdir"), + time=wf.get_resource("mehari", "annotate_strucvars", "time"), + memory=wf.get_resource("mehari", "annotate_strucvars", "memory"), + partition=wf.get_resource("mehari", "annotate_strucvars", "partition"), + tmpdir=wf.get_resource("mehari", "annotate_strucvars", "tmpdir"), log: - **wf.get_log_file("varfish_annotator", "annotate_svs"), + **wf.get_log_file("mehari", "annotate_strucvars"), params: - **{"args": wf.get_params("varfish_annotator", "annotate_svs")}, + **{"args": wf.get_params("mehari", "annotate_strucvars")}, wrapper: - wf.wrapper_path("varfish_annotator/annotate_svs") + wf.wrapper_path("mehari/annotate_strucvars") # Gather statistics about the alignment --------------------------------------- -rule varfish_export_varfish_annotator_bam_qc: +rule varfish_export_mehari_bam_qc: input: - unpack(wf.get_input_files("varfish_annotator", "bam_qc")), + unpack(wf.get_input_files("mehari", "bam_qc")), output: - **wf.get_output_files("varfish_annotator", "bam_qc"), - threads: wf.get_resource("varfish_annotator", "bam_qc", "threads") + **wf.get_output_files("mehari", "bam_qc"), + threads: wf.get_resource("mehari", "bam_qc", "threads") resources: - time=wf.get_resource("varfish_annotator", "bam_qc", "time"), - memory=wf.get_resource("varfish_annotator", "bam_qc", "memory"), - partition=wf.get_resource("varfish_annotator", "bam_qc", "partition"), - tmpdir=wf.get_resource("varfish_annotator", "bam_qc", "tmpdir"), + time=wf.get_resource("mehari", "bam_qc", "time"), + memory=wf.get_resource("mehari", "bam_qc", "memory"), + partition=wf.get_resource("mehari", "bam_qc", "partition"), + tmpdir=wf.get_resource("mehari", "bam_qc", "tmpdir"), log: - **wf.get_log_file("varfish_annotator", "bam_qc"), + **wf.get_log_file("mehari", "bam_qc"), params: - **{"args": wf.get_params("varfish_annotator", "bam_qc")}, + **{"args": wf.get_params("mehari", "bam_qc")}, wrapper: - wf.wrapper_path("varfish_annotator/bam_qc") + wf.wrapper_path("mehari/bam_qc") From 84620a03d509864acd5f37df2c85e948e406a88a Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 2 May 2023 12:08:06 +0200 Subject: [PATCH 3/7] renaming --- .../workflows/varfish_export/__init__.py | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/snappy_pipeline/workflows/varfish_export/__init__.py b/snappy_pipeline/workflows/varfish_export/__init__.py index dde735975..2e44e582c 100644 --- a/snappy_pipeline/workflows/varfish_export/__init__.py +++ b/snappy_pipeline/workflows/varfish_export/__init__.py @@ -126,17 +126,14 @@ """ -class VarfishAnnotatorAnnotateStepPart(VariantCallingGetLogFileMixin, BaseStepPart): - """This step part is responsible for annotating the variants with VarFish Annotator""" +class MehariStepPart(VariantCallingGetLogFileMixin, BaseStepPart): + """This step part is responsible for annotating the variants with Mehari""" - name = "varfish_annotator" + name = "mehari" actions = ("annotate", "annotate_svs", "bam_qc") def __init__(self, parent): super().__init__(parent) - self.base_path_out = ( - "work/{mapper}.{var_caller}.varfish_annotated.{index_ngs_library}/out/.done" - ) # Build shortcut from index library name to pedigree self.index_ngs_library_to_pedigree = {} for sheet in self.parent.shortcut_sheets: @@ -155,7 +152,7 @@ def get_log_file(self, action: str) -> SnakemakeDictItemsGenerator: self._validate_action(action) prefix = ( "work/{mapper}.varfish_export.{index_ngs_library}/log/" - f"{{mapper}}.varfish_annotator_{action}.{{index_ngs_library}}" + f"{{mapper}}.mehari_{action}.{{index_ngs_library}}" ) key_ext = ( ("wrapper", ".wrapper.py"), @@ -271,7 +268,7 @@ def _get_output_files_annotate(self): # Generate paths in "work/" directory prefix = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.varfish_annotator_annotate.{index_ngs_library}" + "{mapper}.mehari_annotate.{index_ngs_library}" ) work_paths = { # annotate will write out PED file "ped": f"{prefix}.ped", @@ -386,7 +383,7 @@ def _get_input_files_annotate_svs(self, wildcards): def _get_output_files_annotate_svs(self): prefix = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.varfish_annotator_annotate_svs.{index_ngs_library}" + "{mapper}.mehari_annotate_svs.{index_ngs_library}" ) work_paths = { "gts": f"{prefix}.gts.tsv.gz", @@ -435,7 +432,7 @@ def _get_input_files_bam_qc(self, wildcards): def _get_output_files_bam_qc(self) -> SnakemakeDictItemsGenerator: prefix = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.varfish_annotator_bam_qc.{index_ngs_library}" + "{mapper}.mehari_bam_qc.{index_ngs_library}" ) work_paths = { "bam_qc": f"{prefix}.bam-qc.tsv.gz", @@ -498,9 +495,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) # Register sub step classes so the sub steps are available - self.register_sub_step_classes( - (WritePedigreeStepPart, VarfishAnnotatorAnnotateStepPart, LinkOutStepPart) - ) + self.register_sub_step_classes((WritePedigreeStepPart, MehariStepPart, LinkOutStepPart)) # Register sub workflows self.register_sub_workflow("variant_calling", self.config["path_variant_calling"]) @@ -560,8 +555,8 @@ def get_result_files(self): We will process all primary DNA libraries and perform joint calling within pedigrees """ - for action in self.sub_steps["varfish_annotator"].actions: - yield from self.sub_steps["varfish_annotator"].get_result_files(action) + for action in self.sub_steps["mehari"].actions: + yield from self.sub_steps["mehari"].get_result_files(action) def check_config(self): self.ensure_w_config( From e95ebbd9aef398e7642ea286d7c3f4b1c001dc6d Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 2 May 2023 12:13:28 +0200 Subject: [PATCH 4/7] tests --- .../workflows/varfish_export/__init__.py | 41 ++--- .../test_workflows_varfish_export.py | 144 +++++++++--------- 2 files changed, 95 insertions(+), 90 deletions(-) diff --git a/snappy_pipeline/workflows/varfish_export/__init__.py b/snappy_pipeline/workflows/varfish_export/__init__.py index 2e44e582c..b48e0a239 100644 --- a/snappy_pipeline/workflows/varfish_export/__init__.py +++ b/snappy_pipeline/workflows/varfish_export/__init__.py @@ -130,7 +130,7 @@ class MehariStepPart(VariantCallingGetLogFileMixin, BaseStepPart): """This step part is responsible for annotating the variants with Mehari""" name = "mehari" - actions = ("annotate", "annotate_svs", "bam_qc") + actions = ("annotate_seqvars", "annotate_strucvars", "bam_qc") def __init__(self, parent): super().__init__(parent) @@ -180,16 +180,16 @@ def get_resource_usage(self, action: str) -> ResourceUsage: @listify def get_result_files(self, action): # Generate templates to the output paths from action's result files. - if action == "annotate": - raw_path_tpls = self._get_output_files_annotate().values() - elif action == "annotate_svs": - # Only annotate SVs if path to step for calling them is configured. + if action == "annotate_seqvars": + raw_path_tpls = self._get_output_files_annotate_seqvars().values() + elif action == "annotate_strucvars": + # Only annotate_seqvars SVs if path to step for calling them is configured. if ( not self.parent.config["path_sv_calling_targeted"] and not self.parent.config["path_sv_calling_wgs"] ): return - raw_path_tpls = self._get_output_files_annotate_svs().values() + raw_path_tpls = self._get_output_files_annotate_strucvars().values() elif action == "bam_qc": raw_path_tpls = self._get_output_files_bam_qc().values() # Filter the templates to the paths in the output directory. @@ -198,7 +198,8 @@ def get_result_files(self, action): # Create concrete paths for all pedigrees in the sample sheet. index_ngs_libraries = self._get_index_ngs_libraries( require_consistent_pedigree_kits=( - bool(self.parent.config["path_sv_calling_targeted"]) and (action == "annotate_svs") + bool(self.parent.config["path_sv_calling_targeted"]) + and (action == "annotate_strucvars") ) ) kwargs = { @@ -242,7 +243,7 @@ def _is_pedigree_good(self, pedigree: Pedigree) -> bool: return not msg @dictify - def _get_input_files_annotate(self, wildcards): + def _get_input_files_annotate_seqvars(self, wildcards): yield "ped", "work/write_pedigree.{index_ngs_library}/out/{index_ngs_library}.ped" variant_calling = self.parent.sub_workflows["variant_calling"] @@ -264,13 +265,13 @@ def _get_input_files_annotate(self, wildcards): yield "vcf", vcfs @dictify - def _get_output_files_annotate(self): + def _get_output_files_annotate_seqvars(self): # Generate paths in "work/" directory prefix = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.mehari_annotate.{index_ngs_library}" + "{mapper}.mehari_annotate_seqvars.{index_ngs_library}" ) - work_paths = { # annotate will write out PED file + work_paths = { # annotate_seqvars will write out PED file "ped": f"{prefix}.ped", "ped_md5": f"{prefix}.ped.md5", "gts": f"{prefix}.gts.tsv.gz", @@ -282,10 +283,12 @@ def _get_output_files_annotate(self): # Generate paths in "output/" directory yield "output_links", [ re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_paths.values(), self.get_log_file("annotate").values()) + for work_path in chain( + work_paths.values(), self.get_log_file("annotate_seqvars").values() + ) ] - def _get_params_annotate(self, wildcards: Wildcards) -> typing.Dict[str, typing.Any]: + def _get_params_annotate_seqvars(self, wildcards: Wildcards) -> typing.Dict[str, typing.Any]: pedigree = self.index_ngs_library_to_pedigree[wildcards.index_ngs_library] for donor in pedigree.donors: if ( @@ -296,7 +299,7 @@ def _get_params_annotate(self, wildcards: Wildcards) -> typing.Dict[str, typing. return {"step_name": "varfish_export"} @dictify - def _get_input_files_annotate_svs(self, wildcards): + def _get_input_files_annotate_strucvars(self, wildcards): yield "ped", "work/write_pedigree.{index_ngs_library}/out/{index_ngs_library}.ped" if self.parent.config["path_sv_calling_targeted"]: @@ -380,10 +383,10 @@ def _get_input_files_annotate_svs(self, wildcards): yield "vcf_cov", cov_vcfs @dictify - def _get_output_files_annotate_svs(self): + def _get_output_files_annotate_strucvars(self): prefix = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.mehari_annotate_svs.{index_ngs_library}" + "{mapper}.mehari_annotate_strucvars.{index_ngs_library}" ) work_paths = { "gts": f"{prefix}.gts.tsv.gz", @@ -397,11 +400,13 @@ def _get_output_files_annotate_svs(self): # Generate paths in "output/" directory yield "output_links", [ re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_paths.values(), self.get_log_file("annotate_svs").values()) + for work_path in chain( + work_paths.values(), self.get_log_file("annotate_strucvars").values() + ) ] #: Alias the get params function. - _get_params_annotate_svs = _get_params_annotate + _get_params_annotate_strucvars = _get_params_annotate_seqvars @dictify def _get_input_files_bam_qc(self, wildcards): diff --git a/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py b/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py index 6595c69f5..02b2f4141 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py +++ b/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py @@ -88,11 +88,11 @@ def varfish_export_workflow( ) -# Tests for VarfishAnnotatorAnnotateStepPart ------------------------------------------------------- +# Tests for MehariAnnotateStepPart ------------------------------------------------------- -def test_varfish_annotator_step_part_get_input_files_annotate(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_input_files_annotate()""" +def test_mehari_step_part_get_input_files_annotate(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_input_files_annotate()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -109,12 +109,12 @@ def test_varfish_annotator_step_part_get_input_files_annotate(varfish_export_wor "vcf": [base_name + ".vcf.gz"], } # Get actual - actual = varfish_export_workflow.get_input_files("varfish_annotator", "annotate")(wildcards) + actual = varfish_export_workflow.get_input_files("mehari", "annotate_seqvars")(wildcards) assert actual == expected -def test_varfish_annotator_step_part_get_input_files_bam_qc(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_input_files_bam_qc()""" +def test_mehari_step_part_get_input_files_bam_qc(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_input_files_bam_qc()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -137,16 +137,16 @@ def test_varfish_annotator_step_part_get_input_files_bam_qc(varfish_export_workf "cov_qc": [base_name_cov.format(i=i) for i in donor_indices], } # Get actual - actual = varfish_export_workflow.get_input_files("varfish_annotator", "bam_qc")(wildcards) + actual = varfish_export_workflow.get_input_files("mehari", "bam_qc")(wildcards) assert actual == expected -def test_varfish_annotator_step_part_get_output_files_annotate(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_output_files_annotate()""" +def test_mehari_step_part_get_output_files_annotate(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_output_files_annotate()""" # Define expected base_name_out = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.varfish_annotator_annotate.{index_ngs_library}" + "{mapper}.mehari_annotate_seqvars.{index_ngs_library}" ) expected = { "gts": base_name_out + ".gts.tsv.gz", @@ -156,69 +156,69 @@ def test_varfish_annotator_step_part_get_output_files_annotate(varfish_export_wo "ped": base_name_out + ".ped", "ped_md5": base_name_out + ".ped.md5", "output_links": [ - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.ped", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.ped.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.gts.tsv.gz", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.gts.tsv.gz.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.db-infos.tsv.gz", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.db-infos.tsv.gz.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.wrapper.py", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.wrapper.py.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.log", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.log.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.conda_info.txt", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.conda_info.txt.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.conda_list.txt", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.conda_list.txt.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.environment.yaml", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.environment.yaml.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.ped", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.ped.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.gts.tsv.gz", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.gts.tsv.gz.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.db-infos.tsv.gz", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.db-infos.tsv.gz.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.wrapper.py", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.wrapper.py.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.log", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.log.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.conda_info.txt", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.conda_info.txt.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.conda_list.txt", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.conda_list.txt.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.environment.yaml", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.environment.yaml.md5", ], } # Get actual - actual = varfish_export_workflow.get_output_files("varfish_annotator", "annotate") + actual = varfish_export_workflow.get_output_files("mehari", "annotate_seqvars") assert actual == expected -def test_varfish_annotator_step_part_get_output_files_bam_qc(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_output_files_bam_qc()""" +def test_mehari_step_part_get_output_files_bam_qc(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_output_files_bam_qc()""" # Define expected base_name_out = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.varfish_annotator_bam_qc.{index_ngs_library}" + "{mapper}.mehari_bam_qc.{index_ngs_library}" ) expected = { "bam_qc": base_name_out + ".bam-qc.tsv.gz", "bam_qc_md5": base_name_out + ".bam-qc.tsv.gz.md5", "output_links": [ - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.bam-qc.tsv.gz", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.bam-qc.tsv.gz.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.wrapper.py", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.wrapper.py.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.log", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.log.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.conda_info.txt", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.conda_info.txt.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.conda_list.txt", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.conda_list.txt.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.environment.yaml", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.environment.yaml.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_bam_qc.{index_ngs_library}.bam-qc.tsv.gz", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_bam_qc.{index_ngs_library}.bam-qc.tsv.gz.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.wrapper.py", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.wrapper.py.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.log", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.log.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.conda_info.txt", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.conda_info.txt.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.conda_list.txt", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.conda_list.txt.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.environment.yaml", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.environment.yaml.md5", ], } # Get actual - actual = varfish_export_workflow.get_output_files("varfish_annotator", "bam_qc") + actual = varfish_export_workflow.get_output_files("mehari", "bam_qc") assert actual == expected -def test_varfish_annotator_step_part_get_log_file_annotate(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_log_file()_annotate""" +def test_mehari_step_part_get_log_file_annotate(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_log_file()_annotate""" # Define expected base_name_log = ( "work/{mapper}.varfish_export.{index_ngs_library}/log/" - "{mapper}.varfish_annotator_annotate.{index_ngs_library}" + "{mapper}.mehari_annotate_seqvars.{index_ngs_library}" ) base_name_wrapper = ( "work/{mapper}.varfish_export.{index_ngs_library}/log/" - "{mapper}.varfish_annotator_annotate.{index_ngs_library}" + "{mapper}.mehari_annotate_seqvars.{index_ngs_library}" ) wrapper_dict = { "wrapper": base_name_wrapper + ".wrapper.py", @@ -229,20 +229,20 @@ def test_varfish_annotator_step_part_get_log_file_annotate(varfish_export_workfl log_dict = get_expected_log_files_dict(base_out=base_name_log) expected = {**wrapper_dict, **log_dict} # Get actual - actual = varfish_export_workflow.get_log_file("varfish_annotator", "annotate") + actual = varfish_export_workflow.get_log_file("mehari", "annotate_seqvars") assert actual == expected -def test_varfish_annotator_step_part_get_log_file_bam_qc(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_log_file()_bam_qc""" +def test_mehari_step_part_get_log_file_bam_qc(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_log_file()_bam_qc""" # Define expected base_name_log = ( "work/{mapper}.varfish_export.{index_ngs_library}/log/" - "{mapper}.varfish_annotator_bam_qc.{index_ngs_library}" + "{mapper}.mehari_bam_qc.{index_ngs_library}" ) base_name_wrapper = ( "work/{mapper}.varfish_export.{index_ngs_library}/log/" - "{mapper}.varfish_annotator_bam_qc.{index_ngs_library}" + "{mapper}.mehari_bam_qc.{index_ngs_library}" ) wrapper_dict = { "wrapper": base_name_wrapper + ".wrapper.py", @@ -253,40 +253,40 @@ def test_varfish_annotator_step_part_get_log_file_bam_qc(varfish_export_workflow log_dict = get_expected_log_files_dict(base_out=base_name_log) expected = {**wrapper_dict, **log_dict} # Get actual - actual = varfish_export_workflow.get_log_file("varfish_annotator", "bam_qc") + actual = varfish_export_workflow.get_log_file("mehari", "bam_qc") assert actual == expected -def test_varfish_annotator_step_part_get_params_annotate(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_params_annotate()""" +def test_mehari_step_part_get_params_annotate(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_params_annotate()""" wildcards = Wildcards(fromdict={"index_ngs_library": "P001-N1-DNA1-WGS1"}) expected = {"step_name": "varfish_export"} - actual = varfish_export_workflow.get_params("varfish_annotator", "annotate")(wildcards) + actual = varfish_export_workflow.get_params("mehari", "annotate_seqvars")(wildcards) assert actual == expected -def test_varfish_annotator_step_part_get_params_bam_qc(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_params_bam_qc()""" +def test_mehari_step_part_get_params_bam_qc(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_params_bam_qc()""" wildcards = Wildcards(fromdict={"index_ngs_library": "P001-N1-DNA1-WGS1"}) expected = { "P001-N1-DNA1-WGS1": "P001-N1-DNA1-WGS1", "P002-N1-DNA1-WGS1": "P002-N1-DNA1-WGS1", "P003-N1-DNA1-WGS1": "P003-N1-DNA1-WGS1", } - actual = varfish_export_workflow.get_params("varfish_annotator", "bam_qc")(wildcards) + actual = varfish_export_workflow.get_params("mehari", "bam_qc")(wildcards) assert actual == expected -def test_varfish_annotator_step_part_get_resource_usage(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart.get_resource_usage()""" - all_actions = varfish_export_workflow.substep_getattr("varfish_annotator", "actions") +def test_mehari_step_part_get_resource_usage(varfish_export_workflow): + """Tests MehariAnnotateStepPart.get_resource_usage()""" + all_actions = varfish_export_workflow.substep_getattr("mehari", "actions") # Define expected expected_dict = {"threads": 2, "time": "1-00:00:00", "memory": "14G", "partition": "medium"} # Evaluate for action in all_actions: for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = varfish_export_workflow.get_resource("varfish_annotator", action, resource) + actual = varfish_export_workflow.get_resource("mehari", action, resource) assert actual == expected, msg_error @@ -296,28 +296,28 @@ def test_varfish_annotator_step_part_get_resource_usage(varfish_export_workflow) def test_varfish_export_workflow(varfish_export_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["link_out", "varfish_annotator", "write_pedigree"] + expected = ["link_out", "mehari", "write_pedigree"] actual = list(sorted(varfish_export_workflow.sub_steps.keys())) assert actual == expected # Check result file construction tpl = ( "output/bwa.varfish_export.P00{i}-N1-DNA1-WGS1/{dir_}/" - "bwa.varfish_annotator_{action}.P00{i}-N1-DNA1-WGS1.{ext}" + "bwa.mehari_{action}.P00{i}-N1-DNA1-WGS1.{ext}" ) # Files in `out` directory expected = [ tpl.format(dir_="out", i=i, ext=ext, action=action) for i in (1, 4) # only for indices for (action, ext) in ( - ("annotate", "gts.tsv.gz"), - ("annotate", "gts.tsv.gz.md5"), - ("annotate", "db-infos.tsv.gz"), - ("annotate", "db-infos.tsv.gz.md5"), + ("annotate_seqvars", "gts.tsv.gz"), + ("annotate_seqvars", "gts.tsv.gz.md5"), + ("annotate_seqvars", "db-infos.tsv.gz"), + ("annotate_seqvars", "db-infos.tsv.gz.md5"), ("bam_qc", "bam-qc.tsv.gz"), ("bam_qc", "bam-qc.tsv.gz.md5"), - ("annotate", "ped"), - ("annotate", "ped.md5"), + ("annotate_seqvars", "ped"), + ("annotate_seqvars", "ped.md5"), ) ] # Files in `log` directory @@ -336,7 +336,7 @@ def test_varfish_export_workflow(varfish_export_workflow): "environment.yaml", "environment.yaml.md5", ) - for action in ("annotate", "bam_qc") + for action in ("annotate_seqvars", "bam_qc") ] expected = sorted(expected) actual = sorted(varfish_export_workflow.get_result_files()) From 0bf5b197eacad9f0a81578f48eb914a8b7b0356c Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 2 May 2023 13:26:48 +0200 Subject: [PATCH 5/7] fix annotation --- .../mehari/annotate_seqvars/wrapper.py | 2 +- .../mehari/annotate_strucvars/wrapper.py | 37 +++++-------------- 2 files changed, 10 insertions(+), 29 deletions(-) diff --git a/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py b/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py index b08d61c4c..9400d560e 100644 --- a/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py +++ b/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py @@ -77,7 +77,7 @@ release=$(echo {export_config[release]} | tr '[:upper:]' '[:lower:]') -# Execute VarFish Annotator +# Perform Mehari sequence variant annotation. mehari \ annotate \ seqvars \ diff --git a/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py b/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py index 0565841fb..2c1f789a8 100644 --- a/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py +++ b/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py @@ -92,35 +92,16 @@ tabix -s1 -b2 -e2 -f $TMPDIR/final_for_import.$num.vcf.gz done -# Compatibility mode with currently deployed VarFish Server -compatibility_option="--opt-out callers-array" - -# Execute VarFish Annotator -varfish-annotator \ - annotate-svs \ - -XX:MaxHeapSize=10g \ - -XX:+UseG1GC \ - \ - --release {export_config[release]} \ - \ - $(if [[ "{coverage_vcf}" != "" ]]; then \ - for path in {coverage_vcf}; do \ - echo --coverage-vcf $path; \ - done; \ - fi) \ - \ - --db-path {export_config[path_db]} \ - --refseq-ser-path {export_config[path_refseq_ser]} \ - --ensembl-ser-path {export_config[path_ensembl_ser]} \ - --input-ped {snakemake.input.ped} \ - \ - $(for vcf in $TMPDIR/final_for_import.*.vcf.gz; do \ - echo --input-vcf $vcf; \ +# Perform Mehari structural variant annotation. +mehari \ + annotate \ + strucvars \ + --path-db {export_config[path_mehari_db]} \ + --path-input-ped {snakemake.input.ped} \ + $(for p in $TMPDIR/final_for_import.*.vcf.gz; do \ + echo --path-input-vcf $p; \ done) \ - --output-db-info {snakemake.output.db_infos} \ - --output-gts {snakemake.output.gts} \ - --output-feature-effects {snakemake.output.feature_effects} \ - $compatibility_option + --path-output-tsv >(gzip -c > {snakemake.output.gts}) # Compute MD5 sums on output files compute-md5 {snakemake.output.db_infos} {snakemake.output.db_infos_md5} From e6d3b1079a9d68a54d94c0aa5896fc5a56068a70 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 2 May 2023 14:51:12 +0200 Subject: [PATCH 6/7] wip --- .../wrappers/mehari/annotate_strucvars/wrapper.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py b/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py index 2c1f789a8..d36788a17 100644 --- a/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py +++ b/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py @@ -103,6 +103,16 @@ done) \ --path-output-tsv >(gzip -c > {snakemake.output.gts}) +cat | gzip -c > {snakemake.output.db-infos} < Date: Tue, 2 May 2023 16:18:38 +0200 Subject: [PATCH 7/7] make work in practice --- snappy_pipeline/workflows/varfish_export/__init__.py | 2 -- .../wrappers/mehari/annotate_seqvars/wrapper.py | 7 ++----- .../wrappers/mehari/annotate_strucvars/wrapper.py | 6 +----- snappy_wrappers/wrappers/mehari/environment.yaml | 2 +- 4 files changed, 4 insertions(+), 13 deletions(-) diff --git a/snappy_pipeline/workflows/varfish_export/__init__.py b/snappy_pipeline/workflows/varfish_export/__init__.py index b48e0a239..f4c07d0d5 100644 --- a/snappy_pipeline/workflows/varfish_export/__init__.py +++ b/snappy_pipeline/workflows/varfish_export/__init__.py @@ -391,8 +391,6 @@ def _get_output_files_annotate_strucvars(self): work_paths = { "gts": f"{prefix}.gts.tsv.gz", "gts_md5": f"{prefix}.gts.tsv.gz.md5", - "feature_effects": f"{prefix}.feature-effects.tsv.gz", - "feature_effects_md5": f"{prefix}.feature-effects.tsv.gz.md5", "db_infos": f"{prefix}.db-infos.tsv.gz", "db_infos_md5": f"{prefix}.db-infos.tsv.gz.md5", } diff --git a/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py b/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py index 9400d560e..dce342909 100644 --- a/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py +++ b/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py @@ -75,19 +75,16 @@ ln -sr {snakemake.input.vcf}.tbi $TMPDIR/tmp.vcf.gz.tbi fi -release=$(echo {export_config[release]} | tr '[:upper:]' '[:lower:]') - # Perform Mehari sequence variant annotation. mehari \ annotate \ seqvars \ - --release ${release} \ --path-db {export_config[path_mehari_db]} \ --path-input-ped {snakemake.input.ped} \ --path-input-vcf $TMPDIR/tmp.vcf.gz \ - --path-output-tsv >(gzip -c > {snakemake.output.gts}) + --path-output-tsv {snakemake.output.gts} -cat | gzip -c > {snakemake.output.db-infos} < {snakemake.output.db_infos} genomebuild db_name release GRCh37 clinvar 20210728 GRCh37 gnomad_exomes r2.1.1 diff --git a/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py b/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py index d36788a17..7ea72deac 100644 --- a/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py +++ b/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py @@ -103,20 +103,16 @@ done) \ --path-output-tsv >(gzip -c > {snakemake.output.gts}) -cat | gzip -c > {snakemake.output.db-infos} < {snakemake.output.db_infos} genomebuild db_name release GRCh37 clinvar 20210728 GRCh37 gnomad_exomes r2.1.1 GRCh37 gnomad_genomes r2.1.1 EOF -# Copy out PED file to output -cp -H {snakemake.input.ped} {snakemake.output.ped} - # Compute MD5 sums on output files compute-md5 {snakemake.output.db_infos} {snakemake.output.db_infos_md5} compute-md5 {snakemake.output.gts} {snakemake.output.gts_md5} -compute-md5 {snakemake.output.feature_effects} {snakemake.output.feature_effects_md5} # Create output links ----------------------------------------------------------------------------- diff --git a/snappy_wrappers/wrappers/mehari/environment.yaml b/snappy_wrappers/wrappers/mehari/environment.yaml index fec4b885e..065eca483 100644 --- a/snappy_wrappers/wrappers/mehari/environment.yaml +++ b/snappy_wrappers/wrappers/mehari/environment.yaml @@ -5,7 +5,7 @@ dependencies: - bcftools >=1.9 - htslib >=1.9 - samtools >=1.9 - - mehari ==0.2.0 + - mehari ==0.2.1 - jq - vcfpy - pysam