diff --git a/snappy_pipeline/workflows/varfish_export/Snakefile b/snappy_pipeline/workflows/varfish_export/Snakefile index e8deaef8f..463ba6fe8 100644 --- a/snappy_pipeline/workflows/varfish_export/Snakefile +++ b/snappy_pipeline/workflows/varfish_export/Snakefile @@ -45,67 +45,67 @@ rule varfish_export_write_pedigree_run: wf.substep_dispatch("write_pedigree", "run", wildcards, output) -# Run varfish-annotator-cli annotate ------------------------------------------ +# Run varfish-annotator-cli annotate-seqvars ----------------------------------- -rule varfish_export_varfish_annotator_annotate: +rule varfish_export_mehari_annotate_seqvars: input: - unpack(wf.get_input_files("varfish_annotator", "annotate")), + unpack(wf.get_input_files("mehari", "annotate_seqvars")), output: - **wf.get_output_files("varfish_annotator", "annotate"), - threads: wf.get_resource("varfish_annotator", "annotate", "threads") + **wf.get_output_files("mehari", "annotate_seqvars"), + threads: wf.get_resource("mehari", "annotate_seqvars", "threads") resources: - time=wf.get_resource("varfish_annotator", "annotate", "time"), - memory=wf.get_resource("varfish_annotator", "annotate", "memory"), - partition=wf.get_resource("varfish_annotator", "annotate", "partition"), - tmpdir=wf.get_resource("varfish_annotator", "annotate", "tmpdir"), + time=wf.get_resource("mehari", "annotate_seqvars", "time"), + memory=wf.get_resource("mehari", "annotate_seqvars", "memory"), + partition=wf.get_resource("mehari", "annotate_seqvars", "partition"), + tmpdir=wf.get_resource("mehari", "annotate_seqvars", "tmpdir"), log: - **wf.get_log_file("varfish_annotator", "annotate"), + **wf.get_log_file("mehari", "annotate_seqvars"), params: - **{"args": wf.get_params("varfish_annotator", "annotate")}, + **{"args": wf.get_params("mehari", "annotate_seqvars")}, wrapper: - wf.wrapper_path("varfish_annotator/annotate") + wf.wrapper_path("mehari/annotate_seqvars") -# Run varfish-annotator-cli annotate-svs --------------------------------------- +# Run varfish-annotator-cli annotate-strucvars --------------------------------- -rule varfish_export_varfish_annotator_annotate_svs: +rule varfish_export_mehari_annotate_strucvars: input: - unpack(wf.get_input_files("varfish_annotator", "annotate_svs")), + unpack(wf.get_input_files("mehari", "annotate_strucvars")), output: - **wf.get_output_files("varfish_annotator", "annotate_svs"), - threads: wf.get_resource("varfish_annotator", "annotate_svs", "threads") + **wf.get_output_files("mehari", "annotate_strucvars"), + threads: wf.get_resource("mehari", "annotate_strucvars", "threads") resources: - time=wf.get_resource("varfish_annotator", "annotate_svs", "time"), - memory=wf.get_resource("varfish_annotator", "annotate_svs", "memory"), - partition=wf.get_resource("varfish_annotator", "annotate_svs", "partition"), - tmpdir=wf.get_resource("varfish_annotator", "annotate_svs", "tmpdir"), + time=wf.get_resource("mehari", "annotate_strucvars", "time"), + memory=wf.get_resource("mehari", "annotate_strucvars", "memory"), + partition=wf.get_resource("mehari", "annotate_strucvars", "partition"), + tmpdir=wf.get_resource("mehari", "annotate_strucvars", "tmpdir"), log: - **wf.get_log_file("varfish_annotator", "annotate_svs"), + **wf.get_log_file("mehari", "annotate_strucvars"), params: - **{"args": wf.get_params("varfish_annotator", "annotate_svs")}, + **{"args": wf.get_params("mehari", "annotate_strucvars")}, wrapper: - wf.wrapper_path("varfish_annotator/annotate_svs") + wf.wrapper_path("mehari/annotate_strucvars") # Gather statistics about the alignment --------------------------------------- -rule varfish_export_varfish_annotator_bam_qc: +rule varfish_export_mehari_bam_qc: input: - unpack(wf.get_input_files("varfish_annotator", "bam_qc")), + unpack(wf.get_input_files("mehari", "bam_qc")), output: - **wf.get_output_files("varfish_annotator", "bam_qc"), - threads: wf.get_resource("varfish_annotator", "bam_qc", "threads") + **wf.get_output_files("mehari", "bam_qc"), + threads: wf.get_resource("mehari", "bam_qc", "threads") resources: - time=wf.get_resource("varfish_annotator", "bam_qc", "time"), - memory=wf.get_resource("varfish_annotator", "bam_qc", "memory"), - partition=wf.get_resource("varfish_annotator", "bam_qc", "partition"), - tmpdir=wf.get_resource("varfish_annotator", "bam_qc", "tmpdir"), + time=wf.get_resource("mehari", "bam_qc", "time"), + memory=wf.get_resource("mehari", "bam_qc", "memory"), + partition=wf.get_resource("mehari", "bam_qc", "partition"), + tmpdir=wf.get_resource("mehari", "bam_qc", "tmpdir"), log: - **wf.get_log_file("varfish_annotator", "bam_qc"), + **wf.get_log_file("mehari", "bam_qc"), params: - **{"args": wf.get_params("varfish_annotator", "bam_qc")}, + **{"args": wf.get_params("mehari", "bam_qc")}, wrapper: - wf.wrapper_path("varfish_annotator/bam_qc") + wf.wrapper_path("mehari/bam_qc") diff --git a/snappy_pipeline/workflows/varfish_export/__init__.py b/snappy_pipeline/workflows/varfish_export/__init__.py index 1012444ea..f4c07d0d5 100644 --- a/snappy_pipeline/workflows/varfish_export/__init__.py +++ b/snappy_pipeline/workflows/varfish_export/__init__.py @@ -121,26 +121,19 @@ release: GRCh37 # REQUIRED: default 'GRCh37' # Path to BED file with exons; used for reducing data to near-exon small variants. path_exon_bed: null # REQUIRED: exon BED file to use - # Path to Jannovar RefSeq ``.ser`` file for annotation - path_refseq_ser: REQUIRED # REQUIRED: path to RefSeq .ser file - # Path to Jannovar ENSEMBL ``.ser`` file for annotation - path_ensembl_ser: REQUIRED # REQUIRED: path to ENSEMBL .ser file - # Path to VarFish annotator database file to use for annotating. - path_db: REQUIRED # REQUIRED: spath to varfish-annotator DB file to use + # Path to mehari database. + path_mehari_db: REQUIRED # REQUIRED: path to mehari database """ -class VarfishAnnotatorAnnotateStepPart(VariantCallingGetLogFileMixin, BaseStepPart): - """This step part is responsible for annotating the variants with VarFish Annotator""" +class MehariStepPart(VariantCallingGetLogFileMixin, BaseStepPart): + """This step part is responsible for annotating the variants with Mehari""" - name = "varfish_annotator" - actions = ("annotate", "annotate_svs", "bam_qc") + name = "mehari" + actions = ("annotate_seqvars", "annotate_strucvars", "bam_qc") def __init__(self, parent): super().__init__(parent) - self.base_path_out = ( - "work/{mapper}.{var_caller}.varfish_annotated.{index_ngs_library}/out/.done" - ) # Build shortcut from index library name to pedigree self.index_ngs_library_to_pedigree = {} for sheet in self.parent.shortcut_sheets: @@ -159,7 +152,7 @@ def get_log_file(self, action: str) -> SnakemakeDictItemsGenerator: self._validate_action(action) prefix = ( "work/{mapper}.varfish_export.{index_ngs_library}/log/" - f"{{mapper}}.varfish_annotator_{action}.{{index_ngs_library}}" + f"{{mapper}}.mehari_{action}.{{index_ngs_library}}" ) key_ext = ( ("wrapper", ".wrapper.py"), @@ -187,16 +180,16 @@ def get_resource_usage(self, action: str) -> ResourceUsage: @listify def get_result_files(self, action): # Generate templates to the output paths from action's result files. - if action == "annotate": - raw_path_tpls = self._get_output_files_annotate().values() - elif action == "annotate_svs": - # Only annotate SVs if path to step for calling them is configured. + if action == "annotate_seqvars": + raw_path_tpls = self._get_output_files_annotate_seqvars().values() + elif action == "annotate_strucvars": + # Only annotate_seqvars SVs if path to step for calling them is configured. if ( not self.parent.config["path_sv_calling_targeted"] and not self.parent.config["path_sv_calling_wgs"] ): return - raw_path_tpls = self._get_output_files_annotate_svs().values() + raw_path_tpls = self._get_output_files_annotate_strucvars().values() elif action == "bam_qc": raw_path_tpls = self._get_output_files_bam_qc().values() # Filter the templates to the paths in the output directory. @@ -205,7 +198,8 @@ def get_result_files(self, action): # Create concrete paths for all pedigrees in the sample sheet. index_ngs_libraries = self._get_index_ngs_libraries( require_consistent_pedigree_kits=( - bool(self.parent.config["path_sv_calling_targeted"]) and (action == "annotate_svs") + bool(self.parent.config["path_sv_calling_targeted"]) + and (action == "annotate_strucvars") ) ) kwargs = { @@ -249,7 +243,7 @@ def _is_pedigree_good(self, pedigree: Pedigree) -> bool: return not msg @dictify - def _get_input_files_annotate(self, wildcards): + def _get_input_files_annotate_seqvars(self, wildcards): yield "ped", "work/write_pedigree.{index_ngs_library}/out/{index_ngs_library}.ped" variant_calling = self.parent.sub_workflows["variant_calling"] @@ -271,13 +265,13 @@ def _get_input_files_annotate(self, wildcards): yield "vcf", vcfs @dictify - def _get_output_files_annotate(self): + def _get_output_files_annotate_seqvars(self): # Generate paths in "work/" directory prefix = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.varfish_annotator_annotate.{index_ngs_library}" + "{mapper}.mehari_annotate_seqvars.{index_ngs_library}" ) - work_paths = { # annotate will write out PED file + work_paths = { # annotate_seqvars will write out PED file "ped": f"{prefix}.ped", "ped_md5": f"{prefix}.ped.md5", "gts": f"{prefix}.gts.tsv.gz", @@ -289,10 +283,12 @@ def _get_output_files_annotate(self): # Generate paths in "output/" directory yield "output_links", [ re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_paths.values(), self.get_log_file("annotate").values()) + for work_path in chain( + work_paths.values(), self.get_log_file("annotate_seqvars").values() + ) ] - def _get_params_annotate(self, wildcards: Wildcards) -> typing.Dict[str, typing.Any]: + def _get_params_annotate_seqvars(self, wildcards: Wildcards) -> typing.Dict[str, typing.Any]: pedigree = self.index_ngs_library_to_pedigree[wildcards.index_ngs_library] for donor in pedigree.donors: if ( @@ -303,7 +299,7 @@ def _get_params_annotate(self, wildcards: Wildcards) -> typing.Dict[str, typing. return {"step_name": "varfish_export"} @dictify - def _get_input_files_annotate_svs(self, wildcards): + def _get_input_files_annotate_strucvars(self, wildcards): yield "ped", "work/write_pedigree.{index_ngs_library}/out/{index_ngs_library}.ped" if self.parent.config["path_sv_calling_targeted"]: @@ -387,16 +383,14 @@ def _get_input_files_annotate_svs(self, wildcards): yield "vcf_cov", cov_vcfs @dictify - def _get_output_files_annotate_svs(self): + def _get_output_files_annotate_strucvars(self): prefix = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.varfish_annotator_annotate_svs.{index_ngs_library}" + "{mapper}.mehari_annotate_strucvars.{index_ngs_library}" ) work_paths = { "gts": f"{prefix}.gts.tsv.gz", "gts_md5": f"{prefix}.gts.tsv.gz.md5", - "feature_effects": f"{prefix}.feature-effects.tsv.gz", - "feature_effects_md5": f"{prefix}.feature-effects.tsv.gz.md5", "db_infos": f"{prefix}.db-infos.tsv.gz", "db_infos_md5": f"{prefix}.db-infos.tsv.gz.md5", } @@ -404,11 +398,13 @@ def _get_output_files_annotate_svs(self): # Generate paths in "output/" directory yield "output_links", [ re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_paths.values(), self.get_log_file("annotate_svs").values()) + for work_path in chain( + work_paths.values(), self.get_log_file("annotate_strucvars").values() + ) ] #: Alias the get params function. - _get_params_annotate_svs = _get_params_annotate + _get_params_annotate_strucvars = _get_params_annotate_seqvars @dictify def _get_input_files_bam_qc(self, wildcards): @@ -439,7 +435,7 @@ def _get_input_files_bam_qc(self, wildcards): def _get_output_files_bam_qc(self) -> SnakemakeDictItemsGenerator: prefix = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.varfish_annotator_bam_qc.{index_ngs_library}" + "{mapper}.mehari_bam_qc.{index_ngs_library}" ) work_paths = { "bam_qc": f"{prefix}.bam-qc.tsv.gz", @@ -502,9 +498,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) # Register sub step classes so the sub steps are available - self.register_sub_step_classes( - (WritePedigreeStepPart, VarfishAnnotatorAnnotateStepPart, LinkOutStepPart) - ) + self.register_sub_step_classes((WritePedigreeStepPart, MehariStepPart, LinkOutStepPart)) # Register sub workflows self.register_sub_workflow("variant_calling", self.config["path_variant_calling"]) @@ -564,8 +558,8 @@ def get_result_files(self): We will process all primary DNA libraries and perform joint calling within pedigrees """ - for action in self.sub_steps["varfish_annotator"].actions: - yield from self.sub_steps["varfish_annotator"].get_result_files(action) + for action in self.sub_steps["mehari"].actions: + yield from self.sub_steps["mehari"].get_result_files(action) def check_config(self): self.ensure_w_config( diff --git a/snappy_wrappers/wrappers/mehari/annotate_seqvars/environment.yaml b/snappy_wrappers/wrappers/mehari/annotate_seqvars/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/annotate_seqvars/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py b/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py new file mode 100644 index 000000000..dce342909 --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/annotate_seqvars/wrapper.py @@ -0,0 +1,120 @@ +import os + +from snakemake.shell import shell + +__author__ = "Manuel Holtgrewe " + +# Get shortcut to configuration of varfish_export step +step_name = snakemake.params.args["step_name"] +export_config = snakemake.config["step_config"][step_name] + +DEF_HELPER_FUNCS = r""" +compute-md5() +{ + if [[ $# -ne 2 ]]; then + >&2 echo "Invalid number of arguments: $#" + exit 1 + fi + md5sum $1 \ + | awk '{ gsub(/.*\//, "", $2); print; }' \ + > $2 +} +""" + +shell( + r""" +set -x + +# Write files for reproducibility ----------------------------------------------------------------- + +{DEF_HELPER_FUNCS} + +# Write out information about conda and save a copy of the wrapper with picked variables +# as well as the environment.yaml file. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +compute-md5 {snakemake.log.conda_list} {snakemake.log.conda_list_md5} +compute-md5 {snakemake.log.conda_info} {snakemake.log.conda_info_md5} +cp {__real_file__} {snakemake.log.wrapper} +compute-md5 {snakemake.log.wrapper} {snakemake.log.wrapper_md5} +cp $(dirname {__file__})/environment.yaml {snakemake.log.env_yaml} +compute-md5 {snakemake.log.env_yaml} {snakemake.log.env_yaml_md5} + +# Also pipe stderr to log file -------------------------------------------------------------------- + +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +# Create auto-cleaned temporary directory +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +# Run actual tools -------------------------------------------------------------------------------- + +# Extract around BED file, if given. +if [[ -n "{export_config[path_exon_bed]}" ]] && [[ "{export_config[path_exon_bed]}" != "None" ]]; then + set -e + bcftools view \ + -R {export_config[path_exon_bed]} \ + {snakemake.input.vcf} \ + | bcftools sort -T $TMPDIR \ + | bcftools norm -d all \ + | bgzip -c \ + > $TMPDIR/tmp.vcf.gz + tabix -f $TMPDIR/tmp.vcf.gz +else + set -e + ln -sr {snakemake.input.vcf} $TMPDIR/tmp.vcf.gz + ln -sr {snakemake.input.vcf}.tbi $TMPDIR/tmp.vcf.gz.tbi +fi + +# Perform Mehari sequence variant annotation. +mehari \ + annotate \ + seqvars \ + --path-db {export_config[path_mehari_db]} \ + --path-input-ped {snakemake.input.ped} \ + --path-input-vcf $TMPDIR/tmp.vcf.gz \ + --path-output-tsv {snakemake.output.gts} + +cat < {snakemake.output.db_infos} +genomebuild db_name release +GRCh37 clinvar 20210728 +GRCh37 gnomad_exomes r2.1.1 +GRCh37 gnomad_genomes r2.1.1 +EOF + +# Copy out PED file to output +cp -H {snakemake.input.ped} {snakemake.output.ped} + +# Compute MD5 sums on output files +compute-md5 {snakemake.output.db_infos} {snakemake.output.db_infos_md5} +compute-md5 {snakemake.output.gts} {snakemake.output.gts_md5} +compute-md5 {snakemake.output.ped} {snakemake.output.ped_md5} + +# Create output links ----------------------------------------------------------------------------- + +for path in {snakemake.output.output_links}; do + dst=$path + src=work/${{dst#output/}} + ln -sr $src $dst +done +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +{DEF_HELPER_FUNCS} + +sleep 1s # try to wait for log file flush +compute-md5 {snakemake.log.log} {snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/mehari/annotate_strucvars/environment.yaml b/snappy_wrappers/wrappers/mehari/annotate_strucvars/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/annotate_strucvars/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/mehari/annotate_strucvars/fix_manta_invs.py b/snappy_wrappers/wrappers/mehari/annotate_strucvars/fix_manta_invs.py new file mode 100644 index 000000000..8b306e707 --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/annotate_strucvars/fix_manta_invs.py @@ -0,0 +1,192 @@ +"""Helper script to fix MANTA inversions.""" + +import argparse +import enum +import subprocess +import sys + +import vcfpy + + +def looks_like_manta(header): + """Checks VCF header whether the file looks like coming from MANTA.""" + for line in header.lines: + if line.key == "source" and line.value.startswith("GenerateSVCandidates"): + return True + elif line.key == "source" and line.value.startswith("DRAGEN"): + return True + return False + + +@enum.unique +class InversionType(enum.Enum): + """Inversion type.""" + + INV3 = "INV3" + INV5 = "INV5" + NONE = "NONE" + + +def analyze_inv(record): + def get_mate_info(alt, c): + arr = alt.split(c) + mate_chrom, mate_pos = arr[1].split(":") + return mate_chrom, int(mate_pos) + + alt = record.ALT[0].serialize() + if alt.startswith("["): + mate_chrom, mate_pos = get_mate_info(alt, "[") + if mate_chrom == record.CHROM: + return mate_chrom, mate_pos, InversionType.INV5 + elif alt.endswith("]"): + mate_chrom, mate_pos = get_mate_info(alt, "]") + if mate_chrom == record.CHROM: + return mate_chrom, mate_pos, InversionType.INV3 + return None, None, InversionType.NONE + + +def collect_inv_bnds(reader): + result = {} + for record in reader: + _, _, inv_type = analyze_inv(record) + if inv_type != InversionType.NONE: + if record.ID[0] in result: + result[record.ID[0]] = record + else: + result[record.INFO["MATEID"][0]] = None + return result + + +def samtools_faidx(reference_fasta, chrom, start_pos, end_pos): + region = f"{chrom}:{start_pos}-{end_pos}" + samtools_out = subprocess.check_output( + ["samtools", "faidx", reference_fasta, region], text=True + ) + ref_seq = [] + for seq in samtools_out.split("\n"): + if not seq.startswith(">"): + ref_seq.append(seq) + return "".join(ref_seq).upper() + + +def convert_manta_inversions(args, reader, writer, inv_bnds): + for record in reader: + if record.ID[0] in inv_bnds: + continue # skip mate record + _, mate_pos, inv_type = analyze_inv(record) + if inv_type != InversionType.NONE: + if inv_type == InversionType.INV5: + # Adjust POS for 5' inversion + record.POS -= 1 + mate_pos -= 1 + record.REF = samtools_faidx( + args.reference_fasta, record.CHROM, record.POS, record.POS + ) + + id_suffix = record.ID[0].split("MantaBND")[1] + idx = id_suffix.rfind(":") + record.ID = ["MantaINV%s" % id_suffix[:idx]] + + record.ALT = [vcfpy.SymbolicAllele("INV")] + + record.INFO["END"] = mate_pos + record.INFO["SVTYPE"] = "INV" + record.INFO["SVLEN"] = [mate_pos - record.POS] + if record.INFO.get("IMPRECISE"): + mate_id = record.INFO["MATEID"][0] + record.INFO["CIEND"] = inv_bnds[mate_id].INFO["CIPOS"] + elif record.INFO.get("HOMLEN"): + record.INFO["CIEND"] = [-record.INFO["HOMLEN"][0], 0] + + if inv_type == InversionType.INV5 and "HOMSEQ" in record.INFO: + cipos = record.INFO["CIPOS"] + hom_seq_start = record.POS + cipos[0] + 1 + hom_seq_end = record.POS + cipos[1] + record.INFO["HOMSEQ"] = [ + samtools_faidx(args.reference_fasta, record.CHROM, hom_seq_start, hom_seq_end) + ] + + # remove BND-specific info + for key in ("MATEID", "BND_DEPTH", "MATE_BND_DEPTH"): + if key in record.INFO: + record.INFO.pop(key) + + # update INFO/EVENT + if "EVENT" in record.INFO: + eid_suffix = record.INFO["EVENT"].split("MantaBND")[1] + idx = eid_suffix.rfind(":") + record.INFO["EVENT"] = "MantaINV%s" % eid_suffix[:idx] + + # set INV3/INV5 tags + if inv_type == InversionType.INV3: + record.INFO["INV3"] = True + elif inv_type == InversionType.INV5: + record.INFO["INV5"] = True + + record.INFO = dict(sorted(record.INFO.items())) + + writer.write_record(record) + + +def run_manta(args, reader): + """Run MANTA conversion.""" + # First, collect all inversion BND records. + with vcfpy.Reader.from_path(args.input_vcf) as reader2: + inv_bnds = collect_inv_bnds(reader2) + # Go through file a second time and convert inversions. + header = reader.header.copy() + header.add_info_line( + { + "ID": "INV3", + "Number": "0", + "Type": "Flag", + "Description": "Inversion breakends open 3' of reported location", + } + ) + header.add_info_line( + { + "ID": "INV5", + "Number": "0", + "Type": "Flag", + "Description": "Inversion breakends open 5' of reported location", + } + ) + header.add_line( + vcfpy.AltAlleleHeaderLine.from_mapping( + { + "ID": "INV", + "Description": "Inversion", + } + ) + ) + with vcfpy.Writer.from_path(args.output_vcf, header) as writer: + convert_manta_inversions(args, reader, writer, inv_bnds) + + +def run(args): + """Main entry point after parsing command line. + + Non-MANTA VCF files are handled directly by copying. MANTA files are processed with + run_manta(). + """ + with vcfpy.Reader.from_path(args.input_vcf) as reader: + if not looks_like_manta(reader.header): # simply copy + with vcfpy.Writer.from_path(args.output_vcf, reader.header) as writer: + for record in reader: + writer.write_record(record) + else: + run_manta(args, reader) + + +def main(argv=None): + parser = argparse.ArgumentParser() + parser.add_argument("--reference-fasta", required=True, help="Reference FASTA file.") + parser.add_argument("--input-vcf", required=True, help="Input VCF file.") + parser.add_argument("--output-vcf", required=True, help="Output VCF file.") + + args = parser.parse_args(argv) + return run(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py b/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py new file mode 100644 index 000000000..7ea72deac --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/annotate_strucvars/wrapper.py @@ -0,0 +1,135 @@ +import os + +from snakemake.shell import shell + +__author__ = "Manuel Holtgrewe " + +# Optionally get path to coverage VCF file. +coverage_vcf = " ".join(getattr(snakemake.input, "vcf_cov", [])) + +# Get shortcut to configuration of varfish_export step +step_name = snakemake.params.args["step_name"] +export_config = snakemake.config["step_config"][step_name] +# Get shortcut to "fix_manta_invs.py" postprocessing script +fix_manta_invs = os.path.join( + os.path.dirname(__file__), + "fix_manta_invs.py", +) + +DEF_HELPER_FUNCS = r""" +compute-md5() +{ + if [[ $# -ne 2 ]]; then + >&2 echo "Invalid number of arguments: $#" + exit 1 + fi + md5sum $1 \ + | awk '{ gsub(/.*\//, "", $2); print; }' \ + > $2 +} +""" + +shell( + r""" +set -x + +# Write files for reproducibility ----------------------------------------------------------------- + +{DEF_HELPER_FUNCS} + +# Write out information about conda and save a copy of the wrapper with picked variables +# as well as the environment.yaml file. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +compute-md5 {snakemake.log.conda_list} {snakemake.log.conda_list_md5} +compute-md5 {snakemake.log.conda_info} {snakemake.log.conda_info_md5} +cp {__real_file__} {snakemake.log.wrapper} +compute-md5 {snakemake.log.wrapper} {snakemake.log.wrapper_md5} +cp $(dirname {__file__})/environment.yaml {snakemake.log.env_yaml} +compute-md5 {snakemake.log.env_yaml} {snakemake.log.env_yaml_md5} + +# Also pipe stderr to log file -------------------------------------------------------------------- + +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +# Create auto-cleaned temporary directory +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +# Run actual tools -------------------------------------------------------------------------------- + +samples=$(cut -f 2 {snakemake.input.ped} | tr '\n' ',' | sed -e 's/,$//g') + +# Fix the Manta inversions +i=0 +for vcf in {snakemake.input.vcf}; do + let "i=$i+1" + num=$(printf %03d $i) + + python3 {fix_manta_invs} \ + --reference-fasta {snakemake.config[static_data_config][reference][path]} \ + --input-vcf $vcf \ + --output-vcf $TMPDIR/fixed_bnd_to_inv_unsorted.$num.vcf + bcftools sort -o $TMPDIR/fixed_bnd_to_inv.$num.vcf $TMPDIR/fixed_bnd_to_inv_unsorted.$num.vcf + + # Add the missing "GT" tag + echo '##FORMAT=' \ + > $TMPDIR/header.gt.txt + + bcftools annotate \ + -h $TMPDIR/header.gt.txt \ + $TMPDIR/fixed_bnd_to_inv.$num.vcf \ + -O z \ + -o $TMPDIR/final_for_import.$num.vcf.gz + tabix -s1 -b2 -e2 -f $TMPDIR/final_for_import.$num.vcf.gz +done + +# Perform Mehari structural variant annotation. +mehari \ + annotate \ + strucvars \ + --path-db {export_config[path_mehari_db]} \ + --path-input-ped {snakemake.input.ped} \ + $(for p in $TMPDIR/final_for_import.*.vcf.gz; do \ + echo --path-input-vcf $p; \ + done) \ + --path-output-tsv >(gzip -c > {snakemake.output.gts}) + +cat < {snakemake.output.db_infos} +genomebuild db_name release +GRCh37 clinvar 20210728 +GRCh37 gnomad_exomes r2.1.1 +GRCh37 gnomad_genomes r2.1.1 +EOF + +# Compute MD5 sums on output files +compute-md5 {snakemake.output.db_infos} {snakemake.output.db_infos_md5} +compute-md5 {snakemake.output.gts} {snakemake.output.gts_md5} + +# Create output links ----------------------------------------------------------------------------- + +for path in {snakemake.output.output_links}; do + dst=$path + src=work/${{dst#output/}} + ln -sr $src $dst +done +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +{DEF_HELPER_FUNCS} + +sleep 1s # try to wait for log file flush +compute-md5 {snakemake.log.log} {snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/mehari/bam_qc/environment.yaml b/snappy_wrappers/wrappers/mehari/bam_qc/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/bam_qc/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/mehari/bam_qc/wrapper.py b/snappy_wrappers/wrappers/mehari/bam_qc/wrapper.py new file mode 120000 index 000000000..16f251594 --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/bam_qc/wrapper.py @@ -0,0 +1 @@ +../../varfish_annotator/bam_qc/wrapper.py \ No newline at end of file diff --git a/snappy_wrappers/wrappers/mehari/environment.yaml b/snappy_wrappers/wrappers/mehari/environment.yaml new file mode 100644 index 000000000..065eca483 --- /dev/null +++ b/snappy_wrappers/wrappers/mehari/environment.yaml @@ -0,0 +1,11 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bcftools >=1.9 + - htslib >=1.9 + - samtools >=1.9 + - mehari ==0.2.1 + - jq + - vcfpy + - pysam diff --git a/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py b/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py deleted file mode 100644 index 8b306e707..000000000 --- a/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Helper script to fix MANTA inversions.""" - -import argparse -import enum -import subprocess -import sys - -import vcfpy - - -def looks_like_manta(header): - """Checks VCF header whether the file looks like coming from MANTA.""" - for line in header.lines: - if line.key == "source" and line.value.startswith("GenerateSVCandidates"): - return True - elif line.key == "source" and line.value.startswith("DRAGEN"): - return True - return False - - -@enum.unique -class InversionType(enum.Enum): - """Inversion type.""" - - INV3 = "INV3" - INV5 = "INV5" - NONE = "NONE" - - -def analyze_inv(record): - def get_mate_info(alt, c): - arr = alt.split(c) - mate_chrom, mate_pos = arr[1].split(":") - return mate_chrom, int(mate_pos) - - alt = record.ALT[0].serialize() - if alt.startswith("["): - mate_chrom, mate_pos = get_mate_info(alt, "[") - if mate_chrom == record.CHROM: - return mate_chrom, mate_pos, InversionType.INV5 - elif alt.endswith("]"): - mate_chrom, mate_pos = get_mate_info(alt, "]") - if mate_chrom == record.CHROM: - return mate_chrom, mate_pos, InversionType.INV3 - return None, None, InversionType.NONE - - -def collect_inv_bnds(reader): - result = {} - for record in reader: - _, _, inv_type = analyze_inv(record) - if inv_type != InversionType.NONE: - if record.ID[0] in result: - result[record.ID[0]] = record - else: - result[record.INFO["MATEID"][0]] = None - return result - - -def samtools_faidx(reference_fasta, chrom, start_pos, end_pos): - region = f"{chrom}:{start_pos}-{end_pos}" - samtools_out = subprocess.check_output( - ["samtools", "faidx", reference_fasta, region], text=True - ) - ref_seq = [] - for seq in samtools_out.split("\n"): - if not seq.startswith(">"): - ref_seq.append(seq) - return "".join(ref_seq).upper() - - -def convert_manta_inversions(args, reader, writer, inv_bnds): - for record in reader: - if record.ID[0] in inv_bnds: - continue # skip mate record - _, mate_pos, inv_type = analyze_inv(record) - if inv_type != InversionType.NONE: - if inv_type == InversionType.INV5: - # Adjust POS for 5' inversion - record.POS -= 1 - mate_pos -= 1 - record.REF = samtools_faidx( - args.reference_fasta, record.CHROM, record.POS, record.POS - ) - - id_suffix = record.ID[0].split("MantaBND")[1] - idx = id_suffix.rfind(":") - record.ID = ["MantaINV%s" % id_suffix[:idx]] - - record.ALT = [vcfpy.SymbolicAllele("INV")] - - record.INFO["END"] = mate_pos - record.INFO["SVTYPE"] = "INV" - record.INFO["SVLEN"] = [mate_pos - record.POS] - if record.INFO.get("IMPRECISE"): - mate_id = record.INFO["MATEID"][0] - record.INFO["CIEND"] = inv_bnds[mate_id].INFO["CIPOS"] - elif record.INFO.get("HOMLEN"): - record.INFO["CIEND"] = [-record.INFO["HOMLEN"][0], 0] - - if inv_type == InversionType.INV5 and "HOMSEQ" in record.INFO: - cipos = record.INFO["CIPOS"] - hom_seq_start = record.POS + cipos[0] + 1 - hom_seq_end = record.POS + cipos[1] - record.INFO["HOMSEQ"] = [ - samtools_faidx(args.reference_fasta, record.CHROM, hom_seq_start, hom_seq_end) - ] - - # remove BND-specific info - for key in ("MATEID", "BND_DEPTH", "MATE_BND_DEPTH"): - if key in record.INFO: - record.INFO.pop(key) - - # update INFO/EVENT - if "EVENT" in record.INFO: - eid_suffix = record.INFO["EVENT"].split("MantaBND")[1] - idx = eid_suffix.rfind(":") - record.INFO["EVENT"] = "MantaINV%s" % eid_suffix[:idx] - - # set INV3/INV5 tags - if inv_type == InversionType.INV3: - record.INFO["INV3"] = True - elif inv_type == InversionType.INV5: - record.INFO["INV5"] = True - - record.INFO = dict(sorted(record.INFO.items())) - - writer.write_record(record) - - -def run_manta(args, reader): - """Run MANTA conversion.""" - # First, collect all inversion BND records. - with vcfpy.Reader.from_path(args.input_vcf) as reader2: - inv_bnds = collect_inv_bnds(reader2) - # Go through file a second time and convert inversions. - header = reader.header.copy() - header.add_info_line( - { - "ID": "INV3", - "Number": "0", - "Type": "Flag", - "Description": "Inversion breakends open 3' of reported location", - } - ) - header.add_info_line( - { - "ID": "INV5", - "Number": "0", - "Type": "Flag", - "Description": "Inversion breakends open 5' of reported location", - } - ) - header.add_line( - vcfpy.AltAlleleHeaderLine.from_mapping( - { - "ID": "INV", - "Description": "Inversion", - } - ) - ) - with vcfpy.Writer.from_path(args.output_vcf, header) as writer: - convert_manta_inversions(args, reader, writer, inv_bnds) - - -def run(args): - """Main entry point after parsing command line. - - Non-MANTA VCF files are handled directly by copying. MANTA files are processed with - run_manta(). - """ - with vcfpy.Reader.from_path(args.input_vcf) as reader: - if not looks_like_manta(reader.header): # simply copy - with vcfpy.Writer.from_path(args.output_vcf, reader.header) as writer: - for record in reader: - writer.write_record(record) - else: - run_manta(args, reader) - - -def main(argv=None): - parser = argparse.ArgumentParser() - parser.add_argument("--reference-fasta", required=True, help="Reference FASTA file.") - parser.add_argument("--input-vcf", required=True, help="Input VCF file.") - parser.add_argument("--output-vcf", required=True, help="Output VCF file.") - - args = parser.parse_args(argv) - return run(args) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py b/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py new file mode 120000 index 000000000..0eefaaa27 --- /dev/null +++ b/snappy_wrappers/wrappers/varfish_annotator/annotate_svs/fix_manta_invs.py @@ -0,0 +1 @@ +../../mehari/annotate_strucvars/fix_manta_invs.py \ No newline at end of file diff --git a/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py b/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py index 6595c69f5..02b2f4141 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py +++ b/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py @@ -88,11 +88,11 @@ def varfish_export_workflow( ) -# Tests for VarfishAnnotatorAnnotateStepPart ------------------------------------------------------- +# Tests for MehariAnnotateStepPart ------------------------------------------------------- -def test_varfish_annotator_step_part_get_input_files_annotate(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_input_files_annotate()""" +def test_mehari_step_part_get_input_files_annotate(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_input_files_annotate()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -109,12 +109,12 @@ def test_varfish_annotator_step_part_get_input_files_annotate(varfish_export_wor "vcf": [base_name + ".vcf.gz"], } # Get actual - actual = varfish_export_workflow.get_input_files("varfish_annotator", "annotate")(wildcards) + actual = varfish_export_workflow.get_input_files("mehari", "annotate_seqvars")(wildcards) assert actual == expected -def test_varfish_annotator_step_part_get_input_files_bam_qc(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_input_files_bam_qc()""" +def test_mehari_step_part_get_input_files_bam_qc(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_input_files_bam_qc()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -137,16 +137,16 @@ def test_varfish_annotator_step_part_get_input_files_bam_qc(varfish_export_workf "cov_qc": [base_name_cov.format(i=i) for i in donor_indices], } # Get actual - actual = varfish_export_workflow.get_input_files("varfish_annotator", "bam_qc")(wildcards) + actual = varfish_export_workflow.get_input_files("mehari", "bam_qc")(wildcards) assert actual == expected -def test_varfish_annotator_step_part_get_output_files_annotate(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_output_files_annotate()""" +def test_mehari_step_part_get_output_files_annotate(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_output_files_annotate()""" # Define expected base_name_out = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.varfish_annotator_annotate.{index_ngs_library}" + "{mapper}.mehari_annotate_seqvars.{index_ngs_library}" ) expected = { "gts": base_name_out + ".gts.tsv.gz", @@ -156,69 +156,69 @@ def test_varfish_annotator_step_part_get_output_files_annotate(varfish_export_wo "ped": base_name_out + ".ped", "ped_md5": base_name_out + ".ped.md5", "output_links": [ - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.ped", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.ped.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.gts.tsv.gz", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.gts.tsv.gz.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.db-infos.tsv.gz", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_annotate.{index_ngs_library}.db-infos.tsv.gz.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.wrapper.py", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.wrapper.py.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.log", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.log.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.conda_info.txt", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.conda_info.txt.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.conda_list.txt", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.conda_list.txt.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.environment.yaml", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_annotate.{index_ngs_library}.environment.yaml.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.ped", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.ped.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.gts.tsv.gz", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.gts.tsv.gz.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.db-infos.tsv.gz", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.db-infos.tsv.gz.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.wrapper.py", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.wrapper.py.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.log", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.log.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.conda_info.txt", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.conda_info.txt.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.conda_list.txt", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.conda_list.txt.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.environment.yaml", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_annotate_seqvars.{index_ngs_library}.environment.yaml.md5", ], } # Get actual - actual = varfish_export_workflow.get_output_files("varfish_annotator", "annotate") + actual = varfish_export_workflow.get_output_files("mehari", "annotate_seqvars") assert actual == expected -def test_varfish_annotator_step_part_get_output_files_bam_qc(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_output_files_bam_qc()""" +def test_mehari_step_part_get_output_files_bam_qc(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_output_files_bam_qc()""" # Define expected base_name_out = ( "work/{mapper}.varfish_export.{index_ngs_library}/out/" - "{mapper}.varfish_annotator_bam_qc.{index_ngs_library}" + "{mapper}.mehari_bam_qc.{index_ngs_library}" ) expected = { "bam_qc": base_name_out + ".bam-qc.tsv.gz", "bam_qc_md5": base_name_out + ".bam-qc.tsv.gz.md5", "output_links": [ - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.bam-qc.tsv.gz", - "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.bam-qc.tsv.gz.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.wrapper.py", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.wrapper.py.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.log", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.log.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.conda_info.txt", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.conda_info.txt.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.conda_list.txt", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.conda_list.txt.md5", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.environment.yaml", - "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.varfish_annotator_bam_qc.{index_ngs_library}.environment.yaml.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_bam_qc.{index_ngs_library}.bam-qc.tsv.gz", + "output/{mapper}.varfish_export.{index_ngs_library}/out/{mapper}.mehari_bam_qc.{index_ngs_library}.bam-qc.tsv.gz.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.wrapper.py", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.wrapper.py.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.log", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.log.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.conda_info.txt", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.conda_info.txt.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.conda_list.txt", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.conda_list.txt.md5", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.environment.yaml", + "output/{mapper}.varfish_export.{index_ngs_library}/log/{mapper}.mehari_bam_qc.{index_ngs_library}.environment.yaml.md5", ], } # Get actual - actual = varfish_export_workflow.get_output_files("varfish_annotator", "bam_qc") + actual = varfish_export_workflow.get_output_files("mehari", "bam_qc") assert actual == expected -def test_varfish_annotator_step_part_get_log_file_annotate(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_log_file()_annotate""" +def test_mehari_step_part_get_log_file_annotate(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_log_file()_annotate""" # Define expected base_name_log = ( "work/{mapper}.varfish_export.{index_ngs_library}/log/" - "{mapper}.varfish_annotator_annotate.{index_ngs_library}" + "{mapper}.mehari_annotate_seqvars.{index_ngs_library}" ) base_name_wrapper = ( "work/{mapper}.varfish_export.{index_ngs_library}/log/" - "{mapper}.varfish_annotator_annotate.{index_ngs_library}" + "{mapper}.mehari_annotate_seqvars.{index_ngs_library}" ) wrapper_dict = { "wrapper": base_name_wrapper + ".wrapper.py", @@ -229,20 +229,20 @@ def test_varfish_annotator_step_part_get_log_file_annotate(varfish_export_workfl log_dict = get_expected_log_files_dict(base_out=base_name_log) expected = {**wrapper_dict, **log_dict} # Get actual - actual = varfish_export_workflow.get_log_file("varfish_annotator", "annotate") + actual = varfish_export_workflow.get_log_file("mehari", "annotate_seqvars") assert actual == expected -def test_varfish_annotator_step_part_get_log_file_bam_qc(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_log_file()_bam_qc""" +def test_mehari_step_part_get_log_file_bam_qc(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_log_file()_bam_qc""" # Define expected base_name_log = ( "work/{mapper}.varfish_export.{index_ngs_library}/log/" - "{mapper}.varfish_annotator_bam_qc.{index_ngs_library}" + "{mapper}.mehari_bam_qc.{index_ngs_library}" ) base_name_wrapper = ( "work/{mapper}.varfish_export.{index_ngs_library}/log/" - "{mapper}.varfish_annotator_bam_qc.{index_ngs_library}" + "{mapper}.mehari_bam_qc.{index_ngs_library}" ) wrapper_dict = { "wrapper": base_name_wrapper + ".wrapper.py", @@ -253,40 +253,40 @@ def test_varfish_annotator_step_part_get_log_file_bam_qc(varfish_export_workflow log_dict = get_expected_log_files_dict(base_out=base_name_log) expected = {**wrapper_dict, **log_dict} # Get actual - actual = varfish_export_workflow.get_log_file("varfish_annotator", "bam_qc") + actual = varfish_export_workflow.get_log_file("mehari", "bam_qc") assert actual == expected -def test_varfish_annotator_step_part_get_params_annotate(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_params_annotate()""" +def test_mehari_step_part_get_params_annotate(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_params_annotate()""" wildcards = Wildcards(fromdict={"index_ngs_library": "P001-N1-DNA1-WGS1"}) expected = {"step_name": "varfish_export"} - actual = varfish_export_workflow.get_params("varfish_annotator", "annotate")(wildcards) + actual = varfish_export_workflow.get_params("mehari", "annotate_seqvars")(wildcards) assert actual == expected -def test_varfish_annotator_step_part_get_params_bam_qc(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart._get_params_bam_qc()""" +def test_mehari_step_part_get_params_bam_qc(varfish_export_workflow): + """Tests MehariAnnotateStepPart._get_params_bam_qc()""" wildcards = Wildcards(fromdict={"index_ngs_library": "P001-N1-DNA1-WGS1"}) expected = { "P001-N1-DNA1-WGS1": "P001-N1-DNA1-WGS1", "P002-N1-DNA1-WGS1": "P002-N1-DNA1-WGS1", "P003-N1-DNA1-WGS1": "P003-N1-DNA1-WGS1", } - actual = varfish_export_workflow.get_params("varfish_annotator", "bam_qc")(wildcards) + actual = varfish_export_workflow.get_params("mehari", "bam_qc")(wildcards) assert actual == expected -def test_varfish_annotator_step_part_get_resource_usage(varfish_export_workflow): - """Tests VarfishAnnotatorAnnotateStepPart.get_resource_usage()""" - all_actions = varfish_export_workflow.substep_getattr("varfish_annotator", "actions") +def test_mehari_step_part_get_resource_usage(varfish_export_workflow): + """Tests MehariAnnotateStepPart.get_resource_usage()""" + all_actions = varfish_export_workflow.substep_getattr("mehari", "actions") # Define expected expected_dict = {"threads": 2, "time": "1-00:00:00", "memory": "14G", "partition": "medium"} # Evaluate for action in all_actions: for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = varfish_export_workflow.get_resource("varfish_annotator", action, resource) + actual = varfish_export_workflow.get_resource("mehari", action, resource) assert actual == expected, msg_error @@ -296,28 +296,28 @@ def test_varfish_annotator_step_part_get_resource_usage(varfish_export_workflow) def test_varfish_export_workflow(varfish_export_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["link_out", "varfish_annotator", "write_pedigree"] + expected = ["link_out", "mehari", "write_pedigree"] actual = list(sorted(varfish_export_workflow.sub_steps.keys())) assert actual == expected # Check result file construction tpl = ( "output/bwa.varfish_export.P00{i}-N1-DNA1-WGS1/{dir_}/" - "bwa.varfish_annotator_{action}.P00{i}-N1-DNA1-WGS1.{ext}" + "bwa.mehari_{action}.P00{i}-N1-DNA1-WGS1.{ext}" ) # Files in `out` directory expected = [ tpl.format(dir_="out", i=i, ext=ext, action=action) for i in (1, 4) # only for indices for (action, ext) in ( - ("annotate", "gts.tsv.gz"), - ("annotate", "gts.tsv.gz.md5"), - ("annotate", "db-infos.tsv.gz"), - ("annotate", "db-infos.tsv.gz.md5"), + ("annotate_seqvars", "gts.tsv.gz"), + ("annotate_seqvars", "gts.tsv.gz.md5"), + ("annotate_seqvars", "db-infos.tsv.gz"), + ("annotate_seqvars", "db-infos.tsv.gz.md5"), ("bam_qc", "bam-qc.tsv.gz"), ("bam_qc", "bam-qc.tsv.gz.md5"), - ("annotate", "ped"), - ("annotate", "ped.md5"), + ("annotate_seqvars", "ped"), + ("annotate_seqvars", "ped.md5"), ) ] # Files in `log` directory @@ -336,7 +336,7 @@ def test_varfish_export_workflow(varfish_export_workflow): "environment.yaml", "environment.yaml.md5", ) - for action in ("annotate", "bam_qc") + for action in ("annotate_seqvars", "bam_qc") ] expected = sorted(expected) actual = sorted(varfish_export_workflow.get_result_files())