From 310eb7b28cf87a984b8d14afe2874ec14a052dab Mon Sep 17 00:00:00 2001 From: kirtanav98 <123595850+kirtanav98@users.noreply.github.com> Date: Fri, 5 Jan 2024 14:43:17 -0500 Subject: [PATCH] Rewrite the SplitVariants task command in TasksGenotypeBatch.wdl to call svtk only once (#618) * add python file for SplitVariants task * edited TasksGenotype.wdl command to call SplitVariants.py * changed command in TasksGenotypeBatch.wdl * changed docker to include correct tag for sv-pipeline * reformatted python script to match github lint8 formatting specifications * reformatted python script to match github lint8 formatting specifications * made changes based on first review * made edit to python script to lint correctly * made edit to python script to lint correctly, and added extra clarifying comments to code. * made edit to python script to lint correctly, and added extra clarifying comments to code. * made edits based on second review. * made edits based on second review. * made edits based on second review. * made edits based on second review. * addressed changes in the last review --- .../scripts/split_variants.py | 95 +++++++++++++++++++ wdl/TasksGenotypeBatch.wdl | 20 +--- 2 files changed, 100 insertions(+), 15 deletions(-) create mode 100644 src/sv-pipeline/04_variant_resolution/scripts/split_variants.py diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py new file mode 100644 index 000000000..34e36dba9 --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py @@ -0,0 +1,95 @@ +#!/bin/python +import argparse +import logging + + +# Function to process the bed file by checking for conditions +def process_bed_file(input_bed, n_per_split, bca=True): + SVTYPE_FIELD = 4 + END_FIELD = 2 + START_FIELD = 1 + + # Dictionary to store the conditions to be checked with matching prefixes + condition_prefixes = { + 'gt5kb': { + 'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)}, + 'lt5kb': { + 'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)}, + 'bca': {'condition': lambda line: bca and ( + line[SVTYPE_FIELD] != 'DEL' and line[SVTYPE_FIELD] != 'DUP' and line[SVTYPE_FIELD] != 'INS')}, + 'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD] == 'INS'} + } + + current_lines = {prefix: [] for prefix in condition_prefixes.keys()} + current_counts = {prefix: 0 for prefix in condition_prefixes.keys()} + current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()} + + # Open the bed file and process + with open(input_bed, 'r') as infile: + for line in infile: + # process bed file line by line + line = line.strip().split('\t') + + # Checks which condition and prefix the current line matches and appends it to the corresponding + # array and increments the counter for that array + for prefix, conditions in condition_prefixes.items(): + if conditions['condition'](line): + current_lines[prefix].append('\t'.join(line)) + current_counts[prefix] += 1 + + # If the current array has the maximum allowed lines added to it create a new array + # with the preceding suffix and write the current array to a file + if current_counts[prefix] == n_per_split: + output_suffix = current_suffixes[prefix].rjust(6, 'a') + output_file = f"{prefix}.{output_suffix}.bed" + with open(output_file, 'w') as outfile: + outfile.write('\n'.join(current_lines[prefix])) + + logging.info(f"File '{output_file}' written.") + current_lines[prefix] = [] + current_counts[prefix] = 0 + current_suffixes[prefix] = increment_suffix(current_suffixes[prefix]) + + # Handle remaining lines after the loop + for prefix, lines in current_lines.items(): + if lines: + output_suffix = current_suffixes[prefix].rjust(6, 'a') + output_file = f"{prefix}.{output_suffix}.bed" + with open(output_file, 'w') as outfile: + outfile.write('\n'.join(lines)) + + logging.info(f"File '{output_file}' written.") + + +# Function to generate the pattern for suffixes +def increment_suffix(suffix): + # define the alphabet and ending + alphabet = 'abcdefghijklmnopqrstuvwxyz' + if suffix == 'z' * 6: + raise ValueError('All possible files generated.') + else: + # if there are available suffixes increment to next available suffix + index = alphabet.index(suffix[0]) + next_char = alphabet[(index + 1) % 26] + return next_char + suffix[1:] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--bed", help="Path to input bed file", required=True) + parser.add_argument("--n", help="number of variants per file", required=True) + parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs", action='store_true') + parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information") + args = parser.parse_args() + + # Set logging level from --log-level input + log_level = args.log_level + numeric_level = getattr(logging, log_level.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError('Invalid log level: %s' % log_level) + logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s') + process_bed_file(args.bed, args.n, args.bca) + + +if __name__ == '__main__': + main() diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl index 1b853a056..4e0d44021 100644 --- a/wdl/TasksGenotypeBatch.wdl +++ b/wdl/TasksGenotypeBatch.wdl @@ -28,22 +28,12 @@ task SplitVariants { Array[File] ins_beds = glob("ins.*") } command <<< - set -euo pipefail - svtk vcf2bed ~{vcf} stdout \ - | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2>=5000) {print $1, $2, $3, $4, $6, $5}' \ - | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - gt5kb. - svtk vcf2bed ~{vcf} stdout \ - | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2<5000) {print $1, $2, $3, $4, $6, $5}' \ - | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - lt5kb. - if [ ~{generate_bca} == "true" ]; then - svtk vcf2bed ~{vcf} stdout \ - | awk -v OFS="\t" '($5!="DEL" && $5!="DUP" && $5!="INS") {print $1, $2, $3, $4, $6, $5}' \ - | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - bca. - svtk vcf2bed ~{vcf} stdout \ - | awk -v OFS="\t" '($5=="INS") {print $1, $2, $3, $4, $6, $5}' \ - | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - ins. - fi + svtk vcf2bed ~{vcf} bed_file.bed + python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \ + --bed bed_file.bed \ + ~{"--n " + n_per_split} \ + ~{if generate_bca then "--bca" else ""} >>> runtime {