From 310eb7b28cf87a984b8d14afe2874ec14a052dab Mon Sep 17 00:00:00 2001
From: kirtanav98 <123595850+kirtanav98@users.noreply.github.com>
Date: Fri, 5 Jan 2024 14:43:17 -0500
Subject: [PATCH] Rewrite the SplitVariants task command in
 TasksGenotypeBatch.wdl to call svtk only once (#618)

* add python file for SplitVariants task

* edited TasksGenotype.wdl command to call SplitVariants.py

* changed command in TasksGenotypeBatch.wdl

* changed docker to include correct tag for sv-pipeline

* reformatted python script to match github lint8 formatting specifications

* reformatted python script to match github lint8 formatting specifications

* made changes based on first review

* made edit to python script to lint correctly

* made edit to python script to lint correctly, and added extra clarifying comments to code.

* made edit to python script to lint correctly, and added extra clarifying comments to code.

* made edits based on second review.

* made edits based on second review.

* made edits based on second review.

* made edits based on second review.

* addressed changes in the last review
---
 .../scripts/split_variants.py                 | 95 +++++++++++++++++++
 wdl/TasksGenotypeBatch.wdl                    | 20 +---
 2 files changed, 100 insertions(+), 15 deletions(-)
 create mode 100644 src/sv-pipeline/04_variant_resolution/scripts/split_variants.py

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
new file mode 100644
index 000000000..34e36dba9
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -0,0 +1,95 @@
+#!/bin/python
+import argparse
+import logging
+
+
+# Function to process the bed file by checking for conditions
+def process_bed_file(input_bed, n_per_split, bca=True):
+    SVTYPE_FIELD = 4
+    END_FIELD = 2
+    START_FIELD = 1
+
+    # Dictionary to store the conditions to be checked with matching prefixes
+    condition_prefixes = {
+        'gt5kb': {
+            'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
+        'lt5kb': {
+            'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
+        'bca': {'condition': lambda line: bca and (
+                line[SVTYPE_FIELD] != 'DEL' and line[SVTYPE_FIELD] != 'DUP' and line[SVTYPE_FIELD] != 'INS')},
+        'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD] == 'INS'}
+    }
+
+    current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
+    current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
+    current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}
+
+    # Open the bed file and process
+    with open(input_bed, 'r') as infile:
+        for line in infile:
+            # process bed file line by line
+            line = line.strip().split('\t')
+
+            # Checks which condition and prefix the current line matches and appends it to the corresponding
+            # array and increments the counter for that array
+            for prefix, conditions in condition_prefixes.items():
+                if conditions['condition'](line):
+                    current_lines[prefix].append('\t'.join(line))
+                    current_counts[prefix] += 1
+
+                    # If the current array has the maximum allowed lines added to it create a new array
+                    # with the preceding suffix and write the current array to a file
+                    if current_counts[prefix] == n_per_split:
+                        output_suffix = current_suffixes[prefix].rjust(6, 'a')
+                        output_file = f"{prefix}.{output_suffix}.bed"
+                        with open(output_file, 'w') as outfile:
+                            outfile.write('\n'.join(current_lines[prefix]))
+
+                        logging.info(f"File '{output_file}' written.")
+                        current_lines[prefix] = []
+                        current_counts[prefix] = 0
+                        current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])
+
+    # Handle remaining lines after the loop
+    for prefix, lines in current_lines.items():
+        if lines:
+            output_suffix = current_suffixes[prefix].rjust(6, 'a')
+            output_file = f"{prefix}.{output_suffix}.bed"
+            with open(output_file, 'w') as outfile:
+                outfile.write('\n'.join(lines))
+
+            logging.info(f"File '{output_file}' written.")
+
+
+# Function to generate the pattern for suffixes
+def increment_suffix(suffix):
+    # define the alphabet and ending
+    alphabet = 'abcdefghijklmnopqrstuvwxyz'
+    if suffix == 'z' * 6:
+        raise ValueError('All possible files generated.')
+    else:
+        # if there are available suffixes increment to next available suffix
+        index = alphabet.index(suffix[0])
+        next_char = alphabet[(index + 1) % 26]
+        return next_char + suffix[1:]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--bed", help="Path to input bed file", required=True)
+    parser.add_argument("--n", help="number of variants per file", required=True)
+    parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs", action='store_true')
+    parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information")
+    args = parser.parse_args()
+
+    # Set logging level from --log-level input
+    log_level = args.log_level
+    numeric_level = getattr(logging, log_level.upper(), None)
+    if not isinstance(numeric_level, int):
+        raise ValueError('Invalid log level: %s' % log_level)
+    logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')
+    process_bed_file(args.bed, args.n, args.bca)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl
index 1b853a056..4e0d44021 100644
--- a/wdl/TasksGenotypeBatch.wdl
+++ b/wdl/TasksGenotypeBatch.wdl
@@ -28,22 +28,12 @@ task SplitVariants {
     Array[File] ins_beds = glob("ins.*")
   }
   command <<<
-
     set -euo pipefail
-    svtk vcf2bed ~{vcf} stdout \
-      | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2>=5000) {print $1, $2, $3, $4, $6, $5}' \
-      | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - gt5kb.
-    svtk vcf2bed ~{vcf} stdout \
-      | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2<5000) {print $1, $2, $3, $4, $6, $5}' \
-      | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - lt5kb.
-    if [ ~{generate_bca} == "true" ]; then
-      svtk vcf2bed ~{vcf} stdout \
-        | awk -v OFS="\t" '($5!="DEL" && $5!="DUP" && $5!="INS") {print $1, $2, $3, $4, $6, $5}' \
-        | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - bca.
-      svtk vcf2bed ~{vcf} stdout \
-        | awk -v OFS="\t" '($5=="INS") {print $1, $2, $3, $4, $6, $5}' \
-        | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - ins.
-    fi
+    svtk vcf2bed ~{vcf} bed_file.bed
+    python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \
+      --bed bed_file.bed \
+      ~{"--n " + n_per_split} \
+      ~{if generate_bca then "--bca" else ""}
 
   >>>
   runtime {