Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite the SplitVariants task command in TasksGenotypeBatch.wdl to call svtk only once #618

Merged
merged 15 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/python
import argparse
import logging


# Function to process the bed file by checking for conditions
def process_bed_file(input_bed, n_per_split, bca=True):
SVTYPE_FIELD = 4
END_FIELD = 2
START_FIELD = 1

# Dictionary to store the conditions to be checked with matching prefixes
condition_prefixes = {
'gt5kb': {
'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
'lt5kb': {
'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
'bca': {'condition': lambda line: bca and (
line[SVTYPE_FIELD] != 'DEL' and line[SVTYPE_FIELD] != 'DUP' and line[SVTYPE_FIELD] != 'INS')},
'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD] == 'INS'}
}

current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}

# Open the bed file and process
with open(input_bed, 'r') as infile:
for line in infile:
# process bed file line by line
line = line.strip().split('\t')

# Checks which condition and prefix the current line matches and appends it to the corresponding
# array and increments the counter for that array
for prefix, conditions in condition_prefixes.items():
if conditions['condition'](line):
current_lines[prefix].append('\t'.join(line))
current_counts[prefix] += 1

# If the current array has the maximum allowed lines added to it create a new array
# with the preceding suffix and write the current array to a file
if current_counts[prefix] == n_per_split:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(current_lines[prefix]))

logging.info(f"File '{output_file}' written.")
current_lines[prefix] = []
current_counts[prefix] = 0
current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])

# Handle remaining lines after the loop
for prefix, lines in current_lines.items():
if lines:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(lines))

logging.info(f"File '{output_file}' written.")


# Function to generate the pattern for suffixes
def increment_suffix(suffix):
# define the alphabet and ending
alphabet = 'abcdefghijklmnopqrstuvwxyz'
if suffix == 'z' * 6:
raise ValueError('All possible files generated.')
else:
# if there are available suffixes increment to next available suffix
index = alphabet.index(suffix[0])
next_char = alphabet[(index + 1) % 26]
return next_char + suffix[1:]


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--bed", help="Path to input bed file", required=True)
parser.add_argument("--n", help="number of variants per file", required=True)
parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs", action='store_true')
parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information")
args = parser.parse_args()

# Set logging level from --log-level input
log_level = args.log_level
numeric_level = getattr(logging, log_level.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: %s' % log_level)
logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')
process_bed_file(args.bed, args.n, args.bca)


if __name__ == '__main__':
main()
20 changes: 5 additions & 15 deletions wdl/TasksGenotypeBatch.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,12 @@ task SplitVariants {
Array[File] ins_beds = glob("ins.*")
}
command <<<

set -euo pipefail
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2>=5000) {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - gt5kb.
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2<5000) {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - lt5kb.
if [ ~{generate_bca} == "true" ]; then
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '($5!="DEL" && $5!="DUP" && $5!="INS") {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - bca.
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '($5=="INS") {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - ins.
fi
svtk vcf2bed ~{vcf} bed_file.bed
python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \
--bed bed_file.bed \
~{"--n " + n_per_split} \
~{if generate_bca then "--bca" else ""}

>>>
runtime {
Expand Down
Loading