Skip to content

Commit

Permalink
Merge pull request #207 from harvardinformatics/filter_intervals
Browse files Browse the repository at this point in the history
parallel sentieon intervals and fix indel snps
  • Loading branch information
tsackton authored Jun 17, 2024
2 parents 857625c + 52db16e commit 7ca25b4
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 19 deletions.
7 changes: 4 additions & 3 deletions workflow/modules/postprocess/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ rule drop_indel_SNPs:
vcf = "results/{refGenome}/postprocess/{prefix}_clean_snps_1.vcf.gz",
idx = "results/{refGenome}/postprocess/{prefix}_clean_snps_1.vcf.gz.tbi"
output:
indel_snps = temp("results/{refGenome}/postprocess/{prefix}_indel_snp_positions.txt"),
keep_snps = temp("results/{refGenome}/postprocess/{prefix}_keep_snp_positions.txt"),
vcf = "results/{refGenome}/{prefix}_clean_snps.vcf.gz",
idx = "results/{refGenome}/{prefix}_clean_snps.vcf.gz.tbi"
conda:
Expand All @@ -172,7 +172,8 @@ rule drop_indel_SNPs:
mem_mb = lambda wildcards, attempt: attempt * 4000
shell:
"""
bcftools query -f '%CHROM\t%POS\t%REF\n' {input.vcf} | awk 'length($3) == 1 {{print $1"\t"$2}}' > {output.indel_snps}
bcftools view -R {output.indel_snps} {input.vcf} -Oz -o {output.vcf}
bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\n' {input.vcf} | awk 'length($3) == 1 {{print $1"\t"$2}}' | bgzip -c > {output.keep_snps}
tabix -s1 -b2 -e2 {output.keep_snps}
bcftools view -T {output.keep_snps} {input.vcf} -Oz -o {output.vcf}
bcftools index -t {output.vcf}
"""
53 changes: 37 additions & 16 deletions workflow/rules/sentieon.smk
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ rule sentieon_combine_gvcf:

rule filter_vcf:
"""
This rule applies filters to the raw vcf.
This rule applies filters to the raw vcf using GNU Parallel.
"""
input:
vcf = "results/{refGenome}/vcfs/raw.vcf.gz",
Expand All @@ -125,20 +125,41 @@ rule filter_vcf:
conda:
"../envs/bam2vcf.yml"
log:
"logs/{refGenome}/sentieon_combine_gvcf/{prefix}_log.txt"
"logs/{refGenome}/sentieon_filter_vcfs/{prefix}_log.txt"
shadow: "minimal"
benchmark:
"benchmarks/{refGenome}/sentieon_combine_gvcf/{prefix}_benchmark.txt"
"benchmarks/{refGenome}/sentieon_filter_vcfs/{prefix}_benchmark.txt"
shell:
"gatk VariantFiltration "
"-R {input.ref} "
"-V {input.vcf} "
"--output {output.vcf} "
"--filter-name \"RPRS_filter\" "
"--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
"--filter-name \"FS_SOR_filter\" "
"--filter-expression \"(vc.isSNP() && ((vc.hasAttribute('FS') && FS > 60.0) || (vc.hasAttribute('SOR') && SOR > 3.0))) || ((vc.isIndel() || vc.isMixed()) && ((vc.hasAttribute('FS') && FS > 200.0) || (vc.hasAttribute('SOR') && SOR > 10.0)))\" "
"--filter-name \"MQ_filter\" "
"--filter-expression \"vc.isSNP() && ((vc.hasAttribute('MQ') && MQ < 40.0) || (vc.hasAttribute('MQRankSum') && MQRankSum < -12.5))\" "
"--filter-name \"QUAL_filter\" "
"--filter-expression \"QUAL < 30.0\" "
"--invalidate-previous-filters true &> {log}"
"""
# get the contig names from the .fai index
contigs=$(cut -f1 {input.indexes[5]})
# create a function that will be passed to gnu parallel
filter_contig() {{
contig=$1
echo $contig
gatk --java-options "-Xmx4g" VariantFiltration \
-R {input.ref} \
-L ${{contig}} \
-V {input.vcf} \
--output {wildcards.refGenome}_{wildcards.prefix}_filter_${{contig}}.vcf.gz \
--filter-name "RPRS_filter" \
--filter-expression "(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)" \
--filter-name "FS_SOR_filter" \
--filter-expression "(vc.isSNP() && ((vc.hasAttribute('FS') && FS > 60.0) || (vc.hasAttribute('SOR') && SOR > 3.0))) || ((vc.isIndel() || vc.isMixed()) && ((vc.hasAttribute('FS') && FS > 200.0) || (vc.hasAttribute('SOR') && SOR > 10.0)))" \
--filter-name "MQ_filter" \
--filter-expression "vc.isSNP() && ((vc.hasAttribute('MQ') && MQ < 40.0) || (vc.hasAttribute('MQRankSum') && MQRankSum < -12.5))" \
--filter-name "QUAL_filter" \
--filter-expression "QUAL < 30.0" \
--invalidate-previous-filters true
}}
export -f filter_contig
# pass each contig to gnu parallel
parallel -j {threads} filter_contig ::: ${{contigs}} 2> {log}
bcftools concat {wildcards.refGenome}_{wildcards.prefix}_filter_*.vcf.gz --threads {threads} -Oz -o {output.vcf} 2>> {log}
tabix -p vcf {output.vcf} 2>> {log}
"""

0 comments on commit 7ca25b4

Please sign in to comment.