Skip to content

Commit

Permalink
Adding rule to merge chunked genotype refinement VCFs
Browse files Browse the repository at this point in the history
  • Loading branch information
skchronicles committed Jun 20, 2024
1 parent 1173b0d commit ce3085f
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 7 deletions.
5 changes: 5 additions & 0 deletions config/cluster/slurm.json
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,11 @@
"mem": "48G",
"time": "2-00:00:00"
},
"gatk_germline_gather_genotype_refinement": {
"threads": "2",
"mem": "64G",
"time": "1-12:00:00"
},
"hla": {
"threads": "8",
"partition": "norm",
Expand Down
5 changes: 5 additions & 0 deletions config/cluster/uge.json
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,11 @@
"partition": "",
"threads": "2"
},
"gatk_germline_gather_genotype_refinement": {
"mem": "32G",
"partition": "",
"threads": "2"
},
"hla": {
"mem": "12G",
"partition": "",
Expand Down
88 changes: 81 additions & 7 deletions workflow/rules/gatk_germline.smk
Original file line number Diff line number Diff line change
Expand Up @@ -632,9 +632,9 @@ rule gatk_germline_apply_vqsr_indels:

rule gatk_germline_scatter_genotype_refinement:
"""
Data-processing step to calculate genotype posterior probabilities
given known population genotypes. The tool calculates the posterior
genotype probability for each sample genotype in a given VCF format
Data-processing step to calculate genotype posterior probabilities
given known population genotypes. The tool calculates the posterior
genotype probability for each sample genotype in a given VCF format
callset. This tool uses use extra information like allele frequencies
in relevant populations to further refine the genotype assignments.
@Input:
Expand All @@ -644,10 +644,11 @@ rule gatk_germline_scatter_genotype_refinement:
SNPs and INDELs for a given region.
(gather-per-cohort-scatter-across-regions)
@Output:
Genotype refined VCF with the following information: Genotype
posteriors added to the FORMAT fields ("PP"), genotypes and GQ
assigned according to these posteriors, per-site genotype priors
added to the INFO field ("PG").
Genotype refined VCF with the following information at a given
region (chr:start-stop): Genotype posteriors added to the FORMAT
fields ("PP"), genotypes and GQ assigned according to these
posteriors, per-site genotype priors added to the INFO field
("PG").
"""
input:
vcf = join(workpath, "haplotypecaller", "VCFs", "snp_indel_recal_chunks", "snps_and_indels_recal_variants_{region}.vcf.gz"),
Expand Down Expand Up @@ -702,3 +703,76 @@ rule gatk_germline_scatter_genotype_refinement:
# Create an tabix index for the VCF file
tabix -p vcf {output.vcf}
"""


rule gatk_germline_gather_genotype_refinement:
"""
Data-processing step to merge the chunked (chr:start-stop) genotype
refinements into a single VCF file.
@Input:
Genotype refined VCF across all regions.
(gather-per-cohort) ~ singleton.
@Output:
Genotype refined VCF file.
"""
input:
tmps = expand(
join(workpath, "haplotypecaller", "VCFs", "gtype_temp_chunks", "snps_and_indels_recal_refinement_variants_{region}.vcf.gz"),
region=regions
),
vcfs = expand(
join(workpath, "haplotypecaller", "VCFs", "gtype_fixed_chunks", "snps_and_indels_recal_refinement_variants_{region}.GTfix.vcf.gz"),
region=regions
),
output:
t_lsl = join(workpath, "haplotypecaller", "VCFs", "gtype_temp_chunks", "snps_and_indels_recal_refinement.region.vcfs.list"),
t_vcf = join(workpath, "haplotypecaller", "VCFs", "snps_and_indels_recal_refinement_variants.vcf.gz"),
v_lsl = join(workpath, "haplotypecaller", "VCFs", "gtype_fixed_chunks", "snps_and_indels_recal_refinement.region.GTfix.vcf.list"),
v_vcf = join(workpath, "haplotypecaller", "VCFs", "snps_and_indels_recal_refinement_variants.GTfix.vcf.gz"),
params:
rname = "gatk_gl_gather_gtype_refine",
t_dir = join(workpath, "haplotypecaller", "VCFs", "gtype_temp_chunks"),
v_dir = join(workpath, "haplotypecaller", "VCFs", "gtype_fixed_chunks"),
# For UGE/SGE clusters memory is allocated
# per cpu, so we must calculate total mem
# as the product of threads and memory
memory = lambda _: int(
int(allocated("mem", "gatk_germline_gather_genotype_refinement", cluster).lower().rstrip('g')) * \
int(allocated("threads", "gatk_germline_gather_genotype_refinement", cluster))
)-2 if run_mode == "uge" \
else int(allocated("mem", "gatk_germline_gather_genotype_refinement", cluster).lower().rstrip('g')) - 2,
threads: int(allocated("threads", "gatk_germline_gather_genotype_refinement", cluster))
container: config['images']['genome-seek']
envmodules:
config['tools']['gatk4']
shell: """
# Get a list of region VCF files to
# merge into a single VCF. Avoids an
# ARG_MAX issue which will limit max
# length of a command.
find {params.t_dir}/ \\
-maxdepth 1 \\
-type f \\
-iname '*.vcf.gz' \\
> {output.t_lsl}
# Merge the chunked (chr:start-stop) genotype
# refinements into a single VCF file.
gatk --java-options '-Xmx{params.memory}g' MergeVcfs \\
--OUTPUT {output.t_vcf} \\
--INPUT {output.t_lsl}
# Get a list of region VCF files to
# merge into a single VCF. Avoids an
# ARG_MAX issue which will limit max
# length of a command.
find {params.v_dir}/ \\
-maxdepth 1 \\
-type f \\
-iname '*.vcf.gz' \\
> {output.v_lsl}
# Merge the chunked (chr:start-stop) genotype
# refinements into a single VCF file.
gatk --java-options '-Xmx{params.memory}g' MergeVcfs \\
--OUTPUT {output.v_vcf} \\
--INPUT {output.v_lsl}
"""

0 comments on commit ce3085f

Please sign in to comment.