Skip to content

Commit

Permalink
adding step to quantify bowtie2 alignment statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
kwells4 committed Dec 4, 2023
1 parent d43619a commit e2f54db
Show file tree
Hide file tree
Showing 2 changed files with 370 additions and 0 deletions.
277 changes: 277 additions & 0 deletions ATAC_seq/alignment.snake
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
# This runs star on the files from the samples document can be single or paired end

# Function to return paths of input files
def _get_input(wildcards):
if wildcards.trim_method == "no":
# Grab path of the fastq file
fastq1 = SAMPLE_LIST.loc[wildcards.sample, "fastq1"]
data_dir = SAMPLE_LIST.loc[wildcards.sample, "data_dir"]
fastq1 = data_dir + "/" + fastq1
# Make sure file exists
fastq1 = _check_path(fastq1)
if IS_PAIRED:
# Grab path of second read
fastq2 = SAMPLE_LIST.loc[wildcards.sample, "fastq2"]
fastq2 = data_dir + "/" + fastq2
# Make sure file exists
fastq2 = _check_path(fastq2)
return(fastq1, fastq2)
else:
return(fastq1)
else:
fastq1 = (wildcards.results + "/" + wildcards.trim_method + "_trim/" +
wildcards.sample + "_R1_trimmed.fastq.gz")
if IS_PAIRED:
# Grab path of second read
fastq2 = (wildcards.results + "/" + wildcards.trim_method + "_trim/" +
wildcards.sample + "_R2_trimmed.fastq.gz")
return(fastq1, fastq2)
else:
return(fastq1)

rule align:
input:
"{results}/{trim_method}_trim/{sample}.txt"
output:
bam_output = "{results}/bowtie2_{trim_method}_trim/{sample}_Aligned.sortedByCoord.out.bam",
bai_file = "{results}/bowtie2_{trim_method}_trim/{sample}_Aligned.sortedByCoord.out.bam.bai",
log_output = "{results}/bowtie2_{trim_method}_trim/{sample}_metrics.out"
params:
job_name = "{sample}_bowtie2",
memory = "select[mem>60] rusage[mem=60]",
genome = GENOME,
fastqs = _get_input,
paired = IS_PAIRED
singularity:
GENERAL_CONTAINER
log:
"{results}/logs/bowtie2/bowtie2_{sample}_{trim_method}_trim"
message:
"Aligning reads for {wildcards.sample}"
threads:
6
shell:
"""
if [ {params.paired} ]
then
all_fastqs=({params.fastqs})
bowtie2 \
--very-sensitive \
-k 10 \
-x {params.genome} \
--threads {threads} \
--rg-id {wildcards.sample} \
--rg 'SM:{wildcards.sample}' \
--met-file {output.log_output} \
-1 ${{all_fastqs[0]}} \
-2 ${{all_fastqs[1]}} | \
samtools view -b - | \
samtools sort -o {output.bam_output} -
else
bowtie2 \
--very-sensitive \
-k 10 \
-x {params.genome} \
--threads {threads} \
--rg-id {wildcards.sample} \
--rg 'SM:{wildcards.sample}' \
--met-file {output.log_output} \
-U {params.fastqs} | \
samtools view -b - | \
samtools sort -o {output.bam_output} -
fi
samtools index {output.bam_output}
"""

# rule remove_duplicate_reads:
# input:
# "{results}/bowtie2_{trim_method}_trim/{sample}_Aligned.sortedByCoord.out.bam"
# output:
# bam = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_dup.bam",
# bai = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_dup.bam.bai",
# mt_bam = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam",
# mt_bai = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam.bai",
# params:
# job_name = "{sample}_remove_dup",
# memory = "select[mem>60] rusage[mem=60]",
# out_pre = "{results}/bowtie2_{trim_method}_trim/{sample}",
# temp_file = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_dup_tmp.bam",
# mapq_threshold = 20
# singularity:
# GENERAL_CONTAINER
# log:
# "{results}/logs/remove_duplicate_reads/remove_dup_{sample}_{trim_method}_trim"
# shell:
# """
# # Remove mapping to mitochondria
# samtools idxstats {input} \
# | cut -f 1 \
# | grep -v MT \
# | xargs samtools view \
# -b {input} > {output.mt_bam}

# samtools index {output.mt_bam}

# # Remove duplicates
# samtools sort -n -O BAM -T {params.out_pre} {output.mt_bam} \
# | samtools fixmate -m - - \
# | samtools sort -T {params.out_pre}_second - \
# | samtools markdup -r -s - {params.temp_file}

# samtools index {params.temp_file}


# # From Greenleaf lab protocol
# # -F 1804: exclude flag, exludes unmapped, next segment unmapped,
# # secondary alignments, not passing platform q, PCR or optical duplicates
# # -f 2: flags to require, properly aligned
# # -q 20: exlude low MAPQ, set as parameter to adjust
# samtools view \
# -F 1804 \
# -f 2 -q {params.mapq_threshold} -b {params.temp_file} > {output.bam}

# rm {params.temp_file}
# rm {params.temp_file}.bai


# samtools index {output.bam}
# """


rule remove_mito:
input:
"{results}/bowtie2_{trim_method}_trim/{sample}_Aligned.sortedByCoord.out.bam"
output:
mt_bam = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam",
mt_bai = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam.bai"
params:
job_name = "{sample}_remove_dup",
memory = "select[mem>60] rusage[mem=60]",
singularity:
GENERAL_CONTAINER
log:
"{results}/logs/remove_mito/remove_mito_{sample}_{trim_method}_trim"
shell:
"""
# Remove mapping to mitochondria
samtools idxstats {input} \
| cut -f 1 \
| grep -v MT \
| xargs samtools view \
-b {input} > {output.mt_bam}
samtools index {output.mt_bam}
"""


rule mark_duplicates:
input:
mt_bam = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam",
mt_bai = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam.bai",
output:
dup_bam = "{results}/bowtie2_{trim_method}_trim/{sample}_mark_dup.bam",
picard_metrics = "{results}/bowtie2_{trim_method}_trim/{sample}_picard_dupMark.txt"
params:
job_name = "{sample}_remove_dup",
memory = "select[mem>60] rusage[mem=60]",
picard_jar = PICARD_JAR,
temp_file = "{results}/bowtie2_{trim_method}_trim/{sample}_dup_temp.bam"
singularity:
PICARD_CONTAINER
log:
"{results}/logs/mark_duplicate_reads/mark_dup_{sample}_{trim_method}_trim"
shell:
"""
# Sort sam
java -jar {params.picard_jar} SortSam \
-I {input.mt_bam} -O {params.temp_file} \
-SO coordinate
# Mark duplicates
java -jar {params.picard_jar} MarkDuplicates \
-I {params.temp_file} -O {output.dup_bam} \
--METRICS_FILE {output.picard_metrics}
rm {params.temp_file}
"""

rule remove_duplicate_reads:
input:
"{results}/bowtie2_{trim_method}_trim/{sample}_mark_dup.bam"
output:
bam = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_dup.bam",
bai = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_dup.bam.bai",
dup_bai = "{results}/bowtie2_{trim_method}_trim/{sample}_mark_dup.bam.bai"
params:
job_name = "{sample}_remove_dup",
memory = "select[mem>60] rusage[mem=60]",
mapq_threshold = 20
singularity:
GENERAL_CONTAINER
log:
"{results}/logs/remove_duplicate_reads/remove_dup_{sample}_{trim_method}_trim"
shell:
"""
samtools index {input}
# From tutorial https://cambiotraining.github.io/chipseq/Practicals/ATACseq/ATACseq_tutorial.html
# -f 3: only include alignments marked with the SAM flag 3, which means “properly paired and mapped”
# -F 4: exclude aligned reads with flag 4: the read itself did not map
# -F 8: exclude aligned reads with flag 8: their mates did not map
# -F 256: exclude alignments with flag 256, which means that Bowtie mapped the read to multiple
# places in the reference genome, and this alignment is not the best
# -F 1024: exclude alignments marked with SAM flag 1024, which indicates that the read is an optical
# or PCR duplicate (this flag would be set by Picard)
# -F 2048: exclude alignments marked with SAM flag 2048, indicating chimeric alignments, where
# Bowtie decided that parts of the read mapped to different regions in the genome. These records
# are the individual aligned segments of the read. They usually indicate structural variation.
# We’re not going to base peak calls on them.
# Finally, we use a basic quality filter, -q 15, to request high-quality alignments.
samtools view \
-b -h \
-f 3 \
-F 4 \
-F 8 \
-F 256 \
-F 1024 \
-F 2048 \
-q params.mapq_threshold \
-o {output.bam} \
{input}
samtools index {output.bam}
"""

# Get alignment stats
rule aligment_stats:
input:
expand(
"{{results}}/bowtie2_{{trim_method}}_trim/{sample}_Aligned.sortedByCoord.out.bam",
sample = SAMPLES
)
output:
"{results}/bowtie2_{trim_method}_trim/bowtie_stats_finished.txt"
params:
job_name = "bowtie_stats",
memory = "select[mem>4] rusage[mem=4]",
out_dir = "{results}/bowtie2_{trim_method}_trim",
in_dir = "{results}/logs/bowtie2"
singularity:
GENERAL_CONTAINER
log:
"{results}/logs/bowtie2_stats/stats_{trim_method}"
threads:
1
shell:
"""
python src/scripts/get_bowtie_stats.py \
-d {params.in_dir} \
-o {params.out_dir} \
-t {wildcards.trim_method}
echo "finished" > {output}
"""
93 changes: 93 additions & 0 deletions ATAC_seq/get_bowtie_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os
import argparse
import glob
import re

def main():
options = setup()

if not os.path.exists(options.out_dir):
os.makedirs(options.out_dir)

read_files(options.in_dir, options.out_dir, options.trim_method)

#########
# Setup #
#########

def setup():
"""
Gets command line arguments and returns a Namespace object
"""

# House keeping to read in arguments from the command line
parser = argparse.ArgumentParser()

parser.add_argument("-d", "--directory", dest = "in_dir",
help = "The directory that holds the log files, default is the working directory",
default = os.getcwd,
action = "store",
metavar = "\b")

parser.add_argument("-o", "--out", dest = "out_dir",
help = "The directory to put the output, default is outs",
default = "outs",
action = "store",
metavar = "\b")

parser.add_argument("-t", "--trim-method", dest = "trim_method",
help = "The trim method used",
default = "cutadapt",
action = "store",
metavar = "\b")

args = parser.parse_args()

return(args)

#############
# Functions #
#############

def read_files(in_dir, out_dir, trim_method):
files = glob.glob(in_dir + "/*.err")
for file in files:
sample = re.sub("_" + trim_method + "_trim.err", "", file.split("/")[-1])
output_file = os.path.join(out_dir, sample + "_bowtie_stats.csv")
sample_dict = {}
with open(file, "r") as in_file:
for line in in_file:
save_line = line.strip().split(" ")
if "reads; of these:" in line:
sample_dict["total_reads"] = save_line[0]
elif "were paired; of these:" in line:
sample_dict["paired_reads"] = save_line[0]
sample_dict["paired_percent"] = save_line[1]
elif ") aligned concordantly 0 times" in line:
sample_dict["no_alignment"] = save_line[0]
sample_dict["no_alignment_percent"] = save_line[1]
elif "aligned concordantly exactly 1 time" in line:
sample_dict["one_alignment"] = save_line[0]
sample_dict["one_alignment_percent"] = save_line[1]
elif "aligned concordantly >1 times" in line:
sample_dict["multi_alignment"] = save_line[0]
sample_dict["multi_alignment_percent"] = save_line[1]
elif "overall alignment rate" in line:
sample_dict["overall_alignment"] = save_line[0]
with open(output_file, "w") as out_file:
out_file.write("total_reads,paired_reads,paired_percent,no_alignment,no_alignment_percent,one_alignment,one_alignment_percent,multi_alignment,multi_alignment_percent,overall_alignment\n")

out_file.write("{},{},{},{},{},{},{},{},{},{}\n".format(sample_dict["total_reads"],
sample_dict["paired_reads"],
sample_dict["paired_percent"],
sample_dict["no_alignment"],
sample_dict["no_alignment_percent"],
sample_dict["one_alignment"],
sample_dict["one_alignment_percent"],
sample_dict["multi_alignment"],
sample_dict["multi_alignment_percent"],
sample_dict["overall_alignment"]))


if __name__ == "__main__":
main()

0 comments on commit e2f54db

Please sign in to comment.