diff --git a/ATAC_seq/alignment.snake b/ATAC_seq/alignment.snake new file mode 100644 index 0000000..f8a3e12 --- /dev/null +++ b/ATAC_seq/alignment.snake @@ -0,0 +1,277 @@ +# This runs star on the files from the samples document can be single or paired end + +# Function to return paths of input files +def _get_input(wildcards): + if wildcards.trim_method == "no": + # Grab path of the fastq file + fastq1 = SAMPLE_LIST.loc[wildcards.sample, "fastq1"] + data_dir = SAMPLE_LIST.loc[wildcards.sample, "data_dir"] + fastq1 = data_dir + "/" + fastq1 + # Make sure file exists + fastq1 = _check_path(fastq1) + if IS_PAIRED: + # Grab path of second read + fastq2 = SAMPLE_LIST.loc[wildcards.sample, "fastq2"] + fastq2 = data_dir + "/" + fastq2 + # Make sure file exists + fastq2 = _check_path(fastq2) + return(fastq1, fastq2) + else: + return(fastq1) + else: + fastq1 = (wildcards.results + "/" + wildcards.trim_method + "_trim/" + + wildcards.sample + "_R1_trimmed.fastq.gz") + if IS_PAIRED: + # Grab path of second read + fastq2 = (wildcards.results + "/" + wildcards.trim_method + "_trim/" + + wildcards.sample + "_R2_trimmed.fastq.gz") + return(fastq1, fastq2) + else: + return(fastq1) + +rule align: + input: + "{results}/{trim_method}_trim/{sample}.txt" + output: + bam_output = "{results}/bowtie2_{trim_method}_trim/{sample}_Aligned.sortedByCoord.out.bam", + bai_file = "{results}/bowtie2_{trim_method}_trim/{sample}_Aligned.sortedByCoord.out.bam.bai", + log_output = "{results}/bowtie2_{trim_method}_trim/{sample}_metrics.out" + params: + job_name = "{sample}_bowtie2", + memory = "select[mem>60] rusage[mem=60]", + genome = GENOME, + fastqs = _get_input, + paired = IS_PAIRED + singularity: + GENERAL_CONTAINER + log: + "{results}/logs/bowtie2/bowtie2_{sample}_{trim_method}_trim" + message: + "Aligning reads for {wildcards.sample}" + threads: + 6 + shell: + """ + if [ {params.paired} ] + then + all_fastqs=({params.fastqs}) + bowtie2 \ + --very-sensitive \ + -k 10 \ + -x {params.genome} \ + --threads {threads} \ + --rg-id {wildcards.sample} \ + --rg 'SM:{wildcards.sample}' \ + --met-file {output.log_output} \ + -1 ${{all_fastqs[0]}} \ + -2 ${{all_fastqs[1]}} | \ + samtools view -b - | \ + samtools sort -o {output.bam_output} - + + else + bowtie2 \ + --very-sensitive \ + -k 10 \ + -x {params.genome} \ + --threads {threads} \ + --rg-id {wildcards.sample} \ + --rg 'SM:{wildcards.sample}' \ + --met-file {output.log_output} \ + -U {params.fastqs} | \ + samtools view -b - | \ + samtools sort -o {output.bam_output} - + + + fi + + samtools index {output.bam_output} + """ + +# rule remove_duplicate_reads: +# input: +# "{results}/bowtie2_{trim_method}_trim/{sample}_Aligned.sortedByCoord.out.bam" +# output: +# bam = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_dup.bam", +# bai = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_dup.bam.bai", +# mt_bam = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam", +# mt_bai = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam.bai", +# params: +# job_name = "{sample}_remove_dup", +# memory = "select[mem>60] rusage[mem=60]", +# out_pre = "{results}/bowtie2_{trim_method}_trim/{sample}", +# temp_file = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_dup_tmp.bam", +# mapq_threshold = 20 +# singularity: +# GENERAL_CONTAINER +# log: +# "{results}/logs/remove_duplicate_reads/remove_dup_{sample}_{trim_method}_trim" +# shell: +# """ +# # Remove mapping to mitochondria +# samtools idxstats {input} \ +# | cut -f 1 \ +# | grep -v MT \ +# | xargs samtools view \ +# -b {input} > {output.mt_bam} + +# samtools index {output.mt_bam} + +# # Remove duplicates +# samtools sort -n -O BAM -T {params.out_pre} {output.mt_bam} \ +# | samtools fixmate -m - - \ +# | samtools sort -T {params.out_pre}_second - \ +# | samtools markdup -r -s - {params.temp_file} + +# samtools index {params.temp_file} + + +# # From Greenleaf lab protocol +# # -F 1804: exclude flag, exludes unmapped, next segment unmapped, +# # secondary alignments, not passing platform q, PCR or optical duplicates +# # -f 2: flags to require, properly aligned +# # -q 20: exlude low MAPQ, set as parameter to adjust +# samtools view \ +# -F 1804 \ +# -f 2 -q {params.mapq_threshold} -b {params.temp_file} > {output.bam} + +# rm {params.temp_file} +# rm {params.temp_file}.bai + + +# samtools index {output.bam} +# """ + + +rule remove_mito: + input: + "{results}/bowtie2_{trim_method}_trim/{sample}_Aligned.sortedByCoord.out.bam" + output: + mt_bam = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam", + mt_bai = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam.bai" + params: + job_name = "{sample}_remove_dup", + memory = "select[mem>60] rusage[mem=60]", + singularity: + GENERAL_CONTAINER + log: + "{results}/logs/remove_mito/remove_mito_{sample}_{trim_method}_trim" + shell: + """ + # Remove mapping to mitochondria + samtools idxstats {input} \ + | cut -f 1 \ + | grep -v MT \ + | xargs samtools view \ + -b {input} > {output.mt_bam} + + samtools index {output.mt_bam} + """ + + +rule mark_duplicates: + input: + mt_bam = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam", + mt_bai = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_mt.bam.bai", + output: + dup_bam = "{results}/bowtie2_{trim_method}_trim/{sample}_mark_dup.bam", + picard_metrics = "{results}/bowtie2_{trim_method}_trim/{sample}_picard_dupMark.txt" + params: + job_name = "{sample}_remove_dup", + memory = "select[mem>60] rusage[mem=60]", + picard_jar = PICARD_JAR, + temp_file = "{results}/bowtie2_{trim_method}_trim/{sample}_dup_temp.bam" + singularity: + PICARD_CONTAINER + log: + "{results}/logs/mark_duplicate_reads/mark_dup_{sample}_{trim_method}_trim" + shell: + """ + # Sort sam + java -jar {params.picard_jar} SortSam \ + -I {input.mt_bam} -O {params.temp_file} \ + -SO coordinate + + # Mark duplicates + java -jar {params.picard_jar} MarkDuplicates \ + -I {params.temp_file} -O {output.dup_bam} \ + --METRICS_FILE {output.picard_metrics} + + rm {params.temp_file} + """ + +rule remove_duplicate_reads: + input: + "{results}/bowtie2_{trim_method}_trim/{sample}_mark_dup.bam" + output: + bam = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_dup.bam", + bai = "{results}/bowtie2_{trim_method}_trim/{sample}_remove_dup.bam.bai", + dup_bai = "{results}/bowtie2_{trim_method}_trim/{sample}_mark_dup.bam.bai" + params: + job_name = "{sample}_remove_dup", + memory = "select[mem>60] rusage[mem=60]", + mapq_threshold = 20 + singularity: + GENERAL_CONTAINER + log: + "{results}/logs/remove_duplicate_reads/remove_dup_{sample}_{trim_method}_trim" + shell: + """ + samtools index {input} + + # From tutorial https://cambiotraining.github.io/chipseq/Practicals/ATACseq/ATACseq_tutorial.html + # -f 3: only include alignments marked with the SAM flag 3, which means “properly paired and mapped” + # -F 4: exclude aligned reads with flag 4: the read itself did not map + # -F 8: exclude aligned reads with flag 8: their mates did not map + # -F 256: exclude alignments with flag 256, which means that Bowtie mapped the read to multiple + # places in the reference genome, and this alignment is not the best + # -F 1024: exclude alignments marked with SAM flag 1024, which indicates that the read is an optical + # or PCR duplicate (this flag would be set by Picard) + # -F 2048: exclude alignments marked with SAM flag 2048, indicating chimeric alignments, where + # Bowtie decided that parts of the read mapped to different regions in the genome. These records + # are the individual aligned segments of the read. They usually indicate structural variation. + # We’re not going to base peak calls on them. + # Finally, we use a basic quality filter, -q 15, to request high-quality alignments. + samtools view \ + -b -h \ + -f 3 \ + -F 4 \ + -F 8 \ + -F 256 \ + -F 1024 \ + -F 2048 \ + -q params.mapq_threshold \ + -o {output.bam} \ + {input} + + samtools index {output.bam} + """ + +# Get alignment stats +rule aligment_stats: + input: + expand( + "{{results}}/bowtie2_{{trim_method}}_trim/{sample}_Aligned.sortedByCoord.out.bam", + sample = SAMPLES + ) + output: + "{results}/bowtie2_{trim_method}_trim/bowtie_stats_finished.txt" + params: + job_name = "bowtie_stats", + memory = "select[mem>4] rusage[mem=4]", + out_dir = "{results}/bowtie2_{trim_method}_trim", + in_dir = "{results}/logs/bowtie2" + singularity: + GENERAL_CONTAINER + log: + "{results}/logs/bowtie2_stats/stats_{trim_method}" + threads: + 1 + shell: + """ + python src/scripts/get_bowtie_stats.py \ + -d {params.in_dir} \ + -o {params.out_dir} \ + -t {wildcards.trim_method} + + echo "finished" > {output} + """ \ No newline at end of file diff --git a/ATAC_seq/get_bowtie_stats.py b/ATAC_seq/get_bowtie_stats.py new file mode 100644 index 0000000..fef37fd --- /dev/null +++ b/ATAC_seq/get_bowtie_stats.py @@ -0,0 +1,93 @@ +import os +import argparse +import glob +import re + +def main(): + options = setup() + + if not os.path.exists(options.out_dir): + os.makedirs(options.out_dir) + + read_files(options.in_dir, options.out_dir, options.trim_method) + +######### +# Setup # +######### + +def setup(): + """ + Gets command line arguments and returns a Namespace object + """ + + # House keeping to read in arguments from the command line + parser = argparse.ArgumentParser() + + parser.add_argument("-d", "--directory", dest = "in_dir", + help = "The directory that holds the log files, default is the working directory", + default = os.getcwd, + action = "store", + metavar = "\b") + + parser.add_argument("-o", "--out", dest = "out_dir", + help = "The directory to put the output, default is outs", + default = "outs", + action = "store", + metavar = "\b") + + parser.add_argument("-t", "--trim-method", dest = "trim_method", + help = "The trim method used", + default = "cutadapt", + action = "store", + metavar = "\b") + + args = parser.parse_args() + + return(args) + +############# +# Functions # +############# + +def read_files(in_dir, out_dir, trim_method): + files = glob.glob(in_dir + "/*.err") + for file in files: + sample = re.sub("_" + trim_method + "_trim.err", "", file.split("/")[-1]) + output_file = os.path.join(out_dir, sample + "_bowtie_stats.csv") + sample_dict = {} + with open(file, "r") as in_file: + for line in in_file: + save_line = line.strip().split(" ") + if "reads; of these:" in line: + sample_dict["total_reads"] = save_line[0] + elif "were paired; of these:" in line: + sample_dict["paired_reads"] = save_line[0] + sample_dict["paired_percent"] = save_line[1] + elif ") aligned concordantly 0 times" in line: + sample_dict["no_alignment"] = save_line[0] + sample_dict["no_alignment_percent"] = save_line[1] + elif "aligned concordantly exactly 1 time" in line: + sample_dict["one_alignment"] = save_line[0] + sample_dict["one_alignment_percent"] = save_line[1] + elif "aligned concordantly >1 times" in line: + sample_dict["multi_alignment"] = save_line[0] + sample_dict["multi_alignment_percent"] = save_line[1] + elif "overall alignment rate" in line: + sample_dict["overall_alignment"] = save_line[0] + with open(output_file, "w") as out_file: + out_file.write("total_reads,paired_reads,paired_percent,no_alignment,no_alignment_percent,one_alignment,one_alignment_percent,multi_alignment,multi_alignment_percent,overall_alignment\n") + + out_file.write("{},{},{},{},{},{},{},{},{},{}\n".format(sample_dict["total_reads"], + sample_dict["paired_reads"], + sample_dict["paired_percent"], + sample_dict["no_alignment"], + sample_dict["no_alignment_percent"], + sample_dict["one_alignment"], + sample_dict["one_alignment_percent"], + sample_dict["multi_alignment"], + sample_dict["multi_alignment_percent"], + sample_dict["overall_alignment"])) + + +if __name__ == "__main__": + main() \ No newline at end of file