Skip to content

Commit

Permalink
added rule to fix truncated art_illumina SAM
Browse files Browse the repository at this point in the history
  • Loading branch information
farchaab committed Aug 30, 2024
1 parent 2e62511 commit 8c0354f
Showing 1 changed file with 45 additions and 4 deletions.
49 changes: 45 additions & 4 deletions mess/workflow/rules/processing/reads.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ fastq_dir = dir.out.long
sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.sam")
if SEQ_TECH == "illumina":
fastq_dir = dir.out.short
sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.sam")
sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fixed")

fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq")
fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz"))
Expand Down Expand Up @@ -151,6 +151,29 @@ if BAM:
"""


rule fix_art_sam:
"""
rule to replace SAM cigar string with read length + M
Fixes truncated art_illumina SAM files with some genomes
"""
input:
os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.sam"),
output:
temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fixed")),
resources:
mem_mb=config.resources.sml.mem,
mem=str(config.resources.sml.mem) + "MB",
time=config.resources.sml.time,
params:
maxlen=MEAN_LEN,
shell:
"""
awk 'BEGIN {{OFS="\t"}} {{ if ($1 ~ /^@/) {{ print $0 }} \\
else {{ $6 = "{params.maxlen}M"; print $0 }} }}' \\
{input} > {output}
"""


rule convert_sam_to_bam:
input:
sam_in,
Expand All @@ -171,7 +194,7 @@ rule convert_sam_to_bam:
containers.bioconvert
shell:
"""
bioconvert {input} {output} -t {threads} 2> {log}
bioconvert sam2bam {input} {output} -t {threads} 2> {log}
"""


Expand Down Expand Up @@ -243,7 +266,7 @@ rule sort_bams:
containers.bioconvert
shell:
"""
samtools sort -@ {threads} {input} -o {output} 2> {log}
samtools sort -@ {threads} {input} -o {output} 2> {log}
"""


Expand All @@ -265,7 +288,7 @@ rule get_bam_coverage:
containers.bioconvert
shell:
"""
samtools coverage {input} > {output}
samtools coverage {input} | tee {output} {log}
"""


Expand All @@ -274,6 +297,7 @@ rule get_tax_profile:
cov=os.path.join(dir.out.bam, "{sample}.txt"),
tax=get_cov_table,
output:
counts=os.path.join(dir.out.tax, "{sample}.tsv"),
seq_abundance=temp(os.path.join(dir.out.tax, "{sample}_seq.tsv")),
tax_abundance=temp(os.path.join(dir.out.tax, "{sample}_tax.tsv")),
resources:
Expand All @@ -288,6 +312,23 @@ rule get_tax_profile:
cov_df = pd.read_csv(input.cov, sep="\t")
cov_df.rename(columns={"#rname": "contig"}, inplace=True)
merge_df = tax_df.merge(cov_df)
merge_df[
[
"samplename",
"fasta",
"contig",
"tax_id",
"startpos",
"endpos",
"numreads",
"covbases",
"coverage",
"cov_sim",
"meandepth",
"meanbaseq",
"meanmapq",
]
].to_csv(output.counts, sep="\t", index=False)
for col in ["numreads", "meandepth"]:
if col == "numreads":
out = output.seq_abundance
Expand Down

0 comments on commit 8c0354f

Please sign in to comment.