Skip to content

Commit

Permalink
Merge pull request #31 from metagenlab/circular
Browse files Browse the repository at this point in the history
Add support for circular genomes
  • Loading branch information
farchaab authored Oct 17, 2024
2 parents 616529a + 754427b commit 8086d5b
Show file tree
Hide file tree
Showing 9 changed files with 224 additions and 89 deletions.
6 changes: 5 additions & 1 deletion mess/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,11 @@
local_sim_options = (
{
"name": "Local genomes options",
"options": ["--asm-summary", "--fasta"],
"options": [
"--asm-summary",
"--fasta",
"--rotate",
],
},
)

Expand Down
8 changes: 7 additions & 1 deletion mess/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ def run_snakemake(
if prefix:
snake_command += ["--conda-prefix", prefix]
if sdm == "apptainer":

snake_command += [
f"--sdm apptainer --apptainer-args '-B {workflow_basedir}:{workflow_basedir}'"
]
Expand Down Expand Up @@ -248,6 +247,13 @@ def sim_options(func):
default=False,
required=False,
),
click.option(
"--rotate",
help="Number of times to shuffle genome start for circular assemblies (2 or more for circular)",
type=int,
default=1,
show_default=True,
),
click.option(
"--tech",
help="Sequencing technology",
Expand Down
90 changes: 71 additions & 19 deletions mess/workflow/rules/preflight/functions.smk
Original file line number Diff line number Diff line change
Expand Up @@ -89,19 +89,47 @@ def list_fastas(wildcards):


table_cache = {}
def get_value(value, wildcards):

vals = (
f"{wildcards.sample}",
f"{wildcards.fasta}",
f"{wildcards.contig}",
)
idx_col = ["samplename", "fasta", "contig"]

def get_value(table, wildcards, value):
if ROTATE > 1:
idx_col += ["n"]
vals += (int(wildcards.n),)

table = checkpoints.split_contigs.get(**wildcards).output[0]
if table not in table_cache:
df = pd.read_csv(
table,
sep="\t",
index_col=["samplename", "fasta", "contig"],
index_col=idx_col,
)
table_cache[table] = df
df = table_cache[table]
val = df.loc[wildcards.sample].loc[wildcards.fasta].loc[wildcards.contig][value]
return val
return df.loc[vals, value]


fa_table_cache = {}
def get_random_start(wildcards):
table = checkpoints.split_contigs.get(**wildcards).output[0]
if table not in fa_table_cache:
df = pd.read_csv(
table,
sep="\t",
index_col=["fasta","contig","n"]
)
fa_table_cache[table] = df
vals=(f"{wildcards.fasta}",f"{wildcards.contig}",int(wildcards.n))
df = fa_table_cache[table]
return df.loc[vals, "random_start"]





def get_asm_summary(wildcards):
Expand Down Expand Up @@ -130,22 +158,46 @@ def aggregate(wildcards, outdir, level, ext):
else:
contigs = list(set(contigs))
if PAIRED and ext != "bam":
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
p=wildcards.p,
contig=contigs,
ext=ext,
)
if ROTATE > 1:
return expand(
os.path.join(
outdir, "{sample}", "{fasta}", "{contig}_{n}{p}.{ext}"
),
sample=wildcards.sample,
fasta=wildcards.fasta,
n=list(range(1, ROTATE + 1)),
p=wildcards.p,
contig=contigs,
ext=ext,
)
else:
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
p=wildcards.p,
contig=contigs,
ext=ext,
)

else:
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
contig=contigs,
ext=ext,
)
if ROTATE > 1:
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
contig=contigs,
n=list(range(1, ROTATE + 1)),
ext=ext,
)
else:
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
contig=contigs,
ext=ext,
)
if level == "fasta":
df = pd.read_csv(table, sep="\t", index_col=["samplename", "fasta"])
fastas = list(set(df.loc[wildcards.sample].index))
Expand Down
36 changes: 36 additions & 0 deletions mess/workflow/rules/processing/fastas.smk
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,45 @@ checkpoint split_contigs:
output:
tsv=os.path.join(dir.out.processing, "cov.tsv"),
dir=directory(os.path.join(dir.out.processing, "split")),
params:
rotate=ROTATE,
resources:
mem_mb=config.resources.sml.mem,
mem=str(config.resources.sml.mem) + "MB",
time=config.resources.sml.time,
script:
os.path.join(dir.scripts, "split_contigs.py")


if ROTATE > 1:

rule rotate_contigs:
input:
fa=os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna"),
df=get_cov_table,
output:
os.path.join(dir.out.processing, "rotate", "{fasta}_{contig}_{n}.fna"),
params:
get_random_start,
log:
os.path.join(
dir.out.logs,
"seqkit",
"restart",
"{fasta}_{contig}_{n}.log",
),
resources:
mem_mb=config.resources.sml.mem,
mem=str(config.resources.sml.mem) + "MB",
time=config.resources.sml.time,
threads: config.resources.sml.cpu
conda:
os.path.join(dir.conda, "seqkit.yml")
container:
containers.seqkit
shell:
"""
seqkit restart -i {params} {input.fa} | \
seqkit seq -i | \
seqkit replace -p .+ -r {wildcards.contig}_{wildcards.n} > {output}
"""
71 changes: 32 additions & 39 deletions mess/workflow/rules/processing/reads.smk
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
fastq_dir = dir.out.long
sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.sam")
contig = "{contig}"
if ROTATE > 1:
contig = "{contig}_{n}"
sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam")
if SEQ_TECH == "illumina":
fastq_dir = dir.out.short
sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fixed")
sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed")

fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq")
fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz"))
Expand All @@ -15,27 +18,19 @@ if PASSES > 1:

rule ccs_sam_to_bam:
input:
os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.sam"),
os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".sam"),
output:
temp(
os.path.join(
dir.out.base,
"ccs",
"{sample}",
"{fasta}",
"{contig}.ccs.bam",
contig + ".ccs.bam",
)
),
benchmark:
os.path.join(
dir.out.bench,
"samtools",
"sam2bam",
"{sample}",
"{fasta}_{contig}.txt",
)
log:
os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", "{contig}.log"),
os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", contig + ".log"),
resources:
mem_mb=config.resources.sml.mem,
mem=str(config.resources.sml.mem) + "MB",
Expand All @@ -53,28 +48,28 @@ if PASSES > 1:

rule ccs_bam_to_fastq:
input:
os.path.join(dir.out.base, "ccs", "{sample}", "{fasta}", "{contig}.ccs.bam"),
os.path.join(
dir.out.base, "ccs", "{sample}", "{fasta}", contig + ".ccs.bam"
),
output:
fq=temp(
os.path.join(
dir.out.long,
"{sample}",
"{fasta}",
"{contig}.fq.gz",
contig + ".fq.gz",
)
),
json=temp(
os.path.join(
dir.out.long,
"{sample}",
"{fasta}",
"{contig}.zmw_metrics.json.gz",
contig + ".zmw_metrics.json.gz",
)
),
benchmark:
os.path.join(dir.out.bench, "ccs", "{sample}", "{fasta}", "{contig}.txt")
log:
os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", "{contig}.log"),
os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", contig + ".log"),
params:
passes=PASSES,
accuracy=ACCURACY,
Expand All @@ -100,10 +95,10 @@ if BAM:

rule add_reference_name:
input:
maf=os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.maf"),
fa=os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.ref"),
maf=os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".maf"),
fa=os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".ref"),
output:
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.maf")),
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".maf")),
params:
seqname=lambda wildcards, input: get_header(input.fa),
resources:
Expand All @@ -117,24 +112,16 @@ if BAM:

rule convert_maf_to_sam:
input:
os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.maf"),
os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".maf"),
output:
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.sam")),
benchmark:
os.path.join(
dir.out.logs,
"bioconvert",
"maf2sam",
"{sample}",
"{fasta}_{contig}.txt",
)
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam")),
log:
os.path.join(
dir.out.logs,
"bioconvert",
"maf2sam",
"{sample}",
"{fasta}_{contig}.log",
"{fasta}" + "_" + contig + ".log",
),
resources:
mem_mb=config.resources.sml.mem,
Expand All @@ -157,9 +144,9 @@ rule fix_art_sam:
Fixes truncated art_illumina SAM files with some genomes
"""
input:
os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.sam"),
os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".sam"),
output:
temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fixed")),
temp(os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed")),
resources:
mem_mb=config.resources.sml.mem,
mem=str(config.resources.sml.mem) + "MB",
Expand All @@ -178,10 +165,14 @@ rule convert_sam_to_bam:
input:
sam_in,
output:
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.bam")),
temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".bam")),
log:
os.path.join(
dir.out.logs, "bioconvert", "sam2bam", "{sample}", "{fasta}{contig}.log"
dir.out.logs,
"bioconvert",
"sam2bam",
"{sample}",
"{fasta}" + contig + ".log",
),
resources:
mem_mb=config.resources.sml.mem,
Expand Down Expand Up @@ -288,7 +279,7 @@ rule get_bam_coverage:
containers.bioconvert
shell:
"""
samtools coverage {input} | tee {output} {log}
samtools coverage {input} > {output} 2> {log}
"""


Expand All @@ -306,10 +297,12 @@ rule get_tax_profile:
time=config.resources.sml.time,
threads: config.resources.sml.cpu
params:
paired=PAIRED,
rotate=ROTATE,
run:
tax_df = pd.read_csv(input.tax, sep="\t")
tax_df = tax_df[tax_df.samplename == wildcards.sample]
if params.rotate > 1:
tax_df["contig"] = tax_df["contig"] + "_" + tax_df["n"].astype(str)
cov_df = pd.read_csv(input.cov, sep="\t")
cov_df.rename(columns={"#rname": "contig"}, inplace=True)
merge_df = tax_df.merge(cov_df)
Expand Down
Loading

0 comments on commit 8086d5b

Please sign in to comment.