From 7ab026b8c3382ce09d79cdba94b926d4c494c642 Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 14 Oct 2024 17:48:38 +0200 Subject: [PATCH 01/13] added rotate option --- mess/util.py | 7 +++++++ mess/workflow/simulate.smk | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/mess/util.py b/mess/util.py index 2fba7cd..35fab30 100644 --- a/mess/util.py +++ b/mess/util.py @@ -248,6 +248,13 @@ def sim_options(func): default=False, required=False, ), + click.option( + "--rotate", + help="Number of times to shuffle genome breakpoints for circular genomes", + type=int, + default=1, + show_default=True, + ), click.option( "--tech", help="Sequencing technology", diff --git a/mess/workflow/simulate.smk b/mess/workflow/simulate.smk index 32706ab..5e0a8cb 100644 --- a/mess/workflow/simulate.smk +++ b/mess/workflow/simulate.smk @@ -77,6 +77,12 @@ MEAN_LEN = config.args.mean_len # calculate coverages include: os.path.join("rules", "processing", "coverages.smk") + + +# fasta processing options +ROTATE = config.args.rotate + + include: os.path.join("rules", "processing", "fastas.smk") From 8f01d5d056acc58f8d5e0e1e2d59c2b2e8922594 Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 14 Oct 2024 17:49:47 +0200 Subject: [PATCH 02/13] adapted functions when rotating contigs --- mess/workflow/rules/preflight/functions.smk | 50 ++++++++++++++++----- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 6e75413..748348f 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -91,17 +91,29 @@ def list_fastas(wildcards): table_cache = {} -def get_value(table, wildcards, value): +def get_value(value, wildcards): + + vals = ( + f"{wildcards.sample}", + f"{wildcards.fasta}", + f"{wildcards.contig}", + ) + idx_col = ["samplename", "fasta", "contig"] + + if ROTATE > 1: + idx_col += ["n"] + vals += (int(wildcards.n),) + + table = checkpoints.split_contigs.get(**wildcards).output[0] if table not in table_cache: df = pd.read_csv( table, sep="\t", - index_col=["samplename", "fasta", "contig"], + index_col=idx_col, ) table_cache[table] = df df = table_cache[table] - val = df.loc[wildcards.sample].loc[wildcards.fasta].loc[wildcards.contig][value] - return val + return df.loc[vals, value] def get_asm_summary(wildcards): @@ -130,14 +142,28 @@ def aggregate(wildcards, outdir, level, ext): else: contigs = list(set(contigs)) if PAIRED and ext != "bam": - return expand( - os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"), - sample=wildcards.sample, - fasta=wildcards.fasta, - p=wildcards.p, - contig=contigs, - ext=ext, - ) + if ROTATE > 1: + return expand( + os.path.join( + outdir, "{sample}", "{fasta}", "{contig}_{n}{p}.{ext}" + ), + sample=wildcards.sample, + fasta=wildcards.fasta, + n=list(range(1, ROTATE + 1)), + p=wildcards.p, + contig=contigs, + ext=ext, + ) + else: + return expand( + os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"), + sample=wildcards.sample, + fasta=wildcards.fasta, + p=wildcards.p, + contig=contigs, + ext=ext, + ) + else: return expand( os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"), From 769806fd613e51ed82c6b95ea7a4319d082ae28a Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 14 Oct 2024 17:49:57 +0200 Subject: [PATCH 03/13] added rule to rotate contigs --- mess/workflow/rules/processing/fastas.smk | 36 +++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/mess/workflow/rules/processing/fastas.smk b/mess/workflow/rules/processing/fastas.smk index e9346d6..f046c7c 100644 --- a/mess/workflow/rules/processing/fastas.smk +++ b/mess/workflow/rules/processing/fastas.smk @@ -50,9 +50,45 @@ checkpoint split_contigs: output: tsv=os.path.join(dir.out.processing, "cov.tsv"), dir=directory(os.path.join(dir.out.processing, "split")), + params: + rotate=ROTATE, resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, script: os.path.join(dir.scripts, "split_contigs.py") + + +if ROTATE > 1: + + rule rotate_contigs: + input: + os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna"), + output: + os.path.join( + dir.out.processing, "split", "{sample}", "{fasta}_{contig}_{n}.fna" + ), + params: + lambda wildcards: get_value("contig_start", wildcards), + log: + os.path.join( + dir.out.logs, + "seqkit", + "restart", + "{sample}", + "{fasta}_{contig}_{n}.log", + ), + resources: + mem_mb=config.resources.sml.mem, + mem=str(config.resources.sml.mem) + "MB", + time=config.resources.sml.time, + threads: config.resources.sml.cpu + conda: + os.path.join(dir.conda, "seqkit.yml") + container: + containers.seqkit + shell: + """ + seqkit restart -i {params} {input} > {output} + """ From 40dc2c40073abd1e7d1381abd5d1c12a6f987c7c Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 14 Oct 2024 17:50:20 +0200 Subject: [PATCH 04/13] adapted inputs and logs when rotating contigs --- mess/workflow/rules/simulate/short_reads.smk | 37 ++++++++++++-------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/mess/workflow/rules/simulate/short_reads.smk b/mess/workflow/rules/simulate/short_reads.smk index a7cf6b5..2624034 100644 --- a/mess/workflow/rules/simulate/short_reads.smk +++ b/mess/workflow/rules/simulate/short_reads.smk @@ -19,39 +19,48 @@ if BAM: art_args += "-sam -M" -sam_out = temp(os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}.txt")) +fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}") +if ROTATE > 1: + fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}_{n}") + +sam_out = temp(fq_prefix + ".txt") if BAM: - sam_out = temp(os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}.sam")) + sam_out = temp(temp(fq_prefix + ".sam")) fastq_out = [ - temp(os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}1.fq")), - temp(os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}2.fq")), + temp(fq_prefix + "1.fq"), + temp(fq_prefix + "2.fq"), ] -fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}") - if not PAIRED: - fastq_out = temp(os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}.fq")) + fastq_out = temp(fq_prefix + ".fq") + +fasta = os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna") +if ROTATE > 1: + fasta = os.path.join( + dir.out.processing, "split", "{sample}", "{fasta}_{contig}_{n}.fna" + ) rule art_illumina: input: - fa=os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna"), - df=get_cov_table, + fa=fasta, output: sam=sam_out, fastqs=fastq_out, params: args=art_args, read_len=MEAN_LEN, - cov=lambda wildcards, input: get_value(input.df, wildcards, "cov_sim"), - seed=lambda wildcards, input: int(get_value(input.df, wildcards, "seed")), + cov=lambda wildcards: get_value("cov_sim", wildcards), + seed=lambda wildcards: int(get_value("seed", wildcards)), prefix=fq_prefix, - benchmark: - os.path.join(dir.out.bench, "art", "{sample}", "{fasta}", "{contig}.txt") log: - os.path.join(dir.out.logs, "art", "{sample}", "{fasta}", "{contig}.log"), + os.path.join(dir.out.logs, "art", "{sample}", "{fasta}", "{contig}.log") + if ROTATE == 1 + else os.path.join( + dir.out.logs, "art", "{sample}", "{fasta}", "{contig}_{n}.log" + ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", From 8a1cf455ec5c551b4cfcec47f777a7de5d6ceab4 Mon Sep 17 00:00:00 2001 From: farchaab Date: Mon, 14 Oct 2024 17:50:51 +0200 Subject: [PATCH 05/13] added contig_start and n columns when rotating contigs --- mess/workflow/scripts/split_contigs.py | 30 ++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/mess/workflow/scripts/split_contigs.py b/mess/workflow/scripts/split_contigs.py index 4505b2c..2bec93e 100644 --- a/mess/workflow/scripts/split_contigs.py +++ b/mess/workflow/scripts/split_contigs.py @@ -2,6 +2,12 @@ import pandas as pd import os from itertools import chain +import random + + +def get_random_start(seed, contig_length, n): + random.seed(seed) + return [random.randint(1, contig_length) for _ in range(n)] def split_fasta(fa, outdir): @@ -11,7 +17,9 @@ def split_fasta(fa, outdir): for record in SeqIO.parse(fa, "fasta"): SeqIO.write(record, fasta_name + "_" + record.id + ".fna", "fasta") - record_ids.append({"contig": record.id, "fasta": name}) + record_ids.append( + {"contig": record.id, "fasta": name, "contig_length": len(record.seq)} + ) return record_ids @@ -24,6 +32,20 @@ def split_fasta(fa, outdir): id2fa = list(chain.from_iterable(id2fa)) contig_df = pd.DataFrame.from_records(id2fa) df = pd.merge(contig_df, cov_df, how="left", on="fasta") -df[["samplename", "fasta", "contig", "tax_id", "seed", "cov_sim"]].to_csv( - snakemake.output.tsv, sep="\t", index=False -) + +cols = ["samplename", "fasta", "contig", "contig_length", "tax_id", "seed", "cov_sim"] + +if snakemake.params.rotate > 1: + cols += ["n", "contig_start"] + df["contig_start"] = df.apply( + lambda row: get_random_start( + row["seed"], row["contig_length"], snakemake.params.rotate + ), + axis=1, + ) + df_expanded = df.explode("contig_start").reset_index(drop=True) + df_expanded["n"] = df_expanded.groupby(["samplename", "contig"]).cumcount() + 1 + df = df_expanded + df["cov_sim"] = df["cov_sim"] / snakemake.params.rotate + +df[cols].to_csv(snakemake.output.tsv, sep="\t", index=False) From fb77724c78b5adf0e6772ae1a86716d6475f12ca Mon Sep 17 00:00:00 2001 From: farchaab Date: Wed, 16 Oct 2024 14:55:49 +0200 Subject: [PATCH 06/13] added get_random_start, updated expand when rotate --- mess/workflow/rules/preflight/functions.smk | 44 ++++++++++++++++----- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk index 748348f..edf0a4a 100644 --- a/mess/workflow/rules/preflight/functions.smk +++ b/mess/workflow/rules/preflight/functions.smk @@ -89,8 +89,6 @@ def list_fastas(wildcards): table_cache = {} - - def get_value(value, wildcards): vals = ( @@ -116,6 +114,24 @@ def get_value(value, wildcards): return df.loc[vals, value] +fa_table_cache = {} +def get_random_start(wildcards): + table = checkpoints.split_contigs.get(**wildcards).output[0] + if table not in fa_table_cache: + df = pd.read_csv( + table, + sep="\t", + index_col=["fasta","contig","n"] + ) + fa_table_cache[table] = df + vals=(f"{wildcards.fasta}",f"{wildcards.contig}",int(wildcards.n)) + df = fa_table_cache[table] + return df.loc[vals, "random_start"] + + + + + def get_asm_summary(wildcards): try: table = checkpoints.download_assemblies.get(**wildcards).output[0] @@ -165,13 +181,23 @@ def aggregate(wildcards, outdir, level, ext): ) else: - return expand( - os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"), - sample=wildcards.sample, - fasta=wildcards.fasta, - contig=contigs, - ext=ext, - ) + if ROTATE > 1: + return expand( + os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}.{ext}"), + sample=wildcards.sample, + fasta=wildcards.fasta, + contig=contigs, + n=list(range(1, ROTATE + 1)), + ext=ext, + ) + else: + return expand( + os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"), + sample=wildcards.sample, + fasta=wildcards.fasta, + contig=contigs, + ext=ext, + ) if level == "fasta": df = pd.read_csv(table, sep="\t", index_col=["samplename", "fasta"]) fastas = list(set(df.loc[wildcards.sample].index)) From 76330031eb53954157acf7b6cdd483ef8bb9040e Mon Sep 17 00:00:00 2001 From: farchaab Date: Wed, 16 Oct 2024 15:01:35 +0200 Subject: [PATCH 07/13] changed random_start range, renamed column --- mess/workflow/scripts/split_contigs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mess/workflow/scripts/split_contigs.py b/mess/workflow/scripts/split_contigs.py index 2bec93e..6750dc4 100644 --- a/mess/workflow/scripts/split_contigs.py +++ b/mess/workflow/scripts/split_contigs.py @@ -7,7 +7,7 @@ def get_random_start(seed, contig_length, n): random.seed(seed) - return [random.randint(1, contig_length) for _ in range(n)] + return [random.randint(0, contig_length - 1) for _ in range(n)] def split_fasta(fa, outdir): @@ -36,14 +36,14 @@ def split_fasta(fa, outdir): cols = ["samplename", "fasta", "contig", "contig_length", "tax_id", "seed", "cov_sim"] if snakemake.params.rotate > 1: - cols += ["n", "contig_start"] - df["contig_start"] = df.apply( + cols += ["n", "random_start"] + df["random_start"] = df.apply( lambda row: get_random_start( row["seed"], row["contig_length"], snakemake.params.rotate ), axis=1, ) - df_expanded = df.explode("contig_start").reset_index(drop=True) + df_expanded = df.explode("random_start").reset_index(drop=True) df_expanded["n"] = df_expanded.groupby(["samplename", "contig"]).cumcount() + 1 df = df_expanded df["cov_sim"] = df["cov_sim"] / snakemake.params.rotate From 5978576d6065d766740bac720497ab58202f3764 Mon Sep 17 00:00:00 2001 From: farchaab Date: Wed, 16 Oct 2024 15:01:53 +0200 Subject: [PATCH 08/13] changed rotate fasta path --- mess/workflow/rules/simulate/short_reads.smk | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mess/workflow/rules/simulate/short_reads.smk b/mess/workflow/rules/simulate/short_reads.smk index 2624034..7d60e2a 100644 --- a/mess/workflow/rules/simulate/short_reads.smk +++ b/mess/workflow/rules/simulate/short_reads.smk @@ -38,14 +38,13 @@ if not PAIRED: fasta = os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna") if ROTATE > 1: - fasta = os.path.join( - dir.out.processing, "split", "{sample}", "{fasta}_{contig}_{n}.fna" - ) + fasta = os.path.join(dir.out.processing, "rotate", "{fasta}_{contig}_{n}.fna") rule art_illumina: input: fa=fasta, + df=get_cov_table, output: sam=sam_out, fastqs=fastq_out, From 22a0157fb507a9bbf441454eb4aaed780db17855 Mon Sep 17 00:00:00 2001 From: farchaab Date: Wed, 16 Oct 2024 15:03:49 +0200 Subject: [PATCH 09/13] fixed rotate_contigs rule --- mess/workflow/rules/processing/fastas.smk | 38 ++++++++++------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/mess/workflow/rules/processing/fastas.smk b/mess/workflow/rules/processing/fastas.smk index f046c7c..d47941b 100644 --- a/mess/workflow/rules/processing/fastas.smk +++ b/mess/workflow/rules/processing/fastas.smk @@ -64,31 +64,25 @@ if ROTATE > 1: rule rotate_contigs: input: - os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna"), + fa=os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna"), + df=get_cov_table, output: - os.path.join( - dir.out.processing, "split", "{sample}", "{fasta}_{contig}_{n}.fna" - ), + os.path.join(dir.out.processing, "rotate", "{fasta}_{contig}_{n}.fna"), params: - lambda wildcards: get_value("contig_start", wildcards), - log: - os.path.join( - dir.out.logs, - "seqkit", - "restart", - "{sample}", - "{fasta}_{contig}_{n}.log", - ), + random_start=get_random_start, resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, - threads: config.resources.sml.cpu - conda: - os.path.join(dir.conda, "seqkit.yml") - container: - containers.seqkit - shell: - """ - seqkit restart -i {params} {input} > {output} - """ + run: + for record in SeqIO.parse(input[0], "fasta"): + header = record.id + "_" + wildcards.n + record.name = header + record.id = header + record.description = header + record.seq = ( + record.seq[params.random_start :] + + record.seq[: params.random_start] + ) + SeqIO.write(record, output[0], "fasta") + From 03fd7457ae45b659855c3c2b9b56687581315a5d Mon Sep 17 00:00:00 2001 From: farchaab Date: Wed, 16 Oct 2024 15:04:30 +0200 Subject: [PATCH 10/13] updated paths to work with rotate --- mess/workflow/rules/processing/reads.smk | 71 +++++++++++------------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/mess/workflow/rules/processing/reads.smk b/mess/workflow/rules/processing/reads.smk index fc7d8f3..60d0c8e 100644 --- a/mess/workflow/rules/processing/reads.smk +++ b/mess/workflow/rules/processing/reads.smk @@ -1,8 +1,11 @@ fastq_dir = dir.out.long -sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.sam") +contig = "{contig}" +if ROTATE > 1: + contig = "{contig}_{n}" +sam_in = os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam") if SEQ_TECH == "illumina": fastq_dir = dir.out.short - sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fixed") + sam_in = os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed") fastq = os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq") fastq_gz = temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fq.gz")) @@ -15,7 +18,7 @@ if PASSES > 1: rule ccs_sam_to_bam: input: - os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.sam"), + os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".sam"), output: temp( os.path.join( @@ -23,19 +26,11 @@ if PASSES > 1: "ccs", "{sample}", "{fasta}", - "{contig}.ccs.bam", + contig + ".ccs.bam", ) ), - benchmark: - os.path.join( - dir.out.bench, - "samtools", - "sam2bam", - "{sample}", - "{fasta}_{contig}.txt", - ) log: - os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", "{contig}.log"), + os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", contig + ".log"), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -53,14 +48,16 @@ if PASSES > 1: rule ccs_bam_to_fastq: input: - os.path.join(dir.out.base, "ccs", "{sample}", "{fasta}", "{contig}.ccs.bam"), + os.path.join( + dir.out.base, "ccs", "{sample}", "{fasta}", contig + ".ccs.bam" + ), output: fq=temp( os.path.join( dir.out.long, "{sample}", "{fasta}", - "{contig}.fq.gz", + contig + ".fq.gz", ) ), json=temp( @@ -68,13 +65,11 @@ if PASSES > 1: dir.out.long, "{sample}", "{fasta}", - "{contig}.zmw_metrics.json.gz", + contig + ".zmw_metrics.json.gz", ) ), - benchmark: - os.path.join(dir.out.bench, "ccs", "{sample}", "{fasta}", "{contig}.txt") log: - os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", "{contig}.log"), + os.path.join(dir.out.logs, "ccs", "{sample}", "{fasta}", contig + ".log"), params: passes=PASSES, accuracy=ACCURACY, @@ -100,10 +95,10 @@ if BAM: rule add_reference_name: input: - maf=os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.maf"), - fa=os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.ref"), + maf=os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".maf"), + fa=os.path.join(dir.out.long, "{sample}", "{fasta}", contig + ".ref"), output: - temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.maf")), + temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".maf")), params: seqname=lambda wildcards, input: get_header(input.fa), resources: @@ -117,24 +112,16 @@ if BAM: rule convert_maf_to_sam: input: - os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.maf"), + os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".maf"), output: - temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.sam")), - benchmark: - os.path.join( - dir.out.logs, - "bioconvert", - "maf2sam", - "{sample}", - "{fasta}_{contig}.txt", - ) + temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".sam")), log: os.path.join( dir.out.logs, "bioconvert", "maf2sam", "{sample}", - "{fasta}_{contig}.log", + "{fasta}" + "_" + contig + ".log", ), resources: mem_mb=config.resources.sml.mem, @@ -157,9 +144,9 @@ rule fix_art_sam: Fixes truncated art_illumina SAM files with some genomes """ input: - os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.sam"), + os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".sam"), output: - temp(os.path.join(fastq_dir, "{sample}", "{fasta}", "{contig}.fixed")), + temp(os.path.join(fastq_dir, "{sample}", "{fasta}", contig + ".fixed")), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -178,10 +165,14 @@ rule convert_sam_to_bam: input: sam_in, output: - temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", "{contig}.bam")), + temp(os.path.join(dir.out.bam, "{sample}", "{fasta}", contig + ".bam")), log: os.path.join( - dir.out.logs, "bioconvert", "sam2bam", "{sample}", "{fasta}{contig}.log" + dir.out.logs, + "bioconvert", + "sam2bam", + "{sample}", + "{fasta}" + contig + ".log", ), resources: mem_mb=config.resources.sml.mem, @@ -288,7 +279,7 @@ rule get_bam_coverage: containers.bioconvert shell: """ - samtools coverage {input} | tee {output} {log} + samtools coverage {input} > {output} 2> {log} """ @@ -306,10 +297,12 @@ rule get_tax_profile: time=config.resources.sml.time, threads: config.resources.sml.cpu params: - paired=PAIRED, + rotate=ROTATE, run: tax_df = pd.read_csv(input.tax, sep="\t") tax_df = tax_df[tax_df.samplename == wildcards.sample] + if params.rotate > 1: + tax_df["contig"] = tax_df["contig"] + "_" + tax_df["n"].astype(str) cov_df = pd.read_csv(input.cov, sep="\t") cov_df.rename(columns={"#rname": "contig"}, inplace=True) merge_df = tax_df.merge(cov_df) From 649754e549a5120647713df835121e2d78c6ea73 Mon Sep 17 00:00:00 2001 From: farchaab Date: Wed, 16 Oct 2024 15:04:50 +0200 Subject: [PATCH 11/13] update rule to work with rotate --- mess/workflow/rules/simulate/long_reads.smk | 32 +++++++++++++-------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/mess/workflow/rules/simulate/long_reads.smk b/mess/workflow/rules/simulate/long_reads.smk index fa9af5a..c6e4e46 100644 --- a/mess/workflow/rules/simulate/long_reads.smk +++ b/mess/workflow/rules/simulate/long_reads.smk @@ -1,20 +1,25 @@ prefix = os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}") +fasta = os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna") +if ROTATE > 1: + prefix = os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}_{n}") + fasta = os.path.join(dir.out.processing, "rotate", "{fasta}_{contig}_{n}.fna") +id_prefix = os.path.basename(prefix) + if PASSES > 1: - pbsim3_out = temp(os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.sam")) + pbsim3_out = temp(prefix + ".sam") rename = f"mv {prefix}_0001.sam {prefix}.sam" else: - pbsim3_out = temp(os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.fq")) + pbsim3_out = temp(prefix + ".fq") rename = f"mv {prefix}_0001.fastq {prefix}.fq" rule pbsim3: input: - fa=os.path.join(dir.out.processing, "split", "{fasta}_{contig}.fna"), - df=get_cov_table, + fa=fasta, output: pbsim3_out, - temp(os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.maf")), - temp(os.path.join(dir.out.long, "{sample}", "{fasta}", "{contig}.ref")), + temp(prefix + ".maf"), + temp(prefix + ".ref"), params: model=os.path.join(MODEL_PATH, f"{MODEL}.model"), ratio=RATIO, @@ -24,14 +29,17 @@ rule pbsim3: maxlen=MAX_LEN, passes=PASSES, accuracy=ACCURACY, - cov=lambda wildcards, input: get_value(input.df, wildcards, "cov_sim"), - seed=lambda wildcards, input: int(get_value(input.df, wildcards, "seed")), + cov=lambda wildcards: get_value("cov_sim", wildcards), + seed=lambda wildcards: int(get_value("seed", wildcards)), prefix=prefix, + id_prefix=id_prefix, reads_rename=rename, - benchmark: - os.path.join(dir.out.bench, "pbsim3", "{sample}", "{fasta}", "{contig}.txt") log: - os.path.join(dir.out.logs, "pbsim3", "{sample}", "{fasta}", "{contig}.log"), + os.path.join(dir.out.logs, "pbsim3", "{sample}", "{fasta}", "{contig}.log") + if ROTATE == 1 + else os.path.join( + dir.out.logs, "pbsim3", "{sample}", "{fasta}", "{contig}_{n}.log" + ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", @@ -43,7 +51,7 @@ rule pbsim3: shell: """ pbsim --strategy wgs --method qshmm \\ - --id-prefix {wildcards.contig} \\ + --id-prefix {params.id_prefix} \\ --difference-ratio {params.ratio} \\ --length-min {params.minlen} \\ --length-max {params.maxlen} \\ From cc54873ddc48b72445af7c2d9fe412e1b0ec1136 Mon Sep 17 00:00:00 2001 From: farchaab Date: Thu, 17 Oct 2024 08:59:27 +0200 Subject: [PATCH 12/13] changed rotate_contigs to use seqkit --- mess/workflow/rules/processing/fastas.smk | 32 ++++++++++++++--------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/mess/workflow/rules/processing/fastas.smk b/mess/workflow/rules/processing/fastas.smk index d47941b..c24014c 100644 --- a/mess/workflow/rules/processing/fastas.smk +++ b/mess/workflow/rules/processing/fastas.smk @@ -69,20 +69,26 @@ if ROTATE > 1: output: os.path.join(dir.out.processing, "rotate", "{fasta}_{contig}_{n}.fna"), params: - random_start=get_random_start, + get_random_start, + log: + os.path.join( + dir.out.logs, + "seqkit", + "restart", + "{fasta}_{contig}_{n}.log", + ), resources: mem_mb=config.resources.sml.mem, mem=str(config.resources.sml.mem) + "MB", time=config.resources.sml.time, - run: - for record in SeqIO.parse(input[0], "fasta"): - header = record.id + "_" + wildcards.n - record.name = header - record.id = header - record.description = header - record.seq = ( - record.seq[params.random_start :] - + record.seq[: params.random_start] - ) - SeqIO.write(record, output[0], "fasta") - + threads: config.resources.sml.cpu + conda: + os.path.join(dir.conda, "seqkit.yml") + container: + containers.seqkit + shell: + """ + seqkit restart -i {params} {input.fa} | \ + seqkit seq -i | \ + seqkit replace -p .+ -r {wildcards.contig}_{wildcards.n} > {output} + """ From 754427b541cebaf71180840acf5a382037f008e1 Mon Sep 17 00:00:00 2001 From: farchaab Date: Thu, 17 Oct 2024 09:12:22 +0200 Subject: [PATCH 13/13] updated rotate option in cli --- mess/__main__.py | 6 +++++- mess/util.py | 3 +-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/mess/__main__.py b/mess/__main__.py index 3b6fdee..9e2a51a 100644 --- a/mess/__main__.py +++ b/mess/__main__.py @@ -105,7 +105,11 @@ local_sim_options = ( { "name": "Local genomes options", - "options": ["--asm-summary", "--fasta"], + "options": [ + "--asm-summary", + "--fasta", + "--rotate", + ], }, ) diff --git a/mess/util.py b/mess/util.py index 35fab30..193a4c2 100644 --- a/mess/util.py +++ b/mess/util.py @@ -116,7 +116,6 @@ def run_snakemake( if prefix: snake_command += ["--conda-prefix", prefix] if sdm == "apptainer": - snake_command += [ f"--sdm apptainer --apptainer-args '-B {workflow_basedir}:{workflow_basedir}'" ] @@ -250,7 +249,7 @@ def sim_options(func): ), click.option( "--rotate", - help="Number of times to shuffle genome breakpoints for circular genomes", + help="Number of times to shuffle genome start for circular assemblies (2 or more for circular)", type=int, default=1, show_default=True,