Skip to content

Commit

Permalink
Merge pull request #35 from metagenlab/readproc
Browse files Browse the repository at this point in the history
Added error free BAMs, optimized read processing and functions
  • Loading branch information
farchaab authored Nov 1, 2024
2 parents 89471d5 + 568e080 commit 6dd8f2d
Show file tree
Hide file tree
Showing 22 changed files with 232 additions and 296 deletions.
15 changes: 3 additions & 12 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ jobs:
with:
fetch-depth: 0

- name: Setup apptainer
uses: eWaterCycle/[email protected]

- name: Setup MeSS environment
uses: conda-incubator/setup-miniconda@v3
with:
Expand All @@ -54,18 +57,6 @@ jobs:
python-version: ${{ matrix.python-version }}
auto-update-conda: true

- name: Setup apt dependencies
run: |
sudo add-apt-repository -y ppa:apptainer/ppa
sudo apt-get update
sudo apt install -y squashfuse fuse2fs gocryptfs apptainer
- name: Disable apparmor namespace restrictions for apptainer
run: |
sudo sh -c 'echo kernel.apparmor_restrict_unprivileged_userns=0 \
>/etc/sysctl.d/90-disable-userns-restrictions.conf'
sudo sysctl -p /etc/sysctl.d/90-disable-userns-restrictions.conf
- name: Install MeSS and pytest-cov
run: |
pip install -e .
Expand Down
8 changes: 7 additions & 1 deletion mess/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,13 @@
},
{
"name": "art_illumina options",
"options": ["--custom-err", "--paired", "--frag-len", "--frag-sd"],
"options": [
"--custom-err",
"--errfree",
"--paired",
"--frag-len",
"--frag-sd",
],
},
{
"name": "pbsim3 options",
Expand Down
3 changes: 2 additions & 1 deletion mess/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ args:
configfile:
custom_err:
dist:
error:
error:
errfree:
fasta:
frag_len:
frag_sd:
Expand Down
2 changes: 1 addition & 1 deletion mess/test_data/minimal_test.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
taxon nb cov_sim sample
staphylococcus_aureus 1 0.1 sample1
1290 1 0.1 sample2
1290 1 0.1 sample2
5 changes: 5 additions & 0 deletions mess/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,11 @@ def sim_options(func):
type=str,
default=None,
),
click.option(
"--errfree",
help="Generate error free alignments with art_illumina",
is_flag=True,
),
click.option(
"--replicates",
help="Number of replicates per sample",
Expand Down
2 changes: 2 additions & 0 deletions mess/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ include: os.path.join("rules", "processing", "fastas.smk")
CUSTOM_ERR = config.args.custom_err
ERROR = config.args.error
BAM = config.args.bam
ERRFREE = config.args.errfree
MIN_LEN = config.args.min_len
MAX_LEN = config.args.max_len
SD_LEN = config.args.sd_len
Expand All @@ -131,6 +132,7 @@ else:


# reads post-processsing options
random.seed(SEED)
SHUFFLE = dict(zip(SAMPLES, random.sample(range(1, 100000), len(SAMPLES))))
SKIP_SHUFFLE = config.args.skip_shuffle
RANKS = config.args.ranks
Expand Down
4 changes: 2 additions & 2 deletions mess/workflow/envs/conda/assembly_finder.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ channels:
- bioconda
- defaults
dependencies:
- assembly_finder =0.7.6
- ncbi-datasets-cli =16.26.2
- assembly_finder =0.8.0
- ncbi-datasets-cli =16.31.0
- taxonkit =0.17.0
- csvtk =0.30.0
- rsync =3.3.0
Expand Down
7 changes: 7 additions & 0 deletions mess/workflow/envs/conda/samtools.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: samtools
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- samtools =1.21
3 changes: 2 additions & 1 deletion mess/workflow/envs/containers.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
art: docker://quay.io/biocontainers/art:2016.06.05--heacdb12_11
assembly_finder: docker://ghcr.io/metagenlab/assembly_finder:v0.7.7
assembly_finder: docker://ghcr.io/metagenlab/assembly_finder:v0.8.0
bioconvert: docker://quay.io/biocontainers/bioconvert:1.1.1--pyhdfd78af_0
curl: docker://quay.io/biocontainers/curl:7.80.0
pigz: docker://quay.io/biocontainers/pigz:2.8
pbccs: docker://quay.io/biocontainers/pbccs:6.4.0--h9ee0642_0
pbsim3: docker://quay.io/biocontainers/pbsim3:3.0.4--h4ac6f70_0
seqkit: docker://quay.io/biocontainers/seqkit:2.8.2--h9ee0642_0
taxonkit: docker://quay.io/biocontainers/taxonkit:0.17.0--h9ee0642_1
samtools: docker://quay.io/biocontainers/samtools:1.21--h50ea8bc_0
6 changes: 1 addition & 5 deletions mess/workflow/rules/download/assembly_finder.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ rule get_unique_entries:
)


af_args = ""
af_args = "--no-use-conda "
if TAXON:
af_args += "--taxon "
if LIMIT:
Expand Down Expand Up @@ -59,14 +59,11 @@ checkpoint download_assemblies:
tsv=os.path.join(dir.out.base, "uniq_entries.tsv"),
output:
asm=os.path.join(dir.out.base, "assembly_finder/assembly_summary.tsv"),
seq=os.path.join(dir.out.base, "assembly_finder/sequence_report.tsv"),
tax=os.path.join(dir.out.base, "assembly_finder/taxonomy.tsv"),
params:
args=af_args,
taxonkit=TAXONKIT,
out=os.path.join(dir.out.base, "assembly_finder"),
benchmark:
os.path.join(dir.out.bench, "assembly_finder.txt")
log:
os.path.join(dir.out.logs, "assembly_finder.log"),
resources:
Expand All @@ -84,6 +81,5 @@ checkpoint download_assemblies:
--taxonkit {params.taxonkit} \\
--threads {threads} \\
{params.args} \\
--no-use-conda \\
-o {params.out} 2> {log}
"""
1 change: 1 addition & 0 deletions mess/workflow/rules/preflight/directories.smk
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ dir.out.short = os.path.join(dir.out.processing, "short")
dir.out.long = os.path.join(dir.out.processing, "long")
dir.out.fastq = os.path.join(dir.out.base, "fastq")
dir.out.bam = os.path.join(dir.out.base, "bam")
dir.out.ef = os.path.join(dir.out.bam, "error-free")
dir.out.tax = os.path.join(dir.out.base, "tax")


Expand Down
164 changes: 54 additions & 110 deletions mess/workflow/rules/preflight/functions.smk
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,39 @@ import random
wildcard_constraints:
sample="[^/]+",
contig="[^/]+",
fasta="[^/]+",


def list_reads(wildcards):
fastqs = "{sample}.fq.gz"
args = {"sample": SAMPLES}

if PAIRED:
reads = expand(
os.path.join(dir.out.fastq, "{sample}_R{p}.fq.gz"),
sample=SAMPLES,
p=PAIRS,
)
else:
reads = expand(
os.path.join(dir.out.fastq, "{sample}.fq.gz"),
sample=SAMPLES,
)
fastqs = "{sample}_R{p}.fq.gz"
args.update({"p": PAIRS})
reads = collect(os.path.join(dir.out.fastq, fastqs), **args)

if BAM:
bams = expand(
bams = collect(
os.path.join(dir.out.bam, "{sample}.{bam}"),
sample=SAMPLES,
bam=["bam", "bam.bai"],
)
tax = expand(
tax = collect(
os.path.join(dir.out.tax, "{sample}_{abundance}.txt"),
sample=SAMPLES,
abundance=["seq", "tax"],
)
reads = reads + bams + tax

if ERRFREE:
bams_ef = expand(
os.path.join(dir.out.ef, "{sample}.{bam}"),
sample=SAMPLES,
bam=["bam", "bam.bai"],
)
reads = reads + bams_ef

return reads


Expand Down Expand Up @@ -67,32 +72,43 @@ def parse_samples(indir, replicates):
fasta_cache = {}


def fasta_input(wildcards):
table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0]
def get_fasta_table(wildcards):
fa_table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0]
if fa_table not in fasta_cache:
fa_df = pd.read_csv(fa_table, sep="\t", index_col="fasta")
fasta_cache[fa_table] = fa_df
fa_df = fasta_cache[fa_table]
return fa_df

df = pd.read_csv(table, sep="\t", index_col="fasta")

def fasta_input(wildcards):
df = get_fasta_table(wildcards)
try:
return df.loc[wildcards.fasta]["path"].drop_duplicates()
except AttributeError:
return df.loc[wildcards.fasta]["path"]
# some samples use the same genome path, drop duplicates to avoid duplicate paths when processing fasta


def list_fastas(wildcards):
table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0]
if table not in fasta_cache:
df = pd.read_csv(table, sep="\t")
fasta_cache[table] = df
df = fasta_cache[table]
fastas = list(set(df["fasta"]))
return expand(os.path.join(dir.out.processing, "{fasta}.fasta"), fasta=fastas)
df = get_fasta_table(wildcards)
return expand(
os.path.join(dir.out.processing, "{fasta}.fasta"), fasta=list(set(df.index))
)


table_cache = {}


def get_value(value, wildcards):
def get_cov_table(wildcards, key, idx_col):
cov_table = checkpoints.split_contigs.get(**wildcards).output[0]
if cov_table not in table_cache:
cov_df = pd.read_csv(cov_table, sep="\t", index_col=idx_col).sort_index()
table_cache[key] = cov_df
cov_df = table_cache[key]
return cov_df


def get_value(value, wildcards):
vals = (
f"{wildcards.sample}",
f"{wildcards.fasta}",
Expand All @@ -103,17 +119,8 @@ def get_value(value, wildcards):
if CIRCULAR:
idx_col += ["n"]
vals += (int(wildcards.n),)

table = checkpoints.split_contigs.get(**wildcards).output[0]
if table not in table_cache:
df = pd.read_csv(
table,
sep="\t",
index_col=idx_col,
).sort_index()
table_cache[table] = df
df = table_cache[table]
return df.loc[vals, value]
val_df = get_cov_table(wildcards, "values", idx_col)
return val_df.loc[vals, value]


def get_asm_summary(wildcards):
Expand All @@ -128,10 +135,6 @@ def get_asm_summary(wildcards):
return table


def get_cov_table(wildcards):
return checkpoints.split_contigs.get(**wildcards).output[0]


def is_circular():
if os.path.isfile(INPUT):
files = [INPUT]
Expand All @@ -144,78 +147,19 @@ def is_circular():
return False


def aggregate(wildcards, outdir, level, ext):
table = checkpoints.split_contigs.get(**wildcards).output[0]
df = pd.read_csv(table, sep="\t", index_col=["samplename", "fasta"]).sort_index()
if level == "contig":
contigs = list(
df.loc[(wildcards.sample, wildcards.fasta)]["contig"].drop_duplicates()
)
if "rotate" in df.columns:
rotates = int(
df.loc[(wildcards.sample, wildcards.fasta)]["rotate"]
.drop_duplicates()
.values
)

if PAIRED and ext != "bam":
if "rotate" in df.columns:
return expand(
os.path.join(
outdir, "{sample}", "{fasta}", "{contig}_{n}{p}.{ext}"
),
sample=wildcards.sample,
fasta=wildcards.fasta,
n=list(range(1, rotates + 1)),
p=wildcards.p,
contig=contigs,
ext=ext,
)
else:
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
p=wildcards.p,
contig=contigs,
ext=ext,
)

else:
if "rotate" in df.columns:
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
contig=contigs,
n=list(range(1, rotates + 1)),
ext=ext,
)
else:
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"),
sample=wildcards.sample,
fasta=wildcards.fasta,
contig=contigs,
ext=ext,
)
if level == "fasta":
fastas = list(set(df.loc[wildcards.sample].index))
def aggregate(wildcards, outdir, ext):
df = get_cov_table(wildcards, "aggregate", ["samplename"])
files = []
for row in df.loc[[wildcards.sample]].itertuples():
prefix = f"{row.contig}"
if CIRCULAR:
prefix += f"_{row.n}"
if PAIRED and ext != "bam":
return expand(
os.path.join(outdir, "{sample}", "{fasta}{p}.{ext}"),
sample=wildcards.sample,
fasta=fastas,
p=wildcards.p,
ext=ext,
)
else:
return expand(
os.path.join(outdir, "{sample}", "{fasta}.{ext}"),
sample=wildcards.sample,
fasta=fastas,
ext=ext,
)
prefix += f"{wildcards.p}"
files.append(
os.path.join(outdir, wildcards.sample, row.fasta, f"{prefix}.{ext}")
)
return files


def get_header(fa):
Expand Down
2 changes: 0 additions & 2 deletions mess/workflow/rules/preflight/targets_download.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@
All target download files are declared here
"""


TargetDownloads = [
os.path.join(dir.out.base, "uniq_entries.tsv"),
os.path.join(dir.out.base, "assembly_finder/assembly_summary.tsv"),
os.path.join(dir.out.base, "assembly_finder/sequence_report.tsv"),
os.path.join(dir.out.base, "assembly_finder/taxonomy.tsv"),
]
Loading

0 comments on commit 6dd8f2d

Please sign in to comment.