Skip to content

Commit

Permalink
Merge pull request #29 from metagenlab/dev
Browse files Browse the repository at this point in the history
Fix duplicated contig concatenation
  • Loading branch information
farchaab authored Oct 11, 2024
2 parents 3dc4635 + e4500e1 commit 152d9f6
Show file tree
Hide file tree
Showing 15 changed files with 107 additions and 55 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/build-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ on:
- ".github/workflows/build-docs.yml"
- "docs/**"
- "mkdocs.yml"
- "!Dockerfile"
- "!.github/workflows/docker-publish.yml"
- "!.github/workflows/unit-tests.yml"
- "!tests/**"
- "!mess/**"
- "!setup.py"
- "!README.md"
# Cancel if a newer run is started
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ on:
- "tests/**"
- "mess/**"
- "setup.py"
- "!docs/**"
- "!README.md"

release:
types: [published]

Expand Down
26 changes: 17 additions & 9 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,24 @@ name: Tests

on:
push:
branches:
- main
paths-ignore:
- "docs/**"
- "*.md"
branches: [main]
paths:
- ".github/workflows/unit-tests.yml"
- "tests/**"
- "mess/**"
- "setup.py"
- "!.github/workflows/build-docs.yml"
- "!docs/**"
- "!mkdocs.yml"
pull_request:
paths-ignore:
- "docs/**"
- "*.md"

paths:
- ".github/workflows/unit-tests.yml"
- "tests/**"
- "mess/**"
- "setup.py"
- "!.github/workflows/build-docs.yml"
- "!docs/**"
- "!mkdocs.yml"

permissions:
contents: read
Expand Down
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ input["samples.tsv
or
samples/*.tsv"] --> taxons
subgraph genome_download["`**genome download**`"]
subgraph genome_download["genome download"]
dlchoice{download ?}
taxons["taxons or
accesions"] --> dlchoice
Expand All @@ -52,6 +52,7 @@ abundances["abundances
(sequence, taxonomic)"] --> depth
end
style community_design color:#15161a
style community_design color:#15161a
fasta --> simulator
depth --> simulator
Expand All @@ -66,6 +67,7 @@ simulator --> CAMI-profile
classDef red fill:#faeaea,color:#fff,stroke:#333;
classDef blue fill:#eaecfa,color:#fff,stroke:#333;
class genome_download blue
class community_design red
```
## :books: Documentation
Expand Down Expand Up @@ -146,11 +148,15 @@ On average, using `samples.tsv` (see [table](#arrow_right-input)), MeSS runs in
| 1 | ff/0d03b1 | 73355 | MESS (1) | COMPLETED | 0 | 2024-09-04 12:55:12.903 | 1m 52s | 1m 52s | 112.6% | 1.7 GB | 8.8 GB | 3.5 GB | 2.4 GB |
| 1 | 07/d352bf | 83576 | MESS (1) | COMPLETED | 0 | 2024-09-04 12:57:30.600 | 1m 50s | 1m 50s | 113.2% | 1.7 GB | 8.9 GB | 3.5 GB | 2.4 GB |



> [!NOTE]
> Average resources usage measured 3 times with one CPU (within a [nextflow](https://github.com/nextflow-io/nextflow) process)
>
> Resources usage was measured exluding dependencies deployement time (conda env creation or container pulling)


More details on resource usage in the [documentation](https://metagenlab.github.io/MeSS/benchmarks/resource-usage/)


Expand Down
12 changes: 9 additions & 3 deletions mess/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,9 @@ def cli():
"-i",
"--input",
help="Path to input table(s)",
type=click.Path(exists=True, file_okay=True, dir_okay=True, readable=True),
type=click.Path(
exists=True, file_okay=True, dir_okay=True, readable=True, resolve_path=True
),
required=True,
)
@download_options
Expand Down Expand Up @@ -323,7 +325,9 @@ def run(
"-i",
"--input",
help="Path to input table(s)",
type=click.Path(exists=True, file_okay=True, dir_okay=True, readable=True),
type=click.Path(
exists=True, file_okay=True, dir_okay=True, readable=True, resolve_path=True
),
required=True,
)
@download_options
Expand Down Expand Up @@ -353,7 +357,9 @@ def download(
"-i",
"--input",
help="Path to input table(s)",
type=click.Path(exists=True, file_okay=True, dir_okay=True, readable=True),
type=click.Path(
exists=True, file_okay=True, dir_okay=True, readable=True, resolve_path=True
),
required=True,
)
@sim_options
Expand Down
2 changes: 1 addition & 1 deletion mess/mess.VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.1
0.10.0
4 changes: 2 additions & 2 deletions mess/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,13 +309,13 @@ def sim_options(func):
click.option(
"--mu",
help="Mu of lognormal dist",
type=int,
type=float,
default=1,
),
click.option(
"--sigma",
help="Sigma of lognormal dist",
type=int,
type=float,
default=0,
),
click.option(
Expand Down
2 changes: 1 addition & 1 deletion mess/workflow/envs/containers.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
art: docker://quay.io/biocontainers/art:2016.06.05--heacdb12_11
assembly_finder: docker://ghcr.io/metagenlab/assembly_finder:v0.7.6
assembly_finder: docker://ghcr.io/metagenlab/assembly_finder:v0.7.7
bioconvert: docker://quay.io/biocontainers/bioconvert:1.1.1--pyhdfd78af_0
curl: docker://quay.io/biocontainers/curl:7.80.0
pigz: docker://quay.io/biocontainers/pigz:2.8
Expand Down
4 changes: 2 additions & 2 deletions mess/workflow/rules/download/assembly_finder.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ rule get_unique_entries:
if "nb" not in df.columns:
df["nb"] = [params.nb] * len(df)
df[["taxon", "nb"]].drop_duplicates().to_csv(
output[0], sep="\t", index=None
output[0], sep="\t", index=False
)
else:
df[["accession"]].drop_duplicates().to_csv(
output[0], sep="\t", index=None, header=False
output[0], sep="\t", index=False, header=False
)


Expand Down
2 changes: 1 addition & 1 deletion mess/workflow/rules/preflight/functions.smk
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def aggregate(wildcards, outdir, level, ext):
if isinstance(contigs, str):
contigs = [contigs]
else:
contigs = list(contigs)
contigs = list(set(contigs))
if PAIRED and ext != "bam":
return expand(
os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"),
Expand Down
4 changes: 3 additions & 1 deletion mess/workflow/rules/processing/reads.smk
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ rule get_tax_profile:
paired=PAIRED,
run:
tax_df = pd.read_csv(input.tax, sep="\t")
tax_df = tax_df[tax_df.samplename == wildcards.sample]
cov_df = pd.read_csv(input.cov, sep="\t")
cov_df.rename(columns={"#rname": "contig"}, inplace=True)
merge_df = tax_df.merge(cov_df)
Expand All @@ -332,9 +333,10 @@ rule get_tax_profile:
for col in ["numreads", "meandepth"]:
if col == "numreads":
out = output.seq_abundance
df = merge_df.groupby("tax_id")[col].sum().reset_index()
elif col == "meandepth":
out = output.tax_abundance
df = merge_df.groupby("tax_id")[col].mean().reset_index()
df = merge_df.groupby("tax_id")[col].mean().reset_index()
df["abundance"] = df[col] / df[col].sum()
df[["tax_id", "abundance"]].to_csv(
out, sep="\t", header=False, index=False
Expand Down
78 changes: 49 additions & 29 deletions mess/workflow/scripts/calculate_cov.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,28 @@ def get_lognormal_dist(df, mu, sigma):
"""
function for simulating lognormal sequence distribution
"""
df["lognormal"] = np.random.lognormal(mean=mu, sigma=sigma, size=len(df))
df["seq_abundance"] = df["lognormal"] / df["lognormal"].sum()
return df


def get_even_dist(table, cols):
dfs = []
for sample in set(df["samplename"]):
subdf = df[df["samplename"] == sample]
subdf.loc[:, "lognormal"] = np.random.lognormal(
mean=mu, sigma=sigma, size=len(subdf)
)
subdf.loc[:, "seq_abundance"] = subdf["lognormal"] / subdf["lognormal"].sum()
dfs.append(subdf)
return pd.concat(dfs)


def get_even_dist(table):
"""
function that calculates even abundances across taxonomy
"""
abundances = (
table[cols]
table.groupby(["samplename"])["tax_id"]
.value_counts(normalize=True)
.reset_index()
.rename(columns={"proportion": "tax_abundance"})
)

return table.merge(abundances)
counts = table.groupby("samplename")["tax_id"].value_counts().reset_index()
return table.merge(abundances).merge(counts)


"""
Expand Down Expand Up @@ -78,54 +83,65 @@ def get_even_dist(table, cols):
# Get total bases
bases = parse_size(snakemake.params.bases)


if "fasta" not in df.columns:
df["fasta"] = df["accession"]

# Calculate prportion with dist
if snakemake.params.dist == "even":
df = get_even_dist(df, ["tax_id"])
df["sum_seq_length"] = df.groupby("samplename")["total_sequence_length"].transform(
"sum"
)
df["sum_cov"] = bases / df["sum_seq_length"]
df["cov_sim"] = df["sum_cov"] * df["tax_abundance"]
df = get_even_dist(df)
df["tax_abundance"] = df["proportion"] / df["count"]
df["genome_bases"] = df["total_sequence_length"] * df["tax_abundance"]
df["sum_genome_bases"] = df.groupby("samplename")["genome_bases"].transform("sum")
df["cov_obtained"] = bases / df["sum_genome_bases"]
df["cov_sim"] = df["tax_abundance"] * df["cov_obtained"]
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["bases"] = df["cov_sim"] * df["total_sequence_length"]
df["reads"] = df["bases"] / snakemake.params.read_len
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["seq_abundance"] = df["bases"] / df["sum_bases"]


elif snakemake.params.dist == "lognormal":
df = get_lognormal_dist(df, snakemake.params.mu, snakemake.params.sigma)
df = get_lognormal_dist(df, mu=snakemake.params.mu, sigma=snakemake.params.sigma)
df["bases"] = df["seq_abundance"] * bases
df["reads"] = df["bases"] / (snakemake.params.read_len * p)
df["reads"] = df["bases"] / snakemake.params.read_len
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["reads"] = df["bases"] / (snakemake.params.read_len * p)
df["seq_abundance"] = df["bases"] / df["sum_bases"]
else:
if "tax_abundance" in entry_df.columns:
df["cov_obtained"] = bases / df["total_sequence_length"]
df["sum_cov_obtained"] = df.groupby("samplename")["cov_obtained"].transform("sum")
df["cov_sim"] = df["tax_abundance"] * df["sum_cov_obtained"]
df["genome_bases"] = df["total_sequence_length"] * df["tax_abundance"]
df["sum_genome_bases"] = df.groupby("samplename")["genome_bases"].transform(
"sum"
)
df["cov_obtained"] = bases / df["sum_genome_bases"]
df["cov_sim"] = df["tax_abundance"] * df["cov_obtained"]
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["bases"] = df["cov_sim"] * df["total_sequence_length"]
df["reads"] = df["bases"] / snakemake.params.read_len
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["reads"] = df["bases"] / (snakemake.params.read_len * p)
df["seq_abundance"] = df["bases"] / df["sum_bases"]

if "seq_abundance" in entry_df.columns:
df["bases"] = df["seq_abundance"] * bases
df["reads"] = df["bases"] / (snakemake.params.read_len * p)
df["reads"] = df["bases"] / snakemake.params.read_len
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]

if "reads" in entry_df.columns:
df["bases"] = df["reads"] * (snakemake.params.read_len * p)
df["bases"] = df["reads"] * snakemake.params.read_len * p
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["seq_abundance"] = df["bases"] / df["sum_bases"]
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]

if "bases" in entry_df.columns:
df["reads"] = df["bases"] / (snakemake.params.read_len * p)
df["reads"] = df["bases"] / snakemake.params.read_len
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["seq_abundance"] = df["bases"] / df["sum_bases"]
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
Expand All @@ -137,11 +153,9 @@ def get_even_dist(table, cols):
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]
df["bases"] = df["cov_sim"] * df["total_sequence_length"]
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["reads"] = df["bases"] / (snakemake.params.read_len * p)
df["reads"] = df["bases"] / snakemake.params.read_len
df["seq_abundance"] = df["bases"] / df["sum_bases"]

if "fasta" not in df.columns:
df["fasta"] = df["accession"]

df["seed"] = random.sample(range(1, 1000000), len(df))

Expand All @@ -159,6 +173,12 @@ def get_even_dist(table, cols):
"tax_abundance",
"seed",
]

df["reads"] = df["reads"].apply(lambda x: int(round(x)))
df["bases"] = df["bases"].apply(lambda x: int(round(x)))
df["tax_abundance"] = df["tax_abundance"].apply(lambda x: round(x, 3))
df["seq_abundance"] = df["seq_abundance"].apply(lambda x: round(x, 3))

df = df.astype(
{"seed": int, "tax_id": int, "total_sequence_length": int, "number_of_contigs": int}
)
Expand Down
4 changes: 2 additions & 2 deletions mess/workflow/scripts/replicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
if len(snakemake.params.replicates) < 2:
samples_df.rename(columns={"sample": "samplename"}, inplace=True)
else:
samples_df["rep"] = snakemake.params.replicates * len(samples_df)
samples_df["rep"] = snakemake.params.replicates * len(df)
samples_df["samplename"] = [
sample + "-" + str(rep)
for sample, rep in zip(samples_df["sample"], samples_df["rep"])
Expand All @@ -30,4 +30,4 @@
continue

samples_df.reset_index(inplace=True, drop=True)
samples_df.to_csv(snakemake.output[0], sep="\t", index=None)
samples_df.to_csv(snakemake.output[0], sep="\t", index=False)
2 changes: 1 addition & 1 deletion mess/workflow/scripts/samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@
df["sample"] = [sample] * len(df)

dfs = pd.concat(dfs).reset_index(drop=True).sort_values("sample")
dfs.to_csv(snakemake.output[0], sep="\t", index=None)
dfs.to_csv(snakemake.output[0], sep="\t", index=False)
2 changes: 1 addition & 1 deletion mess/workflow/scripts/split_contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,5 @@ def split_fasta(fa, outdir):
contig_df = pd.DataFrame.from_records(id2fa)
df = pd.merge(contig_df, cov_df, how="left", on="fasta")
df[["samplename", "fasta", "contig", "tax_id", "seed", "cov_sim"]].to_csv(
snakemake.output.tsv, sep="\t", index=None
snakemake.output.tsv, sep="\t", index=False
)

0 comments on commit 152d9f6

Please sign in to comment.