Skip to content

Commit

Permalink
first init
Browse files Browse the repository at this point in the history
  • Loading branch information
alethomas committed Jul 31, 2024
1 parent 5394fbb commit 67f112b
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 55 deletions.
2 changes: 1 addition & 1 deletion config/pep/samples.csv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
sample_name,fq1,fq2
sample_name,fq1,fq2,technology
59 changes: 44 additions & 15 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os

ILLUMINA = "illumina"
ONT = "ont"

configfile: "config/config.yaml"

Expand All @@ -20,30 +22,57 @@ def get_samples():
return list(pep.sample_table["sample_name"].values)


def get_fastqs(wildcards):
return (
pep.sample_table.loc[wildcards.sample]["fq1"],
pep.sample_table.loc[wildcards.sample]["fq2"],
)
def get_technology(wildcards, sample=None):
if sample is None:
sample = wildcards.sample

return pep.sample_table.loc[sample]["technology"]


def is_ont(wildcards, sample=None):
if sample is None:
return get_technology(wildcards) == ONT
return get_technology(None, sample) == ONT


def get_local_fastqs(wildcards):
path = get_data_path()
return (
"{data}{{date}}/{{sample}}_R1.fastq.gz".format(data=path),
"{data}{{date}}/{{sample}}_R2.fastq.gz".format(data=path),
)
def is_illumina(wildcards, sample=None):
if sample is None:
return get_technology(wildcards) == ILLUMINA
return get_technology(None, sample) == ILLUMINA


def get_fastqs(wildcards):
if is_illumina(wildcards):
return pep.sample_table.loc[wildcards.sample][["fq1", "fq2"]]
elif is_ont(wildcards):
return pep.sample_table.loc[wildcards.sample][["fq1"]]


def get_adapters(wildcards):
return config["adapter-seqs"]


def get_trimmed_fastqs(wildcards):
return [
"results/{date}/qc/fastp/{sample}.1.fastq.gz",
"results/{date}/qc/fastp/{sample}.2.fastq.gz",
]
if is_illumina(wildcards):
return [
"results/{date}/qc/fastp-pe/{sample}.1.fastq.gz",
"results/{date}/qc/fastp-pe/{sample}.2.fastq.gz",
]
elif is_ont(wildcards):
return ["results/{date}/qc/fastp-se/{sample}.fastq.gz"]


def get_fastp_results(wildcards):
"""Returns paths of files to aggregate the fastp results for the multiqc rule."""
# fastp is only used on Illumina and Ion Torrent data
files = []
samples = get_samples()
for sample in samples:
if is_illumina(None, sample):
files.append("results/{{date}}/qc/fastp-pe/{sample}.fastp.json".format(sample=sample))
elif is_ont(None, sample):
files.append("results/{{date}}/qc/fastp-se/{sample}.fastp.json".format(sample=sample))
return files


def get_trimmed_fastq(wildcards):
Expand Down
57 changes: 27 additions & 30 deletions workflow/rules/qc.smk
Original file line number Diff line number Diff line change
@@ -1,44 +1,43 @@
RAW_DATA_PATH = get_data_path()


rule local_fastqs:
rule fastp_pe:
input:
fastqs=get_fastqs,
sample=get_fastqs,
output:
raw1=temp(f"{RAW_DATA_PATH}{{date}}/{{sample}}_R1.fastq.gz"),
raw2=temp(f"{RAW_DATA_PATH}{{date}}/{{sample}}_R2.fastq.gz"),
trimmed=temp(
[
"results/{date}/qc/fastp-pe/{sample}.1.fastq.gz",
"results/{date}/qc/fastp-pe/{sample}.2.fastq.gz",
]
),
html=temp("results/{date}/qc/fastp-pe/{sample}.html"),
json=temp("results/{date}/qc/fastp-pe/{sample}.fastp.json"),
params:
outdir=lambda wildcards, output: Path(output.raw1).parent,
adapters=get_adapters,
extra="--qualified_quality_phred {phred} --length_required {minlen}".format(
phred=(config["quality-criteria"]["min-PHRED"]),
minlen=(config["quality-criteria"]["min-length-reads"]),
),
log:
"logs/{date}/copy_data/{sample}.log",
conda:
"../envs/unix.yaml"
shell:
"(mkdir -p {params.outdir} && "
"cp -v {input.fastqs[0]} {output.raw1} && "
"cp -v {input.fastqs[1]} {output.raw2}) > {log} 2>&1"
"logs/{date}/fastp/fastp-pe/{sample}.log",
threads: 2
wrapper:
"v3.3.3/bio/fastp"


rule fastp:
rule fastp_se:
input:
sample=get_local_fastqs,
sample=get_fastqs,
output:
trimmed=temp(
[
"results/{date}/qc/fastp/{sample}.1.fastq.gz",
"results/{date}/qc/fastp/{sample}.2.fastq.gz",
]
),
html=temp("results/{date}/qc/fastp/{sample}.html"),
json=temp("results/{date}/qc/fastp/{sample}.fastp.json"),
trimmed=temp("results/{date}/qc/fastp-se/{sample}.fastq.gz"),
html=temp("results/{date}/qc/fastp-se/{sample}.html"),
json=temp("results/{date}/qc/fastp-se/{sample}.fastp.json"),
params:
adapters=get_adapters,
extra="--qualified_quality_phred {phred} --length_required {minlen}".format(
phred=(config["quality-criteria"]["min-PHRED"]),
minlen=(config["quality-criteria"]["min-length-reads"]),
),
log:
"logs/{date}/qc/fastp/{sample}.log",
"results/{date}/qc/fastp-se/{sample}.log",
threads: 2
wrapper:
"v3.3.3/bio/fastp"
Expand All @@ -62,12 +61,10 @@ rule fastqc:
rule multiqc:
input:
expand(
[
"results/{{date}}/qc/fastqc/{sample}_fastqc.zip",
"results/{{date}}/qc/fastp/{sample}.fastp.json",
],
"results/{{date}}/qc/fastqc/{sample}_fastqc.zip",
sample=get_samples(),
),
get_fastp_results,
output:
report(
"results/{date}/report/qc/multiqc.html",
Expand Down
5 changes: 1 addition & 4 deletions workflow/rules/report.smk
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
rule qc_diversity_summary:
input:
jsons=expand(
"results/{{date}}/qc/fastp/{sample}.fastp.json",
sample=get_samples(),
),
jsons=lambda wildcards: get_fastp_results(wildcards),
stats=expand(
"results/{{date}}/contamination/{sample}_stats.txt", sample=get_samples()
),
Expand Down
5 changes: 4 additions & 1 deletion workflow/rules/species_diversity.smk
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ rule kraken2:
outfile=temp("results/{date}/diversity/kraken_outfiles/{sample}_outfile.tsv"),
params:
db=lambda wildcards, input: Path(input.hfile).parent,
flag=lambda wildcards: "--paired"
if is_illumina(wildcards)
else ""
threads: 32
log:
"logs/{date}/kraken2_run/{sample}.log",
Expand All @@ -57,7 +60,7 @@ rule kraken2:
conda:
"../envs/kraken_based.yaml"
shell:
"kraken2 --db {params.db} --threads {threads} --quick --paired "
"kraken2 --db {params.db} --threads {threads} --quick {params.flag} "
"--output {output.outfile} --report {output.report} "
"--gzip-compressed {input.fastqs} > {log} 2>&1"

Expand Down
8 changes: 4 additions & 4 deletions workflow/scripts/create_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,10 @@ def save_summary_csv(domain_abundance_df, human_cont_df, read_quality_df, outfil

df_all_for_csv = pd.concat([domain_abundance_for_csv, human_cont_for_csv], axis=1)

header = ["Human", "Bacteria", "Eukaryota", "Archaea", "Viruses"]
new_cols = [s + " (%)" for s in header]
df_all_for_csv = df_all_for_csv[header]
df_all_for_csv.columns = new_cols
# header = ["Human", "Bacteria", "Eukaryota", "Archaea", "Viruses"]
# new_cols = [s + " (%)" for s in header]
# df_all_for_csv = df_all_for_csv[header]
# df_all_for_csv.columns = new_cols

df_all_for_csv = df_all_for_csv.mul(100)
df_all_for_csv = df_all_for_csv.astype("float64").round(3)
Expand Down

0 comments on commit 67f112b

Please sign in to comment.