diff --git a/config/pep/samples.csv b/config/pep/samples.csv index 0f54e8b..6c525a3 100644 --- a/config/pep/samples.csv +++ b/config/pep/samples.csv @@ -1 +1 @@ -sample_name,fq1,fq2 +sample_name,fq1,fq2,technology diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 5300f1a..f7f14c4 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,5 +1,7 @@ import os +ILLUMINA = "illumina" +ONT = "ont" configfile: "config/config.yaml" @@ -20,19 +22,30 @@ def get_samples(): return list(pep.sample_table["sample_name"].values) -def get_fastqs(wildcards): - return ( - pep.sample_table.loc[wildcards.sample]["fq1"], - pep.sample_table.loc[wildcards.sample]["fq2"], - ) +def get_technology(wildcards, sample=None): + if sample is None: + sample = wildcards.sample + + return pep.sample_table.loc[sample]["technology"] + + +def is_ont(wildcards, sample=None): + if sample is None: + return get_technology(wildcards) == ONT + return get_technology(None, sample) == ONT -def get_local_fastqs(wildcards): - path = get_data_path() - return ( - "{data}{{date}}/{{sample}}_R1.fastq.gz".format(data=path), - "{data}{{date}}/{{sample}}_R2.fastq.gz".format(data=path), - ) +def is_illumina(wildcards, sample=None): + if sample is None: + return get_technology(wildcards) == ILLUMINA + return get_technology(None, sample) == ILLUMINA + + +def get_fastqs(wildcards): + if is_illumina(wildcards): + return pep.sample_table.loc[wildcards.sample][["fq1", "fq2"]] + elif is_ont(wildcards): + return pep.sample_table.loc[wildcards.sample][["fq1"]] def get_adapters(wildcards): @@ -40,10 +53,26 @@ def get_adapters(wildcards): def get_trimmed_fastqs(wildcards): - return [ - "results/{date}/qc/fastp/{sample}.1.fastq.gz", - "results/{date}/qc/fastp/{sample}.2.fastq.gz", - ] + if is_illumina(wildcards): + return [ + "results/{date}/qc/fastp-pe/{sample}.1.fastq.gz", + "results/{date}/qc/fastp-pe/{sample}.2.fastq.gz", + ] + elif is_ont(wildcards): + return ["results/{date}/qc/fastp-se/{sample}.fastq.gz"] + + +def get_fastp_results(wildcards): + """Returns paths of files to aggregate the fastp results for the multiqc rule.""" + # fastp is only used on Illumina and Ion Torrent data + files = [] + samples = get_samples() + for sample in samples: + if is_illumina(None, sample): + files.append("results/{{date}}/qc/fastp-pe/{sample}.fastp.json".format(sample=sample)) + elif is_ont(None, sample): + files.append("results/{{date}}/qc/fastp-se/{sample}.fastp.json".format(sample=sample)) + return files def get_trimmed_fastq(wildcards): diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index ffc22c3..3e03b18 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -1,36 +1,35 @@ -RAW_DATA_PATH = get_data_path() - - -rule local_fastqs: +rule fastp_pe: input: - fastqs=get_fastqs, + sample=get_fastqs, output: - raw1=temp(f"{RAW_DATA_PATH}{{date}}/{{sample}}_R1.fastq.gz"), - raw2=temp(f"{RAW_DATA_PATH}{{date}}/{{sample}}_R2.fastq.gz"), + trimmed=temp( + [ + "results/{date}/qc/fastp-pe/{sample}.1.fastq.gz", + "results/{date}/qc/fastp-pe/{sample}.2.fastq.gz", + ] + ), + html=temp("results/{date}/qc/fastp-pe/{sample}.html"), + json=temp("results/{date}/qc/fastp-pe/{sample}.fastp.json"), params: - outdir=lambda wildcards, output: Path(output.raw1).parent, + adapters=get_adapters, + extra="--qualified_quality_phred {phred} --length_required {minlen}".format( + phred=(config["quality-criteria"]["min-PHRED"]), + minlen=(config["quality-criteria"]["min-length-reads"]), + ), log: - "logs/{date}/copy_data/{sample}.log", - conda: - "../envs/unix.yaml" - shell: - "(mkdir -p {params.outdir} && " - "cp -v {input.fastqs[0]} {output.raw1} && " - "cp -v {input.fastqs[1]} {output.raw2}) > {log} 2>&1" + "logs/{date}/fastp/fastp-pe/{sample}.log", + threads: 2 + wrapper: + "v3.3.3/bio/fastp" -rule fastp: +rule fastp_se: input: - sample=get_local_fastqs, + sample=get_fastqs, output: - trimmed=temp( - [ - "results/{date}/qc/fastp/{sample}.1.fastq.gz", - "results/{date}/qc/fastp/{sample}.2.fastq.gz", - ] - ), - html=temp("results/{date}/qc/fastp/{sample}.html"), - json=temp("results/{date}/qc/fastp/{sample}.fastp.json"), + trimmed=temp("results/{date}/qc/fastp-se/{sample}.fastq.gz"), + html=temp("results/{date}/qc/fastp-se/{sample}.html"), + json=temp("results/{date}/qc/fastp-se/{sample}.fastp.json"), params: adapters=get_adapters, extra="--qualified_quality_phred {phred} --length_required {minlen}".format( @@ -38,7 +37,7 @@ rule fastp: minlen=(config["quality-criteria"]["min-length-reads"]), ), log: - "logs/{date}/qc/fastp/{sample}.log", + "results/{date}/qc/fastp-se/{sample}.log", threads: 2 wrapper: "v3.3.3/bio/fastp" @@ -62,12 +61,10 @@ rule fastqc: rule multiqc: input: expand( - [ - "results/{{date}}/qc/fastqc/{sample}_fastqc.zip", - "results/{{date}}/qc/fastp/{sample}.fastp.json", - ], + "results/{{date}}/qc/fastqc/{sample}_fastqc.zip", sample=get_samples(), ), + get_fastp_results, output: report( "results/{date}/report/qc/multiqc.html", diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 5c5851c..ba7efe9 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -1,9 +1,6 @@ rule qc_diversity_summary: input: - jsons=expand( - "results/{{date}}/qc/fastp/{sample}.fastp.json", - sample=get_samples(), - ), + jsons=lambda wildcards: get_fastp_results(wildcards), stats=expand( "results/{{date}}/contamination/{sample}_stats.txt", sample=get_samples() ), diff --git a/workflow/rules/species_diversity.smk b/workflow/rules/species_diversity.smk index a39688d..e15ba7c 100644 --- a/workflow/rules/species_diversity.smk +++ b/workflow/rules/species_diversity.smk @@ -49,6 +49,9 @@ rule kraken2: outfile=temp("results/{date}/diversity/kraken_outfiles/{sample}_outfile.tsv"), params: db=lambda wildcards, input: Path(input.hfile).parent, + flag=lambda wildcards: "--paired" + if is_illumina(wildcards) + else "" threads: 32 log: "logs/{date}/kraken2_run/{sample}.log", @@ -57,7 +60,7 @@ rule kraken2: conda: "../envs/kraken_based.yaml" shell: - "kraken2 --db {params.db} --threads {threads} --quick --paired " + "kraken2 --db {params.db} --threads {threads} --quick {params.flag} " "--output {output.outfile} --report {output.report} " "--gzip-compressed {input.fastqs} > {log} 2>&1" diff --git a/workflow/scripts/create_plots.py b/workflow/scripts/create_plots.py index f630b46..0a35947 100644 --- a/workflow/scripts/create_plots.py +++ b/workflow/scripts/create_plots.py @@ -256,10 +256,10 @@ def save_summary_csv(domain_abundance_df, human_cont_df, read_quality_df, outfil df_all_for_csv = pd.concat([domain_abundance_for_csv, human_cont_for_csv], axis=1) - header = ["Human", "Bacteria", "Eukaryota", "Archaea", "Viruses"] - new_cols = [s + " (%)" for s in header] - df_all_for_csv = df_all_for_csv[header] - df_all_for_csv.columns = new_cols + # header = ["Human", "Bacteria", "Eukaryota", "Archaea", "Viruses"] + # new_cols = [s + " (%)" for s in header] + # df_all_for_csv = df_all_for_csv[header] + # df_all_for_csv.columns = new_cols df_all_for_csv = df_all_for_csv.mul(100) df_all_for_csv = df_all_for_csv.astype("float64").round(3)