From 9c148c5e1d2e2a5e99d45ae345c5221aa81f043a Mon Sep 17 00:00:00 2001
From: Josefa Welling <82578997+josefawelling@users.noreply.github.com>
Date: Thu, 18 Jan 2024 14:08:55 +0100
Subject: [PATCH] feat: auto creation of sample sheet (#8)
* feat: auto creation of sample sheet
* formatting
---
README.md | 18 ++++++--
config/config.yaml | 17 ++++++-
config/pep/samples.csv | 3 --
workflow/Snakefile | 15 +++---
workflow/rules/preprocessing.smk | 12 +++++
workflow/scripts/create_sample_sheet.py | 61 +++++++++++++++++++------
6 files changed, 99 insertions(+), 27 deletions(-)
create mode 100644 workflow/rules/preprocessing.smk
diff --git a/README.md b/README.md
index 56b6d6b..f89bce1 100644
--- a/README.md
+++ b/README.md
@@ -50,12 +50,24 @@ To configure this workflow, modify `config/config.yaml` according to your needs,
#### Sample sheet
The sample sheet contains all samples to be analyzed.
-Samples to be analyzed must be added manually to the sample sheet.
+
+#### Auto creation
+
+You can choose to automatically create a sample sheet with all samples in a specified directory (modifications in `config/config.yaml`). Only `fastq.gz` files are taken into account. Additionally there is the option to rename the sequencers output FASTQ files during this step, e.g. from `sampleID_S40_L001_R1_001.fastq.gz` to `sampleID_R1.fastq.gz`.
+To create the sample sheet and provide it for the workflow, run:
+
+```sh
+ snakemake --cores all --use-conda create_sample_sheet
+```
+
+#### Manual creation or editing
+
+Samples to be analyzed can also be added manually to the sample sheet.
For each sample, a new line in `config/pep/samples.csv` with the following content has to be defined:
- **sample_name**: name or identifier of sample
-- **fq1**: path to read 1 in FASTQ format
-- **fq2**: path to read 2 in FASTQ format
+- **fq1**: path to read 1 in gzip FASTQ format
+- **fq2**: path to read 2 in gzip FASTQ format
### Step 4: Run workflow
diff --git a/config/config.yaml b/config/config.yaml
index 57115f2..8b702bd 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,14 +1,28 @@
pepfile: config/pep/config.yaml
+## this will be used as name for the results folder and can be found in the report
run-date: "23_12_18"
+## adapter sequences used for trimming
adapter-seqs: "-a GCGAATTTCGACGATCGTTGCATTAACTCGCGAA -g AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
+## Option for the auto-creation of the sample sheet
+sample-sheet:
+ # False or True : Should the sample sheet be auto-created?
+ auto-creation: True
+ # False or True : Should the sample fastqs be renamed?
+ # e.g. from sampleID_S40_L001_R1_001.fastq.gz to sampleID_R1.fastq.gz
+ rename-sample-files: False
+ # path to the fastq files of the samples for the sample sheet
+ data-path: "/groups/ds/metagenomes/run_folder/"
+
data-handling:
- # path to store data within the workflow
+ # path to store input fastq data within the workflow
data: "data/"
+ # path to store databases and reference genomes used within the workflow
resources: "resources/"
+## qualtiy criteria used for filtering
quality-criteria:
# minimal length of acceptable reads
min-length-reads: 15
@@ -20,4 +34,5 @@ human-ref: "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_late
kraken:
download-path: "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20231009.tar.gz"
+## string term used for formatting output tables
tablular-config: '/>github<\/a>/a \\t\t\t\n\t\t\t
\n\t\t\t\t'
diff --git a/config/pep/samples.csv b/config/pep/samples.csv
index 9d937f3..0f54e8b 100644
--- a/config/pep/samples.csv
+++ b/config/pep/samples.csv
@@ -1,4 +1 @@
sample_name,fq1,fq2
-115_L001,/groups/ds/resistance_cefiderocol/data/illumina/115_L001_R1.fastq.gz,/groups/ds/resistance_cefiderocol/data/illumina/115_L001_R2.fastq.gz
-139_L001,/groups/ds/resistance_cefiderocol/data/illumina/139_L001_R1.fastq.gz,/groups/ds/resistance_cefiderocol/data/illumina/139_L001_R2.fastq.gz
-I15566-L1,/projects/pig-muenster/Metagenome_Kiel/fastq/I15566-L1_R1.fastq.gz,/projects/pig-muenster/Metagenome_Kiel/fastq/I15566-L1_R2.fastq.gz
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 05add77..6c2c831 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -17,6 +17,11 @@ include: "rules/species_diversity.smk"
include: "rules/report.smk"
+if config["sample-sheet"]["auto-creation"]:
+
+ include: "rules/preprocessing.smk"
+
+
DATE = get_run_date()
@@ -34,12 +39,10 @@ rule all:
onsuccess:
- shell("tar cpfz results/{DATE}/{DATE}_results.tar.gz results/{DATE}/report/")
- print(
- "Workflow finished without an error. You can find the results in {date}_results.tar.gz".format(
- date=DATE
- )
- )
+ print("Workflow finished without an error.")
+ if os.path.exists("results/{date}/report/{date}_report.zip".format(date=DATE)):
+ shell("tar cpfz results/{DATE}/{DATE}_results.tar.gz results/{DATE}/report/")
+ print("You can find the results in {date}_results.tar.gz".format(date=DATE))
onerror:
diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk
new file mode 100644
index 0000000..9aded58
--- /dev/null
+++ b/workflow/rules/preprocessing.smk
@@ -0,0 +1,12 @@
+rule create_sample_sheet:
+ input:
+ "config/pep/samples.csv",
+ params:
+ inpath=config["sample-sheet"]["data-path"],
+ renaming=config["sample-sheet"]["rename-sample-files"],
+ log:
+ "logs/create_sample_sheet.log",
+ conda:
+ "../envs/python.yaml"
+ script:
+ "../scripts/create_sample_sheet.py"
diff --git a/workflow/scripts/create_sample_sheet.py b/workflow/scripts/create_sample_sheet.py
index 3ab7389..9f15b6b 100644
--- a/workflow/scripts/create_sample_sheet.py
+++ b/workflow/scripts/create_sample_sheet.py
@@ -1,27 +1,60 @@
import os
import re
-path="/groups/ds/metagenomes/231218_Miseq/"
-outfile="config/pep/samples_231218.csv"
+import sys
+
+## write to log file
+sys.stderr = open(snakemake.log[0], "w")
+
+inpath = snakemake.params.inpath
+renaming = snakemake.params.renaming
+sample_csv = snakemake.input[0]
def rename_fastqs(path):
- fastqs=os.listdir(path)
- samples=[]
+ samples = []
+
+ fastqs = [file for file in os.listdir(path) if file.endswith(".fastq.gz")]
+ if not fastqs:
+ print(
+ f"Error: There are no fastq files in the directory. Have you used the correct path: {path}?"
+ )
+ raise Exception(
+ f"There are no fastq files in the directory. Have you used the correct path: {path}?"
+ )
+
+ if renaming:
+ print(
+ "Renaming fastq files, e.g. from sampleID_S40_L001_R1_001.fastq.gz to sampleID_R1.fastq.gz"
+ )
+ else:
+ print("Fastq files will not be renamed")
+
for fastq in fastqs:
+ ## renaming from e.g. sampleID_S40_L001_R1_001.fastq.gz to sampleID_R1.fastq.gz
fastq_new = re.sub(r"_S\d{0,2}_L001", "", fastq)
fastq_new = re.sub(r"_001.fastq", ".fastq", fastq_new)
- sample=fastq_new.split("_")[0]
- if sample not in samples:
+
+ sample = (re.search("(.*)_R[1-2].fastq.gz", fastq_new)).group(1)
+ if sample not in samples and sample != "Undetermined":
samples.append(sample)
- os.system(f"mv {path}{fastq} {path}{fastq_new}")
- return(samples)
-def write_sample_sheet(samples,outfile):
- os.system(f"touch {outfile}")
- with open(outfile,"w") as sheet:
+ if renaming:
+ os.system(f"mv {path} {fastq} {path} {fastq_new}")
+
+ return samples
+
+
+def write_sample_sheet(samples, path, outfile):
+ # os.system(f"touch {outfile}")
+
+ with open(outfile, "w") as sheet:
sheet.write("sample_name,fq1,fq2\n")
+
for sample in samples:
- sheet.write(f"{sample},{path}{sample}_R1.fastq.gz,{path}{sample}_R2.fastq.gz\n")
+ sheet.write(
+ f"{sample},{path} {sample}_R1.fastq.gz,{path} {sample}_R2.fastq.gz\n"
+ )
+
-samples=rename_fastqs(path)
-write_sample_sheet(samples,outfile)
+samples = rename_fastqs(inpath)
+write_sample_sheet(samples, inpath, sample_csv)