From 9c148c5e1d2e2a5e99d45ae345c5221aa81f043a Mon Sep 17 00:00:00 2001
From: Josefa Welling <82578997+josefawelling@users.noreply.github.com>
Date: Thu, 18 Jan 2024 14:08:55 +0100
Subject: [PATCH] feat: auto creation of sample sheet (#8)

* feat: auto creation of sample sheet

* formatting
---
 README.md                               | 18 ++++++--
 config/config.yaml                      | 17 ++++++-
 config/pep/samples.csv                  |  3 --
 workflow/Snakefile                      | 15 +++---
 workflow/rules/preprocessing.smk        | 12 +++++
 workflow/scripts/create_sample_sheet.py | 61 +++++++++++++++++++------
 6 files changed, 99 insertions(+), 27 deletions(-)
 create mode 100644 workflow/rules/preprocessing.smk
diff --git a/README.md b/README.md
index 56b6d6b..f89bce1 100644
--- a/README.md
+++ b/README.md
@@ -50,12 +50,24 @@ To configure this workflow, modify `config/config.yaml` according to your needs,
 #### Sample sheet
 
 The sample sheet contains all samples to be analyzed.
-Samples to be analyzed must be added manually to the sample sheet.
+
+#### Auto creation
+
+You can choose to automatically create a sample sheet with all samples in a specified directory (modifications in `config/config.yaml`). Only `fastq.gz` files are taken into account. Additionally there is the option to rename the sequencers output FASTQ files during this step, e.g. from `sampleID_S40_L001_R1_001.fastq.gz` to `sampleID_R1.fastq.gz`.    
+To create the sample sheet and provide it for the workflow, run:
+
+```sh
+    snakemake --cores all --use-conda create_sample_sheet
+```
+
+#### Manual creation or editing
+
+Samples to be analyzed can also be added manually to the sample sheet.
 For each sample, a new line in `config/pep/samples.csv` with the following content has to be defined:
 
 - **sample_name**: name or identifier of sample
-- **fq1**: path to read 1 in FASTQ format
-- **fq2**: path to read 2 in FASTQ format
+- **fq1**: path to read 1 in gzip FASTQ format
+- **fq2**: path to read 2 in gzip FASTQ format
 
 
 ### Step 4: Run workflow
diff --git a/config/config.yaml b/config/config.yaml
index 57115f2..8b702bd 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,14 +1,28 @@
 pepfile: config/pep/config.yaml
 
+## this will be used as name for the results folder and can be found in the report
 run-date: "23_12_18"
 
+## adapter sequences used for trimming
 adapter-seqs: "-a GCGAATTTCGACGATCGTTGCATTAACTCGCGAA -g AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
 
+## Option for the auto-creation of the sample sheet
+sample-sheet:
+  # False or True : Should the sample sheet be auto-created?
+  auto-creation: True
+  # False or True : Should the sample fastqs be renamed?
+  # e.g. from sampleID_S40_L001_R1_001.fastq.gz to sampleID_R1.fastq.gz
+  rename-sample-files: False
+  # path to the fastq files of the samples for the sample sheet
+  data-path: "/groups/ds/metagenomes/run_folder/"
+
 data-handling:
-  # path to store data within the workflow
+  # path to store input fastq data within the workflow
   data: "data/"
+  # path to store databases and reference genomes used within the workflow
   resources: "resources/"
 
+## qualtiy criteria used for filtering
 quality-criteria:
   # minimal length of acceptable reads
   min-length-reads: 15
@@ -20,4 +34,5 @@ human-ref: "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_late
 kraken:
   download-path: "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20231009.tar.gz"
 
+## string term used for formatting output tables
 tablular-config: '/>github<\/a>/a \\t\t\t</li>\n\t\t\t<li class="nav-item"> \n\t\t\t\t<a class="nav-link" href="#">'
diff --git a/config/pep/samples.csv b/config/pep/samples.csv
index 9d937f3..0f54e8b 100644
--- a/config/pep/samples.csv
+++ b/config/pep/samples.csv
@@ -1,4 +1 @@
 sample_name,fq1,fq2
-115_L001,/groups/ds/resistance_cefiderocol/data/illumina/115_L001_R1.fastq.gz,/groups/ds/resistance_cefiderocol/data/illumina/115_L001_R2.fastq.gz
-139_L001,/groups/ds/resistance_cefiderocol/data/illumina/139_L001_R1.fastq.gz,/groups/ds/resistance_cefiderocol/data/illumina/139_L001_R2.fastq.gz
-I15566-L1,/projects/pig-muenster/Metagenome_Kiel/fastq/I15566-L1_R1.fastq.gz,/projects/pig-muenster/Metagenome_Kiel/fastq/I15566-L1_R2.fastq.gz
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 05add77..6c2c831 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -17,6 +17,11 @@ include: "rules/species_diversity.smk"
 include: "rules/report.smk"
 
 
+if config["sample-sheet"]["auto-creation"]:
+
+    include: "rules/preprocessing.smk"
+
+
 DATE = get_run_date()
 
 
@@ -34,12 +39,10 @@ rule all:
 
 
 onsuccess:
-    shell("tar cpfz results/{DATE}/{DATE}_results.tar.gz results/{DATE}/report/")
-    print(
-        "Workflow finished without an error. You can find the results in {date}_results.tar.gz".format(
-            date=DATE
-        )
-    )
+    print("Workflow finished without an error.")
+    if os.path.exists("results/{date}/report/{date}_report.zip".format(date=DATE)):
+        shell("tar cpfz results/{DATE}/{DATE}_results.tar.gz results/{DATE}/report/")
+        print("You can find the results in {date}_results.tar.gz".format(date=DATE))
 
 
 onerror:
diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk
new file mode 100644
index 0000000..9aded58
--- /dev/null
+++ b/workflow/rules/preprocessing.smk
@@ -0,0 +1,12 @@
+rule create_sample_sheet:
+    input:
+        "config/pep/samples.csv",
+    params:
+        inpath=config["sample-sheet"]["data-path"],
+        renaming=config["sample-sheet"]["rename-sample-files"],
+    log:
+        "logs/create_sample_sheet.log",
+    conda:
+        "../envs/python.yaml"
+    script:
+        "../scripts/create_sample_sheet.py"
diff --git a/workflow/scripts/create_sample_sheet.py b/workflow/scripts/create_sample_sheet.py
index 3ab7389..9f15b6b 100644
--- a/workflow/scripts/create_sample_sheet.py
+++ b/workflow/scripts/create_sample_sheet.py
@@ -1,27 +1,60 @@
 import os
 import re
-path="/groups/ds/metagenomes/231218_Miseq/"
-outfile="config/pep/samples_231218.csv"
+import sys
+
+## write to log file
+sys.stderr = open(snakemake.log[0], "w")
+
+inpath = snakemake.params.inpath
+renaming = snakemake.params.renaming
+sample_csv = snakemake.input[0]
 
 
 def rename_fastqs(path):
-    fastqs=os.listdir(path)
-    samples=[]
+    samples = []
+    
+    fastqs = [file for file in os.listdir(path) if file.endswith(".fastq.gz")]
+    if not fastqs:
+        print(
+            f"Error: There are no fastq files in the directory. Have you used the correct path: {path}?"
+        )
+        raise Exception(
+            f"There are no fastq files in the directory. Have you used the correct path: {path}?"
+        )
+
+    if renaming:
+        print(
+            "Renaming fastq files, e.g. from sampleID_S40_L001_R1_001.fastq.gz to sampleID_R1.fastq.gz"
+        )
+    else:
+        print("Fastq files will not be renamed")
+
     for fastq in fastqs:
+        ## renaming from e.g. sampleID_S40_L001_R1_001.fastq.gz to sampleID_R1.fastq.gz
         fastq_new = re.sub(r"_S\d{0,2}_L001", "", fastq)
         fastq_new = re.sub(r"_001.fastq", ".fastq", fastq_new)
-        sample=fastq_new.split("_")[0]
-        if sample not in samples:
+
+        sample = (re.search("(.*)_R[1-2].fastq.gz", fastq_new)).group(1)
+        if sample not in samples and sample != "Undetermined":
             samples.append(sample)
-        os.system(f"mv {path}{fastq} {path}{fastq_new}")
-    return(samples)
 
-def write_sample_sheet(samples,outfile):
-    os.system(f"touch {outfile}")
-    with open(outfile,"w") as sheet:
+        if renaming:
+            os.system(f"mv {path} {fastq} {path} {fastq_new}")
+
+    return samples
+
+
+def write_sample_sheet(samples, path, outfile):
+    # os.system(f"touch {outfile}")
+
+    with open(outfile, "w") as sheet:
         sheet.write("sample_name,fq1,fq2\n")
+
         for sample in samples:
-            sheet.write(f"{sample},{path}{sample}_R1.fastq.gz,{path}{sample}_R2.fastq.gz\n")
+            sheet.write(
+                f"{sample},{path} {sample}_R1.fastq.gz,{path} {sample}_R2.fastq.gz\n"
+            )
+
 
-samples=rename_fastqs(path)
-write_sample_sheet(samples,outfile)
+samples = rename_fastqs(inpath)
+write_sample_sheet(samples, inpath, sample_csv)