From 9c148c5e1d2e2a5e99d45ae345c5221aa81f043a Mon Sep 17 00:00:00 2001 From: Josefa Welling <82578997+josefawelling@users.noreply.github.com> Date: Thu, 18 Jan 2024 14:08:55 +0100 Subject: [PATCH] feat: auto creation of sample sheet (#8) * feat: auto creation of sample sheet * formatting --- README.md | 18 ++++++-- config/config.yaml | 17 ++++++- config/pep/samples.csv | 3 -- workflow/Snakefile | 15 +++--- workflow/rules/preprocessing.smk | 12 +++++ workflow/scripts/create_sample_sheet.py | 61 +++++++++++++++++++------ 6 files changed, 99 insertions(+), 27 deletions(-) create mode 100644 workflow/rules/preprocessing.smk diff --git a/README.md b/README.md index 56b6d6b..f89bce1 100644 --- a/README.md +++ b/README.md @@ -50,12 +50,24 @@ To configure this workflow, modify `config/config.yaml` according to your needs, #### Sample sheet The sample sheet contains all samples to be analyzed. -Samples to be analyzed must be added manually to the sample sheet. + +#### Auto creation + +You can choose to automatically create a sample sheet with all samples in a specified directory (modifications in `config/config.yaml`). Only `fastq.gz` files are taken into account. Additionally there is the option to rename the sequencers output FASTQ files during this step, e.g. from `sampleID_S40_L001_R1_001.fastq.gz` to `sampleID_R1.fastq.gz`. +To create the sample sheet and provide it for the workflow, run: + +```sh + snakemake --cores all --use-conda create_sample_sheet +``` + +#### Manual creation or editing + +Samples to be analyzed can also be added manually to the sample sheet. For each sample, a new line in `config/pep/samples.csv` with the following content has to be defined: - **sample_name**: name or identifier of sample -- **fq1**: path to read 1 in FASTQ format -- **fq2**: path to read 2 in FASTQ format +- **fq1**: path to read 1 in gzip FASTQ format +- **fq2**: path to read 2 in gzip FASTQ format ### Step 4: Run workflow diff --git a/config/config.yaml b/config/config.yaml index 57115f2..8b702bd 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,14 +1,28 @@ pepfile: config/pep/config.yaml +## this will be used as name for the results folder and can be found in the report run-date: "23_12_18" +## adapter sequences used for trimming adapter-seqs: "-a GCGAATTTCGACGATCGTTGCATTAACTCGCGAA -g AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT" +## Option for the auto-creation of the sample sheet +sample-sheet: + # False or True : Should the sample sheet be auto-created? + auto-creation: True + # False or True : Should the sample fastqs be renamed? + # e.g. from sampleID_S40_L001_R1_001.fastq.gz to sampleID_R1.fastq.gz + rename-sample-files: False + # path to the fastq files of the samples for the sample sheet + data-path: "/groups/ds/metagenomes/run_folder/" + data-handling: - # path to store data within the workflow + # path to store input fastq data within the workflow data: "data/" + # path to store databases and reference genomes used within the workflow resources: "resources/" +## qualtiy criteria used for filtering quality-criteria: # minimal length of acceptable reads min-length-reads: 15 @@ -20,4 +34,5 @@ human-ref: "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_late kraken: download-path: "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20231009.tar.gz" +## string term used for formatting output tables tablular-config: '/>github<\/a>/a \\t\t\t\n\t\t\t