Skip to content

Commit

Permalink
feat: auto creation of sample sheet (#8)
Browse files Browse the repository at this point in the history
* feat: auto creation of sample sheet

* formatting
  • Loading branch information
josefawelling authored Jan 18, 2024
1 parent 8218b99 commit 9c148c5
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 27 deletions.
18 changes: 15 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,24 @@ To configure this workflow, modify `config/config.yaml` according to your needs,
#### Sample sheet

The sample sheet contains all samples to be analyzed.
Samples to be analyzed must be added manually to the sample sheet.

#### Auto creation

You can choose to automatically create a sample sheet with all samples in a specified directory (modifications in `config/config.yaml`). Only `fastq.gz` files are taken into account. Additionally there is the option to rename the sequencers output FASTQ files during this step, e.g. from `sampleID_S40_L001_R1_001.fastq.gz` to `sampleID_R1.fastq.gz`.
To create the sample sheet and provide it for the workflow, run:

```sh
snakemake --cores all --use-conda create_sample_sheet
```

#### Manual creation or editing

Samples to be analyzed can also be added manually to the sample sheet.
For each sample, a new line in `config/pep/samples.csv` with the following content has to be defined:

- **sample_name**: name or identifier of sample
- **fq1**: path to read 1 in FASTQ format
- **fq2**: path to read 2 in FASTQ format
- **fq1**: path to read 1 in gzip FASTQ format
- **fq2**: path to read 2 in gzip FASTQ format


### Step 4: Run workflow
Expand Down
17 changes: 16 additions & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
pepfile: config/pep/config.yaml

## this will be used as name for the results folder and can be found in the report
run-date: "23_12_18"

## adapter sequences used for trimming
adapter-seqs: "-a GCGAATTTCGACGATCGTTGCATTAACTCGCGAA -g AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"

## Option for the auto-creation of the sample sheet
sample-sheet:
# False or True : Should the sample sheet be auto-created?
auto-creation: True
# False or True : Should the sample fastqs be renamed?
# e.g. from sampleID_S40_L001_R1_001.fastq.gz to sampleID_R1.fastq.gz
rename-sample-files: False
# path to the fastq files of the samples for the sample sheet
data-path: "/groups/ds/metagenomes/run_folder/"

data-handling:
# path to store data within the workflow
# path to store input fastq data within the workflow
data: "data/"
# path to store databases and reference genomes used within the workflow
resources: "resources/"

## qualtiy criteria used for filtering
quality-criteria:
# minimal length of acceptable reads
min-length-reads: 15
Expand All @@ -20,4 +34,5 @@ human-ref: "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_late
kraken:
download-path: "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20231009.tar.gz"

## string term used for formatting output tables
tablular-config: '/>github<\/a>/a \\t\t\t</li>\n\t\t\t<li class="nav-item"> \n\t\t\t\t<a class="nav-link" href="#">'
3 changes: 0 additions & 3 deletions config/pep/samples.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
sample_name,fq1,fq2
115_L001,/groups/ds/resistance_cefiderocol/data/illumina/115_L001_R1.fastq.gz,/groups/ds/resistance_cefiderocol/data/illumina/115_L001_R2.fastq.gz
139_L001,/groups/ds/resistance_cefiderocol/data/illumina/139_L001_R1.fastq.gz,/groups/ds/resistance_cefiderocol/data/illumina/139_L001_R2.fastq.gz
I15566-L1,/projects/pig-muenster/Metagenome_Kiel/fastq/I15566-L1_R1.fastq.gz,/projects/pig-muenster/Metagenome_Kiel/fastq/I15566-L1_R2.fastq.gz
15 changes: 9 additions & 6 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ include: "rules/species_diversity.smk"
include: "rules/report.smk"


if config["sample-sheet"]["auto-creation"]:

include: "rules/preprocessing.smk"


DATE = get_run_date()


Expand All @@ -34,12 +39,10 @@ rule all:


onsuccess:
shell("tar cpfz results/{DATE}/{DATE}_results.tar.gz results/{DATE}/report/")
print(
"Workflow finished without an error. You can find the results in {date}_results.tar.gz".format(
date=DATE
)
)
print("Workflow finished without an error.")
if os.path.exists("results/{date}/report/{date}_report.zip".format(date=DATE)):
shell("tar cpfz results/{DATE}/{DATE}_results.tar.gz results/{DATE}/report/")
print("You can find the results in {date}_results.tar.gz".format(date=DATE))


onerror:
Expand Down
12 changes: 12 additions & 0 deletions workflow/rules/preprocessing.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
rule create_sample_sheet:
input:
"config/pep/samples.csv",
params:
inpath=config["sample-sheet"]["data-path"],
renaming=config["sample-sheet"]["rename-sample-files"],
log:
"logs/create_sample_sheet.log",
conda:
"../envs/python.yaml"
script:
"../scripts/create_sample_sheet.py"
61 changes: 47 additions & 14 deletions workflow/scripts/create_sample_sheet.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,60 @@
import os
import re
path="/groups/ds/metagenomes/231218_Miseq/"
outfile="config/pep/samples_231218.csv"
import sys

## write to log file
sys.stderr = open(snakemake.log[0], "w")

inpath = snakemake.params.inpath
renaming = snakemake.params.renaming
sample_csv = snakemake.input[0]


def rename_fastqs(path):
fastqs=os.listdir(path)
samples=[]
samples = []

fastqs = [file for file in os.listdir(path) if file.endswith(".fastq.gz")]
if not fastqs:
print(
f"Error: There are no fastq files in the directory. Have you used the correct path: {path}?"
)
raise Exception(
f"There are no fastq files in the directory. Have you used the correct path: {path}?"
)

if renaming:
print(
"Renaming fastq files, e.g. from sampleID_S40_L001_R1_001.fastq.gz to sampleID_R1.fastq.gz"
)
else:
print("Fastq files will not be renamed")

for fastq in fastqs:
## renaming from e.g. sampleID_S40_L001_R1_001.fastq.gz to sampleID_R1.fastq.gz
fastq_new = re.sub(r"_S\d{0,2}_L001", "", fastq)
fastq_new = re.sub(r"_001.fastq", ".fastq", fastq_new)
sample=fastq_new.split("_")[0]
if sample not in samples:

sample = (re.search("(.*)_R[1-2].fastq.gz", fastq_new)).group(1)
if sample not in samples and sample != "Undetermined":
samples.append(sample)
os.system(f"mv {path}{fastq} {path}{fastq_new}")
return(samples)

def write_sample_sheet(samples,outfile):
os.system(f"touch {outfile}")
with open(outfile,"w") as sheet:
if renaming:
os.system(f"mv {path} {fastq} {path} {fastq_new}")

return samples


def write_sample_sheet(samples, path, outfile):
# os.system(f"touch {outfile}")

with open(outfile, "w") as sheet:
sheet.write("sample_name,fq1,fq2\n")

for sample in samples:
sheet.write(f"{sample},{path}{sample}_R1.fastq.gz,{path}{sample}_R2.fastq.gz\n")
sheet.write(
f"{sample},{path} {sample}_R1.fastq.gz,{path} {sample}_R2.fastq.gz\n"
)


samples=rename_fastqs(path)
write_sample_sheet(samples,outfile)
samples = rename_fastqs(inpath)
write_sample_sheet(samples, inpath, sample_csv)

0 comments on commit 9c148c5

Please sign in to comment.