IKIM-Essen · thomasbtf · Feb 16, 2022 · Jan 6, 2022 · Jan 6, 2022 · Jan 11, 2022
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -86,7 +86,7 @@ jobs:
           curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.1.fastq.gz
           curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.2.fastq.gz
           curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ont_reads.fastq.gz > .tests/data/ont_reads.fastq.gz
-          curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR574/003/ERR5745913/ERR5745913.fastq.gz > .tests/data/ion_reads.fastq.gz
+          curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ERR5745913.fastq.gz > .tests/data/ion_reads.fastq.gz
           echo sample_name,fq1,fq2,date,is_amplicon_data,technology > .tests/config/pep/samples.csv
           echo illumina-test,data/B117.1.fastq.gz,data/B117.2.fastq.gz,2022-01-01,$AMPLICON,illumina >> .tests/config/pep/samples.csv
           echo ont-test,data/ont_reads.fastq.gz,,2022-01-01,$AMPLICON,ont >> .tests/config/pep/samples.csv

diff --git a/.tests/config/config.yaml b/.tests/config/config.yaml
@@ -24,11 +24,21 @@ human-genome-download-path:
   - ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.28_GRCh38.p13/GCA_000001405.28_GRCh38.p13_genomic.fna.gz
 
 data-handling:
-  # path of incoming data
+  # flag for using the following data-handling structure
+  # True: data-handling structure is used as shown below
+  # False: only the sample sheet needs to be updated (manually)
+  use-data-handling: True
+  # flag for archiving data
+  # True: data is archived in path defined below
+  # False: data is not archived
+  archive-data: True
+  # path of incoming data, which is moved to the
+  # data directory by the preprocessing script
   incoming: ../incoming/
-  # path to store data in the workflow
+  # path to store data within the workflow
   data: data/
-  # path to archive data from incoming to
+  # path to archive data from incoming and
+  # the results from the latest run to
   archive: ../archive/
 
 quality-criteria:

diff --git a/config/config.yaml b/config/config.yaml
@@ -12,6 +12,14 @@ human-genome-download-path:
   - ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.28_GRCh38.p13/GCA_000001405.28_GRCh38.p13_genomic.fna.gz
 
 data-handling:
+  # flag for using the following data-handling structure
+  # True: data-handling structure is used as shown below
+  # False: only the sample sheet needs to be updated (manually)
+  use-data-handling: True
+  # flag for archiving data
+  # True: data is archived in path defined below
+  # False: data is not archived
+  archive-data: True
   # path of incoming data, which is moved to the
   # data directory by the preprocessing script
   incoming: ../incoming/

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -47,35 +47,43 @@ include: "rules/variant_filtration.smk"
 include: "rules/variant_report.smk"
 include: "rules/generate_output.smk"
 include: "rules/benchmarking.smk"
-include: "rules/preprocessing.smk"
+
+
+if config["data-handling"]["use-data-handling"]:
+
+    include: "rules/preprocessing.smk"
+
+
 include: "rules/long_read.smk"
 include: "rules/lineage_variant_calling.smk"
 
 
-rule save_latest_run:
-    input:
-        expand(
-            "results/.indicators/{latest_run}.archived",
-            latest_run=get_latest_run_date(),
-        ),
-    output:
-        expand(
-            "".join(
-                (
-                    config["data-handling"]["archive"],
-                    "{latest_run}/results_{latest_run}.tar.gz",
-                )
+if config["data-handling"]["archive-data"]:
+
+    rule save_latest_run:
+        input:
+            expand(
+                "results/.indicators/{latest_run}.archived",
+                latest_run=get_latest_run_date(),
             ),
+        output:
+            expand(
+                "".join(
+                    (
+                        config["data-handling"]["archive"],
+                        "{latest_run}/results_{latest_run}.tar.gz",
+                    )
+                ),
+                latest_run=get_latest_run_date(),
+            ),
+        params:
             latest_run=get_latest_run_date(),
-        ),
-    params:
-        latest_run=get_latest_run_date(),
-    log:
-        expand("logs/save-run/{latest_run}.log", latest_run=get_latest_run_date()),
-    conda:
-        "envs/unix.yaml"
-    shell:
-        "tar -zcvf {output} results/{params.latest_run} 2> {log} 2>&1"
+        log:
+            expand("logs/save-run/{latest_run}.log", latest_run=get_latest_run_date()),
+        conda:
+            "envs/unix.yaml"
+        shell:
+            "tar -zcvf {output} results/{params.latest_run} 2> {log} 2>&1"
 
 
 checkpoint all:

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
@@ -21,6 +21,9 @@ properties:
     description: download path of human genome reference
   data-handling:
     properties:
+      use-data-handling:
+        type: boolean
+        description: flag whether to use data reorganization and archiving or not
       incoming:
         type: string
         description: path of incoming data, which is moved to the data directory by the preprocessing script