Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: make incoming and archive structure data-handling optional #443

Merged
merged 9 commits into from
Feb 16, 2022
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ jobs:
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.1.fastq.gz
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.2.fastq.gz
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ont_reads.fastq.gz > .tests/data/ont_reads.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR574/003/ERR5745913/ERR5745913.fastq.gz > .tests/data/ion_reads.fastq.gz
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ERR5745913.fastq.gz > .tests/data/ion_reads.fastq.gz
echo sample_name,fq1,fq2,date,is_amplicon_data,technology > .tests/config/pep/samples.csv
echo illumina-test,data/B117.1.fastq.gz,data/B117.2.fastq.gz,2022-01-01,$AMPLICON,illumina >> .tests/config/pep/samples.csv
echo ont-test,data/ont_reads.fastq.gz,,2022-01-01,$AMPLICON,ont >> .tests/config/pep/samples.csv
Expand Down
16 changes: 13 additions & 3 deletions .tests/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,21 @@ human-genome-download-path:
- ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.28_GRCh38.p13/GCA_000001405.28_GRCh38.p13_genomic.fna.gz

data-handling:
# path of incoming data
# flag for using the following data-handling structure
# True: data-handling structure is used as shown below
# False: only the sample sheet needs to be updated (manually)
use-data-handling: True
# flag for archiving data
# True: data is archived in path defined below
# False: data is not archived
archive-data: True
# path of incoming data, which is moved to the
# data directory by the preprocessing script
incoming: ../incoming/
# path to store data in the workflow
# path to store data within the workflow
data: data/
# path to archive data from incoming to
# path to archive data from incoming and
# the results from the latest run to
archive: ../archive/

quality-criteria:
Expand Down
8 changes: 8 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ human-genome-download-path:
- ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.28_GRCh38.p13/GCA_000001405.28_GRCh38.p13_genomic.fna.gz

data-handling:
# flag for using the following data-handling structure
# True: data-handling structure is used as shown below
# False: only the sample sheet needs to be updated (manually)
use-data-handling: True
# flag for archiving data
# True: data is archived in path defined below
# False: data is not archived
archive-data: True
# path of incoming data, which is moved to the
# data directory by the preprocessing script
incoming: ../incoming/
Expand Down
54 changes: 31 additions & 23 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,35 +47,43 @@ include: "rules/variant_filtration.smk"
include: "rules/variant_report.smk"
include: "rules/generate_output.smk"
include: "rules/benchmarking.smk"
include: "rules/preprocessing.smk"


if config["data-handling"]["use-data-handling"]:

include: "rules/preprocessing.smk"


include: "rules/long_read.smk"
include: "rules/lineage_variant_calling.smk"


rule save_latest_run:
input:
expand(
"results/.indicators/{latest_run}.archived",
latest_run=get_latest_run_date(),
),
output:
expand(
"".join(
(
config["data-handling"]["archive"],
"{latest_run}/results_{latest_run}.tar.gz",
)
if config["data-handling"]["archive-data"]:

rule save_latest_run:
input:
expand(
"results/.indicators/{latest_run}.archived",
latest_run=get_latest_run_date(),
),
output:
expand(
"".join(
(
config["data-handling"]["archive"],
"{latest_run}/results_{latest_run}.tar.gz",
)
),
latest_run=get_latest_run_date(),
),
params:
latest_run=get_latest_run_date(),
),
params:
latest_run=get_latest_run_date(),
log:
expand("logs/save-run/{latest_run}.log", latest_run=get_latest_run_date()),
conda:
"envs/unix.yaml"
shell:
"tar -zcvf {output} results/{params.latest_run} 2> {log} 2>&1"
log:
expand("logs/save-run/{latest_run}.log", latest_run=get_latest_run_date()),
conda:
"envs/unix.yaml"
shell:
"tar -zcvf {output} results/{params.latest_run} 2> {log} 2>&1"


checkpoint all:
Expand Down
3 changes: 3 additions & 0 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ properties:
description: download path of human genome reference
data-handling:
properties:
use-data-handling:
type: boolean
description: flag whether to use data reorganization and archiving or not
incoming:
type: string
description: path of incoming data, which is moved to the data directory by the preprocessing script
Expand Down