diff --git a/config/pep/samples.csv b/config/pep/samples.csv index 4173923ea..7b4b052c5 100644 --- a/config/pep/samples.csv +++ b/config/pep/samples.csv @@ -1,4 +1,4 @@ sample_name,fq1,fq2,date,is_amplicon_data,technology,include_in_high_genome_summary SAMPLE_NAME_1,PATH/TO/fq1,PATH/TO/fq2,SEQUENCING_DATE,0,illumina,1 # Required information for a sample sequencing on the Illumina platform SAMPLE_NAME_2,PATH/TO/fq,,SEQUENCING_DATE,1,ont,0 # Required information for a sample sequencing on the Oxford Nanopore platform -SAMPLE_NAME_3,PATH/TO/fq,,SEQUENCING_DATE,1,ion,0 # Required information for a sample sequencing on the Ion Torrent platform +SAMPLE_NAME_3,PATH/TO/fq,,SEQUENCING_DATE,1,ion,0 # Required information for a sample sequencing on the Ion Torrent platform \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 4abff6178..5ef4fa292 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -47,6 +47,7 @@ include: "rules/variant_filtration.smk" include: "rules/variant_report.smk" include: "rules/generate_output.smk" include: "rules/benchmarking.smk" +include: "rules/init.smk" if config["data-handling"]["use-data-handling"]: diff --git a/workflow/envs/pangolin.yaml b/workflow/envs/pangolin.yaml index 52f9310fd..087118014 100644 --- a/workflow/envs/pangolin.yaml +++ b/workflow/envs/pangolin.yaml @@ -3,5 +3,4 @@ channels: - bioconda - nodefaults dependencies: - - pangolin =4.1.2 - - tabulate <0.9 # TODO remove once pangolin 4.1.3 is available in bioconda + - pangolin =4.1.3 diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d29581213..056a3235e 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -42,6 +42,14 @@ validate(config, "../schemas/config.schema.yaml") validate(pep.sample_table, "../schemas/samples.schema.yaml") +def get_accessions(): + amplicon_ref = config["preprocessing"]["amplicon-reference"] + lin_ref = list(config["strain-calling"]["lineage-references"].values()) + genomes = ["main", amplicon_ref] + for ref in lin_ref: + genomes.append(ref) + return genomes + def get_samples(): return list(pep.sample_table["sample_name"].values) diff --git a/workflow/rules/init.smk b/workflow/rules/init.smk new file mode 100644 index 000000000..7d6a297e2 --- /dev/null +++ b/workflow/rules/init.smk @@ -0,0 +1,43 @@ +include: "ref.smk" + +rule init: + input: "resources/strain-accessions.txt", + expand("resources/genomes/{accession}.fasta", accession=get_accessions()), + "resources/annotation.gff.gz", + "resources/protein_products.gff.gz", + "resources/annotation_known_variants.gff.gz", + "resources/minikraken-8GB/", + "resources/krona/", + "resources/genomes/human-genome.fna.gz", + expand("resources/genomes-renamed/{accession}.fasta", accession=config["strain-calling"]["lineage-references"].values()), + ancient("data/"), + ancient("../archive"), + ancient("../incoming"), + +rule make_directory_data: + output: + data=directory("data"), + log: + "logs/make_directory_data.log" + shell: + "for dir in {output}; do if [ ! -d ""$dir"" ];" + " then mkdir ""$dir""; fi done" + +rule make_directory_incoming: + output: + incoming=directory("../incoming/"), + log: + "logs/make_directory_incoming.log" + shell: + "for dir in {output}; do if [ ! -d ""$dir"" ];" + " then mkdir ""$dir""; fi done" + +rule make_directory_archive: + output: + archive=directory("../archive"), + log: + "logs/make_directory_archive.log" + shell: + "for dir in {output}; do if [ ! -d ""$dir"" ];" + " then mkdir ""$dir""; fi done" + \ No newline at end of file diff --git a/workflow/rules/read_clipping.smk b/workflow/rules/read_clipping.smk index 8d93acc74..e7fd00cfd 100644 --- a/workflow/rules/read_clipping.smk +++ b/workflow/rules/read_clipping.smk @@ -20,63 +20,48 @@ rule samtools_sort: "v1.15.1/bio/samtools/sort" -rule bed_to_bedpe: +rule bed_to_tsv: input: check_bed_for_URL(config["preprocessing"]["amplicon-primers"]), output: - "resources/primer.bedpe", + "resources/primer.tsv", log: "logs/bed-to-bedpe.log", conda: "../envs/python.yaml" script: - "../scripts/bed-to-bedpe.py" + "../scripts/bed-to-tsv.py" -rule bamclipper: +rule trim_primers_fgbio: input: bam="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam", bai="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam.bai", - bedpe="resources/primer.bedpe", - output: - temp( - "results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam" + ref="resources/genomes/{reference}.fasta".format( + reference=config["preprocessing"]["amplicon-reference"] ), - params: - output_dir=get_output_dir, - cwd=lambda w: os.getcwd(), - bed_path=lambda w, input: os.path.join(os.getcwd(), input.bedpe), - bam=lambda w, input: os.path.basename(input.bam), + primers="resources/primer.tsv", + output: + "results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam", log: - "logs/{date}/bamclipper/{read_type}/{sample}.log", + "logs/{date}/fgbio_primer_clipping/{read_type}/{sample}.log", conda: - "../envs/bamclipper.yaml" - threads: 6 + "../envs/fgbio.yaml" shell: - "(cp {input.bam} {params.output_dir} &&" - " cp {input.bai} {params.output_dir} &&" - " cd {params.output_dir} &&" - " bamclipper.sh -b {params.bam} -p {params.bed_path} -n {threads} -u 5 -d 5) " - " > {params.cwd}/{log} 2>&1" + "fgbio TrimPrimers -i {input.bam} -p {input.primers} -o {output} -H true -r {input.ref} > {log} 2>&1" -rule fgbio: +rule filter_bam_fgbio: input: - bam="results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam", - bai="results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam.bai", - ref="resources/genomes/{reference}.fasta".format( - reference=config["preprocessing"]["amplicon-reference"] - ), + bam="results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam", output: - temp( - "results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam" - ), + "results/{date}/read-clipping/hc_filtered/{read_type}/{sample}/{sample}.bam", log: - "logs/{date}/fgbio/{read_type}/{sample}.log", + "logs/{date}/fgbio_filter_bam/{read_type}/{sample}.log", conda: "../envs/fgbio.yaml" shell: - "fgbio --sam-validation-stringency=LENIENT ClipBam -i {input.bam} -o {output} -H true -r {input.ref} > {log} 2>&1" + "fgbio FilterBam -i {input.bam} -o {output} --min-insert-size 100 --remove-single-end-mappings > {log} 2>&1" rule samtools_fastq_pe: @@ -131,7 +116,7 @@ rule plot_primer_clipping: ), params: samples=lambda wildcards: get_samples_for_date(wildcards.date), - bedpe="resources/primer.bedpe", + bedpe="resources/primer.tsv", log: "logs/{date}/plot-primer-clipping.log", conda: diff --git a/workflow/scripts/bed-to-bedpe.py b/workflow/scripts/bed-to-tsv.py similarity index 85% rename from workflow/scripts/bed-to-bedpe.py rename to workflow/scripts/bed-to-tsv.py index 4a3b343df..098a18cb7 100644 --- a/workflow/scripts/bed-to-bedpe.py +++ b/workflow/scripts/bed-to-tsv.py @@ -23,10 +23,9 @@ df_sense["chrom"], df_sense["start"], df_sense["end"], - df_antisense["chrom"], df_antisense["start"], df_antisense["end"], ] -headers = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"] +headers = ["chrom", "left_start", "left_end", "right_start", "right_end"] df_bedpe = pd.concat(data, axis=1, keys=headers) -df_bedpe.to_csv(snakemake.output[0], header=None, index=None, sep="\t", mode="a") +df_bedpe.to_csv(snakemake.output[0], index=None, sep="\t", mode="a") diff --git a/workflow/scripts/plot-primer-clipping.py b/workflow/scripts/plot-primer-clipping.py index c6dabb8e3..a99b55a8f 100644 --- a/workflow/scripts/plot-primer-clipping.py +++ b/workflow/scripts/plot-primer-clipping.py @@ -13,8 +13,10 @@ from intervaltree import IntervalTree # read primer bedpe to df -PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=None) -PRIMER.drop(PRIMER.columns[[0, 3]], axis=1, inplace=True) +PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=0) +print(PRIMER) +PRIMER.drop(PRIMER.columns[[0]], axis=1, inplace=True) +print(PRIMER) PRIMER.columns = ["p1_start", "p1_end", "p2_start", "p2_end"] # convert df to interval trees @@ -116,7 +118,7 @@ def count_intervals(file): "uncut primer within", "cut primer exact", "cut primer within", - "no mathing win", + "no matching win", ], } )