Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add first init rule #559

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/pep/samples.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
sample_name,fq1,fq2,date,is_amplicon_data,technology,include_in_high_genome_summary
SAMPLE_NAME_1,PATH/TO/fq1,PATH/TO/fq2,SEQUENCING_DATE,0,illumina,1 # Required information for a sample sequencing on the Illumina platform
SAMPLE_NAME_2,PATH/TO/fq,,SEQUENCING_DATE,1,ont,0 # Required information for a sample sequencing on the Oxford Nanopore platform
SAMPLE_NAME_3,PATH/TO/fq,,SEQUENCING_DATE,1,ion,0 # Required information for a sample sequencing on the Ion Torrent platform
SAMPLE_NAME_3,PATH/TO/fq,,SEQUENCING_DATE,1,ion,0 # Required information for a sample sequencing on the Ion Torrent platform
1 change: 1 addition & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ include: "rules/variant_filtration.smk"
include: "rules/variant_report.smk"
include: "rules/generate_output.smk"
include: "rules/benchmarking.smk"
include: "rules/init.smk"


if config["data-handling"]["use-data-handling"]:
Expand Down
3 changes: 1 addition & 2 deletions workflow/envs/pangolin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@ channels:
- bioconda
- nodefaults
dependencies:
- pangolin =4.1.2
- tabulate <0.9 # TODO remove once pangolin 4.1.3 is available in bioconda
- pangolin =4.1.3
8 changes: 8 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ validate(config, "../schemas/config.schema.yaml")
validate(pep.sample_table, "../schemas/samples.schema.yaml")


def get_accessions():
amplicon_ref = config["preprocessing"]["amplicon-reference"]
lin_ref = list(config["strain-calling"]["lineage-references"].values())
genomes = ["main", amplicon_ref]
for ref in lin_ref:
genomes.append(ref)
return genomes

def get_samples():
return list(pep.sample_table["sample_name"].values)

Expand Down
43 changes: 43 additions & 0 deletions workflow/rules/init.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
include: "ref.smk"

rule init:
input: "resources/strain-accessions.txt",
expand("resources/genomes/{accession}.fasta", accession=get_accessions()),
"resources/annotation.gff.gz",
"resources/protein_products.gff.gz",
"resources/annotation_known_variants.gff.gz",
"resources/minikraken-8GB/",
"resources/krona/",
"resources/genomes/human-genome.fna.gz",
expand("resources/genomes-renamed/{accession}.fasta", accession=config["strain-calling"]["lineage-references"].values()),
ancient("data/"),
ancient("../archive"),
ancient("../incoming"),

rule make_directory_data:
output:
data=directory("data"),
log:
"logs/make_directory_data.log"
shell:
"for dir in {output}; do if [ ! -d ""$dir"" ];"
" then mkdir ""$dir""; fi done"

rule make_directory_incoming:
output:
incoming=directory("../incoming/"),
log:
"logs/make_directory_incoming.log"
shell:
"for dir in {output}; do if [ ! -d ""$dir"" ];"
" then mkdir ""$dir""; fi done"

rule make_directory_archive:
output:
archive=directory("../archive"),
log:
"logs/make_directory_archive.log"
shell:
"for dir in {output}; do if [ ! -d ""$dir"" ];"
" then mkdir ""$dir""; fi done"

51 changes: 18 additions & 33 deletions workflow/rules/read_clipping.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,63 +20,48 @@ rule samtools_sort:
"v1.15.1/bio/samtools/sort"


rule bed_to_bedpe:
rule bed_to_tsv:
input:
check_bed_for_URL(config["preprocessing"]["amplicon-primers"]),
output:
"resources/primer.bedpe",
"resources/primer.tsv",
log:
"logs/bed-to-bedpe.log",
conda:
"../envs/python.yaml"
script:
"../scripts/bed-to-bedpe.py"
"../scripts/bed-to-tsv.py"


rule bamclipper:
rule trim_primers_fgbio:
input:
bam="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam",
bai="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam.bai",
bedpe="resources/primer.bedpe",
output:
temp(
"results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam"
ref="resources/genomes/{reference}.fasta".format(
reference=config["preprocessing"]["amplicon-reference"]
),
params:
output_dir=get_output_dir,
cwd=lambda w: os.getcwd(),
bed_path=lambda w, input: os.path.join(os.getcwd(), input.bedpe),
bam=lambda w, input: os.path.basename(input.bam),
primers="resources/primer.tsv",
output:
"results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam",
log:
"logs/{date}/bamclipper/{read_type}/{sample}.log",
"logs/{date}/fgbio_primer_clipping/{read_type}/{sample}.log",
conda:
"../envs/bamclipper.yaml"
threads: 6
"../envs/fgbio.yaml"
shell:
"(cp {input.bam} {params.output_dir} &&"
" cp {input.bai} {params.output_dir} &&"
" cd {params.output_dir} &&"
" bamclipper.sh -b {params.bam} -p {params.bed_path} -n {threads} -u 5 -d 5) "
" > {params.cwd}/{log} 2>&1"
"fgbio TrimPrimers -i {input.bam} -p {input.primers} -o {output} -H true -r {input.ref} > {log} 2>&1"


rule fgbio:
rule filter_bam_fgbio:
input:
bam="results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam",
bai="results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam.bai",
ref="resources/genomes/{reference}.fasta".format(
reference=config["preprocessing"]["amplicon-reference"]
),
bam="results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam",
output:
temp(
"results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam"
),
"results/{date}/read-clipping/hc_filtered/{read_type}/{sample}/{sample}.bam",
log:
"logs/{date}/fgbio/{read_type}/{sample}.log",
"logs/{date}/fgbio_filter_bam/{read_type}/{sample}.log",
conda:
"../envs/fgbio.yaml"
shell:
"fgbio --sam-validation-stringency=LENIENT ClipBam -i {input.bam} -o {output} -H true -r {input.ref} > {log} 2>&1"
"fgbio FilterBam -i {input.bam} -o {output} --min-insert-size 100 --remove-single-end-mappings > {log} 2>&1"


rule samtools_fastq_pe:
Expand Down Expand Up @@ -131,7 +116,7 @@ rule plot_primer_clipping:
),
params:
samples=lambda wildcards: get_samples_for_date(wildcards.date),
bedpe="resources/primer.bedpe",
bedpe="resources/primer.tsv",
log:
"logs/{date}/plot-primer-clipping.log",
conda:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@
df_sense["chrom"],
df_sense["start"],
df_sense["end"],
df_antisense["chrom"],
df_antisense["start"],
df_antisense["end"],
]
headers = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"]
headers = ["chrom", "left_start", "left_end", "right_start", "right_end"]
df_bedpe = pd.concat(data, axis=1, keys=headers)
df_bedpe.to_csv(snakemake.output[0], header=None, index=None, sep="\t", mode="a")
df_bedpe.to_csv(snakemake.output[0], index=None, sep="\t", mode="a")
8 changes: 5 additions & 3 deletions workflow/scripts/plot-primer-clipping.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
from intervaltree import IntervalTree

# read primer bedpe to df
PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=None)
PRIMER.drop(PRIMER.columns[[0, 3]], axis=1, inplace=True)
PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=0)
print(PRIMER)
PRIMER.drop(PRIMER.columns[[0]], axis=1, inplace=True)
print(PRIMER)
PRIMER.columns = ["p1_start", "p1_end", "p2_start", "p2_end"]

# convert df to interval trees
Expand Down Expand Up @@ -116,7 +118,7 @@ def count_intervals(file):
"uncut primer within",
"cut primer exact",
"cut primer within",
"no mathing win",
"no matching win",
],
}
)
Expand Down