From a951fcb3e6b4fe3c851bcdc297107a1e4f4bedb5 Mon Sep 17 00:00:00 2001 From: simakro <74452164+simakro@users.noreply.github.com> Date: Tue, 15 Feb 2022 10:02:00 +0100 Subject: [PATCH 01/53] wrote new script map_bin_amp_sampler.py for 'normalizing' reducing/coverage on each amplicon to a common cutoff and started modifying rule long_read.smk to do the reuired mapping and run the script --- workflow/rules/long_read.smk | 78 +++++++++++---- workflow/scripts/map_bin_amp_sampler.py | 124 ++++++++++++++++++++++++ 2 files changed, 182 insertions(+), 20 deletions(-) create mode 100644 workflow/scripts/map_bin_amp_sampler.py diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index ba864bb8a..7e8a61f49 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -19,17 +19,55 @@ rule nanoQC: "nanoQC {input} -o {params.outdir} > {log} 2>&1" -rule count_fastq_reads: +rule nanofilt: input: - get_reads_by_stage, + "results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq", output: - temp("results/{date}/tables/fastq-read-counts/{stage}~{sample}.txt"), + get_fastqs, + # temp("results/{date}/trimmed/nanofilt/{sample}.fastq"), log: - "logs/{date}/count_reads/{stage}~{sample}.log", + "logs/{date}/nanofilt/{sample}.log", + params: + min_length=config["quality-criteria"]["ont"]["min-length-reads"], + min_PHRED=config["quality-criteria"]["ont"]["min-PHRED"], conda: - "../envs/unix.yaml" + "../envs/nanofilt.yaml" + shell: + "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log}" + + +# rule count_fastq_reads: +# input: +# get_reads_by_stage, +# output: +# temp("results/{date}/tables/fastq-read-counts/{stage}~{sample}.txt"), +# log: +# "logs/{date}/count_reads/{stage}~{sample}.log", +# conda: +# "../envs/unix.yaml" +# shell: +# "echo $(( $(cat {input} | wc -l ) / 4)) > {output} 2> {log}" + +rule minimap_to_reference: + input: + reads=get_fastqs, + reference="resources/genomes/main.fasta", + output: + "results/{date}/minimappings/{sample}.paf" + conda: + "../envs/minimap2.yaml" shell: - "echo $(( $(cat {input} | wc -l ) / 4)) > {output} 2> {log}" + "minimap2 -x map-ont {reference} {reads} -o {output}" + + +rule amp_based_binning_downsampling: + input: + reads=get_fastqs, + mappings="results/{date}/minimappings/{sample}.paf" + output: + "results/{date}/normalize_reads/{sample}_ds.fastq" + + # Intermediate number of threads (4-8) achieve best speedup of a+btrimming. @@ -86,20 +124,20 @@ rule porechop_primer_trimming: """ -rule nanofilt: - input: - "results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq", - output: - temp("results/{date}/trimmed/nanofilt/{sample}.fastq"), - log: - "logs/{date}/nanofilt/{sample}.log", - params: - min_length=config["quality-criteria"]["ont"]["min-length-reads"], - min_PHRED=config["quality-criteria"]["ont"]["min-PHRED"], - conda: - "../envs/nanofilt.yaml" - shell: - "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 500 {input} > {output} 2> {log}" +# rule nanofilt: +# input: +# "results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq", +# output: +# temp("results/{date}/trimmed/nanofilt/{sample}.fastq"), +# log: +# "logs/{date}/nanofilt/{sample}.log", +# params: +# min_length=config["quality-criteria"]["ont"]["min-length-reads"], +# min_PHRED=config["quality-criteria"]["ont"]["min-PHRED"], +# conda: +# "../envs/nanofilt.yaml" +# shell: +# "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 500 {input} > {output} 2> {log}" rule canu_correct: diff --git a/workflow/scripts/map_bin_amp_sampler.py b/workflow/scripts/map_bin_amp_sampler.py new file mode 100644 index 000000000..9753d7ec0 --- /dev/null +++ b/workflow/scripts/map_bin_amp_sampler.py @@ -0,0 +1,124 @@ +import random + + +class Mapping: + def __init__(self, qname, qlen, qstart, qend, samestrand, tname, tlen, tstart, tend, matches, total_bp, qual, kwattr): + self.qname = qname + self.qlen = int(qlen) + self.qstart = int(qstart) + self.qend = int(qend) + self.samestrand = samestrand + self.tname = tname + self.tlen = int(tlen) + self.tstart = int(tstart) + self.tend = int(tend) + self.matches = int(matches) + self.total_bp = int(total_bp) + self.qual = int(qual) + self.kwattr = kwattr + self.gen_kw_attr() + + def gen_kw_attr(self): + kwattr_dict = {kw.split(":")[0]: kw.split(":")[-1] for kw in self.kwattr} + for key in kwattr_dict: + self.key = kwattr_dict[key] + + +class Primer: + def __init__(self, contig, start, end, name, score, strand): + self.contig = contig + self.start = int(start) + self.end = int(end) + self.name = name + self.score = int(score) + self.strand = strand + self.type = "primary" + self.amp_no, self.pos = self.get_name_infos() + + + def get_name_infos(self): + ls = self.name.split("_") + if len(ls) == 4: + self.type = "alt" + self.alt_name = ls[3] + return ls[1], ls[2] + + +class Amp: + def __init__(self, amp_no, primers): + self.name = int(amp_no) + self.primers = primers + self.start = min([primer.start for primer in primers]) + self.end = max([primer.end for primer in primers]) + self.reads = list() + + + def random_sample(self, cutoff): + if len(self.reads) > cutoff: + self.selected = random.choices(self.reads, k=cutoff) + else: + self.selected = self.reads + + + +def create_primer_objs(primer_bed): + with open(primer_bed, "r") as bed: + primers = list() + for line in bed: + prim = Primer(*line.strip().split("\t")) + primers.append(prim) + return sorted(primers, key=lambda x: x.end) + + +def generate_amps(primers): + amp_nums = set([primer.amp_no for primer in primers]) + amps = list() + for num in amp_nums: + ao = Amp(num, [primer for primer in primers if primer.amp_no == num]) + amps.append(ao) + return sorted(amps, key=lambda x: x.name) + + +def create_read_mappings(mm2_paf): + with open(mm2_paf, "r") as paf: + mappings = list() + for line in paf: + if len(line.strip()) > 0: + ls = line.strip().split("\t") + mapping = Mapping(*ls[:12], ls[12:]) + mappings.append(mapping) + return sorted(mappings, key=lambda x: x.tend) + + +def bin_mappings(amp_bins, mappings): + done = list() + na = list() + while len(mappings) > 0: + if mappings[0].tend <= amp_bins[0].end + 5: + if mappings[0].tstart >= amp_bins[0].start - 5: + amp_bins[0].reads.append(mappings[0].qname) + mappings.pop(0) + else: + na.append(mappings[0].qname) + mappings.pop(0) + else: + done.append(amp_bins[0]) + amp_bins.pop(0) + + for bin in done: + bin.random_sample(10) + print(bin.name, len(bin.reads), "selected:", len(bin.selected)) + print("na", len(na)) + + +if __name__ == "__main__": + import sys + + mm2_paf = sys.argv[1] + primer_bed = sys.argv[2] + + primers = create_primer_objs(primer_bed) + amps = generate_amps(primers) + mappings = create_read_mappings(mm2_paf) + bin_mappings(amps, mappings) + \ No newline at end of file From b9bafe8e3101bedd92e68e023c0180824a034ef0 Mon Sep 17 00:00:00 2001 From: simakro <74452164+simakro@users.noreply.github.com> Date: Tue, 15 Feb 2022 17:14:47 +0100 Subject: [PATCH 02/53] added write out and convert of picked reads from source fastq to out-fasta --- ...n_amp_sampler.py => amp_covcap_sampler.py} | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) rename workflow/scripts/{map_bin_amp_sampler.py => amp_covcap_sampler.py} (80%) diff --git a/workflow/scripts/map_bin_amp_sampler.py b/workflow/scripts/amp_covcap_sampler.py similarity index 80% rename from workflow/scripts/map_bin_amp_sampler.py rename to workflow/scripts/amp_covcap_sampler.py index 9753d7ec0..da51b210a 100644 --- a/workflow/scripts/map_bin_amp_sampler.py +++ b/workflow/scripts/amp_covcap_sampler.py @@ -91,7 +91,7 @@ def create_read_mappings(mm2_paf): def bin_mappings(amp_bins, mappings): - done = list() + binned = list() na = list() while len(mappings) > 0: if mappings[0].tend <= amp_bins[0].end + 5: @@ -102,23 +102,46 @@ def bin_mappings(amp_bins, mappings): na.append(mappings[0].qname) mappings.pop(0) else: - done.append(amp_bins[0]) + binned.append(amp_bins[0]) amp_bins.pop(0) - for bin in done: + for bin in binned: bin.random_sample(10) print(bin.name, len(bin.reads), "selected:", len(bin.selected)) print("na", len(na)) + return binned + +def write_capped_reads(binned, reads, out): + all_picks = ["@"+ name for amp in binned for name in amp.selected] + print(all_picks) + with open(reads, "r") as fq, open(out, "w") as fa: + for line in fq: + if line.startswith("@"): + readname = line.split(" ")[0] + print(readname) + if readname in all_picks: + readname = readname.replace("@", ">") + fa.write(readname + "\n") + fa.write(next(fq)) + if __name__ == "__main__": import sys mm2_paf = sys.argv[1] primer_bed = sys.argv[2] + reads = sys.argv[3] + out = sys.argv[3] + "_capped" + + # primer_bed = snakemake.output[0] + # mm2_paf = snakemake.input[1] + # reads = snakemake.input[2] + primers = create_primer_objs(primer_bed) amps = generate_amps(primers) mappings = create_read_mappings(mm2_paf) - bin_mappings(amps, mappings) + binned = bin_mappings(amps, mappings) + write_capped_reads(binned, reads, out) \ No newline at end of file From 420e3a7917c8ee2c5a9bd7809cb01d07c09b7c77 Mon Sep 17 00:00:00 2001 From: simakro <74452164+simakro@users.noreply.github.com> Date: Tue, 15 Feb 2022 17:23:14 +0100 Subject: [PATCH 03/53] change in- and output file-extensions for all rules downstream of cap_cov_amp from fastq to fasta; changed also in get_reads_by_stage --- workflow/rules/common.smk | 6 +++--- workflow/rules/long_read.smk | 28 +++++++++++++++------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d459a0d8d..53fba39df 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1407,11 +1407,11 @@ def get_reads_by_stage(wildcards): if wildcards.stage == "raw": return get_fastqs(wildcards) elif wildcards.stage == "trimmed": - return "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fastq" + return "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" elif wildcards.stage == "clipped": - return "results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq" + return "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta" elif wildcards.stage == "filtered": - return "results/{date}/trimmed/nanofilt/{sample}.fastq" + return "results/{date}/trimmed/nanofilt/{sample}.fasta" def get_polished_sequence(wildcards): diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 7e8a61f49..4db851698 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -33,7 +33,7 @@ rule nanofilt: conda: "../envs/nanofilt.yaml" shell: - "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log}" + "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 800 {input} > {output} 2> {log}" # --maxlength 700 # rule count_fastq_reads: @@ -60,23 +60,24 @@ rule minimap_to_reference: "minimap2 -x map-ont {reference} {reads} -o {output}" -rule amp_based_binning_downsampling: +rule cap_cov_amp: input: + primer=get_artic_primer, + mappings="results/{date}/minimappings/{sample}.paf", reads=get_fastqs, - mappings="results/{date}/minimappings/{sample}.paf" output: - "results/{date}/normalize_reads/{sample}_ds.fastq" + "results/{date}/normalize_reads/{sample}_cap.fasta" + script: + "../scripts/amp_covcap_sampler.py" - - # Intermediate number of threads (4-8) achieve best speedup of a+btrimming. # For large files 8 threads help accelerate some, small files are processed faster with 4 threads. rule porechop_adapter_barcode_trimming: input: - get_fastqs, + "results/{date}/normalize_reads/{sample}_cap.fasta" output: - temp("results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fastq"), + temp("results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta"), conda: "../envs/porechop.yaml" log: @@ -107,11 +108,11 @@ rule customize_primer_porechop: rule porechop_primer_trimming: input: fastq_in=( - "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fastq" + "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" ), repl_flag="results/.indicators/replacement_notice.txt", output: - temp("results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq"), + temp("results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta"), conda: "../envs/primechop.yaml" log: @@ -126,9 +127,9 @@ rule porechop_primer_trimming: # rule nanofilt: # input: -# "results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq", +# "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", # output: -# temp("results/{date}/trimmed/nanofilt/{sample}.fastq"), +# temp("results/{date}/trimmed/nanofilt/{sample}.fasta"), # log: # "logs/{date}/nanofilt/{sample}.log", # params: @@ -142,7 +143,8 @@ rule porechop_primer_trimming: rule canu_correct: input: - "results/{date}/trimmed/nanofilt/{sample}.fastq", + # "results/{date}/trimmed/nanofilt/{sample}.fasta", + "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta" output: "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", log: From d85397702248d550f988bfd87de98184233efff4 Mon Sep 17 00:00:00 2001 From: simakro Date: Tue, 15 Feb 2022 19:37:29 +0000 Subject: [PATCH 04/53] fixed several snakemake related bugs --- config/pep/samples.csv | 4 +- workflow/rules/long_read.smk | 28 +++++----- workflow/scripts/amp_covcap_sampler.py | 71 +++++++++++++++----------- 3 files changed, 59 insertions(+), 44 deletions(-) diff --git a/config/pep/samples.csv b/config/pep/samples.csv index 4173923ea..0416fdea8 100644 --- a/config/pep/samples.csv +++ b/config/pep/samples.csv @@ -1,4 +1,2 @@ sample_name,fq1,fq2,date,is_amplicon_data,technology,include_in_high_genome_summary -SAMPLE_NAME_1,PATH/TO/fq1,PATH/TO/fq2,SEQUENCING_DATE,0,illumina,1 # Required information for a sample sequencing on the Illumina platform -SAMPLE_NAME_2,PATH/TO/fq,,SEQUENCING_DATE,1,ont,0 # Required information for a sample sequencing on the Oxford Nanopore platform -SAMPLE_NAME_3,PATH/TO/fq,,SEQUENCING_DATE,1,ion,0 # Required information for a sample sequencing on the Ion Torrent platform +UnCoVar_RefDataSet_Batch2_BC03_cat,/home/simon/uncovar/data/newdate/UnCoVar_RefDataSet_Batch2_BC03_cat.fastq,,newdate,1,ont,0 diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 4db851698..5b1e702aa 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -21,9 +21,11 @@ rule nanoQC: rule nanofilt: input: - "results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq", - output: + # get_reads_by_stage, get_fastqs, + output: + "results/{date}/filtered/nanofilt/{sample}.fastq", + # "results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq", # temp("results/{date}/trimmed/nanofilt/{sample}.fastq"), log: "logs/{date}/nanofilt/{sample}.log", @@ -33,7 +35,7 @@ rule nanofilt: conda: "../envs/nanofilt.yaml" shell: - "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 800 {input} > {output} 2> {log}" # --maxlength 700 + "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 800 {input} > {output} 2> {log}" # --maxlength 700 # rule count_fastq_reads: @@ -48,34 +50,36 @@ rule nanofilt: # shell: # "echo $(( $(cat {input} | wc -l ) / 4)) > {output} 2> {log}" + rule minimap_to_reference: input: - reads=get_fastqs, + reads="results/{date}/filtered/nanofilt/{sample}.fastq", reference="resources/genomes/main.fasta", output: - "results/{date}/minimappings/{sample}.paf" + "results/{date}/minimappings/{sample}.paf", conda: "../envs/minimap2.yaml" shell: - "minimap2 -x map-ont {reference} {reads} -o {output}" + "minimap2 -x map-ont {input.reference} {input.reads} -o {output}" rule cap_cov_amp: input: - primer=get_artic_primer, + primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", mappings="results/{date}/minimappings/{sample}.paf", - reads=get_fastqs, + reads="results/{date}/filtered/nanofilt/{sample}.fastq", + # reads=get_fastqs, output: - "results/{date}/normalize_reads/{sample}_cap.fasta" + "results/{date}/normalize_reads/{sample}_cap.fasta", script: "../scripts/amp_covcap_sampler.py" - + # Intermediate number of threads (4-8) achieve best speedup of a+btrimming. # For large files 8 threads help accelerate some, small files are processed faster with 4 threads. rule porechop_adapter_barcode_trimming: input: - "results/{date}/normalize_reads/{sample}_cap.fasta" + "results/{date}/normalize_reads/{sample}_cap.fasta", output: temp("results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta"), conda: @@ -144,7 +148,7 @@ rule porechop_primer_trimming: rule canu_correct: input: # "results/{date}/trimmed/nanofilt/{sample}.fasta", - "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta" + "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", output: "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", log: diff --git a/workflow/scripts/amp_covcap_sampler.py b/workflow/scripts/amp_covcap_sampler.py index da51b210a..b46b4823a 100644 --- a/workflow/scripts/amp_covcap_sampler.py +++ b/workflow/scripts/amp_covcap_sampler.py @@ -2,7 +2,22 @@ class Mapping: - def __init__(self, qname, qlen, qstart, qend, samestrand, tname, tlen, tstart, tend, matches, total_bp, qual, kwattr): + def __init__( + self, + qname, + qlen, + qstart, + qend, + samestrand, + tname, + tlen, + tstart, + tend, + matches, + total_bp, + qual, + kwattr, + ): self.qname = qname self.qlen = int(qlen) self.qstart = int(qstart) @@ -17,7 +32,7 @@ def __init__(self, qname, qlen, qstart, qend, samestrand, tname, tlen, tstart, t self.qual = int(qual) self.kwattr = kwattr self.gen_kw_attr() - + def gen_kw_attr(self): kwattr_dict = {kw.split(":")[0]: kw.split(":")[-1] for kw in self.kwattr} for key in kwattr_dict: @@ -34,7 +49,6 @@ def __init__(self, contig, start, end, name, score, strand): self.strand = strand self.type = "primary" self.amp_no, self.pos = self.get_name_infos() - def get_name_infos(self): ls = self.name.split("_") @@ -42,7 +56,7 @@ def get_name_infos(self): self.type = "alt" self.alt_name = ls[3] return ls[1], ls[2] - + class Amp: def __init__(self, amp_no, primers): @@ -51,14 +65,12 @@ def __init__(self, amp_no, primers): self.start = min([primer.start for primer in primers]) self.end = max([primer.end for primer in primers]) self.reads = list() - def random_sample(self, cutoff): if len(self.reads) > cutoff: self.selected = random.choices(self.reads, k=cutoff) else: self.selected = self.reads - def create_primer_objs(primer_bed): @@ -78,7 +90,7 @@ def generate_amps(primers): amps.append(ao) return sorted(amps, key=lambda x: x.name) - + def create_read_mappings(mm2_paf): with open(mm2_paf, "r") as paf: mappings = list() @@ -93,33 +105,35 @@ def create_read_mappings(mm2_paf): def bin_mappings(amp_bins, mappings): binned = list() na = list() - while len(mappings) > 0: - if mappings[0].tend <= amp_bins[0].end + 5: - if mappings[0].tstart >= amp_bins[0].start - 5: - amp_bins[0].reads.append(mappings[0].qname) - mappings.pop(0) + while len(amp_bins) > 0: + if len(mappings) > 0: + if mappings[0].tend <= amp_bins[0].end + 5: + if mappings[0].tstart >= amp_bins[0].start - 5: + amp_bins[0].reads.append(mappings[0].qname) + mappings.pop(0) + else: + na.append(mappings[0].qname) + mappings.pop(0) else: - na.append(mappings[0].qname) - mappings.pop(0) - else: - binned.append(amp_bins[0]) - amp_bins.pop(0) + binned.append(amp_bins[0]) + amp_bins.pop(0) for bin in binned: - bin.random_sample(10) + bin.random_sample(200) print(bin.name, len(bin.reads), "selected:", len(bin.selected)) print("na", len(na)) return binned + def write_capped_reads(binned, reads, out): - all_picks = ["@"+ name for amp in binned for name in amp.selected] + all_picks = ["@" + name for amp in binned for name in amp.selected] print(all_picks) with open(reads, "r") as fq, open(out, "w") as fa: for line in fq: if line.startswith("@"): readname = line.split(" ")[0] - print(readname) + # print(readname) if readname in all_picks: readname = readname.replace("@", ">") fa.write(readname + "\n") @@ -129,19 +143,18 @@ def write_capped_reads(binned, reads, out): if __name__ == "__main__": import sys - mm2_paf = sys.argv[1] - primer_bed = sys.argv[2] - reads = sys.argv[3] - out = sys.argv[3] + "_capped" - - # primer_bed = snakemake.output[0] - # mm2_paf = snakemake.input[1] - # reads = snakemake.input[2] + # mm2_paf = sys.argv[1] + # primer_bed = sys.argv[2] + # reads = sys.argv[3] + # out = reads + "_capped" + primer_bed = snakemake.input[0] + mm2_paf = snakemake.input[1] + reads = snakemake.input[2] + out = snakemake.output[0] primers = create_primer_objs(primer_bed) amps = generate_amps(primers) mappings = create_read_mappings(mm2_paf) binned = bin_mappings(amps, mappings) write_capped_reads(binned, reads, out) - \ No newline at end of file From c303aa8c45efa308170b664ef46add780dba84f0 Mon Sep 17 00:00:00 2001 From: simakro Date: Wed, 16 Feb 2022 12:05:09 +0000 Subject: [PATCH 05/53] multiple restorations to repair broken workflow --- workflow/envs/canu.yaml | 3 +- workflow/envs/seqtk.yaml | 5 + workflow/rules/long_read.smk | 224 +++++++++++++++++++------ workflow/scripts/amp_covcap_sampler.py | 2 - 4 files changed, 178 insertions(+), 56 deletions(-) create mode 100644 workflow/envs/seqtk.yaml diff --git a/workflow/envs/canu.yaml b/workflow/envs/canu.yaml index 76790e043..e685b6a23 100644 --- a/workflow/envs/canu.yaml +++ b/workflow/envs/canu.yaml @@ -2,5 +2,6 @@ channels: - bioconda - conda-forge dependencies: - - canu = 2.2 + # - canu = 2.2 + - canu = 2.1.1 - minimap2 = 2.22 diff --git a/workflow/envs/seqtk.yaml b/workflow/envs/seqtk.yaml new file mode 100644 index 000000000..123ad1616 --- /dev/null +++ b/workflow/envs/seqtk.yaml @@ -0,0 +1,5 @@ +channels: + - bioconda + - conda-forge +dependencies: + - seqtk = 1.3 diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 5b1e702aa..aa7684d3e 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -38,22 +38,22 @@ rule nanofilt: "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 800 {input} > {output} 2> {log}" # --maxlength 700 -# rule count_fastq_reads: -# input: -# get_reads_by_stage, -# output: -# temp("results/{date}/tables/fastq-read-counts/{stage}~{sample}.txt"), -# log: -# "logs/{date}/count_reads/{stage}~{sample}.log", -# conda: -# "../envs/unix.yaml" -# shell: -# "echo $(( $(cat {input} | wc -l ) / 4)) > {output} 2> {log}" +rule convert2fasta: + input: + "results/{date}/filtered/nanofilt/{sample}.fastq", + # "results/{barcode}/trim-filt/{barcode}_tf.fastq" + output: + "results/{date}/filtered/nanofilt/{sample}.fasta", + conda: + "../envs/seqtk.yaml" + shell: + "seqtk seq -A {input} > {output}" rule minimap_to_reference: input: - reads="results/{date}/filtered/nanofilt/{sample}.fastq", + # reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + reads="results/{date}/filtered/nanofilt/{sample}.fasta", reference="resources/genomes/main.fasta", output: "results/{date}/minimappings/{sample}.paf", @@ -75,6 +75,122 @@ rule cap_cov_amp: "../scripts/amp_covcap_sampler.py" +# rule canu_correct: +# input: +# "results/{date}/normalize_reads/{sample}_cap.fasta", +# # "results/{date}/filtered/nanofilt/{sample}.fasta", +# # "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", +# output: +# "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", +# log: +# "logs/{date}/canu/correct/{sample}.log", +# params: +# outdir=get_output_dir, +# concurrency=lambda w, threads: get_canu_concurrency(threads), +# # min_length=config["quality-criteria"]["ont"]["min-length-reads"], +# # for_testing=lambda w, threads: get_if_testing( +# # f"corThreads={threads} redMemory=6 oeaMemory=6" +# # ), +# conda: +# "../envs/canu.yaml" +# threads: 64 +# shell: +# """ +# canu -correct -nanopore {input} -p {wildcards.sample} -d {params.outdir} genomeSize=30k minOverlapLength=10 minReadLength=200 \ +# useGrid=false\ +# corMMapMerSize=10 corOutCoverage=50000 corMinCoverage=0 maxInputCoverage=20000 \ +# corOverlapper=minimap utgOverlapper=minimap obtOverlapper=minimap \ +# corConcurrency={params.concurrency} \ +# cormhapConcurrency={params.concurrency} cormhapThreads={params.concurrency} \ +# cormmapConcurrency={params.concurrency} cormmapThreads={params.concurrency} \ +# obtmmapConcurrency={params.concurrency} obtmmapThreads={params.concurrency} \ +# utgmmapConcurrency={params.concurrency} utgmmapThreads={params.concurrency} \ +# redConcurrency={params.concurrency} redThreads={params.concurrency} \ +# ovbConcurrency={params.concurrency} \ +# ovsConcurrency={params.concurrency} \ +# oeaConcurrency={params.concurrency} +# 2> {log} +# """ + + +rule canu_correct: + input: + "results/{date}/normalize_reads/{sample}_cap.fasta", + # "results/{date}/filtered/nanofilt/{sample}.fasta", + # "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", + output: + "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + log: + "logs/{date}/canu/correct/{sample}.log", + params: + outdir=get_output_dir, + concurrency=lambda w, threads: get_canu_concurrency(threads), + min_length=config["quality-criteria"]["ont"]["min-length-reads"], + for_testing=lambda w, threads: get_if_testing( + f"corThreads={threads} redMemory=6 oeaMemory=6" + ), + conda: + "../envs/canu.yaml" + threads: 16 + shell: + """ + ( if [ -d {params.outdir} ]; then rm -Rf {params.outdir}; fi && + canu -correct -nanopore {input} -p {wildcards.sample} -d {params.outdir} genomeSize=30k minOverlapLength=10 minReadLength=200 \ + useGrid=false {params.for_testing} \ + corMMapMerSize=10 corOutCoverage=50000 corMinCoverage=0 maxInputCoverage=20000 \ + corOverlapper=minimap utgOverlapper=minimap obtOverlapper=minimap \ + corConcurrency={params.concurrency} \ + cormhapConcurrency={params.concurrency} cormhapThreads={params.concurrency} \ + cormmapConcurrency={params.concurrency} cormmapThreads={params.concurrency} \ + obtmmapConcurrency={params.concurrency} obtmmapThreads={params.concurrency} \ + utgmmapConcurrency={params.concurrency} utgmmapThreads={params.concurrency} \ + redConcurrency={params.concurrency} redThreads={params.concurrency} \ + ovbConcurrency={params.concurrency} \ + ovsConcurrency={params.concurrency} \ + oeaConcurrency={params.concurrency} + ) + 2> {log} + """ + + +# rule count_fastq_reads: +# input: +# get_reads_by_stage, +# output: +# temp("results/{date}/tables/fastq-read-counts/{stage}~{sample}.txt"), +# log: +# "logs/{date}/count_reads/{stage}~{sample}.log", +# conda: +# "../envs/unix.yaml" +# shell: +# "echo $(( $(cat {input} | wc -l ) / 4)) > {output} 2> {log}" + + +# rule minimap_to_reference: +# input: +# reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", +# # reads="results/{date}/filtered/nanofilt/{sample}.fastq", +# reference="resources/genomes/main.fasta", +# output: +# "results/{date}/minimappings/{sample}.paf", +# conda: +# "../envs/minimap2.yaml" +# shell: +# "minimap2 -x map-ont {input.reference} {input.reads} -o {output}" + + +# rule cap_cov_amp: +# input: +# primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", +# mappings="results/{date}/minimappings/{sample}.paf", +# reads="results/{date}/filtered/nanofilt/{sample}.fastq", +# # reads=get_fastqs, +# output: +# "results/{date}/normalize_reads/{sample}_cap.fasta", +# script: +# "../scripts/amp_covcap_sampler.py" + + # Intermediate number of threads (4-8) achieve best speedup of a+btrimming. # For large files 8 threads help accelerate some, small files are processed faster with 4 threads. rule porechop_adapter_barcode_trimming: @@ -116,7 +232,7 @@ rule porechop_primer_trimming: ), repl_flag="results/.indicators/replacement_notice.txt", output: - temp("results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta"), + "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", conda: "../envs/primechop.yaml" log: @@ -145,52 +261,54 @@ rule porechop_primer_trimming: # "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 500 {input} > {output} 2> {log}" -rule canu_correct: - input: - # "results/{date}/trimmed/nanofilt/{sample}.fasta", - "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", - output: - "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", - log: - "logs/{date}/canu/assemble/{sample}.log", - params: - outdir=get_output_dir, - concurrency=lambda w, threads: get_canu_concurrency(threads), - min_length=config["quality-criteria"]["ont"]["min-length-reads"], - for_testing=lambda w, threads: get_if_testing( - f"corThreads={threads} redMemory=6 oeaMemory=6" - ), - conda: - "../envs/canu.yaml" - threads: 16 - shell: - """ - ( if [ -d {params.outdir} ]; then rm -Rf {params.outdir}; fi && - canu -correct -nanopore {input} -p {wildcards.sample} -d {params.outdir} genomeSize=30k minOverlapLength=10 minReadLength=200 \ - useGrid=false {params.for_testing} \ - corMMapMerSize=10 corOutCoverage=50000 corMinCoverage=0 maxInputCoverage=20000 \ - corOverlapper=minimap utgOverlapper=minimap obtOverlapper=minimap \ - corConcurrency={params.concurrency} \ - cormhapConcurrency={params.concurrency} cormhapThreads={params.concurrency} \ - cormmapConcurrency={params.concurrency} cormmapThreads={params.concurrency} \ - obtmmapConcurrency={params.concurrency} obtmmapThreads={params.concurrency} \ - utgmmapConcurrency={params.concurrency} utgmmapThreads={params.concurrency} \ - redConcurrency={params.concurrency} redThreads={params.concurrency} \ - ovbConcurrency={params.concurrency} \ - ovsConcurrency={params.concurrency} \ - oeaConcurrency={params.concurrency} - ) - 2> {log} - """ +# rule canu_correct: +# input: +# # "results/{date}/trimmed/nanofilt/{sample}.fasta", +# "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", +# output: +# "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", +# log: +# "logs/{date}/canu/assemble/{sample}.log", +# params: +# outdir=get_output_dir, +# concurrency=lambda w, threads: get_canu_concurrency(threads), +# min_length=config["quality-criteria"]["ont"]["min-length-reads"], +# for_testing=lambda w, threads: get_if_testing( +# f"corThreads={threads} redMemory=6 oeaMemory=6" +# ), +# conda: +# "../envs/canu.yaml" +# threads: 16 +# shell: +# """ +# ( if [ -d {params.outdir} ]; then rm -Rf {params.outdir}; fi && +# canu -correct -nanopore {input} -p {wildcards.sample} -d {params.outdir} genomeSize=30k minOverlapLength=10 minReadLength=200 \ +# useGrid=false {params.for_testing} \ +# corMMapMerSize=10 corOutCoverage=50000 corMinCoverage=0 maxInputCoverage=20000 \ +# corOverlapper=minimap utgOverlapper=minimap obtOverlapper=minimap \ +# corConcurrency={params.concurrency} \ +# cormhapConcurrency={params.concurrency} cormhapThreads={params.concurrency} \ +# cormmapConcurrency={params.concurrency} cormmapThreads={params.concurrency} \ +# obtmmapConcurrency={params.concurrency} obtmmapThreads={params.concurrency} \ +# utgmmapConcurrency={params.concurrency} utgmmapThreads={params.concurrency} \ +# redConcurrency={params.concurrency} redThreads={params.concurrency} \ +# ovbConcurrency={params.concurrency} \ +# ovsConcurrency={params.concurrency} \ +# oeaConcurrency={params.concurrency} +# ) +# 2> {log} +# """ # rule medaka_consensus_reference: use rule assembly_polishing_ont as medaka_consensus_reference with: input: - fasta="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + # Don´t ever use corrected reads as input for medaka, it is supposed to polish with rwa-reads! + # fasta="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + fasta="results/{date}/filtered/nanofilt/{sample}.fasta", reference="resources/genomes/main.fasta", output: - temp("results/{date}/consensus/medaka/{sample}/{sample}.fasta"), + "results/{date}/consensus/medaka/{sample}/{sample}.fasta", # polish consensus @@ -200,7 +318,7 @@ rule bcftools_consensus_ont: bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf", # clonal vs. subclonal? bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf.csi", output: - temp("results/{date}/consensus/bcftools/{sample}.fasta"), + "results/{date}/consensus/bcftools/{sample}.fasta", log: "logs/{date}/bcftools-consensus-ont/{sample}.log", conda: diff --git a/workflow/scripts/amp_covcap_sampler.py b/workflow/scripts/amp_covcap_sampler.py index b46b4823a..116d24fac 100644 --- a/workflow/scripts/amp_covcap_sampler.py +++ b/workflow/scripts/amp_covcap_sampler.py @@ -128,12 +128,10 @@ def bin_mappings(amp_bins, mappings): def write_capped_reads(binned, reads, out): all_picks = ["@" + name for amp in binned for name in amp.selected] - print(all_picks) with open(reads, "r") as fq, open(out, "w") as fa: for line in fq: if line.startswith("@"): readname = line.split(" ")[0] - # print(readname) if readname in all_picks: readname = readname.replace("@", ">") fa.write(readname + "\n") From e77580ca25455cd97ce8099d4ec7760376790b2e Mon Sep 17 00:00:00 2001 From: simakro Date: Thu, 17 Feb 2022 16:04:38 +0000 Subject: [PATCH 06/53] established custom script for trimming of primers work-title: map-trim --- workflow/envs/canu.yaml | 2 +- workflow/rules/long_read.smk | 256 ++++++++++------------- workflow/scripts/amp_covcap_sampler.py | 77 +++++-- workflow/scripts/map_trim.py | 268 +++++++++++++++++++++++++ 4 files changed, 438 insertions(+), 165 deletions(-) create mode 100644 workflow/scripts/map_trim.py diff --git a/workflow/envs/canu.yaml b/workflow/envs/canu.yaml index e685b6a23..2199be94a 100644 --- a/workflow/envs/canu.yaml +++ b/workflow/envs/canu.yaml @@ -2,6 +2,6 @@ channels: - bioconda - conda-forge dependencies: - # - canu = 2.2 + # do not use canu 2.2 ! - canu = 2.1.1 - minimap2 = 2.22 diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index aa7684d3e..1aa07d4fa 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -24,9 +24,9 @@ rule nanofilt: # get_reads_by_stage, get_fastqs, output: - "results/{date}/filtered/nanofilt/{sample}.fastq", # "results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq", # temp("results/{date}/trimmed/nanofilt/{sample}.fastq"), + "results/{date}/filtered/nanofilt/{sample}.fastq", log: "logs/{date}/nanofilt/{sample}.log", params: @@ -35,13 +35,13 @@ rule nanofilt: conda: "../envs/nanofilt.yaml" shell: - "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 800 {input} > {output} 2> {log}" # --maxlength 700 + "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 600 {input} > {output} 2> {log}" # --maxlength 700 rule convert2fasta: input: - "results/{date}/filtered/nanofilt/{sample}.fastq", # "results/{barcode}/trim-filt/{barcode}_tf.fastq" + "results/{date}/filtered/nanofilt/{sample}.fastq", output: "results/{date}/filtered/nanofilt/{sample}.fasta", conda: @@ -50,7 +50,7 @@ rule convert2fasta: "seqtk seq -A {input} > {output}" -rule minimap_to_reference: +rule map_for_capping: input: # reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", reads="results/{date}/filtered/nanofilt/{sample}.fasta", @@ -60,64 +60,45 @@ rule minimap_to_reference: conda: "../envs/minimap2.yaml" shell: - "minimap2 -x map-ont {input.reference} {input.reads} -o {output}" + "minimap2 -x map-ont {input.reference} {input.reads} -o {output} --secondary=no" rule cap_cov_amp: input: + # reads=get_fastqs, primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", mappings="results/{date}/minimappings/{sample}.paf", reads="results/{date}/filtered/nanofilt/{sample}.fastq", - # reads=get_fastqs, output: "results/{date}/normalize_reads/{sample}_cap.fasta", + "results/{date}/normalize_reads/{sample}_cap.json", script: "../scripts/amp_covcap_sampler.py" -# rule canu_correct: -# input: -# "results/{date}/normalize_reads/{sample}_cap.fasta", -# # "results/{date}/filtered/nanofilt/{sample}.fasta", -# # "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", -# output: -# "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", -# log: -# "logs/{date}/canu/correct/{sample}.log", -# params: -# outdir=get_output_dir, -# concurrency=lambda w, threads: get_canu_concurrency(threads), -# # min_length=config["quality-criteria"]["ont"]["min-length-reads"], -# # for_testing=lambda w, threads: get_if_testing( -# # f"corThreads={threads} redMemory=6 oeaMemory=6" -# # ), -# conda: -# "../envs/canu.yaml" -# threads: 64 -# shell: -# """ -# canu -correct -nanopore {input} -p {wildcards.sample} -d {params.outdir} genomeSize=30k minOverlapLength=10 minReadLength=200 \ -# useGrid=false\ -# corMMapMerSize=10 corOutCoverage=50000 corMinCoverage=0 maxInputCoverage=20000 \ -# corOverlapper=minimap utgOverlapper=minimap obtOverlapper=minimap \ -# corConcurrency={params.concurrency} \ -# cormhapConcurrency={params.concurrency} cormhapThreads={params.concurrency} \ -# cormmapConcurrency={params.concurrency} cormmapThreads={params.concurrency} \ -# obtmmapConcurrency={params.concurrency} obtmmapThreads={params.concurrency} \ -# utgmmapConcurrency={params.concurrency} utgmmapThreads={params.concurrency} \ -# redConcurrency={params.concurrency} redThreads={params.concurrency} \ -# ovbConcurrency={params.concurrency} \ -# ovsConcurrency={params.concurrency} \ -# oeaConcurrency={params.concurrency} -# 2> {log} -# """ +# Intermediate number of threads (4-8) achieve best speedup of a+btrimming. +# For large files 8 threads help accelerate some, small files are processed faster with 4 threads. +rule porechop_adapter_barcode_trimming: + input: + "results/{date}/normalize_reads/{sample}_cap.fasta", + output: + # temp("results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta"), + "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", + conda: + "../envs/porechop.yaml" + log: + "logs/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.log", + threads: 8 + shell: + "porechop -i {input} -o {output} -t {threads} -v 1 > {log} 2>&1" rule canu_correct: input: - "results/{date}/normalize_reads/{sample}_cap.fasta", + # "results/{date}/normalize_reads/{sample}_cap.fasta", # "results/{date}/filtered/nanofilt/{sample}.fasta", # "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", + "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", output: "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", log: @@ -153,73 +134,45 @@ rule canu_correct: """ -# rule count_fastq_reads: -# input: -# get_reads_by_stage, -# output: -# temp("results/{date}/tables/fastq-read-counts/{stage}~{sample}.txt"), -# log: -# "logs/{date}/count_reads/{stage}~{sample}.log", -# conda: -# "../envs/unix.yaml" -# shell: -# "echo $(( $(cat {input} | wc -l ) / 4)) > {output} 2> {log}" - - -# rule minimap_to_reference: -# input: -# reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", -# # reads="results/{date}/filtered/nanofilt/{sample}.fastq", -# reference="resources/genomes/main.fasta", -# output: -# "results/{date}/minimappings/{sample}.paf", -# conda: -# "../envs/minimap2.yaml" -# shell: -# "minimap2 -x map-ont {input.reference} {input.reads} -o {output}" - - -# rule cap_cov_amp: -# input: -# primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", -# mappings="results/{date}/minimappings/{sample}.paf", -# reads="results/{date}/filtered/nanofilt/{sample}.fastq", -# # reads=get_fastqs, -# output: -# "results/{date}/normalize_reads/{sample}_cap.fasta", -# script: -# "../scripts/amp_covcap_sampler.py" - - -# Intermediate number of threads (4-8) achieve best speedup of a+btrimming. -# For large files 8 threads help accelerate some, small files are processed faster with 4 threads. -rule porechop_adapter_barcode_trimming: +rule map_for_trimming: input: - "results/{date}/normalize_reads/{sample}_cap.fasta", + # reads="results/{date}/filtered/nanofilt/{sample}.fasta", + reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + reference="resources/genomes/main.fasta", output: - temp("results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta"), + "results/{date}/minimappings/trimming/{sample}_trim.paf", conda: - "../envs/porechop.yaml" - log: - "logs/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.log", - threads: 8 + "../envs/minimap2.yaml" shell: - "porechop -i {input} -o {output} -t {threads} -v 1 > {log} 2>&1" + "minimap2 -x map-ont {input.reference} {input.reads} -o {output} --secondary=no" -rule customize_primer_porechop: +rule trim_primers: input: - get_artic_primer, + # reads=get_fastqs, + primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", + mappings="results/{date}/minimappings/trimming/{sample}.paf", + reads="results/{date}/filtered/nanofilt/{sample}.fastq", output: - "results/.indicators/replacement_notice.txt", - conda: - "../envs/primechop.yaml" - log: - "logs/customize_primer_porechop.log", - shell: - "(cp {input} $CONDA_PREFIX/lib/python3.6/site-packages/porechop/adapters.py && " - 'echo "replaced adpaters in adapter.py file in $CONDA_PREFIX/lib/python3.6/site-packages/porechop/adapters.py with ARTICv3 primers" > {output}) ' - "2> {log}" + "results/{date}/normalize_reads/{sample}_cap.fasta", + "results/{date}/normalize_reads/{sample}_cap.json", + script: + "../scripts/amp_covcap_sampler.py" + + +# rule customize_primer_porechop: +# input: +# get_artic_primer, +# output: +# "results/.indicators/replacement_notice.txt", +# conda: +# "../envs/primechop.yaml" +# log: +# "logs/customize_primer_porechop.log", +# shell: +# "(cp {input} $CONDA_PREFIX/lib/python3.6/site-packages/porechop/adapters.py && " +# 'echo "replaced adpaters in adapter.py file in $CONDA_PREFIX/lib/python3.6/site-packages/porechop/adapters.py with ARTICv3 primers" > {output}) ' +# "2> {log}" # Using a low number of threads (2-4) speed up primer-trimming significantly (>2x), even for large files, @@ -227,12 +180,11 @@ rule customize_primer_porechop: # to barcode+adapter-trimming. However, using only one thread is again very slow. rule porechop_primer_trimming: input: - fastq_in=( - "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" - ), + # fastq_in="results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", + fastq_in="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", repl_flag="results/.indicators/replacement_notice.txt", output: - "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", + "results/{date}/trimmed/porechop/primer_clipped/{sample}.corr.abpclip.fasta", conda: "../envs/primechop.yaml" log: @@ -245,58 +197,71 @@ rule porechop_primer_trimming: """ -# rule nanofilt: +# rule count_fastq_reads: # input: -# "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", +# get_reads_by_stage, # output: -# temp("results/{date}/trimmed/nanofilt/{sample}.fasta"), +# temp("results/{date}/tables/fastq-read-counts/{stage}~{sample}.txt"), # log: -# "logs/{date}/nanofilt/{sample}.log", -# params: -# min_length=config["quality-criteria"]["ont"]["min-length-reads"], -# min_PHRED=config["quality-criteria"]["ont"]["min-PHRED"], +# "logs/{date}/count_reads/{stage}~{sample}.log", # conda: -# "../envs/nanofilt.yaml" +# "../envs/unix.yaml" # shell: -# "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 500 {input} > {output} 2> {log}" +# "echo $(( $(cat {input} | wc -l ) / 4)) > {output} 2> {log}" -# rule canu_correct: +# # Intermediate number of threads (4-8) achieve best speedup of a+btrimming. +# # For large files 8 threads help accelerate some, small files are processed faster with 4 threads. +# rule porechop_adapter_barcode_trimming: # input: -# # "results/{date}/trimmed/nanofilt/{sample}.fasta", -# "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", +# "results/{date}/normalize_reads/{sample}_cap.fasta", +# output: +# # temp("results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta"), +# "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" +# conda: +# "../envs/porechop.yaml" +# log: +# "logs/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.log", +# threads: 8 +# shell: +# "porechop -i {input} -o {output} -t {threads} -v 1 > {log} 2>&1" + + +# rule customize_primer_porechop: +# input: +# get_artic_primer, # output: -# "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", +# "results/.indicators/replacement_notice.txt", +# conda: +# "../envs/primechop.yaml" # log: -# "logs/{date}/canu/assemble/{sample}.log", -# params: -# outdir=get_output_dir, -# concurrency=lambda w, threads: get_canu_concurrency(threads), -# min_length=config["quality-criteria"]["ont"]["min-length-reads"], -# for_testing=lambda w, threads: get_if_testing( -# f"corThreads={threads} redMemory=6 oeaMemory=6" +# "logs/customize_primer_porechop.log", +# shell: +# "(cp {input} $CONDA_PREFIX/lib/python3.6/site-packages/porechop/adapters.py && " +# 'echo "replaced adpaters in adapter.py file in $CONDA_PREFIX/lib/python3.6/site-packages/porechop/adapters.py with ARTICv3 primers" > {output}) ' +# "2> {log}" + + +# # Using a low number of threads (2-4) speed up primer-trimming significantly (>2x), even for large files, +# # presumably due to the much higher number of target-sequences for trimming as compared +# # to barcode+adapter-trimming. However, using only one thread is again very slow. +# rule porechop_primer_trimming: +# input: +# fastq_in=( +# "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" # ), +# repl_flag="results/.indicators/replacement_notice.txt", +# output: +# "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", # conda: -# "../envs/canu.yaml" -# threads: 16 +# "../envs/primechop.yaml" +# log: +# "logs/{date}/trimmed/porechop/primer_clipped/{sample}.log", +# threads: 2 # shell: # """ -# ( if [ -d {params.outdir} ]; then rm -Rf {params.outdir}; fi && -# canu -correct -nanopore {input} -p {wildcards.sample} -d {params.outdir} genomeSize=30k minOverlapLength=10 minReadLength=200 \ -# useGrid=false {params.for_testing} \ -# corMMapMerSize=10 corOutCoverage=50000 corMinCoverage=0 maxInputCoverage=20000 \ -# corOverlapper=minimap utgOverlapper=minimap obtOverlapper=minimap \ -# corConcurrency={params.concurrency} \ -# cormhapConcurrency={params.concurrency} cormhapThreads={params.concurrency} \ -# cormmapConcurrency={params.concurrency} cormmapThreads={params.concurrency} \ -# obtmmapConcurrency={params.concurrency} obtmmapThreads={params.concurrency} \ -# utgmmapConcurrency={params.concurrency} utgmmapThreads={params.concurrency} \ -# redConcurrency={params.concurrency} redThreads={params.concurrency} \ -# ovbConcurrency={params.concurrency} \ -# ovsConcurrency={params.concurrency} \ -# oeaConcurrency={params.concurrency} -# ) -# 2> {log} +# (porechop -i {input.fastq_in} -o {output} --no_split --end_size 35 --extra_end_trim 0 -t {threads} -v 1) 2> {log} +# rm results/.indicators/replacement_notice.txt # """ @@ -305,7 +270,8 @@ use rule assembly_polishing_ont as medaka_consensus_reference with: input: # Don´t ever use corrected reads as input for medaka, it is supposed to polish with rwa-reads! # fasta="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", - fasta="results/{date}/filtered/nanofilt/{sample}.fasta", + # fasta="results/{date}/filtered/nanofilt/{sample}.fasta", + "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", reference="resources/genomes/main.fasta", output: "results/{date}/consensus/medaka/{sample}/{sample}.fasta", diff --git a/workflow/scripts/amp_covcap_sampler.py b/workflow/scripts/amp_covcap_sampler.py index 116d24fac..91bb986fb 100644 --- a/workflow/scripts/amp_covcap_sampler.py +++ b/workflow/scripts/amp_covcap_sampler.py @@ -1,4 +1,5 @@ import random +from collections import Counter, defaultdict class Mapping: @@ -36,7 +37,7 @@ def __init__( def gen_kw_attr(self): kwattr_dict = {kw.split(":")[0]: kw.split(":")[-1] for kw in self.kwattr} for key in kwattr_dict: - self.key = kwattr_dict[key] + self.__dict__[key] = kwattr_dict[key] class Primer: @@ -64,13 +65,21 @@ def __init__(self, amp_no, primers): self.primers = primers self.start = min([primer.start for primer in primers]) self.end = max([primer.end for primer in primers]) + self.max_len = self.end - self.start + self.fwp_boundary = max( + [prim for prim in primers if prim.pos == "LEFT"], key=lambda x: x.end + ).end + self.revp_boundary = min( + [prim for prim in primers if prim.pos == "RIGHT"], key=lambda x: x.start + ).start + self.read_names = list() self.reads = list() def random_sample(self, cutoff): - if len(self.reads) > cutoff: - self.selected = random.choices(self.reads, k=cutoff) + if len(self.read_names) > cutoff: + self.selected = random.choices(self.read_names, k=cutoff) else: - self.selected = self.reads + self.selected = self.read_names def create_primer_objs(primer_bed): @@ -87,18 +96,38 @@ def generate_amps(primers): amps = list() for num in amp_nums: ao = Amp(num, [primer for primer in primers if primer.amp_no == num]) + print(ao.name, ao.max_len) amps.append(ao) return sorted(amps, key=lambda x: x.name) +def filter_read_mappings(mappings): + mappings = [m for m in mappings if 300 < m.qlen < 600] + mappings = [m for m in mappings if m.qual == 60] + # mappings = [m for m in mappings if m.tp == "P"] + return mappings + + def create_read_mappings(mm2_paf): with open(mm2_paf, "r") as paf: - mappings = list() + map_dct = defaultdict(list) for line in paf: if len(line.strip()) > 0: ls = line.strip().split("\t") mapping = Mapping(*ls[:12], ls[12:]) - mappings.append(mapping) + map_dct[mapping.qname].append(mapping) + mult_maps = {n: ml for n, ml in map_dct.items() if len(ml) > 1} + mappings = [m for k, l in map_dct.items() for m in l] + mappings = filter_read_mappings(mappings) + mono_mappings = [m for m in mappings if m.qname not in mult_maps] + dual_mappings = {k: v for k, v in mult_maps.items() if len(v) == 2} + incl_max = [ + max(dual_mappings[mname], key=lambda x: x.matches) + for mname in dual_mappings + ] + incl_max = filter_read_mappings(incl_max) + mono_mappings.extend(incl_max) + mappings = mono_mappings return sorted(mappings, key=lambda x: x.tend) @@ -109,7 +138,7 @@ def bin_mappings(amp_bins, mappings): if len(mappings) > 0: if mappings[0].tend <= amp_bins[0].end + 5: if mappings[0].tstart >= amp_bins[0].start - 5: - amp_bins[0].reads.append(mappings[0].qname) + amp_bins[0].read_names.append(mappings[0].qname) mappings.pop(0) else: na.append(mappings[0].qname) @@ -117,18 +146,26 @@ def bin_mappings(amp_bins, mappings): else: binned.append(amp_bins[0]) amp_bins.pop(0) + else: + break for bin in binned: bin.random_sample(200) - print(bin.name, len(bin.reads), "selected:", len(bin.selected)) + print(bin.name, len(bin.read_names), "selected:", len(bin.selected)) print("na", len(na)) return binned -def write_capped_reads(binned, reads, out): +def write_capped_reads(binned, reads, fa_out, js_out): + # print("Writing json") + # bins_dct = {amp.name:amp.read_names for amp in binned} + # with open(js_out, "w") as js: + # json.dump(bins_dct, js) + + print("Writing fasta") all_picks = ["@" + name for amp in binned for name in amp.selected] - with open(reads, "r") as fq, open(out, "w") as fa: + with open(reads, "r") as fq, open(fa_out, "w") as fa: for line in fq: if line.startswith("@"): readname = line.split(" ")[0] @@ -141,18 +178,20 @@ def write_capped_reads(binned, reads, out): if __name__ == "__main__": import sys - # mm2_paf = sys.argv[1] - # primer_bed = sys.argv[2] - # reads = sys.argv[3] - # out = reads + "_capped" + mm2_paf = sys.argv[1] + primer_bed = sys.argv[2] + reads = sys.argv[3] + fa_out = reads + "_capped" + js_out = reads + "_capped.json" - primer_bed = snakemake.input[0] - mm2_paf = snakemake.input[1] - reads = snakemake.input[2] - out = snakemake.output[0] + # primer_bed = snakemake.input[0] + # mm2_paf = snakemake.input[1] + # reads = snakemake.input[2] + # fa_out = snakemake.output[0] + # js_out = snakemake.output[1] primers = create_primer_objs(primer_bed) amps = generate_amps(primers) mappings = create_read_mappings(mm2_paf) binned = bin_mappings(amps, mappings) - write_capped_reads(binned, reads, out) + write_capped_reads(binned, reads, fa_out, js_out) diff --git a/workflow/scripts/map_trim.py b/workflow/scripts/map_trim.py new file mode 100644 index 000000000..a739ac2b6 --- /dev/null +++ b/workflow/scripts/map_trim.py @@ -0,0 +1,268 @@ +# import gzip +from collections import defaultdict + + +class Read: + def __init__(self, header, seq): + self.header = header + self.name = self.header.split(">")[1] + self.seq = seq + # self.prim_clipped_seq = "" + + def clip_primers(self, fwp_boundary, revp_boundary, mapping): + qlen, qstart, qend = mapping.qlen, mapping.qstart, mapping.qend + tstart, tend = mapping.tstart, mapping.tend + + clip_left = 0 + if tstart <= fwp_boundary: + add_clip = fwp_boundary - tstart + clip_left = qstart + add_clip + else: + ldiff = tstart - fwp_boundary + if qstart >= ldiff: + clip_left = qstart - ldiff + + clip_right = qlen + if tend >= revp_boundary: + sub_clip = tend - revp_boundary + clip_right = qend - sub_clip + else: + rdiff = revp_boundary - tend + if qlen - qend >= rdiff: + clip_right = qend + rdiff + + self.seq = self.seq[clip_left:clip_right] + print(clip_left, qlen - clip_right) + return clip_left, qlen - clip_right + + +class Mapping: + def __init__( + self, + qname, + qlen, + qstart, + qend, + samestrand, + tname, + tlen, + tstart, + tend, + matches, + total_bp, + qual, + kwattr, + ): + self.qname = qname + self.qlen = int(qlen) + self.qstart = int(qstart) + self.qend = int(qend) + self.samestrand = samestrand + self.tname = tname + self.tlen = int(tlen) + self.tstart = int(tstart) + self.tend = int(tend) + self.matches = int(matches) + self.total_bp = int(total_bp) + self.qual = int(qual) + self.kwattr = kwattr + self.gen_kw_attr() + + def gen_kw_attr(self): + kwattr_dict = {kw.split(":")[0]: kw.split(":")[-1] for kw in self.kwattr} + for key in kwattr_dict: + self.__dict__[key] = kwattr_dict[key] + + +class Primer: + def __init__(self, contig, start, end, name, score, strand): + self.contig = contig + self.start = int(start) + self.end = int(end) + self.name = name + self.score = int(score) + self.strand = strand + self.type = "primary" + self.amp_no, self.pos = self.get_name_infos() + + def get_name_infos(self): + ls = self.name.split("_") + if len(ls) == 4: + self.type = "alt" + self.alt_name = ls[3] + return ls[1], ls[2] + + +class Amp: + def __init__(self, amp_no, primers): + self.name = int(amp_no) + self.primers = primers + self.start = min([primer.start for primer in primers]) + self.end = max([primer.end for primer in primers]) + self.max_len = self.end - self.start + self.fwp_boundary = max( + [prim for prim in primers if prim.pos == "LEFT"], key=lambda x: x.end + ).end + self.revp_boundary = min( + [prim for prim in primers if prim.pos == "RIGHT"], key=lambda x: x.start + ).start + # self.read_names = list() + self.mappings = dict() + self.reads = dict() + + def primer_clipping_all(self): + clip_ct_left = 0 + clip_ct_right = 0 + for read in self.reads: + try: + mapping = self.mappings[read] + left, right = self.reads[read].clip_primers( + self.fwp_boundary, self.revp_boundary, mapping + ) + clip_ct_left += left + clip_ct_right += right + except KeyError as e: + print(f"KeyError in primer_clipping_all: {e}") + return clip_ct_left, clip_ct_right + + +def create_primer_objs(primer_bed): + with open(primer_bed, "r") as bed: + primers = list() + for line in bed: + prim = Primer(*line.strip().split("\t")) + primers.append(prim) + return sorted(primers, key=lambda x: x.end) + + +def create_primer_objs(primer_bed): + with open(primer_bed, "r") as bed: + primers = list() + for line in bed: + prim = Primer(*line.strip().split("\t")) + primers.append(prim) + return sorted(primers, key=lambda x: x.end) + + +def generate_amps(primers): + amp_nums = set([primer.amp_no for primer in primers]) + amps = list() + for num in amp_nums: + ao = Amp(num, [primer for primer in primers if primer.amp_no == num]) + amps.append(ao) + return sorted(amps, key=lambda x: x.name) + + +def filter_read_mappings(mappings): + mappings = [m for m in mappings if 300 < m.qlen < 600] + mappings = [m for m in mappings if m.qual == 60] + mappings = [m for m in mappings if m.tp == "P"] + return mappings + + +def create_read_mappings(mm2_paf): + with open(mm2_paf, "r") as paf: + map_dct = defaultdict(list) + for line in paf: + if len(line.strip()) > 0: + ls = line.strip().split("\t") + mapping = Mapping(*ls[:12], ls[12:]) + map_dct[mapping.qname].append(mapping) + mult_maps = {n: ml for n, ml in map_dct.items() if len(ml) > 1} + mappings = [m for k, l in map_dct.items() for m in l] + mappings = filter_read_mappings(mappings) + mono_mappings = [m for m in mappings if m.qname not in mult_maps] + dual_mappings = {k: v for k, v in mult_maps.items() if len(v) == 2} + incl_max = [ + max(dual_mappings[mname], key=lambda x: x.matches) + for mname in dual_mappings + ] + incl_max = filter_read_mappings(incl_max) + mono_mappings.extend(incl_max) + mappings = mono_mappings + return sorted(mappings, key=lambda x: (x.tend, x.tstart)) + + +def bin_mappings(amp_bins, mappings): + binned = list() + na = list() + # print(amp_bins) + while len(amp_bins) > 0: + if len(mappings) > 0: + if mappings[0].tend <= amp_bins[0].end + 5: + if mappings[0].tstart >= amp_bins[0].start - 5: + amp_bins[0].mappings[mappings[0].qname] = mappings[0] + mappings.pop(0) + else: + na.append(mappings[0].qname) + mappings.pop(0) + else: + binned.append(amp_bins[0]) + amp_bins.pop(0) + else: + break + + # for bin in binned: + # print(bin.name, "reads:", len(bin.reads)) + # print("na", len(na)) + + return binned + + +def load_reads(read_fasta, amp_bins): + + reads = dict() + with open(read_fasta, "r") as rfa: + for line in rfa: + if line.startswith(">"): + # print(line) + header = line.strip().split(" ")[0] + seq = next(rfa) + read = Read(header, seq.strip()) + reads[read.name] = read + + # print(reads) + + for amp in amp_bins: + print("amp.mappings", len(amp.mappings)) + amp.reads = {k: v for k, v in reads.items() if k in amp.mappings} + print("amp.reads", len(amp.reads)) + # print(amp.reads) + + return amp_bins + + +def clip_and_write_out(amp_bins, clipped_out): + with open(clipped_out, "w") as out: + clip_ct = {"left": 0, "right": 0} + for amp in amp_bins: + left, right = amp.primer_clipping_all() + clip_ct["left"] += left + clip_ct["right"] += right + for read in amp.reads: + out.write(amp.reads[read].header + "\n") + out.write(amp.reads[read].seq + "\n") + print( + f"{clip_ct['left']} bases were clipped from the left/start of reads and " + f"{clip_ct['right']} bases were clipped from the right/end of reads" + ) + + +if __name__ == "__main__": + import sys + + mm2_paf = sys.argv[1] + primer_bed = sys.argv[2] + reads = sys.argv[3] + clipped_out = reads + "_primer_clipped" + + # primer_bed = snakemake.input[0] + # mm2_paf = snakemake.input[1] + # reads = snakemake.input[2] + + primers = create_primer_objs(primer_bed) + amps = generate_amps(primers) + mappings = create_read_mappings(mm2_paf) + amps_bin_maps = bin_mappings(amps, mappings) + amps_bin_reads = load_reads(reads, amps_bin_maps) + clip_and_write_out(amps_bin_reads, clipped_out) From 35ca8adfba5463f94108c60898edc4cf6bf03b09 Mon Sep 17 00:00:00 2001 From: simakro Date: Fri, 18 Feb 2022 09:28:15 +0000 Subject: [PATCH 07/53] improved coverage-capping amp_covcap_sampler.py and amplicon primer-trimming map_trim.py scripts, including reverse strand swap for trimming --- workflow/rules/long_read.smk | 54 +++++++++++++------------- workflow/scripts/amp_covcap_sampler.py | 27 +++++++------ workflow/scripts/map_trim.py | 28 ++++++++----- 3 files changed, 61 insertions(+), 48 deletions(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 1aa07d4fa..701b9ec0b 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -71,7 +71,6 @@ rule cap_cov_amp: reads="results/{date}/filtered/nanofilt/{sample}.fastq", output: "results/{date}/normalize_reads/{sample}_cap.fasta", - "results/{date}/normalize_reads/{sample}_cap.json", script: "../scripts/amp_covcap_sampler.py" @@ -140,24 +139,27 @@ rule map_for_trimming: reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", reference="resources/genomes/main.fasta", output: - "results/{date}/minimappings/trimming/{sample}_trim.paf", + alignments="results/{date}/minimappings/trimming/{sample}_trim.paf", + dcreads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", conda: "../envs/minimap2.yaml" shell: - "minimap2 -x map-ont {input.reference} {input.reads} -o {output} --secondary=no" + """ + minimap2 -x map-ont {input.reference} {input.reads} -o {output.alignments} --secondary=no && + gzip -d {input.reads} + """ -rule trim_primers: +rule trim_primers_corrected: input: # reads=get_fastqs, primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", - mappings="results/{date}/minimappings/trimming/{sample}.paf", - reads="results/{date}/filtered/nanofilt/{sample}.fastq", + mappings="results/{date}/minimappings/trimming/{sample}_trim.paf", + reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", output: - "results/{date}/normalize_reads/{sample}_cap.fasta", - "results/{date}/normalize_reads/{sample}_cap.json", + "results/{date}/corrected/{sample}/{sample}.correctedReads.primerclip.fasta", script: - "../scripts/amp_covcap_sampler.py" + "../scripts/map_trim.py" # rule customize_primer_porechop: @@ -178,23 +180,23 @@ rule trim_primers: # Using a low number of threads (2-4) speed up primer-trimming significantly (>2x), even for large files, # presumably due to the much higher number of target-sequences for trimming as compared # to barcode+adapter-trimming. However, using only one thread is again very slow. -rule porechop_primer_trimming: - input: - # fastq_in="results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", - fastq_in="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", - repl_flag="results/.indicators/replacement_notice.txt", - output: - "results/{date}/trimmed/porechop/primer_clipped/{sample}.corr.abpclip.fasta", - conda: - "../envs/primechop.yaml" - log: - "logs/{date}/trimmed/porechop/primer_clipped/{sample}.log", - threads: 2 - shell: - """ - (porechop -i {input.fastq_in} -o {output} --no_split --end_size 35 --extra_end_trim 0 -t {threads} -v 1) 2> {log} - rm results/.indicators/replacement_notice.txt - """ +# rule porechop_primer_trimming: +# input: +# # fastq_in="results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", +# fastq_in="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", +# repl_flag="results/.indicators/replacement_notice.txt", +# output: +# "results/{date}/trimmed/porechop/primer_clipped/{sample}.corr.abpclip.fasta", +# conda: +# "../envs/primechop.yaml" +# log: +# "logs/{date}/trimmed/porechop/primer_clipped/{sample}.log", +# threads: 2 +# shell: +# """ +# (porechop -i {input.fastq_in} -o {output} --no_split --end_size 35 --extra_end_trim 0 -t {threads} -v 1) 2> {log} +# rm results/.indicators/replacement_notice.txt +# """ # rule count_fastq_reads: diff --git a/workflow/scripts/amp_covcap_sampler.py b/workflow/scripts/amp_covcap_sampler.py index 91bb986fb..b01f85d81 100644 --- a/workflow/scripts/amp_covcap_sampler.py +++ b/workflow/scripts/amp_covcap_sampler.py @@ -1,3 +1,7 @@ +# Copyright 2022 Simon Magin. +# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) +# This file may not be copied, modified, or distributed except according to those terms. + import random from collections import Counter, defaultdict @@ -157,7 +161,7 @@ def bin_mappings(amp_bins, mappings): return binned -def write_capped_reads(binned, reads, fa_out, js_out): +def write_capped_reads(binned, reads, fa_out): # print("Writing json") # bins_dct = {amp.name:amp.read_names for amp in binned} # with open(js_out, "w") as js: @@ -178,20 +182,19 @@ def write_capped_reads(binned, reads, fa_out, js_out): if __name__ == "__main__": import sys - mm2_paf = sys.argv[1] - primer_bed = sys.argv[2] - reads = sys.argv[3] - fa_out = reads + "_capped" - js_out = reads + "_capped.json" + # mm2_paf = sys.argv[1] + # primer_bed = sys.argv[2] + # reads = sys.argv[3] + # fa_out = reads + "_capped" + # js_out = reads + "_capped.json" - # primer_bed = snakemake.input[0] - # mm2_paf = snakemake.input[1] - # reads = snakemake.input[2] - # fa_out = snakemake.output[0] - # js_out = snakemake.output[1] + primer_bed = snakemake.input[0] + mm2_paf = snakemake.input[1] + reads = snakemake.input[2] + fa_out = snakemake.output[0] primers = create_primer_objs(primer_bed) amps = generate_amps(primers) mappings = create_read_mappings(mm2_paf) binned = bin_mappings(amps, mappings) - write_capped_reads(binned, reads, fa_out, js_out) + write_capped_reads(binned, reads, fa_out) diff --git a/workflow/scripts/map_trim.py b/workflow/scripts/map_trim.py index a739ac2b6..2ca3eb645 100644 --- a/workflow/scripts/map_trim.py +++ b/workflow/scripts/map_trim.py @@ -1,4 +1,7 @@ -# import gzip +# Copyright 2022 Simon Magin. +# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) +# This file may not be copied, modified, or distributed except according to those terms. + from collections import defaultdict @@ -10,7 +13,8 @@ def __init__(self, header, seq): # self.prim_clipped_seq = "" def clip_primers(self, fwp_boundary, revp_boundary, mapping): - qlen, qstart, qend = mapping.qlen, mapping.qstart, mapping.qend + qlen, samestrand = mapping.qlen, mapping.samestrand + qstart, qend = mapping.qstart, mapping.qend tstart, tend = mapping.tstart, mapping.tend clip_left = 0 @@ -31,8 +35,11 @@ def clip_primers(self, fwp_boundary, revp_boundary, mapping): if qlen - qend >= rdiff: clip_right = qend + rdiff + if not samestrand: + clip_left, clip_right = clip_right, clip_left + self.seq = self.seq[clip_left:clip_right] - print(clip_left, qlen - clip_right) + # print(clip_left, qlen - clip_right) return clip_left, qlen - clip_right @@ -251,14 +258,15 @@ def clip_and_write_out(amp_bins, clipped_out): if __name__ == "__main__": import sys - mm2_paf = sys.argv[1] - primer_bed = sys.argv[2] - reads = sys.argv[3] - clipped_out = reads + "_primer_clipped" + # mm2_paf = sys.argv[1] + # primer_bed = sys.argv[2] + # reads = sys.argv[3] + # clipped_out = reads + "_primer_clipped" - # primer_bed = snakemake.input[0] - # mm2_paf = snakemake.input[1] - # reads = snakemake.input[2] + primer_bed = snakemake.input[0] + mm2_paf = snakemake.input[1] + reads = snakemake.input[2] + clipped_out = snakemake.output[0] primers = create_primer_objs(primer_bed) amps = generate_amps(primers) From 6412c6f0fffc1e159d2000b4b730e440dc259188 Mon Sep 17 00:00:00 2001 From: simakro Date: Fri, 18 Feb 2022 11:34:57 +0000 Subject: [PATCH 08/53] added improved logic for clipping of '-' strand aligning reads --- workflow/scripts/map_trim.py | 78 +++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/workflow/scripts/map_trim.py b/workflow/scripts/map_trim.py index 2ca3eb645..ff2cc7e52 100644 --- a/workflow/scripts/map_trim.py +++ b/workflow/scripts/map_trim.py @@ -17,29 +17,46 @@ def clip_primers(self, fwp_boundary, revp_boundary, mapping): qstart, qend = mapping.qstart, mapping.qend tstart, tend = mapping.tstart, mapping.tend - clip_left = 0 - if tstart <= fwp_boundary: - add_clip = fwp_boundary - tstart - clip_left = qstart + add_clip - else: - ldiff = tstart - fwp_boundary - if qstart >= ldiff: - clip_left = qstart - ldiff - - clip_right = qlen - if tend >= revp_boundary: - sub_clip = tend - revp_boundary - clip_right = qend - sub_clip - else: - rdiff = revp_boundary - tend - if qlen - qend >= rdiff: - clip_right = qend + rdiff + if samestrand: + clip_left = 0 + if tstart <= fwp_boundary: + add_clip = fwp_boundary - tstart + clip_left = qstart + add_clip + else: + ldiff = tstart - fwp_boundary + if qstart >= ldiff: + clip_left = qstart - ldiff + + clip_right = qlen + if tend >= revp_boundary: + sub_clip = tend - revp_boundary + clip_right = qend - sub_clip + else: + rdiff = revp_boundary - tend + if qlen - qend >= rdiff: + clip_right = qend + rdiff - if not samestrand: - clip_left, clip_right = clip_right, clip_left + else: + clip_right = qlen + if tstart <= fwp_boundary: + add_clip = fwp_boundary - tstart + clip_right = qend - add_clip + else: + rdiff = tstart - fwp_boundary + if qlen - qend >= rdiff: + clip_right = qend + rdiff + + clip_left = 0 + if tend >= revp_boundary: + sub_clip = tend - revp_boundary + clip_left = qstart + sub_clip + else: + ldiff = revp_boundary - tend + if qstart >= ldiff: + clip_left = qstart - ldiff self.seq = self.seq[clip_left:clip_right] - # print(clip_left, qlen - clip_right) + return clip_left, qlen - clip_right @@ -64,7 +81,7 @@ def __init__( self.qlen = int(qlen) self.qstart = int(qstart) self.qend = int(qend) - self.samestrand = samestrand + self.samestrand = self.eval_strand(samestrand) self.tname = tname self.tlen = int(tlen) self.tstart = int(tstart) @@ -80,6 +97,12 @@ def gen_kw_attr(self): for key in kwattr_dict: self.__dict__[key] = kwattr_dict[key] + def eval_strand(self, strand_info): + if strand_info == "+": + return True + else: + return False + class Primer: def __init__(self, contig, start, end, name, score, strand): @@ -193,7 +216,6 @@ def create_read_mappings(mm2_paf): def bin_mappings(amp_bins, mappings): binned = list() na = list() - # print(amp_bins) while len(amp_bins) > 0: if len(mappings) > 0: if mappings[0].tend <= amp_bins[0].end + 5: @@ -208,11 +230,6 @@ def bin_mappings(amp_bins, mappings): amp_bins.pop(0) else: break - - # for bin in binned: - # print(bin.name, "reads:", len(bin.reads)) - # print("na", len(na)) - return binned @@ -228,13 +245,10 @@ def load_reads(read_fasta, amp_bins): read = Read(header, seq.strip()) reads[read.name] = read - # print(reads) - for amp in amp_bins: print("amp.mappings", len(amp.mappings)) amp.reads = {k: v for k, v in reads.items() if k in amp.mappings} print("amp.reads", len(amp.reads)) - # print(amp.reads) return amp_bins @@ -250,8 +264,8 @@ def clip_and_write_out(amp_bins, clipped_out): out.write(amp.reads[read].header + "\n") out.write(amp.reads[read].seq + "\n") print( - f"{clip_ct['left']} bases were clipped from the left/start of reads and " - f"{clip_ct['right']} bases were clipped from the right/end of reads" + f"{clip_ct['left']} bases were clipped from the fw/left-primer side of reads and " + f"{clip_ct['right']} bases were clipped from the rev/right-primer side of reads" ) From 85f5a58038507300022c9078d717bb3e322494f7 Mon Sep 17 00:00:00 2001 From: simakro Date: Sat, 19 Feb 2022 07:15:54 +0000 Subject: [PATCH 09/53] use only one map-trim to remove all, reinstated porechop; multiple adaptations and fixes in assembly.smk, common.smk, long_read.smk, qc.smk, amp_covcap_sampler.py, map_trim.py; repaired workflow and changed naming patterns and formatting for ont; medaka rule -v; get_reads_by_stage, get trimmed reads, species diversity_before_se, seqtk.yaml --- workflow/rules/assembly.smk | 4 +- workflow/rules/common.smk | 10 +- workflow/rules/long_read.smk | 225 +++++++++---------------- workflow/rules/qc.smk | 7 +- workflow/scripts/amp_covcap_sampler.py | 15 +- workflow/scripts/map_trim.py | 17 +- 6 files changed, 96 insertions(+), 182 deletions(-) diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index aa10715dc..7151f077a 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -197,9 +197,7 @@ rule assembly_polishing_ont: "../envs/medaka.yaml" threads: 4 shell: - "(medaka_consensus -v -f -i {input.fasta} -o {params.outdir} -d {input.reference} -t {threads} &&" - " mv {params.outdir}/consensus.fasta {output}) " - " > {log} 2>&1" + "medaka_consensus -f -i {input.fasta} -o {params.outdir} -d {input.reference} -t {threads} > {log} 2>&1" rule aggregate_polished_de_novo_sequences: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 53fba39df..d4c85756e 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1314,7 +1314,9 @@ def get_trimmed_reads(wildcards): "results/{{{{date}}}}/trimmed/fastp-pe/{{sample}}.{read}.fastq.gz", read=[1, 2], ), - ont_pattern="results/{{date}}/trimmed/porechop/adapter_barcode_trimming/{sample}.fastq.gz", + # ont_pattern="results/{{date}}/trimmed/porechop/adapter_barcode_trimming/{sample}.fastq.gz", + ont_pattern="results/{{date}}/corrected/trimmed/{sample}.corr.trim.fasta", + # ont_pattern="results/{date}/raw/trimmed/{sample}.raw.trim.fasta" ion_torrent_pattern="results/{{date}}/trimmed/fastp-se/{sample}.fastq.gz", ) @@ -1407,9 +1409,11 @@ def get_reads_by_stage(wildcards): if wildcards.stage == "raw": return get_fastqs(wildcards) elif wildcards.stage == "trimmed": - return "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" + # return "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" + return "results/{date}/corrected/trimmed/{sample}.corr.trim.fasta" elif wildcards.stage == "clipped": - return "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta" + # return "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta" + "results/{date}/raw/trimmed/{sample}.raw.clip.fasta" elif wildcards.stage == "filtered": return "results/{date}/trimmed/nanofilt/{sample}.fasta" diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 701b9ec0b..a8bc84378 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -19,14 +19,24 @@ rule nanoQC: "nanoQC {input} -o {params.outdir} > {log} 2>&1" +rule count_fastq_reads: + input: + get_reads_by_stage, + output: + temp("results/{date}/tables/fastq-read-counts/{stage}~{sample}.txt"), + log: + "logs/{date}/count_reads/{stage}~{sample}.log", + conda: + "../envs/unix.yaml" + shell: + "echo $(( $(cat {input} | wc -l ) / 4)) > {output} 2> {log}" + + rule nanofilt: input: - # get_reads_by_stage, get_fastqs, output: - # "results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq", - # temp("results/{date}/trimmed/nanofilt/{sample}.fastq"), - "results/{date}/filtered/nanofilt/{sample}.fastq", + temp("results/{date}/filtered/nanofilt/{sample}.fastq"), log: "logs/{date}/nanofilt/{sample}.log", params: @@ -40,7 +50,6 @@ rule nanofilt: rule convert2fasta: input: - # "results/{barcode}/trim-filt/{barcode}_tf.fastq" "results/{date}/filtered/nanofilt/{sample}.fastq", output: "results/{date}/filtered/nanofilt/{sample}.fasta", @@ -52,11 +61,10 @@ rule convert2fasta: rule map_for_capping: input: - # reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", reads="results/{date}/filtered/nanofilt/{sample}.fasta", reference="resources/genomes/main.fasta", output: - "results/{date}/minimappings/{sample}.paf", + "results/{date}/minimappings/coverage/{sample}.paf", conda: "../envs/minimap2.yaml" shell: @@ -65,9 +73,8 @@ rule map_for_capping: rule cap_cov_amp: input: - # reads=get_fastqs, primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", - mappings="results/{date}/minimappings/{sample}.paf", + mappings="results/{date}/minimappings/coverage/{sample}.paf", reads="results/{date}/filtered/nanofilt/{sample}.fastq", output: "results/{date}/normalize_reads/{sample}_cap.fasta", @@ -75,29 +82,10 @@ rule cap_cov_amp: "../scripts/amp_covcap_sampler.py" -# Intermediate number of threads (4-8) achieve best speedup of a+btrimming. -# For large files 8 threads help accelerate some, small files are processed faster with 4 threads. -rule porechop_adapter_barcode_trimming: - input: - "results/{date}/normalize_reads/{sample}_cap.fasta", - output: - # temp("results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta"), - "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", - conda: - "../envs/porechop.yaml" - log: - "logs/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.log", - threads: 8 - shell: - "porechop -i {input} -o {output} -t {threads} -v 1 > {log} 2>&1" - - rule canu_correct: input: - # "results/{date}/normalize_reads/{sample}_cap.fasta", - # "results/{date}/filtered/nanofilt/{sample}.fasta", - # "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", - "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", + # "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", + "results/{date}/normalize_reads/{sample}_cap.fasta", output: "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", log: @@ -130,159 +118,102 @@ rule canu_correct: oeaConcurrency={params.concurrency} ) 2> {log} - """ + """ # 2>&1 -rule map_for_trimming: +# Intermediate number of threads (4-8) achieve best speedup of a+btrimming. +# For large files 8 threads help accelerate some, small files are processed faster with 4 threads. +rule porechop_adapter_barcode_trimming: input: - # reads="results/{date}/filtered/nanofilt/{sample}.fasta", - reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", - reference="resources/genomes/main.fasta", + # "results/{date}/normalize_reads/{sample}_cap.fasta", + "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", output: - alignments="results/{date}/minimappings/trimming/{sample}_trim.paf", - dcreads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", + temp("results/{date}/corrected/trimmed/{sample}.corr.trim.fasta"), + # "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", conda: - "../envs/minimap2.yaml" + "../envs/porechop.yaml" + log: + "logs/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.log", + threads: 8 shell: - """ - minimap2 -x map-ont {input.reference} {input.reads} -o {output.alignments} --secondary=no && - gzip -d {input.reads} - """ - - -rule trim_primers_corrected: - input: - # reads=get_fastqs, - primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", - mappings="results/{date}/minimappings/trimming/{sample}_trim.paf", - reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", - output: - "results/{date}/corrected/{sample}/{sample}.correctedReads.primerclip.fasta", - script: - "../scripts/map_trim.py" - - -# rule customize_primer_porechop: -# input: -# get_artic_primer, -# output: -# "results/.indicators/replacement_notice.txt", -# conda: -# "../envs/primechop.yaml" -# log: -# "logs/customize_primer_porechop.log", -# shell: -# "(cp {input} $CONDA_PREFIX/lib/python3.6/site-packages/porechop/adapters.py && " -# 'echo "replaced adpaters in adapter.py file in $CONDA_PREFIX/lib/python3.6/site-packages/porechop/adapters.py with ARTICv3 primers" > {output}) ' -# "2> {log}" + "porechop -i {input} -o {output} -t {threads} -v 1 > {log} 2>&1" -# Using a low number of threads (2-4) speed up primer-trimming significantly (>2x), even for large files, -# presumably due to the much higher number of target-sequences for trimming as compared -# to barcode+adapter-trimming. However, using only one thread is again very slow. -# rule porechop_primer_trimming: +# rule map_corr: # input: -# # fastq_in="results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", -# fastq_in="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", -# repl_flag="results/.indicators/replacement_notice.txt", +# reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", +# reference="resources/genomes/main.fasta", # output: -# "results/{date}/trimmed/porechop/primer_clipped/{sample}.corr.abpclip.fasta", +# alignments="results/{date}/minimappings/corrected/{sample}.paf", +# dcreads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", # conda: -# "../envs/primechop.yaml" -# log: -# "logs/{date}/trimmed/porechop/primer_clipped/{sample}.log", -# threads: 2 +# "../envs/minimap2.yaml" # shell: # """ -# (porechop -i {input.fastq_in} -o {output} --no_split --end_size 35 --extra_end_trim 0 -t {threads} -v 1) 2> {log} -# rm results/.indicators/replacement_notice.txt +# minimap2 -x map-ont {input.reference} {input.reads} -o {output.alignments} --secondary=no && +# gzip -d {input.reads} # """ -# rule count_fastq_reads: +# rule trim_primers_corrected: # input: -# get_reads_by_stage, +# # reads=get_fastqs, +# primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", +# mappings="results/{date}/minimappings/corrected/{sample}.paf", +# reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", # output: -# temp("results/{date}/tables/fastq-read-counts/{stage}~{sample}.txt"), -# log: -# "logs/{date}/count_reads/{stage}~{sample}.log", -# conda: -# "../envs/unix.yaml" -# shell: -# "echo $(( $(cat {input} | wc -l ) / 4)) > {output} 2> {log}" +# "results/{date}/corrected/{sample}/{sample}.correctedReads.primerclip.fasta", +# script: +# "../scripts/map_trim.py" -# # Intermediate number of threads (4-8) achieve best speedup of a+btrimming. -# # For large files 8 threads help accelerate some, small files are processed faster with 4 threads. -# rule porechop_adapter_barcode_trimming: -# input: -# "results/{date}/normalize_reads/{sample}_cap.fasta", -# output: -# # temp("results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta"), -# "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" -# conda: -# "../envs/porechop.yaml" -# log: -# "logs/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.log", -# threads: 8 -# shell: -# "porechop -i {input} -o {output} -t {threads} -v 1 > {log} 2>&1" - - -# rule customize_primer_porechop: -# input: -# get_artic_primer, -# output: -# "results/.indicators/replacement_notice.txt", -# conda: -# "../envs/primechop.yaml" -# log: -# "logs/customize_primer_porechop.log", -# shell: -# "(cp {input} $CONDA_PREFIX/lib/python3.6/site-packages/porechop/adapters.py && " -# 'echo "replaced adpaters in adapter.py file in $CONDA_PREFIX/lib/python3.6/site-packages/porechop/adapters.py with ARTICv3 primers" > {output}) ' -# "2> {log}" +rule map_raw: + input: + reads="results/{date}/normalize_reads/{sample}_cap.fasta", + reference="resources/genomes/main.fasta", + output: + alignments="results/{date}/minimappings/trim_raw/{sample}.paf", + # dcreads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", + conda: + "../envs/minimap2.yaml" + shell: + """ + minimap2 -x map-ont {input.reference} {input.reads} -o {output.alignments} --secondary=no + """ -# # Using a low number of threads (2-4) speed up primer-trimming significantly (>2x), even for large files, -# # presumably due to the much higher number of target-sequences for trimming as compared -# # to barcode+adapter-trimming. However, using only one thread is again very slow. -# rule porechop_primer_trimming: -# input: -# fastq_in=( -# "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" -# ), -# repl_flag="results/.indicators/replacement_notice.txt", -# output: -# "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta", -# conda: -# "../envs/primechop.yaml" -# log: -# "logs/{date}/trimmed/porechop/primer_clipped/{sample}.log", -# threads: 2 -# shell: -# """ -# (porechop -i {input.fastq_in} -o {output} --no_split --end_size 35 --extra_end_trim 0 -t {threads} -v 1) 2> {log} -# rm results/.indicators/replacement_notice.txt -# """ +rule trim_primers_raw: + input: + # reads=get_fastqs, + primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", + mappings="results/{date}/minimappings/trim_raw/{sample}.paf", + reads="results/{date}/normalize_reads/{sample}_cap.fasta", + output: + "results/{date}/raw/trimmed/{sample}.raw.clip.fasta", + script: + "../scripts/map_trim.py" # rule medaka_consensus_reference: use rule assembly_polishing_ont as medaka_consensus_reference with: input: - # Don´t ever use corrected reads as input for medaka, it is supposed to polish with rwa-reads! + # Don´t ever use corrected reads as input for medaka, it is supposed to polish with raw-reads! # fasta="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", # fasta="results/{date}/filtered/nanofilt/{sample}.fasta", - "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", + # fasta="results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", + fasta="results/{date}/raw/trimmed/{sample}.raw.clip.fasta", reference="resources/genomes/main.fasta", output: - "results/{date}/consensus/medaka/{sample}/{sample}.fasta", + # "results/{date}/consensus/medaka/{sample}/{sample}.fasta", + "results/{date}/consensus/medaka/{sample}/consensus.fasta", + # "results/{date}/polishing/medaka/{sample}/{sample}.consensus.fasta", # polish consensus rule bcftools_consensus_ont: input: - fasta="results/{date}/consensus/medaka/{sample}/{sample}.fasta", + fasta="results/{date}/consensus/medaka/{sample}/consensus.fasta", + # fasta="results/{date}/consensus/medaka/{sample}/{sample}.fasta", bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf", # clonal vs. subclonal? bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf.csi", output: diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 93c85b0f2..cb3220eca 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -153,9 +153,10 @@ rule species_diversity_before_se: kraken_output=temp( "results/{date}/species-diversity/se/{sample}/{sample}.kraken" ), - report=( - "results/{date}/species-diversity/se/{sample}/{sample}.uncleaned.kreport2" - ), + # report=( + # "results/{date}/species-diversity/se/{sample}/{sample}.uncleaned.kreport2" + # ), + report="results/{date}/species-diversity/se/{sample}/{sample}.uncleaned.kreport2", log: "logs/{date}/kraken/se/{sample}.log", threads: 8 diff --git a/workflow/scripts/amp_covcap_sampler.py b/workflow/scripts/amp_covcap_sampler.py index b01f85d81..e9a8a5027 100644 --- a/workflow/scripts/amp_covcap_sampler.py +++ b/workflow/scripts/amp_covcap_sampler.py @@ -3,7 +3,7 @@ # This file may not be copied, modified, or distributed except according to those terms. import random -from collections import Counter, defaultdict +from collections import defaultdict class Mapping: @@ -100,7 +100,7 @@ def generate_amps(primers): amps = list() for num in amp_nums: ao = Amp(num, [primer for primer in primers if primer.amp_no == num]) - print(ao.name, ao.max_len) + # print(ao.name, ao.max_len) amps.append(ao) return sorted(amps, key=lambda x: x.name) @@ -108,7 +108,6 @@ def generate_amps(primers): def filter_read_mappings(mappings): mappings = [m for m in mappings if 300 < m.qlen < 600] mappings = [m for m in mappings if m.qual == 60] - # mappings = [m for m in mappings if m.tp == "P"] return mappings @@ -155,19 +154,13 @@ def bin_mappings(amp_bins, mappings): for bin in binned: bin.random_sample(200) - print(bin.name, len(bin.read_names), "selected:", len(bin.selected)) - print("na", len(na)) + # print(bin.name, len(bin.read_names), "selected:", len(bin.selected)) + # print("na", len(na)) return binned def write_capped_reads(binned, reads, fa_out): - # print("Writing json") - # bins_dct = {amp.name:amp.read_names for amp in binned} - # with open(js_out, "w") as js: - # json.dump(bins_dct, js) - - print("Writing fasta") all_picks = ["@" + name for amp in binned for name in amp.selected] with open(reads, "r") as fq, open(fa_out, "w") as fa: for line in fq: diff --git a/workflow/scripts/map_trim.py b/workflow/scripts/map_trim.py index ff2cc7e52..6b4f177c1 100644 --- a/workflow/scripts/map_trim.py +++ b/workflow/scripts/map_trim.py @@ -10,7 +10,6 @@ def __init__(self, header, seq): self.header = header self.name = self.header.split(">")[1] self.seq = seq - # self.prim_clipped_seq = "" def clip_primers(self, fwp_boundary, revp_boundary, mapping): qlen, samestrand = mapping.qlen, mapping.samestrand @@ -136,7 +135,6 @@ def __init__(self, amp_no, primers): self.revp_boundary = min( [prim for prim in primers if prim.pos == "RIGHT"], key=lambda x: x.start ).start - # self.read_names = list() self.mappings = dict() self.reads = dict() @@ -165,15 +163,6 @@ def create_primer_objs(primer_bed): return sorted(primers, key=lambda x: x.end) -def create_primer_objs(primer_bed): - with open(primer_bed, "r") as bed: - primers = list() - for line in bed: - prim = Primer(*line.strip().split("\t")) - primers.append(prim) - return sorted(primers, key=lambda x: x.end) - - def generate_amps(primers): amp_nums = set([primer.amp_no for primer in primers]) amps = list() @@ -186,7 +175,6 @@ def generate_amps(primers): def filter_read_mappings(mappings): mappings = [m for m in mappings if 300 < m.qlen < 600] mappings = [m for m in mappings if m.qual == 60] - mappings = [m for m in mappings if m.tp == "P"] return mappings @@ -234,7 +222,6 @@ def bin_mappings(amp_bins, mappings): def load_reads(read_fasta, amp_bins): - reads = dict() with open(read_fasta, "r") as rfa: for line in rfa: @@ -246,9 +233,9 @@ def load_reads(read_fasta, amp_bins): reads[read.name] = read for amp in amp_bins: - print("amp.mappings", len(amp.mappings)) + # print("amp.mappings", len(amp.mappings)) amp.reads = {k: v for k, v in reads.items() if k in amp.mappings} - print("amp.reads", len(amp.reads)) + # print("amp.reads", len(amp.reads)) return amp_bins From 4372ab67ae778967127a748cbca59b3615934ec3 Mon Sep 17 00:00:00 2001 From: simakro Date: Fri, 25 Feb 2022 06:31:07 +0000 Subject: [PATCH 10/53] remove map-trim in order to reorganize --- workflow/rules/read_mapping.smk | 1 + workflow/scripts/map_trim.py | 277 -------------------------------- 2 files changed, 1 insertion(+), 277 deletions(-) delete mode 100644 workflow/scripts/map_trim.py diff --git a/workflow/rules/read_mapping.smk b/workflow/rules/read_mapping.smk index 4c9317923..de49b89d2 100644 --- a/workflow/rules/read_mapping.smk +++ b/workflow/rules/read_mapping.smk @@ -60,6 +60,7 @@ rule map_reads: threads: 8 wrapper: "0.69.0/bio/bwa/mem" + # "v1.1.0/bio/bwa/mem" rule mark_duplicates: diff --git a/workflow/scripts/map_trim.py b/workflow/scripts/map_trim.py deleted file mode 100644 index 6b4f177c1..000000000 --- a/workflow/scripts/map_trim.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright 2022 Simon Magin. -# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) -# This file may not be copied, modified, or distributed except according to those terms. - -from collections import defaultdict - - -class Read: - def __init__(self, header, seq): - self.header = header - self.name = self.header.split(">")[1] - self.seq = seq - - def clip_primers(self, fwp_boundary, revp_boundary, mapping): - qlen, samestrand = mapping.qlen, mapping.samestrand - qstart, qend = mapping.qstart, mapping.qend - tstart, tend = mapping.tstart, mapping.tend - - if samestrand: - clip_left = 0 - if tstart <= fwp_boundary: - add_clip = fwp_boundary - tstart - clip_left = qstart + add_clip - else: - ldiff = tstart - fwp_boundary - if qstart >= ldiff: - clip_left = qstart - ldiff - - clip_right = qlen - if tend >= revp_boundary: - sub_clip = tend - revp_boundary - clip_right = qend - sub_clip - else: - rdiff = revp_boundary - tend - if qlen - qend >= rdiff: - clip_right = qend + rdiff - - else: - clip_right = qlen - if tstart <= fwp_boundary: - add_clip = fwp_boundary - tstart - clip_right = qend - add_clip - else: - rdiff = tstart - fwp_boundary - if qlen - qend >= rdiff: - clip_right = qend + rdiff - - clip_left = 0 - if tend >= revp_boundary: - sub_clip = tend - revp_boundary - clip_left = qstart + sub_clip - else: - ldiff = revp_boundary - tend - if qstart >= ldiff: - clip_left = qstart - ldiff - - self.seq = self.seq[clip_left:clip_right] - - return clip_left, qlen - clip_right - - -class Mapping: - def __init__( - self, - qname, - qlen, - qstart, - qend, - samestrand, - tname, - tlen, - tstart, - tend, - matches, - total_bp, - qual, - kwattr, - ): - self.qname = qname - self.qlen = int(qlen) - self.qstart = int(qstart) - self.qend = int(qend) - self.samestrand = self.eval_strand(samestrand) - self.tname = tname - self.tlen = int(tlen) - self.tstart = int(tstart) - self.tend = int(tend) - self.matches = int(matches) - self.total_bp = int(total_bp) - self.qual = int(qual) - self.kwattr = kwattr - self.gen_kw_attr() - - def gen_kw_attr(self): - kwattr_dict = {kw.split(":")[0]: kw.split(":")[-1] for kw in self.kwattr} - for key in kwattr_dict: - self.__dict__[key] = kwattr_dict[key] - - def eval_strand(self, strand_info): - if strand_info == "+": - return True - else: - return False - - -class Primer: - def __init__(self, contig, start, end, name, score, strand): - self.contig = contig - self.start = int(start) - self.end = int(end) - self.name = name - self.score = int(score) - self.strand = strand - self.type = "primary" - self.amp_no, self.pos = self.get_name_infos() - - def get_name_infos(self): - ls = self.name.split("_") - if len(ls) == 4: - self.type = "alt" - self.alt_name = ls[3] - return ls[1], ls[2] - - -class Amp: - def __init__(self, amp_no, primers): - self.name = int(amp_no) - self.primers = primers - self.start = min([primer.start for primer in primers]) - self.end = max([primer.end for primer in primers]) - self.max_len = self.end - self.start - self.fwp_boundary = max( - [prim for prim in primers if prim.pos == "LEFT"], key=lambda x: x.end - ).end - self.revp_boundary = min( - [prim for prim in primers if prim.pos == "RIGHT"], key=lambda x: x.start - ).start - self.mappings = dict() - self.reads = dict() - - def primer_clipping_all(self): - clip_ct_left = 0 - clip_ct_right = 0 - for read in self.reads: - try: - mapping = self.mappings[read] - left, right = self.reads[read].clip_primers( - self.fwp_boundary, self.revp_boundary, mapping - ) - clip_ct_left += left - clip_ct_right += right - except KeyError as e: - print(f"KeyError in primer_clipping_all: {e}") - return clip_ct_left, clip_ct_right - - -def create_primer_objs(primer_bed): - with open(primer_bed, "r") as bed: - primers = list() - for line in bed: - prim = Primer(*line.strip().split("\t")) - primers.append(prim) - return sorted(primers, key=lambda x: x.end) - - -def generate_amps(primers): - amp_nums = set([primer.amp_no for primer in primers]) - amps = list() - for num in amp_nums: - ao = Amp(num, [primer for primer in primers if primer.amp_no == num]) - amps.append(ao) - return sorted(amps, key=lambda x: x.name) - - -def filter_read_mappings(mappings): - mappings = [m for m in mappings if 300 < m.qlen < 600] - mappings = [m for m in mappings if m.qual == 60] - return mappings - - -def create_read_mappings(mm2_paf): - with open(mm2_paf, "r") as paf: - map_dct = defaultdict(list) - for line in paf: - if len(line.strip()) > 0: - ls = line.strip().split("\t") - mapping = Mapping(*ls[:12], ls[12:]) - map_dct[mapping.qname].append(mapping) - mult_maps = {n: ml for n, ml in map_dct.items() if len(ml) > 1} - mappings = [m for k, l in map_dct.items() for m in l] - mappings = filter_read_mappings(mappings) - mono_mappings = [m for m in mappings if m.qname not in mult_maps] - dual_mappings = {k: v for k, v in mult_maps.items() if len(v) == 2} - incl_max = [ - max(dual_mappings[mname], key=lambda x: x.matches) - for mname in dual_mappings - ] - incl_max = filter_read_mappings(incl_max) - mono_mappings.extend(incl_max) - mappings = mono_mappings - return sorted(mappings, key=lambda x: (x.tend, x.tstart)) - - -def bin_mappings(amp_bins, mappings): - binned = list() - na = list() - while len(amp_bins) > 0: - if len(mappings) > 0: - if mappings[0].tend <= amp_bins[0].end + 5: - if mappings[0].tstart >= amp_bins[0].start - 5: - amp_bins[0].mappings[mappings[0].qname] = mappings[0] - mappings.pop(0) - else: - na.append(mappings[0].qname) - mappings.pop(0) - else: - binned.append(amp_bins[0]) - amp_bins.pop(0) - else: - break - return binned - - -def load_reads(read_fasta, amp_bins): - reads = dict() - with open(read_fasta, "r") as rfa: - for line in rfa: - if line.startswith(">"): - # print(line) - header = line.strip().split(" ")[0] - seq = next(rfa) - read = Read(header, seq.strip()) - reads[read.name] = read - - for amp in amp_bins: - # print("amp.mappings", len(amp.mappings)) - amp.reads = {k: v for k, v in reads.items() if k in amp.mappings} - # print("amp.reads", len(amp.reads)) - - return amp_bins - - -def clip_and_write_out(amp_bins, clipped_out): - with open(clipped_out, "w") as out: - clip_ct = {"left": 0, "right": 0} - for amp in amp_bins: - left, right = amp.primer_clipping_all() - clip_ct["left"] += left - clip_ct["right"] += right - for read in amp.reads: - out.write(amp.reads[read].header + "\n") - out.write(amp.reads[read].seq + "\n") - print( - f"{clip_ct['left']} bases were clipped from the fw/left-primer side of reads and " - f"{clip_ct['right']} bases were clipped from the rev/right-primer side of reads" - ) - - -if __name__ == "__main__": - import sys - - # mm2_paf = sys.argv[1] - # primer_bed = sys.argv[2] - # reads = sys.argv[3] - # clipped_out = reads + "_primer_clipped" - - primer_bed = snakemake.input[0] - mm2_paf = snakemake.input[1] - reads = snakemake.input[2] - clipped_out = snakemake.output[0] - - primers = create_primer_objs(primer_bed) - amps = generate_amps(primers) - mappings = create_read_mappings(mm2_paf) - amps_bin_maps = bin_mappings(amps, mappings) - amps_bin_reads = load_reads(reads, amps_bin_maps) - clip_and_write_out(amps_bin_reads, clipped_out) From b62d62eb0ebdb6ec2d534007181ba8546a61985b Mon Sep 17 00:00:00 2001 From: simakro Date: Sun, 27 Feb 2022 23:41:43 +0000 Subject: [PATCH 11/53] perform all clipping and downsampling with notramp --- workflow/envs/notramp.yaml | 4 + workflow/rules/long_read.smk | 142 ++++++++--------------------------- 2 files changed, 35 insertions(+), 111 deletions(-) create mode 100644 workflow/envs/notramp.yaml diff --git a/workflow/envs/notramp.yaml b/workflow/envs/notramp.yaml new file mode 100644 index 000000000..88212d37e --- /dev/null +++ b/workflow/envs/notramp.yaml @@ -0,0 +1,4 @@ +channels: + - simakro +dependencies: + - notramp diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index a8bc84378..76a728918 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -45,49 +45,42 @@ rule nanofilt: conda: "../envs/nanofilt.yaml" shell: - "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 600 {input} > {output} 2> {log}" # --maxlength 700 + "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log}" -rule convert2fasta: +rule downsample_and_trim_raw: input: - "results/{date}/filtered/nanofilt/{sample}.fastq", + primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", + reads="results/{date}/filtered/nanofilt/{sample}.fastq", + ref_genome="resources/genomes/main.fasta", output: - "results/{date}/filtered/nanofilt/{sample}.fasta", + "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.fasta", + "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta", + params: + outdir="results/{date}/norm_trim_raw_reads/{sample}", + log: + "results/{date}/norm_trim_raw_reads/{sample}/notramp.log", conda: - "../envs/seqtk.yaml" + "../envs/notramp.yaml" shell: - "seqtk seq -A {input} > {output}" + "notramp -a -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir)" -rule map_for_capping: +# rule medaka_consensus_reference: +use rule assembly_polishing_ont as medaka_consensus_reference with: input: - reads="results/{date}/filtered/nanofilt/{sample}.fasta", + # Don´t ever use corrected reads as input for medaka, it is supposed to polish with raw-reads! + fasta="results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta", reference="resources/genomes/main.fasta", output: - "results/{date}/minimappings/coverage/{sample}.paf", - conda: - "../envs/minimap2.yaml" - shell: - "minimap2 -x map-ont {input.reference} {input.reads} -o {output} --secondary=no" - - -rule cap_cov_amp: - input: - primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", - mappings="results/{date}/minimappings/coverage/{sample}.paf", - reads="results/{date}/filtered/nanofilt/{sample}.fastq", - output: - "results/{date}/normalize_reads/{sample}_cap.fasta", - script: - "../scripts/amp_covcap_sampler.py" + "results/{date}/consensus/medaka/{sample}/consensus.fasta", rule canu_correct: input: - # "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", - "results/{date}/normalize_reads/{sample}_cap.fasta", + "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.fasta", output: - "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", log: "logs/{date}/canu/correct/{sample}.log", params: @@ -116,104 +109,31 @@ rule canu_correct: ovbConcurrency={params.concurrency} \ ovsConcurrency={params.concurrency} \ oeaConcurrency={params.concurrency} - ) + && gzip -d {output}.gz) 2> {log} """ # 2>&1 -# Intermediate number of threads (4-8) achieve best speedup of a+btrimming. -# For large files 8 threads help accelerate some, small files are processed faster with 4 threads. -rule porechop_adapter_barcode_trimming: +rule clip_adbc_corrected: input: - # "results/{date}/normalize_reads/{sample}_cap.fasta", - "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", + reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", + ref_genome="resources/genomes/main.fasta", output: - temp("results/{date}/corrected/trimmed/{sample}.corr.trim.fasta"), - # "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", - conda: - "../envs/porechop.yaml" + "results/{date}/norm_trim_corr_reads/{sample}/{sample}.cap.clip.fasta", + params: + outdir="results/{date}/norm_trim_corr_reads/{sample}", log: - "logs/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.log", - threads: 8 - shell: - "porechop -i {input} -o {output} -t {threads} -v 1 > {log} 2>&1" - - -# rule map_corr: -# input: -# reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", -# reference="resources/genomes/main.fasta", -# output: -# alignments="results/{date}/minimappings/corrected/{sample}.paf", -# dcreads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", -# conda: -# "../envs/minimap2.yaml" -# shell: -# """ -# minimap2 -x map-ont {input.reference} {input.reads} -o {output.alignments} --secondary=no && -# gzip -d {input.reads} -# """ - - -# rule trim_primers_corrected: -# input: -# # reads=get_fastqs, -# primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", -# mappings="results/{date}/minimappings/corrected/{sample}.paf", -# reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", -# output: -# "results/{date}/corrected/{sample}/{sample}.correctedReads.primerclip.fasta", -# script: -# "../scripts/map_trim.py" - - -rule map_raw: - input: - reads="results/{date}/normalize_reads/{sample}_cap.fasta", - reference="resources/genomes/main.fasta", - output: - alignments="results/{date}/minimappings/trim_raw/{sample}.paf", - # dcreads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", + "results/{date}/norm_trim_corr_reads/{sample}/notramp.log", conda: - "../envs/minimap2.yaml" + "../envs/notramp.yaml" shell: - """ - minimap2 -x map-ont {input.reference} {input.reads} -o {output.alignments} --secondary=no - """ - - -rule trim_primers_raw: - input: - # reads=get_fastqs, - primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", - mappings="results/{date}/minimappings/trim_raw/{sample}.paf", - reads="results/{date}/normalize_reads/{sample}_cap.fasta", - output: - "results/{date}/raw/trimmed/{sample}.raw.clip.fasta", - script: - "../scripts/map_trim.py" - - -# rule medaka_consensus_reference: -use rule assembly_polishing_ont as medaka_consensus_reference with: - input: - # Don´t ever use corrected reads as input for medaka, it is supposed to polish with raw-reads! - # fasta="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", - # fasta="results/{date}/filtered/nanofilt/{sample}.fasta", - # fasta="results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta", - fasta="results/{date}/raw/trimmed/{sample}.raw.clip.fasta", - reference="resources/genomes/main.fasta", - output: - # "results/{date}/consensus/medaka/{sample}/{sample}.fasta", - "results/{date}/consensus/medaka/{sample}/consensus.fasta", - # "results/{date}/polishing/medaka/{sample}/{sample}.consensus.fasta", + "notramp -t --incl_prim -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir)" -# polish consensus rule bcftools_consensus_ont: input: fasta="results/{date}/consensus/medaka/{sample}/consensus.fasta", - # fasta="results/{date}/consensus/medaka/{sample}/{sample}.fasta", bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf", # clonal vs. subclonal? bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf.csi", output: From 931f4d6bb6acbed58b543d2702fd3440c2b1f760 Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 28 Feb 2022 00:14:54 +0000 Subject: [PATCH 12/53] multiple renamings and adaptions --- workflow/envs/notramp.yaml | 2 ++ workflow/rules/assembly.smk | 2 +- workflow/rules/common.smk | 14 ++++++++------ workflow/rules/long_read.smk | 10 +++++----- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/workflow/envs/notramp.yaml b/workflow/envs/notramp.yaml index 88212d37e..4068d1ecc 100644 --- a/workflow/envs/notramp.yaml +++ b/workflow/envs/notramp.yaml @@ -1,4 +1,6 @@ channels: - simakro + - bioconda dependencies: - notramp + - minimap2 diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index 7151f077a..c13b64bf3 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -179,7 +179,7 @@ rule assembly_polishing_illumina: # polish ont de novo assembly rule assembly_polishing_ont: input: - fasta="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + fasta="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", reference="results/{date}/contigs/ordered/{sample}.fasta", output: report( diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d4c85756e..70b0717fc 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -407,7 +407,7 @@ def get_reads(wildcards): ) ont_pattern = expand( - "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", **wildcards, ) @@ -1315,7 +1315,7 @@ def get_trimmed_reads(wildcards): read=[1, 2], ), # ont_pattern="results/{{date}}/trimmed/porechop/adapter_barcode_trimming/{sample}.fastq.gz", - ont_pattern="results/{{date}}/corrected/trimmed/{sample}.corr.trim.fasta", + ont_pattern="results/{{date}}/corrected/{sample}/{sample}.correctedReads.fasta", # ont_pattern="results/{date}/raw/trimmed/{sample}.raw.trim.fasta" ion_torrent_pattern="results/{{date}}/trimmed/fastp-se/{sample}.fastq.gz", ) @@ -1409,11 +1409,13 @@ def get_reads_by_stage(wildcards): if wildcards.stage == "raw": return get_fastqs(wildcards) elif wildcards.stage == "trimmed": - # return "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" - return "results/{date}/corrected/trimmed/{sample}.corr.trim.fasta" + return "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta" + # return "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" + # return "results/{date}/corrected/trimmed/{sample}.corr.trim.fasta" elif wildcards.stage == "clipped": - # return "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta" - "results/{date}/raw/trimmed/{sample}.raw.clip.fasta" + return "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta" + # return "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta" + # "results/{date}/raw/trimmed/{sample}.raw.clip.fasta" elif wildcards.stage == "filtered": return "results/{date}/trimmed/nanofilt/{sample}.fasta" diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 76a728918..156cd4229 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -63,7 +63,7 @@ rule downsample_and_trim_raw: conda: "../envs/notramp.yaml" shell: - "notramp -a -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir)" + "notramp -a -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}" # rule medaka_consensus_reference: @@ -108,8 +108,8 @@ rule canu_correct: redConcurrency={params.concurrency} redThreads={params.concurrency} \ ovbConcurrency={params.concurrency} \ ovsConcurrency={params.concurrency} \ - oeaConcurrency={params.concurrency} - && gzip -d {output}.gz) + oeaConcurrency={params.concurrency}) + gzip -d {output}.gz 2> {log} """ # 2>&1 @@ -120,7 +120,7 @@ rule clip_adbc_corrected: reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", ref_genome="resources/genomes/main.fasta", output: - "results/{date}/norm_trim_corr_reads/{sample}/{sample}.cap.clip.fasta", + "results/{date}/norm_trim_corr_reads/{sample}/{sample}.clip.fasta", params: outdir="results/{date}/norm_trim_corr_reads/{sample}", log: @@ -128,7 +128,7 @@ rule clip_adbc_corrected: conda: "../envs/notramp.yaml" shell: - "notramp -t --incl_prim -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir)" + "notramp -t --incl_prim -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}" rule bcftools_consensus_ont: From e17b32f09791b9df4b328b3e30faf48ed78f70ad Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 28 Feb 2022 00:19:23 +0000 Subject: [PATCH 13/53] cleanup --- workflow/rules/common.smk | 6 ------ workflow/rules/long_read.smk | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 70b0717fc..42310c406 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1314,9 +1314,7 @@ def get_trimmed_reads(wildcards): "results/{{{{date}}}}/trimmed/fastp-pe/{{sample}}.{read}.fastq.gz", read=[1, 2], ), - # ont_pattern="results/{{date}}/trimmed/porechop/adapter_barcode_trimming/{sample}.fastq.gz", ont_pattern="results/{{date}}/corrected/{sample}/{sample}.correctedReads.fasta", - # ont_pattern="results/{date}/raw/trimmed/{sample}.raw.trim.fasta" ion_torrent_pattern="results/{{date}}/trimmed/fastp-se/{sample}.fastq.gz", ) @@ -1410,12 +1408,8 @@ def get_reads_by_stage(wildcards): return get_fastqs(wildcards) elif wildcards.stage == "trimmed": return "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta" - # return "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fasta" - # return "results/{date}/corrected/trimmed/{sample}.corr.trim.fasta" elif wildcards.stage == "clipped": return "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta" - # return "results/{date}/trimmed/porechop/primer_clipped/{sample}.fasta" - # "results/{date}/raw/trimmed/{sample}.raw.clip.fasta" elif wildcards.stage == "filtered": return "results/{date}/trimmed/nanofilt/{sample}.fasta" diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 156cd4229..6c9f43dfb 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -111,7 +111,7 @@ rule canu_correct: oeaConcurrency={params.concurrency}) gzip -d {output}.gz 2> {log} - """ # 2>&1 + """ rule clip_adbc_corrected: From 1b2940bf7a6d6efbe7f59ef3275e307a49e50b1b Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 28 Feb 2022 10:44:24 +0000 Subject: [PATCH 14/53] adjusted output canu_correct --- workflow/rules/long_read.smk | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 6c9f43dfb..1b1d0abec 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -80,7 +80,10 @@ rule canu_correct: input: "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.fasta", output: - "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", + temp(directory("results/{date}/corrected/{sample}/correction")), + temp(directory("results/{date}/corrected/{sample}/{sample}.seqStore")), + corr_gz="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + corr_fa="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", log: "logs/{date}/canu/correct/{sample}.log", params: @@ -109,7 +112,7 @@ rule canu_correct: ovbConcurrency={params.concurrency} \ ovsConcurrency={params.concurrency} \ oeaConcurrency={params.concurrency}) - gzip -d {output}.gz + gzip -d {output.corr_gz} 2> {log} """ From 75bf4fb68afd68b6c05fd7e375f83dc45ee4678d Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 28 Feb 2022 10:58:06 +0000 Subject: [PATCH 15/53] linting --- workflow/rules/long_read.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 1b1d0abec..803a4904a 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -50,7 +50,7 @@ rule nanofilt: rule downsample_and_trim_raw: input: - primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", + primer=".tests/resources/nCoV-2019.primer.bed", reads="results/{date}/filtered/nanofilt/{sample}.fastq", ref_genome="resources/genomes/main.fasta", output: @@ -119,7 +119,7 @@ rule canu_correct: rule clip_adbc_corrected: input: - primer="/home/simon/uncovar/.tests/resources/nCoV-2019.primer.bed", + primer=".tests/resources/nCoV-2019.primer.bed", reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", ref_genome="resources/genomes/main.fasta", output: From b29b3b335fcf1bdefa25d5f610b1dda58f4b353a Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 28 Feb 2022 11:15:09 +0000 Subject: [PATCH 16/53] get_artic_primer --- workflow/rules/long_read.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 803a4904a..20999e965 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -50,7 +50,7 @@ rule nanofilt: rule downsample_and_trim_raw: input: - primer=".tests/resources/nCoV-2019.primer.bed", + primer=get_artic_primer, reads="results/{date}/filtered/nanofilt/{sample}.fastq", ref_genome="resources/genomes/main.fasta", output: @@ -119,7 +119,7 @@ rule canu_correct: rule clip_adbc_corrected: input: - primer=".tests/resources/nCoV-2019.primer.bed", + primer=get_artic_primer, reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", ref_genome="resources/genomes/main.fasta", output: From c8d18c1629b8441186a36653a1255d551992b05f Mon Sep 17 00:00:00 2001 From: simakro Date: Wed, 2 Mar 2022 05:39:35 +0000 Subject: [PATCH 17/53] increased notramp version; reroute stdout+stderr to log in canu_correct --- resources/ARTIC_v3_adapters.py | 1238 -------------------------------- workflow/envs/notramp.yaml | 2 +- workflow/rules/long_read.smk | 4 +- 3 files changed, 3 insertions(+), 1241 deletions(-) delete mode 100644 resources/ARTIC_v3_adapters.py diff --git a/resources/ARTIC_v3_adapters.py b/resources/ARTIC_v3_adapters.py deleted file mode 100644 index 749ea38df..000000000 --- a/resources/ARTIC_v3_adapters.py +++ /dev/null @@ -1,1238 +0,0 @@ -""" -Copyright 2017 Ryan Wick (rrwick@gmail.com) -https://github.com/rrwick/Porechop - -This module contains the class and sequences for known adapters used in Oxford Nanopore library -preparation kits. - -This file is part of Porechop. Porechop is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by the Free Software Foundation, -either version 3 of the License, or (at your option) any later version. Porechop is distributed in -the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -details. You should have received a copy of the GNU General Public License along with Porechop. If -not, see . -""" - - -class Adapter(object): - def __init__( - self, name, start_sequence=None, end_sequence=None, both_ends_sequence=None - ): - self.name = name - self.start_sequence = start_sequence if start_sequence else [] - self.end_sequence = end_sequence if end_sequence else [] - if both_ends_sequence: - self.start_sequence = both_ends_sequence - self.end_sequence = both_ends_sequence - self.best_start_score, self.best_end_score = 0.0, 0.0 - - def best_start_or_end_score(self): - return max(self.best_start_score, self.best_end_score) - - def is_barcode(self): - return self.name.startswith("Barcode ") - - def barcode_direction(self): - if "_rev" in self.start_sequence[0]: - return "reverse" - else: - return "forward" - - def get_barcode_name(self): - """ - Gets the barcode name for the output files. We want a concise name, so it looks at all - options and chooses the shortest. - """ - possible_names = [self.name] - if self.start_sequence: - possible_names.append(self.start_sequence[0]) - if self.end_sequence: - possible_names.append(self.end_sequence[0]) - barcode_name = sorted(possible_names, key=lambda x: len(x))[0] - return barcode_name.replace(" ", "_") - - -# INSTRUCTIONS FOR ADDING CUSTOM ADAPTERS -# --------------------------------------- -# If you need Porechop to remove adapters that aren't included, you can add your own my modifying -# the ADAPTERS list below. -# -# Here is the format for a normal adapter: -# Adapter('Adapter_set_name', -# start_sequence=('Start_adapter_name', 'AAAACCCCGGGGTTTTAAAACCCCGGGGTTTT'), -# end_sequence=('End_adapter_name', 'AACCGGTTAACCGGTTAACCGGTTAACCGGTT')) -# -# You can exclude start_sequence and end_sequence as appropriate. -# -# If you have custom Barcodes, make sure that the adapter set name starts with 'Barcode '. Also, -# remove the existing barcode sequences from this file to avoid conflicts: -# Adapter('Barcode 1', -# start_sequence=('Barcode_1_start', 'AAAAAAAACCCCCCCCGGGGGGGGTTTTTTTT'), -# end_sequence=('Barcode_1_end', 'AAAAAAAACCCCCCCCGGGGGGGGTTTTTTTT')), -# Adapter('Barcode 2', -# start_sequence=('Barcode_2_start', 'TTTTTTTTGGGGGGGGCCCCCCCCAAAAAAAA'), -# end_sequence=('Barcode_2_end', 'TTTTTTTTGGGGGGGGCCCCCCCCAAAAAAAA')) - - -ADAPTERS = [ - Adapter( - "nCoV-2019_1_LEFT", - start_sequence=("nCoV-2019_1_LEFT_fw", "ACCAACCAACTTTCGATCTCTTGT"), - end_sequence=("nCoV-2019_1_LEFT_revcomp", "ACAAGAGATCGAAAGTTGGTTGGT"), - ), - Adapter( - "nCoV-2019_1_RIGHT", - start_sequence=("nCoV-2019_1_RIGHT_fw", "CATCTTTAAGATGTTGACGTGCCTC"), - end_sequence=("nCoV-2019_1_RIGHT_revcomp", "GAGGCACGTCAACATCTTAAAGATG"), - ), - Adapter( - "nCoV-2019_2_LEFT", - start_sequence=("nCoV-2019_2_LEFT_fw", "CTGTTTTACAGGTTCGCGACGT"), - end_sequence=("nCoV-2019_2_LEFT_revcomp", "ACGTCGCGAACCTGTAAAACAG"), - ), - Adapter( - "nCoV-2019_2_RIGHT", - start_sequence=("nCoV-2019_2_RIGHT_fw", "TAAGGATCAGTGCCAAGCTCGT"), - end_sequence=("nCoV-2019_2_RIGHT_revcomp", "ACGAGCTTGGCACTGATCCTTA"), - ), - Adapter( - "nCoV-2019_3_LEFT", - start_sequence=("nCoV-2019_3_LEFT_fw", "CGGTAATAAAGGAGCTGGTGGC"), - end_sequence=("nCoV-2019_3_LEFT_revcomp", "GCCACCAGCTCCTTTATTACCG"), - ), - Adapter( - "nCoV-2019_3_RIGHT", - start_sequence=("nCoV-2019_3_RIGHT_fw", "AAGGTGTCTGCAATTCATAGCTCT"), - end_sequence=("nCoV-2019_3_RIGHT_revcomp", "AGAGCTATGAATTGCAGACACCTT"), - ), - Adapter( - "nCoV-2019_4_LEFT", - start_sequence=("nCoV-2019_4_LEFT_fw", "GGTGTATACTGCTGCCGTGAAC"), - end_sequence=("nCoV-2019_4_LEFT_revcomp", "GTTCACGGCAGCAGTATACACC"), - ), - Adapter( - "nCoV-2019_4_RIGHT", - start_sequence=("nCoV-2019_4_RIGHT_fw", "CACAAGTAGTGGCACCTTCTTTAGT"), - end_sequence=("nCoV-2019_4_RIGHT_revcomp", "ACTAAAGAAGGTGCCACTACTTGTG"), - ), - Adapter( - "nCoV-2019_5_LEFT", - start_sequence=("nCoV-2019_5_LEFT_fw", "TGGTGAAACTTCATGGCAGACG"), - end_sequence=("nCoV-2019_5_LEFT_revcomp", "CGTCTGCCATGAAGTTTCACCA"), - ), - Adapter( - "nCoV-2019_5_RIGHT", - start_sequence=("nCoV-2019_5_RIGHT_fw", "ATTGATGTTGACTTTCTCTTTTTGGAGT"), - end_sequence=("nCoV-2019_5_RIGHT_revcomp", "ACTCCAAAAAGAGAAAGTCAACATCAAT"), - ), - Adapter( - "nCoV-2019_6_LEFT", - start_sequence=("nCoV-2019_6_LEFT_fw", "GGTGTTGTTGGAGAAGGTTCCG"), - end_sequence=("nCoV-2019_6_LEFT_revcomp", "CGGAACCTTCTCCAACAACACC"), - ), - Adapter( - "nCoV-2019_6_RIGHT", - start_sequence=("nCoV-2019_6_RIGHT_fw", "TAGCGGCCTTCTGTAAAACACG"), - end_sequence=("nCoV-2019_6_RIGHT_revcomp", "CGTGTTTTACAGAAGGCCGCTA"), - ), - Adapter( - "nCoV-2019_7_LEFT", - start_sequence=("nCoV-2019_7_LEFT_fw", "ATCAGAGGCTGCTCGTGTTGTA"), - end_sequence=("nCoV-2019_7_LEFT_revcomp", "TACAACACGAGCAGCCTCTGAT"), - ), - Adapter( - "nCoV-2019_7_LEFT_alt0", - start_sequence=("nCoV-2019_7_LEFT_alt0_fw", "CATTTGCATCAGAGGCTGCTCG"), - end_sequence=("nCoV-2019_7_LEFT_alt0_revcomp", "CGAGCAGCCTCTGATGCAAATG"), - ), - Adapter( - "nCoV-2019_7_RIGHT", - start_sequence=("nCoV-2019_7_RIGHT_fw", "TGCACAGGTGACAATTTGTCCA"), - end_sequence=("nCoV-2019_7_RIGHT_revcomp", "TGGACAAATTGTCACCTGTGCA"), - ), - Adapter( - "nCoV-2019_7_RIGHT_alt5", - start_sequence=("nCoV-2019_7_RIGHT_alt5_fw", "AGGTGACAATTTGTCCACCGAC"), - end_sequence=("nCoV-2019_7_RIGHT_alt5_revcomp", "GTCGGTGGACAAATTGTCACCT"), - ), - Adapter( - "nCoV-2019_8_LEFT", - start_sequence=("nCoV-2019_8_LEFT_fw", "AGAGTTTCTTAGAGACGGTTGGGA"), - end_sequence=("nCoV-2019_8_LEFT_revcomp", "TCCCAACCGTCTCTAAGAAACTCT"), - ), - Adapter( - "nCoV-2019_8_RIGHT", - start_sequence=("nCoV-2019_8_RIGHT_fw", "GCTTCAACAGCTTCACTAGTAGGT"), - end_sequence=("nCoV-2019_8_RIGHT_revcomp", "ACCTACTAGTGAAGCTGTTGAAGC"), - ), - Adapter( - "nCoV-2019_9_LEFT", - start_sequence=("nCoV-2019_9_LEFT_fw", "TCCCACAGAAGTGTTAACAGAGGA"), - end_sequence=("nCoV-2019_9_LEFT_revcomp", "TCCTCTGTTAACACTTCTGTGGGA"), - ), - Adapter( - "nCoV-2019_9_LEFT_alt4", - start_sequence=("nCoV-2019_9_LEFT_alt4_fw", "TTCCCACAGAAGTGTTAACAGAGG"), - end_sequence=("nCoV-2019_9_LEFT_alt4_revcomp", "CCTCTGTTAACACTTCTGTGGGAA"), - ), - Adapter( - "nCoV-2019_9_RIGHT", - start_sequence=("nCoV-2019_9_RIGHT_fw", "ATGACAGCATCTGCCACAACAC"), - end_sequence=("nCoV-2019_9_RIGHT_revcomp", "GTGTTGTGGCAGATGCTGTCAT"), - ), - Adapter( - "nCoV-2019_9_RIGHT_alt2", - start_sequence=("nCoV-2019_9_RIGHT_alt2_fw", "GACAGCATCTGCCACAACACAG"), - end_sequence=("nCoV-2019_9_RIGHT_alt2_revcomp", "CTGTGTTGTGGCAGATGCTGTC"), - ), - Adapter( - "nCoV-2019_10_LEFT", - start_sequence=("nCoV-2019_10_LEFT_fw", "TGAGAAGTGCTCTGCCTATACAGT"), - end_sequence=("nCoV-2019_10_LEFT_revcomp", "ACTGTATAGGCAGAGCACTTCTCA"), - ), - Adapter( - "nCoV-2019_10_RIGHT", - start_sequence=("nCoV-2019_10_RIGHT_fw", "TCATCTAACCAATCTTCTTCTTGCTCT"), - end_sequence=("nCoV-2019_10_RIGHT_revcomp", "AGAGCAAGAAGAAGATTGGTTAGATGA"), - ), - Adapter( - "nCoV-2019_11_LEFT", - start_sequence=("nCoV-2019_11_LEFT_fw", "GGAATTTGGTGCCACTTCTGCT"), - end_sequence=("nCoV-2019_11_LEFT_revcomp", "AGCAGAAGTGGCACCAAATTCC"), - ), - Adapter( - "nCoV-2019_11_RIGHT", - start_sequence=("nCoV-2019_11_RIGHT_fw", "TCATCAGATTCAACTTGCATGGCA"), - end_sequence=("nCoV-2019_11_RIGHT_revcomp", "TGCCATGCAAGTTGAATCTGATGA"), - ), - Adapter( - "nCoV-2019_12_LEFT", - start_sequence=("nCoV-2019_12_LEFT_fw", "AAACATGGAGGAGGTGTTGCAG"), - end_sequence=("nCoV-2019_12_LEFT_revcomp", "CTGCAACACCTCCTCCATGTTT"), - ), - Adapter( - "nCoV-2019_12_RIGHT", - start_sequence=("nCoV-2019_12_RIGHT_fw", "TTCACTCTTCATTTCCAAAAAGCTTGA"), - end_sequence=("nCoV-2019_12_RIGHT_revcomp", "TCAAGCTTTTTGGAAATGAAGAGTGAA"), - ), - Adapter( - "nCoV-2019_13_LEFT", - start_sequence=("nCoV-2019_13_LEFT_fw", "TCGCACAAATGTCTACTTAGCTGT"), - end_sequence=("nCoV-2019_13_LEFT_revcomp", "ACAGCTAAGTAGACATTTGTGCGA"), - ), - Adapter( - "nCoV-2019_13_RIGHT", - start_sequence=("nCoV-2019_13_RIGHT_fw", "ACCACAGCAGTTAAAACACCCT"), - end_sequence=("nCoV-2019_13_RIGHT_revcomp", "AGGGTGTTTTAACTGCTGTGGT"), - ), - Adapter( - "nCoV-2019_14_LEFT", - start_sequence=("nCoV-2019_14_LEFT_fw", "CATCCAGATTCTGCCACTCTTGT"), - end_sequence=("nCoV-2019_14_LEFT_revcomp", "ACAAGAGTGGCAGAATCTGGATG"), - ), - Adapter( - "nCoV-2019_14_LEFT_alt4", - start_sequence=("nCoV-2019_14_LEFT_alt4_fw", "TGGCAATCTTCATCCAGATTCTGC"), - end_sequence=("nCoV-2019_14_LEFT_alt4_revcomp", "GCAGAATCTGGATGAAGATTGCCA"), - ), - Adapter( - "nCoV-2019_14_RIGHT", - start_sequence=("nCoV-2019_14_RIGHT_fw", "AGTTTCCACACAGACAGGCATT"), - end_sequence=("nCoV-2019_14_RIGHT_revcomp", "AATGCCTGTCTGTGTGGAAACT"), - ), - Adapter( - "nCoV-2019_14_RIGHT_alt2", - start_sequence=("nCoV-2019_14_RIGHT_alt2_fw", "TGCGTGTTTCTTCTGCATGTGC"), - end_sequence=("nCoV-2019_14_RIGHT_alt2_revcomp", "GCACATGCAGAAGAAACACGCA"), - ), - Adapter( - "nCoV-2019_15_LEFT", - start_sequence=("nCoV-2019_15_LEFT_fw", "ACAGTGCTTAAAAAGTGTAAAAGTGCC"), - end_sequence=("nCoV-2019_15_LEFT_revcomp", "GGCACTTTTACACTTTTTAAGCACTGT"), - ), - Adapter( - "nCoV-2019_15_LEFT_alt1", - start_sequence=("nCoV-2019_15_LEFT_alt1_fw", "AGTGCTTAAAAAGTGTAAAAGTGCCT"), - end_sequence=("nCoV-2019_15_LEFT_alt1_revcomp", "AGGCACTTTTACACTTTTTAAGCACT"), - ), - Adapter( - "nCoV-2019_15_RIGHT", - start_sequence=("nCoV-2019_15_RIGHT_fw", "AACAGAAACTGTAGCTGGCACT"), - end_sequence=("nCoV-2019_15_RIGHT_revcomp", "AGTGCCAGCTACAGTTTCTGTT"), - ), - Adapter( - "nCoV-2019_15_RIGHT_alt3", - start_sequence=("nCoV-2019_15_RIGHT_alt3_fw", "ACTGTAGCTGGCACTTTGAGAGA"), - end_sequence=("nCoV-2019_15_RIGHT_alt3_revcomp", "TCTCTCAAAGTGCCAGCTACAGT"), - ), - Adapter( - "nCoV-2019_16_LEFT", - start_sequence=("nCoV-2019_16_LEFT_fw", "AATTTGGAAGAAGCTGCTCGGT"), - end_sequence=("nCoV-2019_16_LEFT_revcomp", "ACCGAGCAGCTTCTTCCAAATT"), - ), - Adapter( - "nCoV-2019_16_RIGHT", - start_sequence=("nCoV-2019_16_RIGHT_fw", "CACAACTTGCGTGTGGAGGTTA"), - end_sequence=("nCoV-2019_16_RIGHT_revcomp", "TAACCTCCACACGCAAGTTGTG"), - ), - Adapter( - "nCoV-2019_17_LEFT", - start_sequence=("nCoV-2019_17_LEFT_fw", "CTTCTTTCTTTGAGAGAAGTGAGGACT"), - end_sequence=("nCoV-2019_17_LEFT_revcomp", "AGTCCTCACTTCTCTCAAAGAAAGAAG"), - ), - Adapter( - "nCoV-2019_17_RIGHT", - start_sequence=("nCoV-2019_17_RIGHT_fw", "TTTGTTGGAGTGTTAACAATGCAGT"), - end_sequence=("nCoV-2019_17_RIGHT_revcomp", "ACTGCATTGTTAACACTCCAACAAA"), - ), - Adapter( - "nCoV-2019_18_LEFT", - start_sequence=("nCoV-2019_18_LEFT_fw", "TGGAAATACCCACAAGTTAATGGTTTAAC"), - end_sequence=("nCoV-2019_18_LEFT_revcomp", "GTTAAACCATTAACTTGTGGGTATTTCCA"), - ), - Adapter( - "nCoV-2019_18_LEFT_alt2", - start_sequence=("nCoV-2019_18_LEFT_alt2_fw", "ACTTCTATTAAATGGGCAGATAACAACTGT"), - end_sequence=( - "nCoV-2019_18_LEFT_alt2_revcomp", - "ACAGTTGTTATCTGCCCATTTAATAGAAGT", - ), - ), - Adapter( - "nCoV-2019_18_RIGHT", - start_sequence=("nCoV-2019_18_RIGHT_fw", "AGCTTGTTTACCACACGTACAAGG"), - end_sequence=("nCoV-2019_18_RIGHT_revcomp", "CCTTGTACGTGTGGTAAACAAGCT"), - ), - Adapter( - "nCoV-2019_18_RIGHT_alt1", - start_sequence=("nCoV-2019_18_RIGHT_alt1_fw", "GCTTGTTTACCACACGTACAAGG"), - end_sequence=("nCoV-2019_18_RIGHT_alt1_revcomp", "CCTTGTACGTGTGGTAAACAAGC"), - ), - Adapter( - "nCoV-2019_19_LEFT", - start_sequence=("nCoV-2019_19_LEFT_fw", "GCTGTTATGTACATGGGCACACT"), - end_sequence=("nCoV-2019_19_LEFT_revcomp", "AGTGTGCCCATGTACATAACAGC"), - ), - Adapter( - "nCoV-2019_19_RIGHT", - start_sequence=("nCoV-2019_19_RIGHT_fw", "TGTCCAACTTAGGGTCAATTTCTGT"), - end_sequence=("nCoV-2019_19_RIGHT_revcomp", "ACAGAAATTGACCCTAAGTTGGACA"), - ), - Adapter( - "nCoV-2019_20_LEFT", - start_sequence=("nCoV-2019_20_LEFT_fw", "ACAAAGAAAACAGTTACACAACAACCA"), - end_sequence=("nCoV-2019_20_LEFT_revcomp", "TGGTTGTTGTGTAACTGTTTTCTTTGT"), - ), - Adapter( - "nCoV-2019_20_RIGHT", - start_sequence=("nCoV-2019_20_RIGHT_fw", "ACGTGGCTTTATTAGTTGCATTGTT"), - end_sequence=("nCoV-2019_20_RIGHT_revcomp", "AACAATGCAACTAATAAAGCCACGT"), - ), - Adapter( - "nCoV-2019_21_LEFT", - start_sequence=("nCoV-2019_21_LEFT_fw", "TGGCTATTGATTATAAACACTACACACCC"), - end_sequence=("nCoV-2019_21_LEFT_revcomp", "GGGTGTGTAGTGTTTATAATCAATAGCCA"), - ), - Adapter( - "nCoV-2019_21_LEFT_alt2", - start_sequence=("nCoV-2019_21_LEFT_alt2_fw", "GGCTATTGATTATAAACACTACACACCCT"), - end_sequence=( - "nCoV-2019_21_LEFT_alt2_revcomp", - "AGGGTGTGTAGTGTTTATAATCAATAGCC", - ), - ), - Adapter( - "nCoV-2019_21_RIGHT", - start_sequence=("nCoV-2019_21_RIGHT_fw", "TAGATCTGTGTGGCCAACCTCT"), - end_sequence=("nCoV-2019_21_RIGHT_revcomp", "AGAGGTTGGCCACACAGATCTA"), - ), - Adapter( - "nCoV-2019_21_RIGHT_alt0", - start_sequence=("nCoV-2019_21_RIGHT_alt0_fw", "GATCTGTGTGGCCAACCTCTTC"), - end_sequence=("nCoV-2019_21_RIGHT_alt0_revcomp", "GAAGAGGTTGGCCACACAGATC"), - ), - Adapter( - "nCoV-2019_22_LEFT", - start_sequence=("nCoV-2019_22_LEFT_fw", "ACTACCGAAGTTGTAGGAGACATTATACT"), - end_sequence=("nCoV-2019_22_LEFT_revcomp", "AGTATAATGTCTCCTACAACTTCGGTAGT"), - ), - Adapter( - "nCoV-2019_22_RIGHT", - start_sequence=("nCoV-2019_22_RIGHT_fw", "ACAGTATTCTTTGCTATAGTAGTCGGC"), - end_sequence=("nCoV-2019_22_RIGHT_revcomp", "GCCGACTACTATAGCAAAGAATACTGT"), - ), - Adapter( - "nCoV-2019_23_LEFT", - start_sequence=("nCoV-2019_23_LEFT_fw", "ACAACTACTAACATAGTTACACGGTGT"), - end_sequence=("nCoV-2019_23_LEFT_revcomp", "ACACCGTGTAACTATGTTAGTAGTTGT"), - ), - Adapter( - "nCoV-2019_23_RIGHT", - start_sequence=("nCoV-2019_23_RIGHT_fw", "ACCAGTACAGTAGGTTGCAATAGTG"), - end_sequence=("nCoV-2019_23_RIGHT_revcomp", "CACTATTGCAACCTACTGTACTGGT"), - ), - Adapter( - "nCoV-2019_24_LEFT", - start_sequence=("nCoV-2019_24_LEFT_fw", "AGGCATGCCTTCTTACTGTACTG"), - end_sequence=("nCoV-2019_24_LEFT_revcomp", "CAGTACAGTAAGAAGGCATGCCT"), - ), - Adapter( - "nCoV-2019_24_RIGHT", - start_sequence=("nCoV-2019_24_RIGHT_fw", "ACATTCTAACCATAGCTGAAATCGGG"), - end_sequence=("nCoV-2019_24_RIGHT_revcomp", "CCCGATTTCAGCTATGGTTAGAATGT"), - ), - Adapter( - "nCoV-2019_25_LEFT", - start_sequence=("nCoV-2019_25_LEFT_fw", "GCAATTGTTTTTCAGCTATTTTGCAGT"), - end_sequence=("nCoV-2019_25_LEFT_revcomp", "ACTGCAAAATAGCTGAAAAACAATTGC"), - ), - Adapter( - "nCoV-2019_25_RIGHT", - start_sequence=("nCoV-2019_25_RIGHT_fw", "ACTGTAGTGACAAGTCTCTCGCA"), - end_sequence=("nCoV-2019_25_RIGHT_revcomp", "TGCGAGAGACTTGTCACTACAGT"), - ), - Adapter( - "nCoV-2019_26_LEFT", - start_sequence=("nCoV-2019_26_LEFT_fw", "TTGTGATACATTCTGTGCTGGTAGT"), - end_sequence=("nCoV-2019_26_LEFT_revcomp", "ACTACCAGCACAGAATGTATCACAA"), - ), - Adapter( - "nCoV-2019_26_RIGHT", - start_sequence=("nCoV-2019_26_RIGHT_fw", "TCCGCACTATCACCAACATCAG"), - end_sequence=("nCoV-2019_26_RIGHT_revcomp", "CTGATGTTGGTGATAGTGCGGA"), - ), - Adapter( - "nCoV-2019_27_LEFT", - start_sequence=("nCoV-2019_27_LEFT_fw", "ACTACAGTCAGCTTATGTGTCAACC"), - end_sequence=("nCoV-2019_27_LEFT_revcomp", "GGTTGACACATAAGCTGACTGTAGT"), - ), - Adapter( - "nCoV-2019_27_RIGHT", - start_sequence=("nCoV-2019_27_RIGHT_fw", "AATACAAGCACCAAGGTCACGG"), - end_sequence=("nCoV-2019_27_RIGHT_revcomp", "CCGTGACCTTGGTGCTTGTATT"), - ), - Adapter( - "nCoV-2019_28_LEFT", - start_sequence=("nCoV-2019_28_LEFT_fw", "ACATAGAAGTTACTGGCGATAGTTGT"), - end_sequence=("nCoV-2019_28_LEFT_revcomp", "ACAACTATCGCCAGTAACTTCTATGT"), - ), - Adapter( - "nCoV-2019_28_RIGHT", - start_sequence=("nCoV-2019_28_RIGHT_fw", "TGTTTAGACATGACATGAACAGGTGT"), - end_sequence=("nCoV-2019_28_RIGHT_revcomp", "ACACCTGTTCATGTCATGTCTAAACA"), - ), - Adapter( - "nCoV-2019_29_LEFT", - start_sequence=("nCoV-2019_29_LEFT_fw", "ACTTGTGTTCCTTTTTGTTGCTGC"), - end_sequence=("nCoV-2019_29_LEFT_revcomp", "GCAGCAACAAAAAGGAACACAAGT"), - ), - Adapter( - "nCoV-2019_29_RIGHT", - start_sequence=("nCoV-2019_29_RIGHT_fw", "AGTGTACTCTATAAGTTTTGATGGTGTGT"), - end_sequence=("nCoV-2019_29_RIGHT_revcomp", "ACACACCATCAAAACTTATAGAGTACACT"), - ), - Adapter( - "nCoV-2019_30_LEFT", - start_sequence=("nCoV-2019_30_LEFT_fw", "GCACAACTAATGGTGACTTTTTGCA"), - end_sequence=("nCoV-2019_30_LEFT_revcomp", "TGCAAAAAGTCACCATTAGTTGTGC"), - ), - Adapter( - "nCoV-2019_30_RIGHT", - start_sequence=("nCoV-2019_30_RIGHT_fw", "ACCACTAGTAGATACACAAACACCAG"), - end_sequence=("nCoV-2019_30_RIGHT_revcomp", "CTGGTGTTTGTGTATCTACTAGTGGT"), - ), - Adapter( - "nCoV-2019_31_LEFT", - start_sequence=("nCoV-2019_31_LEFT_fw", "TTCTGAGTACTGTAGGCACGGC"), - end_sequence=("nCoV-2019_31_LEFT_revcomp", "GCCGTGCCTACAGTACTCAGAA"), - ), - Adapter( - "nCoV-2019_31_RIGHT", - start_sequence=("nCoV-2019_31_RIGHT_fw", "ACAGAATAAACACCAGGTAAGAATGAGT"), - end_sequence=("nCoV-2019_31_RIGHT_revcomp", "ACTCATTCTTACCTGGTGTTTATTCTGT"), - ), - Adapter( - "nCoV-2019_32_LEFT", - start_sequence=("nCoV-2019_32_LEFT_fw", "TGGTGAATACAGTCATGTAGTTGCC"), - end_sequence=("nCoV-2019_32_LEFT_revcomp", "GGCAACTACATGACTGTATTCACCA"), - ), - Adapter( - "nCoV-2019_32_RIGHT", - start_sequence=("nCoV-2019_32_RIGHT_fw", "AGCACATCACTACGCAACTTTAGA"), - end_sequence=("nCoV-2019_32_RIGHT_revcomp", "TCTAAAGTTGCGTAGTGATGTGCT"), - ), - Adapter( - "nCoV-2019_33_LEFT", - start_sequence=("nCoV-2019_33_LEFT_fw", "ACTTTTGAAGAAGCTGCGCTGT"), - end_sequence=("nCoV-2019_33_LEFT_revcomp", "ACAGCGCAGCTTCTTCAAAAGT"), - ), - Adapter( - "nCoV-2019_33_RIGHT", - start_sequence=("nCoV-2019_33_RIGHT_fw", "TGGACAGTAAACTACGTCATCAAGC"), - end_sequence=("nCoV-2019_33_RIGHT_revcomp", "GCTTGATGACGTAGTTTACTGTCCA"), - ), - Adapter( - "nCoV-2019_34_LEFT", - start_sequence=("nCoV-2019_34_LEFT_fw", "TCCCATCTGGTAAAGTTGAGGGT"), - end_sequence=("nCoV-2019_34_LEFT_revcomp", "ACCCTCAACTTTACCAGATGGGA"), - ), - Adapter( - "nCoV-2019_34_RIGHT", - start_sequence=("nCoV-2019_34_RIGHT_fw", "AGTGAAATTGGGCCTCATAGCA"), - end_sequence=("nCoV-2019_34_RIGHT_revcomp", "TGCTATGAGGCCCAATTTCACT"), - ), - Adapter( - "nCoV-2019_35_LEFT", - start_sequence=("nCoV-2019_35_LEFT_fw", "TGTTCGCATTCAACCAGGACAG"), - end_sequence=("nCoV-2019_35_LEFT_revcomp", "CTGTCCTGGTTGAATGCGAACA"), - ), - Adapter( - "nCoV-2019_35_RIGHT", - start_sequence=("nCoV-2019_35_RIGHT_fw", "ACTTCATAGCCACAAGGTTAAAGTCA"), - end_sequence=("nCoV-2019_35_RIGHT_revcomp", "TGACTTTAACCTTGTGGCTATGAAGT"), - ), - Adapter( - "nCoV-2019_36_LEFT", - start_sequence=("nCoV-2019_36_LEFT_fw", "TTAGCTTGGTTGTACGCTGCTG"), - end_sequence=("nCoV-2019_36_LEFT_revcomp", "CAGCAGCGTACAACCAAGCTAA"), - ), - Adapter( - "nCoV-2019_36_RIGHT", - start_sequence=("nCoV-2019_36_RIGHT_fw", "GAACAAAGACCATTGAGTACTCTGGA"), - end_sequence=("nCoV-2019_36_RIGHT_revcomp", "TCCAGAGTACTCAATGGTCTTTGTTC"), - ), - Adapter( - "nCoV-2019_37_LEFT", - start_sequence=("nCoV-2019_37_LEFT_fw", "ACACACCACTGGTTGTTACTCAC"), - end_sequence=("nCoV-2019_37_LEFT_revcomp", "GTGAGTAACAACCAGTGGTGTGT"), - ), - Adapter( - "nCoV-2019_37_RIGHT", - start_sequence=("nCoV-2019_37_RIGHT_fw", "GTCCACACTCTCCTAGCACCAT"), - end_sequence=("nCoV-2019_37_RIGHT_revcomp", "ATGGTGCTAGGAGAGTGTGGAC"), - ), - Adapter( - "nCoV-2019_38_LEFT", - start_sequence=("nCoV-2019_38_LEFT_fw", "ACTGTGTTATGTATGCATCAGCTGT"), - end_sequence=("nCoV-2019_38_LEFT_revcomp", "ACAGCTGATGCATACATAACACAGT"), - ), - Adapter( - "nCoV-2019_38_RIGHT", - start_sequence=("nCoV-2019_38_RIGHT_fw", "CACCAAGAGTCAGTCTAAAGTAGCG"), - end_sequence=("nCoV-2019_38_RIGHT_revcomp", "CGCTACTTTAGACTGACTCTTGGTG"), - ), - Adapter( - "nCoV-2019_39_LEFT", - start_sequence=("nCoV-2019_39_LEFT_fw", "AGTATTGCCCTATTTTCTTCATAACTGGT"), - end_sequence=("nCoV-2019_39_LEFT_revcomp", "ACCAGTTATGAAGAAAATAGGGCAATACT"), - ), - Adapter( - "nCoV-2019_39_RIGHT", - start_sequence=("nCoV-2019_39_RIGHT_fw", "TGTAACTGGACACATTGAGCCC"), - end_sequence=("nCoV-2019_39_RIGHT_revcomp", "GGGCTCAATGTGTCCAGTTACA"), - ), - Adapter( - "nCoV-2019_40_LEFT", - start_sequence=("nCoV-2019_40_LEFT_fw", "TGCACATCAGTAGTCTTACTCTCAGT"), - end_sequence=("nCoV-2019_40_LEFT_revcomp", "ACTGAGAGTAAGACTACTGATGTGCA"), - ), - Adapter( - "nCoV-2019_40_RIGHT", - start_sequence=("nCoV-2019_40_RIGHT_fw", "CATGGCTGCATCACGGTCAAAT"), - end_sequence=("nCoV-2019_40_RIGHT_revcomp", "ATTTGACCGTGATGCAGCCATG"), - ), - Adapter( - "nCoV-2019_41_LEFT", - start_sequence=("nCoV-2019_41_LEFT_fw", "GTTCCCTTCCATCATATGCAGCT"), - end_sequence=("nCoV-2019_41_LEFT_revcomp", "AGCTGCATATGATGGAAGGGAAC"), - ), - Adapter( - "nCoV-2019_41_RIGHT", - start_sequence=("nCoV-2019_41_RIGHT_fw", "TGGTATGACAACCATTAGTTTGGCT"), - end_sequence=("nCoV-2019_41_RIGHT_revcomp", "AGCCAAACTAATGGTTGTCATACCA"), - ), - Adapter( - "nCoV-2019_42_LEFT", - start_sequence=("nCoV-2019_42_LEFT_fw", "TGCAAGAGATGGTTGTGTTCCC"), - end_sequence=("nCoV-2019_42_LEFT_revcomp", "GGGAACACAACCATCTCTTGCA"), - ), - Adapter( - "nCoV-2019_42_RIGHT", - start_sequence=("nCoV-2019_42_RIGHT_fw", "CCTACCTCCCTTTGTTGTGTTGT"), - end_sequence=("nCoV-2019_42_RIGHT_revcomp", "ACAACACAACAAAGGGAGGTAGG"), - ), - Adapter( - "nCoV-2019_43_LEFT", - start_sequence=("nCoV-2019_43_LEFT_fw", "TACGACAGATGTCTTGTGCTGC"), - end_sequence=("nCoV-2019_43_LEFT_revcomp", "GCAGCACAAGACATCTGTCGTA"), - ), - Adapter( - "nCoV-2019_43_RIGHT", - start_sequence=("nCoV-2019_43_RIGHT_fw", "AGCAGCATCTACAGCAAAAGCA"), - end_sequence=("nCoV-2019_43_RIGHT_revcomp", "TGCTTTTGCTGTAGATGCTGCT"), - ), - Adapter( - "nCoV-2019_44_LEFT", - start_sequence=("nCoV-2019_44_LEFT_fw", "TGCCACAGTACGTCTACAAGCT"), - end_sequence=("nCoV-2019_44_LEFT_revcomp", "AGCTTGTAGACGTACTGTGGCA"), - ), - Adapter( - "nCoV-2019_44_LEFT_alt3", - start_sequence=("nCoV-2019_44_LEFT_alt3_fw", "CCACAGTACGTCTACAAGCTGG"), - end_sequence=("nCoV-2019_44_LEFT_alt3_revcomp", "CCAGCTTGTAGACGTACTGTGG"), - ), - Adapter( - "nCoV-2019_44_RIGHT", - start_sequence=("nCoV-2019_44_RIGHT_fw", "AACCTTTCCACATACCGCAGAC"), - end_sequence=("nCoV-2019_44_RIGHT_revcomp", "GTCTGCGGTATGTGGAAAGGTT"), - ), - Adapter( - "nCoV-2019_44_RIGHT_alt0", - start_sequence=("nCoV-2019_44_RIGHT_alt0_fw", "CGCAGACGGTACAGACTGTGTT"), - end_sequence=("nCoV-2019_44_RIGHT_alt0_revcomp", "AACACAGTCTGTACCGTCTGCG"), - ), - Adapter( - "nCoV-2019_45_LEFT", - start_sequence=("nCoV-2019_45_LEFT_fw", "TACCTACAACTTGTGCTAATGACCC"), - end_sequence=("nCoV-2019_45_LEFT_revcomp", "GGGTCATTAGCACAAGTTGTAGGTA"), - ), - Adapter( - "nCoV-2019_45_LEFT_alt2", - start_sequence=("nCoV-2019_45_LEFT_alt2_fw", "AGTATGTACAAATACCTACAACTTGTGCT"), - end_sequence=( - "nCoV-2019_45_LEFT_alt2_revcomp", - "AGCACAAGTTGTAGGTATTTGTACATACT", - ), - ), - Adapter( - "nCoV-2019_45_RIGHT", - start_sequence=("nCoV-2019_45_RIGHT_fw", "AAATTGTTTCTTCATGTTGGTAGTTAGAGA"), - end_sequence=("nCoV-2019_45_RIGHT_revcomp", "TCTCTAACTACCAACATGAAGAAACAATTT"), - ), - Adapter( - "nCoV-2019_45_RIGHT_alt7", - start_sequence=("nCoV-2019_45_RIGHT_alt7_fw", "TTCATGTTGGTAGTTAGAGAAAGTGTGTC"), - end_sequence=( - "nCoV-2019_45_RIGHT_alt7_revcomp", - "GACACACTTTCTCTAACTACCAACATGAA", - ), - ), - Adapter( - "nCoV-2019_46_LEFT", - start_sequence=("nCoV-2019_46_LEFT_fw", "TGTCGCTTCCAAGAAAAGGACG"), - end_sequence=("nCoV-2019_46_LEFT_revcomp", "CGTCCTTTTCTTGGAAGCGACA"), - ), - Adapter( - "nCoV-2019_46_LEFT_alt1", - start_sequence=("nCoV-2019_46_LEFT_alt1_fw", "CGCTTCCAAGAAAAGGACGAAGA"), - end_sequence=("nCoV-2019_46_LEFT_alt1_revcomp", "TCTTCGTCCTTTTCTTGGAAGCG"), - ), - Adapter( - "nCoV-2019_46_RIGHT", - start_sequence=("nCoV-2019_46_RIGHT_fw", "CACGTTCACCTAAGTTGGCGTA"), - end_sequence=("nCoV-2019_46_RIGHT_revcomp", "TACGCCAACTTAGGTGAACGTG"), - ), - Adapter( - "nCoV-2019_46_RIGHT_alt2", - start_sequence=("nCoV-2019_46_RIGHT_alt2_fw", "CACGTTCACCTAAGTTGGCGTAT"), - end_sequence=("nCoV-2019_46_RIGHT_alt2_revcomp", "ATACGCCAACTTAGGTGAACGTG"), - ), - Adapter( - "nCoV-2019_47_LEFT", - start_sequence=("nCoV-2019_47_LEFT_fw", "AGGACTGGTATGATTTTGTAGAAAACCC"), - end_sequence=("nCoV-2019_47_LEFT_revcomp", "GGGTTTTCTACAAAATCATACCAGTCCT"), - ), - Adapter( - "nCoV-2019_47_RIGHT", - start_sequence=("nCoV-2019_47_RIGHT_fw", "AATAACGGTCAAAGAGTTTTAACCTCTC"), - end_sequence=("nCoV-2019_47_RIGHT_revcomp", "GAGAGGTTAAAACTCTTTGACCGTTATT"), - ), - Adapter( - "nCoV-2019_48_LEFT", - start_sequence=("nCoV-2019_48_LEFT_fw", "TGTTGACACTGACTTAACAAAGCCT"), - end_sequence=("nCoV-2019_48_LEFT_revcomp", "AGGCTTTGTTAAGTCAGTGTCAACA"), - ), - Adapter( - "nCoV-2019_48_RIGHT", - start_sequence=("nCoV-2019_48_RIGHT_fw", "TAGATTACCAGAAGCAGCGTGC"), - end_sequence=("nCoV-2019_48_RIGHT_revcomp", "GCACGCTGCTTCTGGTAATCTA"), - ), - Adapter( - "nCoV-2019_49_LEFT", - start_sequence=("nCoV-2019_49_LEFT_fw", "AGGAATTACTTGTGTATGCTGCTGA"), - end_sequence=("nCoV-2019_49_LEFT_revcomp", "TCAGCAGCATACACAAGTAATTCCT"), - ), - Adapter( - "nCoV-2019_49_RIGHT", - start_sequence=("nCoV-2019_49_RIGHT_fw", "TGACGATGACTTGGTTAGCATTAATACA"), - end_sequence=("nCoV-2019_49_RIGHT_revcomp", "TGTATTAATGCTAACCAAGTCATCGTCA"), - ), - Adapter( - "nCoV-2019_50_LEFT", - start_sequence=("nCoV-2019_50_LEFT_fw", "GTTGATAAGTACTTTGATTGTTACGATGGT"), - end_sequence=("nCoV-2019_50_LEFT_revcomp", "ACCATCGTAACAATCAAAGTACTTATCAAC"), - ), - Adapter( - "nCoV-2019_50_RIGHT", - start_sequence=("nCoV-2019_50_RIGHT_fw", "TAACATGTTGTGCCAACCACCA"), - end_sequence=("nCoV-2019_50_RIGHT_revcomp", "TGGTGGTTGGCACAACATGTTA"), - ), - Adapter( - "nCoV-2019_51_LEFT", - start_sequence=("nCoV-2019_51_LEFT_fw", "TCAATAGCCGCCACTAGAGGAG"), - end_sequence=("nCoV-2019_51_LEFT_revcomp", "CTCCTCTAGTGGCGGCTATTGA"), - ), - Adapter( - "nCoV-2019_51_RIGHT", - start_sequence=("nCoV-2019_51_RIGHT_fw", "AGTGCATTAACATTGGCCGTGA"), - end_sequence=("nCoV-2019_51_RIGHT_revcomp", "TCACGGCCAATGTTAATGCACT"), - ), - Adapter( - "nCoV-2019_52_LEFT", - start_sequence=("nCoV-2019_52_LEFT_fw", "CATCAGGAGATGCCACAACTGC"), - end_sequence=("nCoV-2019_52_LEFT_revcomp", "GCAGTTGTGGCATCTCCTGATG"), - ), - Adapter( - "nCoV-2019_52_RIGHT", - start_sequence=("nCoV-2019_52_RIGHT_fw", "GTTGAGAGCAAAATTCATGAGGTCC"), - end_sequence=("nCoV-2019_52_RIGHT_revcomp", "GGACCTCATGAATTTTGCTCTCAAC"), - ), - Adapter( - "nCoV-2019_53_LEFT", - start_sequence=("nCoV-2019_53_LEFT_fw", "AGCAAAATGTTGGACTGAGACTGA"), - end_sequence=("nCoV-2019_53_LEFT_revcomp", "TCAGTCTCAGTCCAACATTTTGCT"), - ), - Adapter( - "nCoV-2019_53_RIGHT", - start_sequence=("nCoV-2019_53_RIGHT_fw", "AGCCTCATAAAACTCAGGTTCCC"), - end_sequence=("nCoV-2019_53_RIGHT_revcomp", "GGGAACCTGAGTTTTATGAGGCT"), - ), - Adapter( - "nCoV-2019_54_LEFT", - start_sequence=("nCoV-2019_54_LEFT_fw", "TGAGTTAACAGGACACATGTTAGACA"), - end_sequence=("nCoV-2019_54_LEFT_revcomp", "TGTCTAACATGTGTCCTGTTAACTCA"), - ), - Adapter( - "nCoV-2019_54_RIGHT", - start_sequence=("nCoV-2019_54_RIGHT_fw", "AACCAAAAACTTGTCCATTAGCACA"), - end_sequence=("nCoV-2019_54_RIGHT_revcomp", "TGTGCTAATGGACAAGTTTTTGGTT"), - ), - Adapter( - "nCoV-2019_55_LEFT", - start_sequence=("nCoV-2019_55_LEFT_fw", "ACTCAACTTTACTTAGGAGGTATGAGCT"), - end_sequence=("nCoV-2019_55_LEFT_revcomp", "AGCTCATACCTCCTAAGTAAAGTTGAGT"), - ), - Adapter( - "nCoV-2019_55_RIGHT", - start_sequence=("nCoV-2019_55_RIGHT_fw", "GGTGTACTCTCCTATTTGTACTTTACTGT"), - end_sequence=("nCoV-2019_55_RIGHT_revcomp", "ACAGTAAAGTACAAATAGGAGAGTACACC"), - ), - Adapter( - "nCoV-2019_56_LEFT", - start_sequence=("nCoV-2019_56_LEFT_fw", "ACCTAGACCACCACTTAACCGA"), - end_sequence=("nCoV-2019_56_LEFT_revcomp", "TCGGTTAAGTGGTGGTCTAGGT"), - ), - Adapter( - "nCoV-2019_56_RIGHT", - start_sequence=("nCoV-2019_56_RIGHT_fw", "ACACTATGCGAGCAGAAGGGTA"), - end_sequence=("nCoV-2019_56_RIGHT_revcomp", "TACCCTTCTGCTCGCATAGTGT"), - ), - Adapter( - "nCoV-2019_57_LEFT", - start_sequence=("nCoV-2019_57_LEFT_fw", "ATTCTACACTCCAGGGACCACC"), - end_sequence=("nCoV-2019_57_LEFT_revcomp", "GGTGGTCCCTGGAGTGTAGAAT"), - ), - Adapter( - "nCoV-2019_57_RIGHT", - start_sequence=("nCoV-2019_57_RIGHT_fw", "GTAATTGAGCAGGGTCGCCAAT"), - end_sequence=("nCoV-2019_57_RIGHT_revcomp", "ATTGGCGACCCTGCTCAATTAC"), - ), - Adapter( - "nCoV-2019_58_LEFT", - start_sequence=("nCoV-2019_58_LEFT_fw", "TGATTTGAGTGTTGTCAATGCCAGA"), - end_sequence=("nCoV-2019_58_LEFT_revcomp", "TCTGGCATTGACAACACTCAAATCA"), - ), - Adapter( - "nCoV-2019_58_RIGHT", - start_sequence=("nCoV-2019_58_RIGHT_fw", "CTTTTCTCCAAGCAGGGTTACGT"), - end_sequence=("nCoV-2019_58_RIGHT_revcomp", "ACGTAACCCTGCTTGGAGAAAAG"), - ), - Adapter( - "nCoV-2019_59_LEFT", - start_sequence=("nCoV-2019_59_LEFT_fw", "TCACGCATGATGTTTCATCTGCA"), - end_sequence=("nCoV-2019_59_LEFT_revcomp", "TGCAGATGAAACATCATGCGTGA"), - ), - Adapter( - "nCoV-2019_59_RIGHT", - start_sequence=("nCoV-2019_59_RIGHT_fw", "AAGAGTCCTGTTACATTTTCAGCTTG"), - end_sequence=("nCoV-2019_59_RIGHT_revcomp", "CAAGCTGAAAATGTAACAGGACTCTT"), - ), - Adapter( - "nCoV-2019_60_LEFT", - start_sequence=("nCoV-2019_60_LEFT_fw", "TGATAGAGACCTTTATGACAAGTTGCA"), - end_sequence=("nCoV-2019_60_LEFT_revcomp", "TGCAACTTGTCATAAAGGTCTCTATCA"), - ), - Adapter( - "nCoV-2019_60_RIGHT", - start_sequence=("nCoV-2019_60_RIGHT_fw", "GGTACCAACAGCTTCTCTAGTAGC"), - end_sequence=("nCoV-2019_60_RIGHT_revcomp", "GCTACTAGAGAAGCTGTTGGTACC"), - ), - Adapter( - "nCoV-2019_61_LEFT", - start_sequence=("nCoV-2019_61_LEFT_fw", "TGTTTATCACCCGCGAAGAAGC"), - end_sequence=("nCoV-2019_61_LEFT_revcomp", "GCTTCTTCGCGGGTGATAAACA"), - ), - Adapter( - "nCoV-2019_61_RIGHT", - start_sequence=("nCoV-2019_61_RIGHT_fw", "ATCACATAGACAACAGGTGCGC"), - end_sequence=("nCoV-2019_61_RIGHT_revcomp", "GCGCACCTGTTGTCTATGTGAT"), - ), - Adapter( - "nCoV-2019_62_LEFT", - start_sequence=("nCoV-2019_62_LEFT_fw", "GGCACATGGCTTTGAGTTGACA"), - end_sequence=("nCoV-2019_62_LEFT_revcomp", "TGTCAACTCAAAGCCATGTGCC"), - ), - Adapter( - "nCoV-2019_62_RIGHT", - start_sequence=("nCoV-2019_62_RIGHT_fw", "GTTGAACCTTTCTACAAGCCGC"), - end_sequence=("nCoV-2019_62_RIGHT_revcomp", "GCGGCTTGTAGAAAGGTTCAAC"), - ), - Adapter( - "nCoV-2019_63_LEFT", - start_sequence=("nCoV-2019_63_LEFT_fw", "TGTTAAGCGTGTTGACTGGACT"), - end_sequence=("nCoV-2019_63_LEFT_revcomp", "AGTCCAGTCAACACGCTTAACA"), - ), - Adapter( - "nCoV-2019_63_RIGHT", - start_sequence=("nCoV-2019_63_RIGHT_fw", "ACAAACTGCCACCATCACAACC"), - end_sequence=("nCoV-2019_63_RIGHT_revcomp", "GGTTGTGATGGTGGCAGTTTGT"), - ), - Adapter( - "nCoV-2019_64_LEFT", - start_sequence=("nCoV-2019_64_LEFT_fw", "TCGATAGATATCCTGCTAATTCCATTGT"), - end_sequence=("nCoV-2019_64_LEFT_revcomp", "ACAATGGAATTAGCAGGATATCTATCGA"), - ), - Adapter( - "nCoV-2019_64_RIGHT", - start_sequence=("nCoV-2019_64_RIGHT_fw", "AGTCTTGTAAAAGTGTTCCAGAGGT"), - end_sequence=("nCoV-2019_64_RIGHT_revcomp", "ACCTCTGGAACACTTTTACAAGACT"), - ), - Adapter( - "nCoV-2019_65_LEFT", - start_sequence=("nCoV-2019_65_LEFT_fw", "GCTGGCTTTAGCTTGTGGGTTT"), - end_sequence=("nCoV-2019_65_LEFT_revcomp", "AAACCCACAAGCTAAAGCCAGC"), - ), - Adapter( - "nCoV-2019_65_RIGHT", - start_sequence=("nCoV-2019_65_RIGHT_fw", "TGTCAGTCATAGAACAAACACCAATAGT"), - end_sequence=("nCoV-2019_65_RIGHT_revcomp", "ACTATTGGTGTTTGTTCTATGACTGACA"), - ), - Adapter( - "nCoV-2019_66_LEFT", - start_sequence=("nCoV-2019_66_LEFT_fw", "GGGTGTGGACATTGCTGCTAAT"), - end_sequence=("nCoV-2019_66_LEFT_revcomp", "ATTAGCAGCAATGTCCACACCC"), - ), - Adapter( - "nCoV-2019_66_RIGHT", - start_sequence=("nCoV-2019_66_RIGHT_fw", "TCAATTTCCATTTGACTCCTGGGT"), - end_sequence=("nCoV-2019_66_RIGHT_revcomp", "ACCCAGGAGTCAAATGGAAATTGA"), - ), - Adapter( - "nCoV-2019_67_LEFT", - start_sequence=("nCoV-2019_67_LEFT_fw", "GTTGTCCAACAATTACCTGAAACTTACT"), - end_sequence=("nCoV-2019_67_LEFT_revcomp", "AGTAAGTTTCAGGTAATTGTTGGACAAC"), - ), - Adapter( - "nCoV-2019_67_RIGHT", - start_sequence=("nCoV-2019_67_RIGHT_fw", "CAACCTTAGAAACTACAGATAAATCTTGGG"), - end_sequence=("nCoV-2019_67_RIGHT_revcomp", "CCCAAGATTTATCTGTAGTTTCTAAGGTTG"), - ), - Adapter( - "nCoV-2019_68_LEFT", - start_sequence=("nCoV-2019_68_LEFT_fw", "ACAGGTTCATCTAAGTGTGTGTGT"), - end_sequence=("nCoV-2019_68_LEFT_revcomp", "ACACACACACTTAGATGAACCTGT"), - ), - Adapter( - "nCoV-2019_68_RIGHT", - start_sequence=("nCoV-2019_68_RIGHT_fw", "CTCCTTTATCAGAACCAGCACCA"), - end_sequence=("nCoV-2019_68_RIGHT_revcomp", "TGGTGCTGGTTCTGATAAAGGAG"), - ), - Adapter( - "nCoV-2019_69_LEFT", - start_sequence=("nCoV-2019_69_LEFT_fw", "TGTCGCAAAATATACTCAACTGTGTCA"), - end_sequence=("nCoV-2019_69_LEFT_revcomp", "TGACACAGTTGAGTATATTTTGCGACA"), - ), - Adapter( - "nCoV-2019_69_RIGHT", - start_sequence=("nCoV-2019_69_RIGHT_fw", "TCTTTATAGCCACGGAACCTCCA"), - end_sequence=("nCoV-2019_69_RIGHT_revcomp", "TGGAGGTTCCGTGGCTATAAAGA"), - ), - Adapter( - "nCoV-2019_70_LEFT", - start_sequence=("nCoV-2019_70_LEFT_fw", "ACAAAAGAAAATGACTCTAAAGAGGGTTT"), - end_sequence=("nCoV-2019_70_LEFT_revcomp", "AAACCCTCTTTAGAGTCATTTTCTTTTGT"), - ), - Adapter( - "nCoV-2019_70_RIGHT", - start_sequence=("nCoV-2019_70_RIGHT_fw", "TGACCTTCTTTTAAAGACATAACAGCAG"), - end_sequence=("nCoV-2019_70_RIGHT_revcomp", "CTGCTGTTATGTCTTTAAAAGAAGGTCA"), - ), - Adapter( - "nCoV-2019_71_LEFT", - start_sequence=("nCoV-2019_71_LEFT_fw", "ACAAATCCAATTCAGTTGTCTTCCTATTC"), - end_sequence=("nCoV-2019_71_LEFT_revcomp", "GAATAGGAAGACAACTGAATTGGATTTGT"), - ), - Adapter( - "nCoV-2019_71_RIGHT", - start_sequence=("nCoV-2019_71_RIGHT_fw", "TGGAAAAGAAAGGTAAGAACAAGTCCT"), - end_sequence=("nCoV-2019_71_RIGHT_revcomp", "AGGACTTGTTCTTACCTTTCTTTTCCA"), - ), - Adapter( - "nCoV-2019_72_LEFT", - start_sequence=("nCoV-2019_72_LEFT_fw", "ACACGTGGTGTTTATTACCCTGAC"), - end_sequence=("nCoV-2019_72_LEFT_revcomp", "GTCAGGGTAATAAACACCACGTGT"), - ), - Adapter( - "nCoV-2019_72_RIGHT", - start_sequence=("nCoV-2019_72_RIGHT_fw", "ACTCTGAACTCACTTTCCATCCAAC"), - end_sequence=("nCoV-2019_72_RIGHT_revcomp", "GTTGGATGGAAAGTGAGTTCAGAGT"), - ), - Adapter( - "nCoV-2019_73_LEFT", - start_sequence=("nCoV-2019_73_LEFT_fw", "CAATTTTGTAATGATCCATTTTTGGGTGT"), - end_sequence=("nCoV-2019_73_LEFT_revcomp", "ACACCCAAAAATGGATCATTACAAAATTG"), - ), - Adapter( - "nCoV-2019_73_RIGHT", - start_sequence=("nCoV-2019_73_RIGHT_fw", "CACCAGCTGTCCAACCTGAAGA"), - end_sequence=("nCoV-2019_73_RIGHT_revcomp", "TCTTCAGGTTGGACAGCTGGTG"), - ), - Adapter( - "nCoV-2019_74_LEFT", - start_sequence=("nCoV-2019_74_LEFT_fw", "ACATCACTAGGTTTCAAACTTTACTTGC"), - end_sequence=("nCoV-2019_74_LEFT_revcomp", "GCAAGTAAAGTTTGAAACCTAGTGATGT"), - ), - Adapter( - "nCoV-2019_74_RIGHT", - start_sequence=("nCoV-2019_74_RIGHT_fw", "GCAACACAGTTGCTGATTCTCTTC"), - end_sequence=("nCoV-2019_74_RIGHT_revcomp", "GAAGAGAATCAGCAACTGTGTTGC"), - ), - Adapter( - "nCoV-2019_75_LEFT", - start_sequence=("nCoV-2019_75_LEFT_fw", "AGAGTCCAACCAACAGAATCTATTGT"), - end_sequence=("nCoV-2019_75_LEFT_revcomp", "ACAATAGATTCTGTTGGTTGGACTCT"), - ), - Adapter( - "nCoV-2019_75_RIGHT", - start_sequence=("nCoV-2019_75_RIGHT_fw", "ACCACCAACCTTAGAATCAAGATTGT"), - end_sequence=("nCoV-2019_75_RIGHT_revcomp", "ACAATCTTGATTCTAAGGTTGGTGGT"), - ), - Adapter( - "nCoV-2019_76_LEFT", - start_sequence=("nCoV-2019_76_LEFT_fw", "AGGGCAAACTGGAAAGATTGCT"), - end_sequence=("nCoV-2019_76_LEFT_revcomp", "AGCAATCTTTCCAGTTTGCCCT"), - ), - Adapter( - "nCoV-2019_76_LEFT_alt3", - start_sequence=("nCoV-2019_76_LEFT_alt3_fw", "GGGCAAACTGGAAAGATTGCTGA"), - end_sequence=("nCoV-2019_76_LEFT_alt3_revcomp", "TCAGCAATCTTTCCAGTTTGCCC"), - ), - Adapter( - "nCoV-2019_76_RIGHT", - start_sequence=("nCoV-2019_76_RIGHT_fw", "ACACCTGTGCCTGTTAAACCAT"), - end_sequence=("nCoV-2019_76_RIGHT_revcomp", "ATGGTTTAACAGGCACAGGTGT"), - ), - Adapter( - "nCoV-2019_76_RIGHT_alt0", - start_sequence=("nCoV-2019_76_RIGHT_alt0_fw", "ACCTGTGCCTGTTAAACCATTGA"), - end_sequence=("nCoV-2019_76_RIGHT_alt0_revcomp", "TCAATGGTTTAACAGGCACAGGT"), - ), - Adapter( - "nCoV-2019_77_LEFT", - start_sequence=("nCoV-2019_77_LEFT_fw", "CCAGCAACTGTTTGTGGACCTA"), - end_sequence=("nCoV-2019_77_LEFT_revcomp", "TAGGTCCACAAACAGTTGCTGG"), - ), - Adapter( - "nCoV-2019_77_RIGHT", - start_sequence=("nCoV-2019_77_RIGHT_fw", "CAGCCCCTATTAAACAGCCTGC"), - end_sequence=("nCoV-2019_77_RIGHT_revcomp", "GCAGGCTGTTTAATAGGGGCTG"), - ), - Adapter( - "nCoV-2019_78_LEFT", - start_sequence=("nCoV-2019_78_LEFT_fw", "CAACTTACTCCTACTTGGCGTGT"), - end_sequence=("nCoV-2019_78_LEFT_revcomp", "ACACGCCAAGTAGGAGTAAGTTG"), - ), - Adapter( - "nCoV-2019_78_RIGHT", - start_sequence=("nCoV-2019_78_RIGHT_fw", "TGTGTACAAAAACTGCCATATTGCA"), - end_sequence=("nCoV-2019_78_RIGHT_revcomp", "TGCAATATGGCAGTTTTTGTACACA"), - ), - Adapter( - "nCoV-2019_79_LEFT", - start_sequence=("nCoV-2019_79_LEFT_fw", "GTGGTGATTCAACTGAATGCAGC"), - end_sequence=("nCoV-2019_79_LEFT_revcomp", "GCTGCATTCAGTTGAATCACCAC"), - ), - Adapter( - "nCoV-2019_79_RIGHT", - start_sequence=("nCoV-2019_79_RIGHT_fw", "CATTTCATCTGTGAGCAAAGGTGG"), - end_sequence=("nCoV-2019_79_RIGHT_revcomp", "CCACCTTTGCTCACAGATGAAATG"), - ), - Adapter( - "nCoV-2019_80_LEFT", - start_sequence=("nCoV-2019_80_LEFT_fw", "TTGCCTTGGTGATATTGCTGCT"), - end_sequence=("nCoV-2019_80_LEFT_revcomp", "AGCAGCAATATCACCAAGGCAA"), - ), - Adapter( - "nCoV-2019_80_RIGHT", - start_sequence=("nCoV-2019_80_RIGHT_fw", "TGGAGCTAAGTTGTTTAACAAGCG"), - end_sequence=("nCoV-2019_80_RIGHT_revcomp", "CGCTTGTTAAACAACTTAGCTCCA"), - ), - Adapter( - "nCoV-2019_81_LEFT", - start_sequence=("nCoV-2019_81_LEFT_fw", "GCACTTGGAAAACTTCAAGATGTGG"), - end_sequence=("nCoV-2019_81_LEFT_revcomp", "CCACATCTTGAAGTTTTCCAAGTGC"), - ), - Adapter( - "nCoV-2019_81_RIGHT", - start_sequence=("nCoV-2019_81_RIGHT_fw", "GTGAAGTTCTTTTCTTGTGCAGGG"), - end_sequence=("nCoV-2019_81_RIGHT_revcomp", "CCCTGCACAAGAAAAGAACTTCAC"), - ), - Adapter( - "nCoV-2019_82_LEFT", - start_sequence=("nCoV-2019_82_LEFT_fw", "GGGCTATCATCTTATGTCCTTCCCT"), - end_sequence=("nCoV-2019_82_LEFT_revcomp", "AGGGAAGGACATAAGATGATAGCCC"), - ), - Adapter( - "nCoV-2019_82_RIGHT", - start_sequence=("nCoV-2019_82_RIGHT_fw", "TGCCAGAGATGTCACCTAAATCAA"), - end_sequence=("nCoV-2019_82_RIGHT_revcomp", "TTGATTTAGGTGACATCTCTGGCA"), - ), - Adapter( - "nCoV-2019_83_LEFT", - start_sequence=("nCoV-2019_83_LEFT_fw", "TCCTTTGCAACCTGAATTAGACTCA"), - end_sequence=("nCoV-2019_83_LEFT_revcomp", "TGAGTCTAATTCAGGTTGCAAAGGA"), - ), - Adapter( - "nCoV-2019_83_RIGHT", - start_sequence=("nCoV-2019_83_RIGHT_fw", "TTTGACTCCTTTGAGCACTGGC"), - end_sequence=("nCoV-2019_83_RIGHT_revcomp", "GCCAGTGCTCAAAGGAGTCAAA"), - ), - Adapter( - "nCoV-2019_84_LEFT", - start_sequence=("nCoV-2019_84_LEFT_fw", "TGCTGTAGTTGTCTCAAGGGCT"), - end_sequence=("nCoV-2019_84_LEFT_revcomp", "AGCCCTTGAGACAACTACAGCA"), - ), - Adapter( - "nCoV-2019_84_RIGHT", - start_sequence=("nCoV-2019_84_RIGHT_fw", "AGGTGTGAGTAAACTGTTACAAACAAC"), - end_sequence=("nCoV-2019_84_RIGHT_revcomp", "GTTGTTTGTAACAGTTTACTCACACCT"), - ), - Adapter( - "nCoV-2019_85_LEFT", - start_sequence=("nCoV-2019_85_LEFT_fw", "ACTAGCACTCTCCAAGGGTGTT"), - end_sequence=("nCoV-2019_85_LEFT_revcomp", "AACACCCTTGGAGAGTGCTAGT"), - ), - Adapter( - "nCoV-2019_85_RIGHT", - start_sequence=("nCoV-2019_85_RIGHT_fw", "ACACAGTCTTTTACTCCAGATTCCC"), - end_sequence=("nCoV-2019_85_RIGHT_revcomp", "GGGAATCTGGAGTAAAAGACTGTGT"), - ), - Adapter( - "nCoV-2019_86_LEFT", - start_sequence=("nCoV-2019_86_LEFT_fw", "TCAGGTGATGGCACAACAAGTC"), - end_sequence=("nCoV-2019_86_LEFT_revcomp", "GACTTGTTGTGCCATCACCTGA"), - ), - Adapter( - "nCoV-2019_86_RIGHT", - start_sequence=("nCoV-2019_86_RIGHT_fw", "ACGAAAGCAAGAAAAAGAAGTACGC"), - end_sequence=("nCoV-2019_86_RIGHT_revcomp", "GCGTACTTCTTTTTCTTGCTTTCGT"), - ), - Adapter( - "nCoV-2019_87_LEFT", - start_sequence=("nCoV-2019_87_LEFT_fw", "CGACTACTAGCGTGCCTTTGTA"), - end_sequence=("nCoV-2019_87_LEFT_revcomp", "TACAAAGGCACGCTAGTAGTCG"), - ), - Adapter( - "nCoV-2019_87_RIGHT", - start_sequence=("nCoV-2019_87_RIGHT_fw", "ACTAGGTTCCATTGTTCAAGGAGC"), - end_sequence=("nCoV-2019_87_RIGHT_revcomp", "GCTCCTTGAACAATGGAACCTAGT"), - ), - Adapter( - "nCoV-2019_88_LEFT", - start_sequence=("nCoV-2019_88_LEFT_fw", "CCATGGCAGATTCCAACGGTAC"), - end_sequence=("nCoV-2019_88_LEFT_revcomp", "GTACCGTTGGAATCTGCCATGG"), - ), - Adapter( - "nCoV-2019_88_RIGHT", - start_sequence=("nCoV-2019_88_RIGHT_fw", "TGGTCAGAATAGTGCCATGGAGT"), - end_sequence=("nCoV-2019_88_RIGHT_revcomp", "ACTCCATGGCACTATTCTGACCA"), - ), - Adapter( - "nCoV-2019_89_LEFT", - start_sequence=("nCoV-2019_89_LEFT_fw", "GTACGCGTTCCATGTGGTCATT"), - end_sequence=("nCoV-2019_89_LEFT_revcomp", "AATGACCACATGGAACGCGTAC"), - ), - Adapter( - "nCoV-2019_89_LEFT_alt2", - start_sequence=("nCoV-2019_89_LEFT_alt2_fw", "CGCGTTCCATGTGGTCATTCAA"), - end_sequence=("nCoV-2019_89_LEFT_alt2_revcomp", "TTGAATGACCACATGGAACGCG"), - ), - Adapter( - "nCoV-2019_89_RIGHT", - start_sequence=("nCoV-2019_89_RIGHT_fw", "ACCTGAAAGTCAACGAGATGAAACA"), - end_sequence=("nCoV-2019_89_RIGHT_revcomp", "TGTTTCATCTCGTTGACTTTCAGGT"), - ), - Adapter( - "nCoV-2019_89_RIGHT_alt4", - start_sequence=("nCoV-2019_89_RIGHT_alt4_fw", "ACGAGATGAAACATCTGTTGTCACT"), - end_sequence=("nCoV-2019_89_RIGHT_alt4_revcomp", "AGTGACAACAGATGTTTCATCTCGT"), - ), - Adapter( - "nCoV-2019_90_LEFT", - start_sequence=("nCoV-2019_90_LEFT_fw", "ACACAGACCATTCCAGTAGCAGT"), - end_sequence=("nCoV-2019_90_LEFT_revcomp", "ACTGCTACTGGAATGGTCTGTGT"), - ), - Adapter( - "nCoV-2019_90_RIGHT", - start_sequence=("nCoV-2019_90_RIGHT_fw", "TGAAATGGTGAATTGCCCTCGT"), - end_sequence=("nCoV-2019_90_RIGHT_revcomp", "ACGAGGGCAATTCACCATTTCA"), - ), - Adapter( - "nCoV-2019_91_LEFT", - start_sequence=("nCoV-2019_91_LEFT_fw", "TCACTACCAAGAGTGTGTTAGAGGT"), - end_sequence=("nCoV-2019_91_LEFT_revcomp", "ACCTCTAACACACTCTTGGTAGTGA"), - ), - Adapter( - "nCoV-2019_91_RIGHT", - start_sequence=("nCoV-2019_91_RIGHT_fw", "TTCAAGTGAGAACCAAAAGATAATAAGCA"), - end_sequence=("nCoV-2019_91_RIGHT_revcomp", "TGCTTATTATCTTTTGGTTCTCACTTGAA"), - ), - Adapter( - "nCoV-2019_92_LEFT", - start_sequence=("nCoV-2019_92_LEFT_fw", "TTTGTGCTTTTTAGCCTTTCTGCT"), - end_sequence=("nCoV-2019_92_LEFT_revcomp", "AGCAGAAAGGCTAAAAAGCACAAA"), - ), - Adapter( - "nCoV-2019_92_RIGHT", - start_sequence=("nCoV-2019_92_RIGHT_fw", "AGGTTCCTGGCAATTAATTGTAAAAGG"), - end_sequence=("nCoV-2019_92_RIGHT_revcomp", "CCTTTTACAATTAATTGCCAGGAACCT"), - ), - Adapter( - "nCoV-2019_93_LEFT", - start_sequence=("nCoV-2019_93_LEFT_fw", "TGAGGCTGGTTCTAAATCACCCA"), - end_sequence=("nCoV-2019_93_LEFT_revcomp", "TGGGTGATTTAGAACCAGCCTCA"), - ), - Adapter( - "nCoV-2019_93_RIGHT", - start_sequence=("nCoV-2019_93_RIGHT_fw", "AGGTCTTCCTTGCCATGTTGAG"), - end_sequence=("nCoV-2019_93_RIGHT_revcomp", "CTCAACATGGCAAGGAAGACCT"), - ), - Adapter( - "nCoV-2019_94_LEFT", - start_sequence=("nCoV-2019_94_LEFT_fw", "GGCCCCAAGGTTTACCCAATAA"), - end_sequence=("nCoV-2019_94_LEFT_revcomp", "TTATTGGGTAAACCTTGGGGCC"), - ), - Adapter( - "nCoV-2019_94_RIGHT", - start_sequence=("nCoV-2019_94_RIGHT_fw", "TTTGGCAATGTTGTTCCTTGAGG"), - end_sequence=("nCoV-2019_94_RIGHT_revcomp", "CCTCAAGGAACAACATTGCCAAA"), - ), - Adapter( - "nCoV-2019_95_LEFT", - start_sequence=("nCoV-2019_95_LEFT_fw", "TGAGGGAGCCTTGAATACACCA"), - end_sequence=("nCoV-2019_95_LEFT_revcomp", "TGGTGTATTCAAGGCTCCCTCA"), - ), - Adapter( - "nCoV-2019_95_RIGHT", - start_sequence=("nCoV-2019_95_RIGHT_fw", "CAGTACGTTTTTGCCGAGGCTT"), - end_sequence=("nCoV-2019_95_RIGHT_revcomp", "AAGCCTCGGCAAAAACGTACTG"), - ), - Adapter( - "nCoV-2019_96_LEFT", - start_sequence=("nCoV-2019_96_LEFT_fw", "GCCAACAACAACAAGGCCAAAC"), - end_sequence=("nCoV-2019_96_LEFT_revcomp", "GTTTGGCCTTGTTGTTGTTGGC"), - ), - Adapter( - "nCoV-2019_96_RIGHT", - start_sequence=("nCoV-2019_96_RIGHT_fw", "TAGGCTCTGTTGGTGGGAATGT"), - end_sequence=("nCoV-2019_96_RIGHT_revcomp", "ACATTCCCACCAACAGAGCCTA"), - ), - Adapter( - "nCoV-2019_97_LEFT", - start_sequence=("nCoV-2019_97_LEFT_fw", "TGGATGACAAAGATCCAAATTTCAAAGA"), - end_sequence=("nCoV-2019_97_LEFT_revcomp", "TCTTTGAAATTTGGATCTTTGTCATCCA"), - ), - Adapter( - "nCoV-2019_97_RIGHT", - start_sequence=("nCoV-2019_97_RIGHT_fw", "ACACACTGATTAAAGATTGCTATGTGAG"), - end_sequence=("nCoV-2019_97_RIGHT_revcomp", "CTCACATAGCAATCTTTAATCAGTGTGT"), - ), - Adapter( - "nCoV-2019_98_LEFT", - start_sequence=("nCoV-2019_98_LEFT_fw", "AACAATTGCAACAATCCATGAGCA"), - end_sequence=("nCoV-2019_98_LEFT_revcomp", "TGCTCATGGATTGTTGCAATTGTT"), - ), - Adapter( - "nCoV-2019_98_RIGHT", - start_sequence=("nCoV-2019_98_RIGHT_fw", "TTCTCCTAAGAAGCTATTAAAATCACATGG"), - end_sequence=("nCoV-2019_98_RIGHT_revcomp", "CCATGTGATTTTAATAGCTTCTTAGGAGAA"), - ), -] - - -def make_full_native_barcode_adapter(barcode_num): - barcode = [ - x for x in ADAPTERS if x.name == "Barcode " + str(barcode_num) + " (reverse)" - ][0] - start_barcode_seq = barcode.start_sequence[1] - end_barcode_seq = barcode.end_sequence[1] - - start_full_seq = ( - "AATGTACTTCGTTCAGTTACGTATTGCTAAGGTTAA" + start_barcode_seq + "CAGCACCT" - ) - end_full_seq = "AGGTGCTG" + end_barcode_seq + "TTAACCTTAGCAATACGTAACTGAACGAAGT" - - return Adapter( - "Native barcoding " + str(barcode_num) + " (full sequence)", - start_sequence=("NB" + "%02d" % barcode_num + "_start", start_full_seq), - end_sequence=("NB" + "%02d" % barcode_num + "_end", end_full_seq), - ) - - -def make_old_full_rapid_barcode_adapter(barcode_num): # applies to SQK-RBK001 - barcode = [ - x for x in ADAPTERS if x.name == "Barcode " + str(barcode_num) + " (forward)" - ][0] - start_barcode_seq = barcode.start_sequence[1] - - start_full_seq = ( - "AATGTACTTCGTTCAGTTACG" - + "TATTGCT" - + start_barcode_seq - + "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA" - ) - - return Adapter( - "Rapid barcoding " + str(barcode_num) + " (full sequence, old)", - start_sequence=("RB" + "%02d" % barcode_num + "_full", start_full_seq), - ) - - -def make_new_full_rapid_barcode_adapter(barcode_num): # applies to SQK-RBK004 - barcode = [ - x for x in ADAPTERS if x.name == "Barcode " + str(barcode_num) + " (forward)" - ][0] - start_barcode_seq = barcode.start_sequence[1] - - start_full_seq = ( - "AATGTACTTCGTTCAGTTACG" - + "GCTTGGGTGTTTAACC" - + start_barcode_seq - + "GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA" - ) - - return Adapter( - "Rapid barcoding " + str(barcode_num) + " (full sequence, new)", - start_sequence=("RB" + "%02d" % barcode_num + "_full", start_full_seq), - ) diff --git a/workflow/envs/notramp.yaml b/workflow/envs/notramp.yaml index 4068d1ecc..d6eddc01c 100644 --- a/workflow/envs/notramp.yaml +++ b/workflow/envs/notramp.yaml @@ -2,5 +2,5 @@ channels: - simakro - bioconda dependencies: - - notramp + - notramp = 1.0.3 - minimap2 diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 20999e965..f3b805ae6 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -113,7 +113,7 @@ rule canu_correct: ovsConcurrency={params.concurrency} \ oeaConcurrency={params.concurrency}) gzip -d {output.corr_gz} - 2> {log} + > {log} 2>&1 """ @@ -123,7 +123,7 @@ rule clip_adbc_corrected: reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", ref_genome="resources/genomes/main.fasta", output: - "results/{date}/norm_trim_corr_reads/{sample}/{sample}.clip.fasta", + "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta", params: outdir="results/{date}/norm_trim_corr_reads/{sample}", log: From 8c9165db1802161ad99bf18fb2f21b1ae2938719 Mon Sep 17 00:00:00 2001 From: simakro Date: Wed, 2 Mar 2022 06:18:43 +0000 Subject: [PATCH 18/53] restored get_artic_primers to get the bed file --- workflow/rules/common.smk | 2 +- workflow/rules/long_read.smk | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 42310c406..ff2647ae2 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1301,7 +1301,7 @@ def get_artic_primer(wildcards): # TODO add more _adapters.py (not preferred) or # add a script to generate them from a link to a bed file. # The bed file can be found in the artic repo. Related to #356 - return "resources/ARTIC_v{}_adapters.py".format( + return "resources/ARTIC_v{}_nCoV-2019.primer.bed".format( config["preprocessing"]["artic-primer-version"] ) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index f3b805ae6..5f71e7b8c 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -57,7 +57,8 @@ rule downsample_and_trim_raw: "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.fasta", "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta", params: - outdir="results/{date}/norm_trim_raw_reads/{sample}", + # outdir="results/{date}/norm_trim_raw_reads/{sample}", + outdir=lambda w, output: os.path.dirname(output[0]), log: "results/{date}/norm_trim_raw_reads/{sample}/notramp.log", conda: From 4651b2e69cce198611693323c72f2e747582f5d5 Mon Sep 17 00:00:00 2001 From: simakro Date: Wed, 2 Mar 2022 10:13:33 +0000 Subject: [PATCH 19/53] get_output_dir comments --- workflow/rules/long_read.smk | 8 ++++---- workflow/rules/read_mapping.smk | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 5f71e7b8c..b00228cf0 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -57,8 +57,7 @@ rule downsample_and_trim_raw: "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.fasta", "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta", params: - # outdir="results/{date}/norm_trim_raw_reads/{sample}", - outdir=lambda w, output: os.path.dirname(output[0]), + outdir=get_output_dir, log: "results/{date}/norm_trim_raw_reads/{sample}/notramp.log", conda: @@ -113,7 +112,7 @@ rule canu_correct: ovbConcurrency={params.concurrency} \ ovsConcurrency={params.concurrency} \ oeaConcurrency={params.concurrency}) - gzip -d {output.corr_gz} + gzip -d {output.corr_gz} --keep > {log} 2>&1 """ @@ -126,7 +125,8 @@ rule clip_adbc_corrected: output: "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta", params: - outdir="results/{date}/norm_trim_corr_reads/{sample}", + # outdir="results/{date}/norm_trim_corr_reads/{sample}", + outdir=get_output_dir, log: "results/{date}/norm_trim_corr_reads/{sample}/notramp.log", conda: diff --git a/workflow/rules/read_mapping.smk b/workflow/rules/read_mapping.smk index 4f43d4251..d4193706e 100644 --- a/workflow/rules/read_mapping.smk +++ b/workflow/rules/read_mapping.smk @@ -59,8 +59,8 @@ rule map_reads: sort_order="coordinate", threads: 8 wrapper: + # "v1.2.0/bio/bwa-mem2/mem" "0.69.0/bio/bwa/mem" - # "v1.1.0/bio/bwa/mem" rule mark_duplicates: From 82ef67971f0fea0441d29f5669533ac8496d03ab Mon Sep 17 00:00:00 2001 From: simakro Date: Wed, 2 Mar 2022 10:18:32 +0000 Subject: [PATCH 20/53] restore sample-table --- config/pep/samples.csv | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/config/pep/samples.csv b/config/pep/samples.csv index 0416fdea8..4173923ea 100644 --- a/config/pep/samples.csv +++ b/config/pep/samples.csv @@ -1,2 +1,4 @@ sample_name,fq1,fq2,date,is_amplicon_data,technology,include_in_high_genome_summary -UnCoVar_RefDataSet_Batch2_BC03_cat,/home/simon/uncovar/data/newdate/UnCoVar_RefDataSet_Batch2_BC03_cat.fastq,,newdate,1,ont,0 +SAMPLE_NAME_1,PATH/TO/fq1,PATH/TO/fq2,SEQUENCING_DATE,0,illumina,1 # Required information for a sample sequencing on the Illumina platform +SAMPLE_NAME_2,PATH/TO/fq,,SEQUENCING_DATE,1,ont,0 # Required information for a sample sequencing on the Oxford Nanopore platform +SAMPLE_NAME_3,PATH/TO/fq,,SEQUENCING_DATE,1,ion,0 # Required information for a sample sequencing on the Ion Torrent platform From c7dc646ecadcb6f79ebacb8d04abb62726e3b9ee Mon Sep 17 00:00:00 2001 From: simakro Date: Wed, 2 Mar 2022 10:34:08 +0000 Subject: [PATCH 21/53] envs version nums --- workflow/envs/minimap2.yaml | 2 +- workflow/envs/notramp.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/envs/minimap2.yaml b/workflow/envs/minimap2.yaml index a44e24867..16f072c0b 100644 --- a/workflow/envs/minimap2.yaml +++ b/workflow/envs/minimap2.yaml @@ -2,4 +2,4 @@ channels: - bioconda - conda-forge dependencies: - - minimap2 =2.22 + - minimap2 = 2.22 diff --git a/workflow/envs/notramp.yaml b/workflow/envs/notramp.yaml index d6eddc01c..5c03528a3 100644 --- a/workflow/envs/notramp.yaml +++ b/workflow/envs/notramp.yaml @@ -2,5 +2,5 @@ channels: - simakro - bioconda dependencies: - - notramp = 1.0.3 - - minimap2 + - notramp =1.0.3 + - minimap2 =2.22 From 3b1f37217bb95df62f6483de01933954f57e6193 Mon Sep 17 00:00:00 2001 From: simakro Date: Wed, 2 Mar 2022 10:36:20 +0000 Subject: [PATCH 22/53] envs conda-forge --- workflow/envs/notramp.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/envs/notramp.yaml b/workflow/envs/notramp.yaml index 5c03528a3..66321b450 100644 --- a/workflow/envs/notramp.yaml +++ b/workflow/envs/notramp.yaml @@ -1,6 +1,7 @@ channels: - simakro - bioconda + - conda-forge dependencies: - notramp =1.0.3 - minimap2 =2.22 From 447a396da9c4df02a6ebd2b1f1ba15e4ec5beb2e Mon Sep 17 00:00:00 2001 From: simakro Date: Wed, 2 Mar 2022 10:52:14 +0000 Subject: [PATCH 23/53] resolving comments --- workflow/envs/porechop.yaml | 5 ----- workflow/envs/primechop.yaml | 6 ------ workflow/rules/assembly.smk | 2 +- workflow/rules/long_read.smk | 2 -- 4 files changed, 1 insertion(+), 14 deletions(-) delete mode 100644 workflow/envs/porechop.yaml delete mode 100644 workflow/envs/primechop.yaml diff --git a/workflow/envs/porechop.yaml b/workflow/envs/porechop.yaml deleted file mode 100644 index 4ef889415..000000000 --- a/workflow/envs/porechop.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - porechop =0.2 diff --git a/workflow/envs/primechop.yaml b/workflow/envs/primechop.yaml deleted file mode 100644 index 850a55b4e..000000000 --- a/workflow/envs/primechop.yaml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - porechop =0.2 - - python =3.9 diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index c13b64bf3..c5040a490 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -179,7 +179,7 @@ rule assembly_polishing_illumina: # polish ont de novo assembly rule assembly_polishing_ont: input: - fasta="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", + fasta="results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta", reference="results/{date}/contigs/ordered/{sample}.fasta", output: report( diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index b00228cf0..2458eb515 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -66,7 +66,6 @@ rule downsample_and_trim_raw: "notramp -a -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}" -# rule medaka_consensus_reference: use rule assembly_polishing_ont as medaka_consensus_reference with: input: # Don´t ever use corrected reads as input for medaka, it is supposed to polish with raw-reads! @@ -125,7 +124,6 @@ rule clip_adbc_corrected: output: "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta", params: - # outdir="results/{date}/norm_trim_corr_reads/{sample}", outdir=get_output_dir, log: "results/{date}/norm_trim_corr_reads/{sample}/notramp.log", From c23633650f7eb182fa3af9753d02ee2da71fa52c Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Wed, 2 Mar 2022 16:53:30 +0100 Subject: [PATCH 24/53] updated primer paths --- workflow/rules/common.smk | 9 --------- workflow/rules/long_read.smk | 4 ++-- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index ff2647ae2..b73ba1132 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1297,15 +1297,6 @@ def get_include_flag_for_date(wildcards): return [get_include_flag(sample) for sample in allsamplelist] -def get_artic_primer(wildcards): - # TODO add more _adapters.py (not preferred) or - # add a script to generate them from a link to a bed file. - # The bed file can be found in the artic repo. Related to #356 - return "resources/ARTIC_v{}_nCoV-2019.primer.bed".format( - config["preprocessing"]["artic-primer-version"] - ) - - def get_trimmed_reads(wildcards): """Returns paths of files of the trimmed reads for parsing by kraken.""" return get_list_of_expanded_patters_by_technology( diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 2458eb515..5ceabce12 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -50,7 +50,7 @@ rule nanofilt: rule downsample_and_trim_raw: input: - primer=get_artic_primer, + primer=config["preprocessing"]["amplicon-primers"], reads="results/{date}/filtered/nanofilt/{sample}.fastq", ref_genome="resources/genomes/main.fasta", output: @@ -118,7 +118,7 @@ rule canu_correct: rule clip_adbc_corrected: input: - primer=get_artic_primer, + primer=config["preprocessing"]["amplicon-primers"], reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", ref_genome="resources/genomes/main.fasta", output: From d91ac22585ea4220084fdbe52190affeaa404f05 Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Wed, 2 Mar 2022 16:54:23 +0100 Subject: [PATCH 25/53] rmv amp_covcap_sampler.py --- workflow/scripts/amp_covcap_sampler.py | 193 ------------------------- 1 file changed, 193 deletions(-) delete mode 100644 workflow/scripts/amp_covcap_sampler.py diff --git a/workflow/scripts/amp_covcap_sampler.py b/workflow/scripts/amp_covcap_sampler.py deleted file mode 100644 index e9a8a5027..000000000 --- a/workflow/scripts/amp_covcap_sampler.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright 2022 Simon Magin. -# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) -# This file may not be copied, modified, or distributed except according to those terms. - -import random -from collections import defaultdict - - -class Mapping: - def __init__( - self, - qname, - qlen, - qstart, - qend, - samestrand, - tname, - tlen, - tstart, - tend, - matches, - total_bp, - qual, - kwattr, - ): - self.qname = qname - self.qlen = int(qlen) - self.qstart = int(qstart) - self.qend = int(qend) - self.samestrand = samestrand - self.tname = tname - self.tlen = int(tlen) - self.tstart = int(tstart) - self.tend = int(tend) - self.matches = int(matches) - self.total_bp = int(total_bp) - self.qual = int(qual) - self.kwattr = kwattr - self.gen_kw_attr() - - def gen_kw_attr(self): - kwattr_dict = {kw.split(":")[0]: kw.split(":")[-1] for kw in self.kwattr} - for key in kwattr_dict: - self.__dict__[key] = kwattr_dict[key] - - -class Primer: - def __init__(self, contig, start, end, name, score, strand): - self.contig = contig - self.start = int(start) - self.end = int(end) - self.name = name - self.score = int(score) - self.strand = strand - self.type = "primary" - self.amp_no, self.pos = self.get_name_infos() - - def get_name_infos(self): - ls = self.name.split("_") - if len(ls) == 4: - self.type = "alt" - self.alt_name = ls[3] - return ls[1], ls[2] - - -class Amp: - def __init__(self, amp_no, primers): - self.name = int(amp_no) - self.primers = primers - self.start = min([primer.start for primer in primers]) - self.end = max([primer.end for primer in primers]) - self.max_len = self.end - self.start - self.fwp_boundary = max( - [prim for prim in primers if prim.pos == "LEFT"], key=lambda x: x.end - ).end - self.revp_boundary = min( - [prim for prim in primers if prim.pos == "RIGHT"], key=lambda x: x.start - ).start - self.read_names = list() - self.reads = list() - - def random_sample(self, cutoff): - if len(self.read_names) > cutoff: - self.selected = random.choices(self.read_names, k=cutoff) - else: - self.selected = self.read_names - - -def create_primer_objs(primer_bed): - with open(primer_bed, "r") as bed: - primers = list() - for line in bed: - prim = Primer(*line.strip().split("\t")) - primers.append(prim) - return sorted(primers, key=lambda x: x.end) - - -def generate_amps(primers): - amp_nums = set([primer.amp_no for primer in primers]) - amps = list() - for num in amp_nums: - ao = Amp(num, [primer for primer in primers if primer.amp_no == num]) - # print(ao.name, ao.max_len) - amps.append(ao) - return sorted(amps, key=lambda x: x.name) - - -def filter_read_mappings(mappings): - mappings = [m for m in mappings if 300 < m.qlen < 600] - mappings = [m for m in mappings if m.qual == 60] - return mappings - - -def create_read_mappings(mm2_paf): - with open(mm2_paf, "r") as paf: - map_dct = defaultdict(list) - for line in paf: - if len(line.strip()) > 0: - ls = line.strip().split("\t") - mapping = Mapping(*ls[:12], ls[12:]) - map_dct[mapping.qname].append(mapping) - mult_maps = {n: ml for n, ml in map_dct.items() if len(ml) > 1} - mappings = [m for k, l in map_dct.items() for m in l] - mappings = filter_read_mappings(mappings) - mono_mappings = [m for m in mappings if m.qname not in mult_maps] - dual_mappings = {k: v for k, v in mult_maps.items() if len(v) == 2} - incl_max = [ - max(dual_mappings[mname], key=lambda x: x.matches) - for mname in dual_mappings - ] - incl_max = filter_read_mappings(incl_max) - mono_mappings.extend(incl_max) - mappings = mono_mappings - return sorted(mappings, key=lambda x: x.tend) - - -def bin_mappings(amp_bins, mappings): - binned = list() - na = list() - while len(amp_bins) > 0: - if len(mappings) > 0: - if mappings[0].tend <= amp_bins[0].end + 5: - if mappings[0].tstart >= amp_bins[0].start - 5: - amp_bins[0].read_names.append(mappings[0].qname) - mappings.pop(0) - else: - na.append(mappings[0].qname) - mappings.pop(0) - else: - binned.append(amp_bins[0]) - amp_bins.pop(0) - else: - break - - for bin in binned: - bin.random_sample(200) - # print(bin.name, len(bin.read_names), "selected:", len(bin.selected)) - # print("na", len(na)) - - return binned - - -def write_capped_reads(binned, reads, fa_out): - all_picks = ["@" + name for amp in binned for name in amp.selected] - with open(reads, "r") as fq, open(fa_out, "w") as fa: - for line in fq: - if line.startswith("@"): - readname = line.split(" ")[0] - if readname in all_picks: - readname = readname.replace("@", ">") - fa.write(readname + "\n") - fa.write(next(fq)) - - -if __name__ == "__main__": - import sys - - # mm2_paf = sys.argv[1] - # primer_bed = sys.argv[2] - # reads = sys.argv[3] - # fa_out = reads + "_capped" - # js_out = reads + "_capped.json" - - primer_bed = snakemake.input[0] - mm2_paf = snakemake.input[1] - reads = snakemake.input[2] - fa_out = snakemake.output[0] - - primers = create_primer_objs(primer_bed) - amps = generate_amps(primers) - mappings = create_read_mappings(mm2_paf) - binned = bin_mappings(amps, mappings) - write_capped_reads(binned, reads, fa_out) From 1bd3e1dafb11e050fde2811c486e617607361195 Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 7 Mar 2022 06:41:44 +0000 Subject: [PATCH 26/53] canu_correct: moved corr_gz from output to parameters --- workflow/rules/long_read.smk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 2458eb515..7fc52f549 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -81,12 +81,13 @@ rule canu_correct: output: temp(directory("results/{date}/corrected/{sample}/correction")), temp(directory("results/{date}/corrected/{sample}/{sample}.seqStore")), - corr_gz="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + # corr_gz="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", corr_fa="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", log: "logs/{date}/canu/correct/{sample}.log", params: outdir=get_output_dir, + corr_gz="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", concurrency=lambda w, threads: get_canu_concurrency(threads), min_length=config["quality-criteria"]["ont"]["min-length-reads"], for_testing=lambda w, threads: get_if_testing( @@ -111,7 +112,7 @@ rule canu_correct: ovbConcurrency={params.concurrency} \ ovsConcurrency={params.concurrency} \ oeaConcurrency={params.concurrency}) - gzip -d {output.corr_gz} --keep + gzip -d {params.corr_gz} --keep > {log} 2>&1 """ From 2ea26d7a0b644d91fbf0a63b110fc4246f236bb9 Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 7 Mar 2022 08:12:27 +0000 Subject: [PATCH 27/53] nanofilt accept de/compressed --- workflow/rules/long_read.smk | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 2f838bbc0..8379fda43 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -45,7 +45,13 @@ rule nanofilt: conda: "../envs/nanofilt.yaml" shell: - "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log}" + """ + if (file {input} | grep -q gzip) ; then + gunzip -c {input} | NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 > {output} 2> {log} + else + NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log} + fi + """ rule downsample_and_trim_raw: From b0ae175d4867b43f4b62964b9c9cbe47aab861c9 Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 7 Mar 2022 08:51:34 +0000 Subject: [PATCH 28/53] change gzip to compressed --- workflow/rules/long_read.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 8379fda43..bd705d42d 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -46,7 +46,7 @@ rule nanofilt: "../envs/nanofilt.yaml" shell: """ - if (file {input} | grep -q gzip) ; then + if (file {input} | grep -q compressed) ; then gunzip -c {input} | NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 > {output} 2> {log} else NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log} From 45a13ded5f2fda5be26247450dcc1293817fdae5 Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 7 Mar 2022 09:15:08 +0000 Subject: [PATCH 29/53] with corr_gz in output --- workflow/rules/long_read.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index bd705d42d..73e118d3d 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -87,7 +87,7 @@ rule canu_correct: output: temp(directory("results/{date}/corrected/{sample}/correction")), temp(directory("results/{date}/corrected/{sample}/{sample}.seqStore")), - # corr_gz="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", + corr_gz="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz", corr_fa="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", log: "logs/{date}/canu/correct/{sample}.log", From 78a5c4fe522e0694cb1a2dbc6d608f2df4bf0fb3 Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 7 Mar 2022 09:50:24 +0000 Subject: [PATCH 30/53] take out bcf rules --- workflow/rules/long_read.smk | 64 +++++++++++++++++------------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 73e118d3d..c0f657a1a 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -140,36 +140,34 @@ rule clip_adbc_corrected: "notramp -t --incl_prim -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}" -rule bcftools_consensus_ont: - input: - fasta="results/{date}/consensus/medaka/{sample}/consensus.fasta", - bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf", # clonal vs. subclonal? - bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf.csi", - output: - "results/{date}/consensus/bcftools/{sample}.fasta", - log: - "logs/{date}/bcftools-consensus-ont/{sample}.log", - conda: - "../envs/bcftools.yaml" - shell: - "bcftools consensus -f {input.fasta} {input.bcf} > {output} 2> {log}" - - -rule rename_consensus: - input: - "results/{date}/consensus/bcftools/{sample}.fasta", - output: - report( - "results/{date}/contigs/consensus/{sample}.fasta", - category="4. Sequences", - subcategory="3. Consensus Sequences", - caption="../report/assembly_consensus.rst", - ), - log: - "logs/{date}/rename-consensus-fasta/{sample}.log", - conda: - "../envs/unix.yaml" - shell: - "(cp {input} {output} && " - ' sed -i "1s/.*/>{wildcards.sample}/" {output})' - " 2> {log}" +# rule bcftools_consensus_ont: +# input: +# fasta="results/{date}/consensus/medaka/{sample}/consensus.fasta", +# bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf", # clonal vs. subclonal? +# bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf.csi", +# output: +# "results/{date}/consensus/bcftools/{sample}.fasta", +# log: +# "logs/{date}/bcftools-consensus-ont/{sample}.log", +# conda: +# "../envs/bcftools.yaml" +# shell: +# "bcftools consensus -f {input.fasta} {input.bcf} > {output} 2> {log}" +# rule rename_consensus: +# input: +# "results/{date}/consensus/bcftools/{sample}.fasta", +# output: +# report( +# "results/{date}/contigs/consensus/{sample}.fasta", +# category="4. Sequences", +# subcategory="3. Consensus Sequences", +# caption="../report/assembly_consensus.rst", +# ), +# log: +# "logs/{date}/rename-consensus-fasta/{sample}.log", +# conda: +# "../envs/unix.yaml" +# shell: +# "(cp {input} {output} && " +# ' sed -i "1s/.*/>{wildcards.sample}/" {output})' +# " 2> {log}" From 5bae64fb1c9fc925bf849479a4b531927998dcf2 Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 7 Mar 2022 11:02:46 +0000 Subject: [PATCH 31/53] orf --- workflow/rules/long_read.smk | 64 +++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index c0f657a1a..5f6753ffe 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -140,34 +140,36 @@ rule clip_adbc_corrected: "notramp -t --incl_prim -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}" -# rule bcftools_consensus_ont: -# input: -# fasta="results/{date}/consensus/medaka/{sample}/consensus.fasta", -# bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf", # clonal vs. subclonal? -# bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf.csi", -# output: -# "results/{date}/consensus/bcftools/{sample}.fasta", -# log: -# "logs/{date}/bcftools-consensus-ont/{sample}.log", -# conda: -# "../envs/bcftools.yaml" -# shell: -# "bcftools consensus -f {input.fasta} {input.bcf} > {output} 2> {log}" -# rule rename_consensus: -# input: -# "results/{date}/consensus/bcftools/{sample}.fasta", -# output: -# report( -# "results/{date}/contigs/consensus/{sample}.fasta", -# category="4. Sequences", -# subcategory="3. Consensus Sequences", -# caption="../report/assembly_consensus.rst", -# ), -# log: -# "logs/{date}/rename-consensus-fasta/{sample}.log", -# conda: -# "../envs/unix.yaml" -# shell: -# "(cp {input} {output} && " -# ' sed -i "1s/.*/>{wildcards.sample}/" {output})' -# " 2> {log}" +rule bcftools_consensus_ont: + input: + fasta="results/{date}/consensus/medaka/{sample}/consensus.fasta", + bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.orf.bcf", # clonal vs. subclonal? + bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.orf.bcf.csi", + output: + "results/{date}/consensus/bcftools/{sample}.fasta", + log: + "logs/{date}/bcftools-consensus-ont/{sample}.log", + conda: + "../envs/bcftools.yaml" + shell: + "bcftools consensus -f {input.fasta} {input.bcf} > {output} 2> {log}" + + +rule rename_consensus: + input: + "results/{date}/consensus/bcftools/{sample}.fasta", + output: + report( + "results/{date}/contigs/consensus/{sample}.fasta", + category="4. Sequences", + subcategory="3. Consensus Sequences", + caption="../report/assembly_consensus.rst", + ), + log: + "logs/{date}/rename-consensus-fasta/{sample}.log", + conda: + "../envs/unix.yaml" + shell: + "(cp {input} {output} && " + ' sed -i "1s/.*/>{wildcards.sample}/" {output})' + " 2> {log}" From 6b862258b20f34c363b151b6fff694d24d9a4aa9 Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 7 Mar 2022 12:11:21 +0000 Subject: [PATCH 32/53] echo test --- .tests/config/config.yaml | 2 +- .tests/resources/primer.bedpe | 111 +++++++++++++++++++++++++++++++++- workflow/rules/long_read.smk | 4 +- 3 files changed, 112 insertions(+), 5 deletions(-) diff --git a/.tests/config/config.yaml b/.tests/config/config.yaml index 25a8ea6f4..92fae4b27 100644 --- a/.tests/config/config.yaml +++ b/.tests/config/config.yaml @@ -128,7 +128,7 @@ strain-calling: # paths to store genomes that are extracted from the full GISAID data extracted-strain-genomes: resources/genomes # flag for using all lineage reference from GISAIDS Epicov database. API key must be exported as env var GISAID_API_TOKEN. - use-gisaid: True + use-gisaid: False # GenBank accession for downloading lineage-references # only used, if use-gisaid flag is set to False lineage-references: diff --git a/.tests/resources/primer.bedpe b/.tests/resources/primer.bedpe index 27160a179..080cac138 100644 --- a/.tests/resources/primer.bedpe +++ b/.tests/resources/primer.bedpe @@ -1,2 +1,109 @@ -MN908947.3 25 45 MN908947.3 265 285 -MN908947.3 215 235 MN908947.3 485 505 +MN908947.3 30 54 MN908947.3 385 410 +MN908947.3 320 342 MN908947.3 704 726 +MN908947.3 642 664 MN908947.3 1004 1028 +MN908947.3 943 965 MN908947.3 1312 1337 +MN908947.3 1242 1264 MN908947.3 1623 1651 +MN908947.3 1573 1595 MN908947.3 1942 1964 +MN908947.3 1875 1897 MN908947.3 2247 2269 +MN908947.3 1868 1890 MN908947.3 2242 2264 +MN908947.3 2181 2205 MN908947.3 2568 2592 +MN908947.3 2505 2529 MN908947.3 2882 2904 +MN908947.3 2504 2528 MN908947.3 2880 2902 +MN908947.3 2826 2850 MN908947.3 3183 3210 +MN908947.3 3144 3166 MN908947.3 3507 3531 +MN908947.3 3460 3482 MN908947.3 3826 3853 +MN908947.3 3771 3795 MN908947.3 4142 4164 +MN908947.3 4054 4077 MN908947.3 4428 4450 +MN908947.3 4044 4068 MN908947.3 4402 4424 +MN908947.3 4294 4321 MN908947.3 4674 4696 +MN908947.3 4296 4322 MN908947.3 4666 4689 +MN908947.3 4636 4658 MN908947.3 4995 5017 +MN908947.3 4939 4966 MN908947.3 5296 5321 +MN908947.3 5230 5259 MN908947.3 5620 5644 +MN908947.3 5257 5287 MN908947.3 5620 5643 +MN908947.3 5563 5586 MN908947.3 5932 5957 +MN908947.3 5867 5894 MN908947.3 6247 6272 +MN908947.3 6167 6196 MN908947.3 6528 6550 +MN908947.3 6168 6197 MN908947.3 6526 6548 +MN908947.3 6466 6495 MN908947.3 6846 6873 +MN908947.3 6718 6745 MN908947.3 7092 7117 +MN908947.3 7035 7058 MN908947.3 7389 7415 +MN908947.3 7305 7332 MN908947.3 7671 7694 +MN908947.3 7626 7651 MN908947.3 7997 8019 +MN908947.3 7943 7968 MN908947.3 8319 8341 +MN908947.3 8249 8275 MN908947.3 8635 8661 +MN908947.3 8595 8619 MN908947.3 8954 8983 +MN908947.3 8888 8913 MN908947.3 9245 9271 +MN908947.3 9204 9226 MN908947.3 9557 9585 +MN908947.3 9477 9502 MN908947.3 9834 9858 +MN908947.3 9784 9806 MN908947.3 10146 10171 +MN908947.3 10076 10099 MN908947.3 10437 10459 +MN908947.3 10362 10384 MN908947.3 10737 10763 +MN908947.3 10666 10688 MN908947.3 11048 11074 +MN908947.3 10999 11022 MN908947.3 11372 11394 +MN908947.3 11306 11331 MN908947.3 11668 11693 +MN908947.3 11555 11584 MN908947.3 11927 11949 +MN908947.3 11863 11889 MN908947.3 12234 12256 +MN908947.3 12110 12133 MN908947.3 12465 12490 +MN908947.3 12417 12439 MN908947.3 12779 12802 +MN908947.3 12710 12732 MN908947.3 13074 13096 +MN908947.3 13005 13027 MN908947.3 13378 13400 +MN908947.3 13007 13029 MN908947.3 13363 13385 +MN908947.3 13319 13344 MN908947.3 13669 13699 +MN908947.3 13307 13336 MN908947.3 13660 13689 +MN908947.3 13599 13621 MN908947.3 13962 13984 +MN908947.3 13602 13625 MN908947.3 13961 13984 +MN908947.3 13918 13946 MN908947.3 14271 14299 +MN908947.3 14207 14232 MN908947.3 14579 14601 +MN908947.3 14545 14570 MN908947.3 14898 14926 +MN908947.3 14865 14895 MN908947.3 15224 15246 +MN908947.3 15171 15193 MN908947.3 15538 15560 +MN908947.3 15481 15503 MN908947.3 15861 15886 +MN908947.3 15827 15851 MN908947.3 16186 16209 +MN908947.3 16118 16144 MN908947.3 16485 16510 +MN908947.3 16416 16444 MN908947.3 16804 16833 +MN908947.3 16748 16770 MN908947.3 17130 17152 +MN908947.3 17065 17087 MN908947.3 17430 17452 +MN908947.3 17381 17406 MN908947.3 17738 17761 +MN908947.3 17674 17697 MN908947.3 18036 18062 +MN908947.3 17966 17993 MN908947.3 18324 18348 +MN908947.3 18253 18275 MN908947.3 18650 18672 +MN908947.3 18596 18618 MN908947.3 18957 18979 +MN908947.3 18896 18918 MN908947.3 19275 19297 +MN908947.3 19204 19232 MN908947.3 19591 19616 +MN908947.3 19548 19570 MN908947.3 19911 19939 +MN908947.3 19844 19866 MN908947.3 20231 20255 +MN908947.3 20172 20200 MN908947.3 20542 20572 +MN908947.3 20472 20496 MN908947.3 20867 20890 +MN908947.3 20786 20813 MN908947.3 21146 21169 +MN908947.3 21075 21104 MN908947.3 21427 21455 +MN908947.3 21357 21386 MN908947.3 21716 21743 +MN908947.3 21658 21682 MN908947.3 22013 22038 +MN908947.3 21961 21990 MN908947.3 22324 22346 +MN908947.3 22262 22290 MN908947.3 22626 22650 +MN908947.3 22516 22542 MN908947.3 22877 22903 +MN908947.3 22797 22819 MN908947.3 23192 23214 +MN908947.3 22798 22821 MN908947.3 23189 23212 +MN908947.3 23122 23144 MN908947.3 23500 23522 +MN908947.3 23443 23466 MN908947.3 23822 23847 +MN908947.3 23789 23812 MN908947.3 24145 24169 +MN908947.3 24078 24100 MN908947.3 24443 24467 +MN908947.3 24391 24416 MN908947.3 24765 24789 +MN908947.3 24696 24721 MN908947.3 25052 25076 +MN908947.3 24978 25003 MN908947.3 25347 25369 +MN908947.3 25279 25301 MN908947.3 25646 25673 +MN908947.3 25601 25623 MN908947.3 25969 25994 +MN908947.3 25902 25924 MN908947.3 26290 26315 +MN908947.3 26197 26219 MN908947.3 26566 26590 +MN908947.3 26520 26542 MN908947.3 26890 26913 +MN908947.3 26835 26857 MN908947.3 27202 27227 +MN908947.3 26838 26860 MN908947.3 27190 27215 +MN908947.3 27141 27164 MN908947.3 27511 27533 +MN908947.3 27446 27471 MN908947.3 27825 27854 +MN908947.3 27784 27808 MN908947.3 28145 28172 +MN908947.3 28081 28104 MN908947.3 28442 28464 +MN908947.3 28394 28416 MN908947.3 28756 28779 +MN908947.3 28677 28699 MN908947.3 29041 29063 +MN908947.3 28985 29007 MN908947.3 29356 29378 +MN908947.3 29288 29316 MN908947.3 29665 29693 +MN908947.3 29486 29510 MN908947.3 29836 29866 diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 5f6753ffe..7a3839c3b 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -47,9 +47,9 @@ rule nanofilt: shell: """ if (file {input} | grep -q compressed) ; then - gunzip -c {input} | NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 > {output} 2> {log} + echo gzipped && gunzip -c {input} | NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 > {output} 2> {log} else - NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log} + echo unzipped && NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log} fi """ From 1cd7d0e9df8c9c32ab281e8d81eb3f193c8bedb1 Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 7 Mar 2022 12:35:49 +0000 Subject: [PATCH 33/53] remove parenthesis --- workflow/rules/long_read.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 7a3839c3b..1b4c4f2d0 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -46,7 +46,7 @@ rule nanofilt: "../envs/nanofilt.yaml" shell: """ - if (file {input} | grep -q compressed) ; then + if file {input} | grep -q compressed ; then echo gzipped && gunzip -c {input} | NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 > {output} 2> {log} else echo unzipped && NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log} From 388befbc32c381b9273ce94b3aec70128342499b Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 7 Mar 2022 13:25:32 +0000 Subject: [PATCH 34/53] added gzip to nanofilt env and replace gunzip with gzip in rule nanofilt --- workflow/envs/nanofilt.yaml | 1 + workflow/rules/long_read.smk | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/workflow/envs/nanofilt.yaml b/workflow/envs/nanofilt.yaml index 7792655f1..6b13383d1 100644 --- a/workflow/envs/nanofilt.yaml +++ b/workflow/envs/nanofilt.yaml @@ -3,3 +3,4 @@ channels: - conda-forge dependencies: - nanofilt =2.8 + - gzip diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 1b4c4f2d0..84bb887bd 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -47,9 +47,9 @@ rule nanofilt: shell: """ if file {input} | grep -q compressed ; then - echo gzipped && gunzip -c {input} | NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 > {output} 2> {log} + echo gzipped && gzip -d -c {input} | NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 > {output} 2> {log} else - echo unzipped && NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log} + echo unzipped && NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log} fi """ From b5c444384d45d3b3be43a8e02b8733123b3a5baf Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 7 Mar 2022 15:09:22 +0000 Subject: [PATCH 35/53] add file to nanofilt env --- workflow/envs/nanofilt.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/envs/nanofilt.yaml b/workflow/envs/nanofilt.yaml index 6b13383d1..8daa7c83f 100644 --- a/workflow/envs/nanofilt.yaml +++ b/workflow/envs/nanofilt.yaml @@ -4,3 +4,4 @@ channels: dependencies: - nanofilt =2.8 - gzip + - file From f9b14633277adf478f9bbe9653ed10688f660b18 Mon Sep 17 00:00:00 2001 From: simakro Date: Mon, 7 Mar 2022 16:16:22 +0000 Subject: [PATCH 36/53] set corMemory for testing to 6GB --- workflow/rules/long_read.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 84bb887bd..9caca73bc 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -97,7 +97,7 @@ rule canu_correct: concurrency=lambda w, threads: get_canu_concurrency(threads), min_length=config["quality-criteria"]["ont"]["min-length-reads"], for_testing=lambda w, threads: get_if_testing( - f"corThreads={threads} redMemory=6 oeaMemory=6" + f"corThreads={threads} corMemory=6 redMemory=6 oeaMemory=6" ), conda: "../envs/canu.yaml" From f76aae4a87ccfcecab1427f2bde98e386fc8396c Mon Sep 17 00:00:00 2001 From: simakro Date: Tue, 8 Mar 2022 14:33:12 +0000 Subject: [PATCH 37/53] corona-spades in long_read.smk --- config/pep/samples.csv | 4 +--- workflow/rules/assembly.smk | 2 +- workflow/rules/common.smk | 3 ++- workflow/rules/long_read.smk | 15 ++++++++++++++- workflow/rules/read_mapping.smk | 3 ++- 5 files changed, 20 insertions(+), 7 deletions(-) diff --git a/config/pep/samples.csv b/config/pep/samples.csv index 4173923ea..0416fdea8 100644 --- a/config/pep/samples.csv +++ b/config/pep/samples.csv @@ -1,4 +1,2 @@ sample_name,fq1,fq2,date,is_amplicon_data,technology,include_in_high_genome_summary -SAMPLE_NAME_1,PATH/TO/fq1,PATH/TO/fq2,SEQUENCING_DATE,0,illumina,1 # Required information for a sample sequencing on the Illumina platform -SAMPLE_NAME_2,PATH/TO/fq,,SEQUENCING_DATE,1,ont,0 # Required information for a sample sequencing on the Oxford Nanopore platform -SAMPLE_NAME_3,PATH/TO/fq,,SEQUENCING_DATE,1,ion,0 # Required information for a sample sequencing on the Ion Torrent platform +UnCoVar_RefDataSet_Batch2_BC03_cat,/home/simon/uncovar/data/newdate/UnCoVar_RefDataSet_Batch2_BC03_cat.fastq,,newdate,1,ont,0 diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index d705b28eb..62cc50298 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -183,7 +183,7 @@ rule assembly_polishing_ont: reference="results/{date}/contigs/ordered/{sample}.fasta", output: report( - "results/{date}/polishing/medaka/{sample}/{sample}.fasta", + "results/{date}/polishing/medaka/{sample}/consensus.fasta", category="4. Sequences", subcategory="1. De Novo Assembled Sequences", caption="../report/assembly_ont.rst", diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index a8750172d..b706a54f7 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1410,7 +1410,8 @@ def get_polished_sequence(wildcards): return get_pattern_by_technology( wildcards, illumina_pattern="results/{date}/polishing/bcftools-illumina/{sample}.fasta", - ont_pattern="results/{date}/polishing/medaka/{sample}/{sample}.fasta", + # ont_pattern="results/{date}/polishing/medaka/{sample}/consensus.fasta", + ont_pattern="results/{date}/consensus/medaka/{sample}/consensus.fasta", ion_torrent_pattern="results/{date}/polishing/bcftools-illumina/{sample}.fasta", ) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 9caca73bc..706ffccd4 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -1,4 +1,4 @@ -# Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster. +# Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster, Simon Magin. # Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) # This file may not be copied, modified, or distributed # except according to those terms. @@ -140,6 +140,19 @@ rule clip_adbc_corrected: "notramp -t --incl_prim -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}" +rule spades_assemble: + input: + "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta", + output: + outdir=directory("results/{date}/{sample}_spades_asm"), + outfile="results/{date}/{sample}_spades_asm/raw_contigs.fasta", + conda: + "../envs/spades.yaml" + threads: 6 + shell: + "spades.py --corona -s {input} -o {output.outdir}" + + rule bcftools_consensus_ont: input: fasta="results/{date}/consensus/medaka/{sample}/consensus.fasta", diff --git a/workflow/rules/read_mapping.smk b/workflow/rules/read_mapping.smk index d4193706e..f53d265ed 100644 --- a/workflow/rules/read_mapping.smk +++ b/workflow/rules/read_mapping.smk @@ -49,7 +49,8 @@ rule map_reads: reads=get_reads, idx=get_bwa_index, output: - temp("results/{date}/mapped/ref~{reference}/{sample}.bam"), + # temp("results/{date}/mapped/ref~{reference}/{sample}.bam"), + "results/{date}/mapped/ref~{reference}/{sample}.bam", log: "logs/{date}/bwa-mem/ref~{reference}/{sample}.log", params: From 2118d24f3af9e9c6ad666ab0ab98ebdbcc190998 Mon Sep 17 00:00:00 2001 From: simakro Date: Wed, 9 Mar 2022 07:02:17 +0000 Subject: [PATCH 38/53] solved all issues with wrong input-files for ont de-novo assembly --- workflow/rules/assembly.smk | 10 ++++++++-- workflow/rules/common.smk | 9 ++++++--- workflow/rules/long_read.smk | 9 ++++++--- workflow/rules/qc.smk | 9 +++++++-- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index 62cc50298..462eda45b 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -91,9 +91,15 @@ rule assembly_spades_pe: rule spades_assemble_se: input: - get_reads_after_qc, + # get_reads_after_qc, + # "results/newdate/norm_trim_corr_reads/UnCoVar_RefDataSet_Batch2_BC03_cat/UnCoVar_RefDataSet_Batch2_BC03_cat.correctedReads.clip.fasta" + # "results/newdate/nonhuman-reads/se/UnCoVar_RefDataSet_Batch2_BC03_cat.correctedReads.clip.fastq" + "results/newdate/nonhuman-reads/se/UnCoVar_RefDataSet_Batch2_BC03_cat.fastq", output: - temp("results/{date}/assembly/{sample}/spades-se/{sample}.contigs.fasta"), + # temp("results/{date}/assembly/{sample}/spades-se/{sample}.contigs.fasta"), + # "results/{date}/assembly/{sample}/spades-se/{sample}.correctedReads.clip.contigs.fasta", + # "results/{date}/assembly/{sample}/spades-se/{sample}.contigs.fasta", + "results/{date}/assembly/{sample}/spades-se/{sample}.contigs.fasta", log: "logs/{date}/spades/se/{sample}.log", conda: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index b706a54f7..39743e377 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -407,7 +407,7 @@ def get_reads(wildcards): ) ont_pattern = expand( - "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", + "results/{date}/corrected/{sample}/{sample}.correctedReads.clip.fasta", **wildcards, ) @@ -458,7 +458,9 @@ def get_reads_after_qc(wildcards, read="both"): **wildcards, ) ont_pattern = expand( - "results/{date}/nonhuman-reads/se/{sample}.fastq.gz", **wildcards + # "results/{date}/nonhuman-reads/se/{sample}.correctedReads.clip.fastq", **wildcards + "results/{date}/nonhuman-reads/se/{sample}.fastq", + **wildcards, ) ion_torrent_pattern = expand( "results/{date}/read-clipping/fastq/se/{sample}.fastq", **wildcards @@ -1398,7 +1400,8 @@ def get_reads_by_stage(wildcards): if wildcards.stage == "raw": return get_fastqs(wildcards) elif wildcards.stage == "trimmed": - return "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta" + # return "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta" + return "results/{date}/corrected/{sample}/{sample}.correctedReads.clip.fasta" elif wildcards.stage == "clipped": return "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta" elif wildcards.stage == "filtered": diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 706ffccd4..f4b25f68c 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -129,11 +129,13 @@ rule clip_adbc_corrected: reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", ref_genome="resources/genomes/main.fasta", output: - "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta", + # "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta", + "results/{date}/corrected/{sample}/{sample}.correctedReads.clip.fasta", params: outdir=get_output_dir, log: - "results/{date}/norm_trim_corr_reads/{sample}/notramp.log", + # "results/{date}/norm_trim_corr_reads/{sample}/notramp.log", + "results/{date}/corrected/{sample}/notramp.log", conda: "../envs/notramp.yaml" shell: @@ -142,7 +144,8 @@ rule clip_adbc_corrected: rule spades_assemble: input: - "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta", + # "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fastq", + "results/{date}/corrected/{sample}/{sample}.correctedReads.clip.fastq", output: outdir=directory("results/{date}/{sample}_spades_asm"), outfile="results/{date}/{sample}_spades_asm/raw_contigs.fasta", diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index cb3220eca..7cca1066c 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -204,7 +204,8 @@ rule extract_reads_of_interest: bam="results/{date}/mapped/ref~main+human/{sample}.bam", index="results/{date}/mapped/ref~main+human/{sample}.bam.bai", output: - temp("results/{date}/mapped/ref~main+human/nonhuman/{sample}.bam"), + # temp("results/{date}/mapped/ref~main+human/nonhuman/{sample}.bam"), + "results/{date}/mapped/ref~main+human/nonhuman/{sample}.bam", log: "logs/{date}/extract_reads_of_interest/{sample}.log", params: @@ -237,7 +238,9 @@ rule order_nonhuman_reads_se: input: "results/{date}/mapped/ref~main+human/nonhuman/{sample}.bam", output: - fq=temp("results/{date}/nonhuman-reads/se/{sample}.fastq.gz"), + # fq=temp("results/{date}/nonhuman-reads/se/{sample}.fastq.gz"), + fq="results/{date}/nonhuman-reads/se/{sample}.fastq", + # fa="results/{date}/nonhuman-reads/se/{sample}.correctedReads.clip.fasta", bam_sorted=temp("results/{date}/nonhuman-reads/{sample}.sorted.bam"), log: "logs/{date}/order_nonhuman_reads/se/{sample}.log", @@ -247,7 +250,9 @@ rule order_nonhuman_reads_se: shell: "(samtools sort -@ {threads} -n {input} -o {output.bam_sorted}; " " samtools fastq -@ {threads} -0 {output.fq} {output.bam_sorted})" + " > {log} 2>&1" + # " samtools fasta -@ {threads} -0 {output.fa} {output.bam_sorted})" # analysis of species diversity present AFTER removing human contamination From 18263fff6949f21f568ac136986b43edd9ad1ac2 Mon Sep 17 00:00:00 2001 From: simakro Date: Wed, 9 Mar 2022 07:50:06 +0000 Subject: [PATCH 39/53] cleanup and temp-notes --- workflow/rules/assembly.smk | 7 +------ workflow/rules/common.smk | 6 +----- workflow/rules/long_read.smk | 16 ---------------- workflow/rules/qc.smk | 8 +------- workflow/rules/read_mapping.smk | 3 +-- 5 files changed, 4 insertions(+), 36 deletions(-) diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index 462eda45b..89fcab0e4 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -91,14 +91,9 @@ rule assembly_spades_pe: rule spades_assemble_se: input: - # get_reads_after_qc, - # "results/newdate/norm_trim_corr_reads/UnCoVar_RefDataSet_Batch2_BC03_cat/UnCoVar_RefDataSet_Batch2_BC03_cat.correctedReads.clip.fasta" - # "results/newdate/nonhuman-reads/se/UnCoVar_RefDataSet_Batch2_BC03_cat.correctedReads.clip.fastq" "results/newdate/nonhuman-reads/se/UnCoVar_RefDataSet_Batch2_BC03_cat.fastq", output: - # temp("results/{date}/assembly/{sample}/spades-se/{sample}.contigs.fasta"), - # "results/{date}/assembly/{sample}/spades-se/{sample}.correctedReads.clip.contigs.fasta", - # "results/{date}/assembly/{sample}/spades-se/{sample}.contigs.fasta", + # used to be temp() "results/{date}/assembly/{sample}/spades-se/{sample}.contigs.fasta", log: "logs/{date}/spades/se/{sample}.log", diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 39743e377..e128a6574 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -458,9 +458,7 @@ def get_reads_after_qc(wildcards, read="both"): **wildcards, ) ont_pattern = expand( - # "results/{date}/nonhuman-reads/se/{sample}.correctedReads.clip.fastq", **wildcards - "results/{date}/nonhuman-reads/se/{sample}.fastq", - **wildcards, + "results/{date}/nonhuman-reads/se/{sample}.fastq", **wildcards ) ion_torrent_pattern = expand( "results/{date}/read-clipping/fastq/se/{sample}.fastq", **wildcards @@ -1400,7 +1398,6 @@ def get_reads_by_stage(wildcards): if wildcards.stage == "raw": return get_fastqs(wildcards) elif wildcards.stage == "trimmed": - # return "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta" return "results/{date}/corrected/{sample}/{sample}.correctedReads.clip.fasta" elif wildcards.stage == "clipped": return "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta" @@ -1413,7 +1410,6 @@ def get_polished_sequence(wildcards): return get_pattern_by_technology( wildcards, illumina_pattern="results/{date}/polishing/bcftools-illumina/{sample}.fasta", - # ont_pattern="results/{date}/polishing/medaka/{sample}/consensus.fasta", ont_pattern="results/{date}/consensus/medaka/{sample}/consensus.fasta", ion_torrent_pattern="results/{date}/polishing/bcftools-illumina/{sample}.fasta", ) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index f4b25f68c..84981f19e 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -129,12 +129,10 @@ rule clip_adbc_corrected: reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta", ref_genome="resources/genomes/main.fasta", output: - # "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fasta", "results/{date}/corrected/{sample}/{sample}.correctedReads.clip.fasta", params: outdir=get_output_dir, log: - # "results/{date}/norm_trim_corr_reads/{sample}/notramp.log", "results/{date}/corrected/{sample}/notramp.log", conda: "../envs/notramp.yaml" @@ -142,20 +140,6 @@ rule clip_adbc_corrected: "notramp -t --incl_prim -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}" -rule spades_assemble: - input: - # "results/{date}/norm_trim_corr_reads/{sample}/{sample}.correctedReads.clip.fastq", - "results/{date}/corrected/{sample}/{sample}.correctedReads.clip.fastq", - output: - outdir=directory("results/{date}/{sample}_spades_asm"), - outfile="results/{date}/{sample}_spades_asm/raw_contigs.fasta", - conda: - "../envs/spades.yaml" - threads: 6 - shell: - "spades.py --corona -s {input} -o {output.outdir}" - - rule bcftools_consensus_ont: input: fasta="results/{date}/consensus/medaka/{sample}/consensus.fasta", diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 7cca1066c..b4e5a3918 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -153,9 +153,6 @@ rule species_diversity_before_se: kraken_output=temp( "results/{date}/species-diversity/se/{sample}/{sample}.kraken" ), - # report=( - # "results/{date}/species-diversity/se/{sample}/{sample}.uncleaned.kreport2" - # ), report="results/{date}/species-diversity/se/{sample}/{sample}.uncleaned.kreport2", log: "logs/{date}/kraken/se/{sample}.log", @@ -204,7 +201,6 @@ rule extract_reads_of_interest: bam="results/{date}/mapped/ref~main+human/{sample}.bam", index="results/{date}/mapped/ref~main+human/{sample}.bam.bai", output: - # temp("results/{date}/mapped/ref~main+human/nonhuman/{sample}.bam"), "results/{date}/mapped/ref~main+human/nonhuman/{sample}.bam", log: "logs/{date}/extract_reads_of_interest/{sample}.log", @@ -238,9 +234,7 @@ rule order_nonhuman_reads_se: input: "results/{date}/mapped/ref~main+human/nonhuman/{sample}.bam", output: - # fq=temp("results/{date}/nonhuman-reads/se/{sample}.fastq.gz"), fq="results/{date}/nonhuman-reads/se/{sample}.fastq", - # fa="results/{date}/nonhuman-reads/se/{sample}.correctedReads.clip.fasta", bam_sorted=temp("results/{date}/nonhuman-reads/{sample}.sorted.bam"), log: "logs/{date}/order_nonhuman_reads/se/{sample}.log", @@ -252,7 +246,6 @@ rule order_nonhuman_reads_se: " samtools fastq -@ {threads} -0 {output.fq} {output.bam_sorted})" " > {log} 2>&1" - # " samtools fasta -@ {threads} -0 {output.fa} {output.bam_sorted})" # analysis of species diversity present AFTER removing human contamination @@ -264,6 +257,7 @@ rule species_diversity_after_pe: kraken_output=temp( "results/{date}/species-diversity-nonhuman/pe/{sample}/{sample}.kraken" ), + # removed parethesis; used to be temp()? report="results/{date}/species-diversity-nonhuman/pe/{sample}/{sample}.cleaned.kreport2", log: "logs/{date}/kraken/{sample}_pe_nonhuman.log", diff --git a/workflow/rules/read_mapping.smk b/workflow/rules/read_mapping.smk index f53d265ed..08323285d 100644 --- a/workflow/rules/read_mapping.smk +++ b/workflow/rules/read_mapping.smk @@ -49,7 +49,7 @@ rule map_reads: reads=get_reads, idx=get_bwa_index, output: - # temp("results/{date}/mapped/ref~{reference}/{sample}.bam"), + # used to be temp() "results/{date}/mapped/ref~{reference}/{sample}.bam", log: "logs/{date}/bwa-mem/ref~{reference}/{sample}.log", @@ -60,7 +60,6 @@ rule map_reads: sort_order="coordinate", threads: 8 wrapper: - # "v1.2.0/bio/bwa-mem2/mem" "0.69.0/bio/bwa/mem" From 505c9f16b8f4ff51d2a8788acf2c6137329b549a Mon Sep 17 00:00:00 2001 From: simakro Date: Wed, 9 Mar 2022 08:36:40 +0000 Subject: [PATCH 40/53] restored sample-table, removed echos, removed hard-coded path --- config/pep/samples.csv | 4 +++- workflow/rules/assembly.smk | 3 ++- workflow/rules/long_read.smk | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/config/pep/samples.csv b/config/pep/samples.csv index 0416fdea8..4173923ea 100644 --- a/config/pep/samples.csv +++ b/config/pep/samples.csv @@ -1,2 +1,4 @@ sample_name,fq1,fq2,date,is_amplicon_data,technology,include_in_high_genome_summary -UnCoVar_RefDataSet_Batch2_BC03_cat,/home/simon/uncovar/data/newdate/UnCoVar_RefDataSet_Batch2_BC03_cat.fastq,,newdate,1,ont,0 +SAMPLE_NAME_1,PATH/TO/fq1,PATH/TO/fq2,SEQUENCING_DATE,0,illumina,1 # Required information for a sample sequencing on the Illumina platform +SAMPLE_NAME_2,PATH/TO/fq,,SEQUENCING_DATE,1,ont,0 # Required information for a sample sequencing on the Oxford Nanopore platform +SAMPLE_NAME_3,PATH/TO/fq,,SEQUENCING_DATE,1,ion,0 # Required information for a sample sequencing on the Ion Torrent platform diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index 89fcab0e4..a1338e30f 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -91,7 +91,8 @@ rule assembly_spades_pe: rule spades_assemble_se: input: - "results/newdate/nonhuman-reads/se/UnCoVar_RefDataSet_Batch2_BC03_cat.fastq", + # "results/newdate/nonhuman-reads/se/UnCoVar_RefDataSet_Batch2_BC03_cat.fastq", + "results/newdate/nonhuman-reads/se/{sample}.fastq", output: # used to be temp() "results/{date}/assembly/{sample}/spades-se/{sample}.contigs.fasta", diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 84981f19e..cdf2a20e8 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -47,9 +47,9 @@ rule nanofilt: shell: """ if file {input} | grep -q compressed ; then - echo gzipped && gzip -d -c {input} | NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 > {output} 2> {log} + gzip -d -c {input} | NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 > {output} 2> {log} else - echo unzipped && NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log} + NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log} fi """ From 2010f2866ffe47afffa33dd70adc9b9793e82208 Mon Sep 17 00:00:00 2001 From: simakro Date: Fri, 11 Mar 2022 12:31:38 +0000 Subject: [PATCH 41/53] resolve log issue --- workflow/envs/notramp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/notramp.yaml b/workflow/envs/notramp.yaml index 66321b450..c0dc36cd4 100644 --- a/workflow/envs/notramp.yaml +++ b/workflow/envs/notramp.yaml @@ -3,5 +3,5 @@ channels: - bioconda - conda-forge dependencies: - - notramp =1.0.3 + - notramp =1.0.4 - minimap2 =2.22 From a441053aa0bc8cc0cad931263c9caa10967cc3e1 Mon Sep 17 00:00:00 2001 From: simakro Date: Fri, 11 Mar 2022 17:21:36 +0000 Subject: [PATCH 42/53] notramp env --- workflow/envs/notramp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/notramp.yaml b/workflow/envs/notramp.yaml index c0dc36cd4..2a57c7ba6 100644 --- a/workflow/envs/notramp.yaml +++ b/workflow/envs/notramp.yaml @@ -3,5 +3,5 @@ channels: - bioconda - conda-forge dependencies: - - notramp =1.0.4 + - notramp =1.0.5 - minimap2 =2.22 From b958a7a7e519858301a78307db7a51e6497b889a Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Mon, 14 Mar 2022 09:47:08 +0100 Subject: [PATCH 43/53] undo changes in test config --- .tests/config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.tests/config/config.yaml b/.tests/config/config.yaml index 92fae4b27..4552e119b 100644 --- a/.tests/config/config.yaml +++ b/.tests/config/config.yaml @@ -13,7 +13,7 @@ testing: - MT470120 - MT451810 # flag for using genomes from genbank. Only for benchmarking purposes. - use-genbank: True + use-genbank: False # genome to use as reference. Must be a NCBI accession virus-reference-genome: From 41ae1eeb27870270e41e504c72e28bd6852d33d8 Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Mon, 14 Mar 2022 09:47:41 +0100 Subject: [PATCH 44/53] undo changes in test primer --- .tests/resources/primer.bedpe | 111 +--------------------------------- 1 file changed, 2 insertions(+), 109 deletions(-) diff --git a/.tests/resources/primer.bedpe b/.tests/resources/primer.bedpe index 080cac138..27160a179 100644 --- a/.tests/resources/primer.bedpe +++ b/.tests/resources/primer.bedpe @@ -1,109 +1,2 @@ -MN908947.3 30 54 MN908947.3 385 410 -MN908947.3 320 342 MN908947.3 704 726 -MN908947.3 642 664 MN908947.3 1004 1028 -MN908947.3 943 965 MN908947.3 1312 1337 -MN908947.3 1242 1264 MN908947.3 1623 1651 -MN908947.3 1573 1595 MN908947.3 1942 1964 -MN908947.3 1875 1897 MN908947.3 2247 2269 -MN908947.3 1868 1890 MN908947.3 2242 2264 -MN908947.3 2181 2205 MN908947.3 2568 2592 -MN908947.3 2505 2529 MN908947.3 2882 2904 -MN908947.3 2504 2528 MN908947.3 2880 2902 -MN908947.3 2826 2850 MN908947.3 3183 3210 -MN908947.3 3144 3166 MN908947.3 3507 3531 -MN908947.3 3460 3482 MN908947.3 3826 3853 -MN908947.3 3771 3795 MN908947.3 4142 4164 -MN908947.3 4054 4077 MN908947.3 4428 4450 -MN908947.3 4044 4068 MN908947.3 4402 4424 -MN908947.3 4294 4321 MN908947.3 4674 4696 -MN908947.3 4296 4322 MN908947.3 4666 4689 -MN908947.3 4636 4658 MN908947.3 4995 5017 -MN908947.3 4939 4966 MN908947.3 5296 5321 -MN908947.3 5230 5259 MN908947.3 5620 5644 -MN908947.3 5257 5287 MN908947.3 5620 5643 -MN908947.3 5563 5586 MN908947.3 5932 5957 -MN908947.3 5867 5894 MN908947.3 6247 6272 -MN908947.3 6167 6196 MN908947.3 6528 6550 -MN908947.3 6168 6197 MN908947.3 6526 6548 -MN908947.3 6466 6495 MN908947.3 6846 6873 -MN908947.3 6718 6745 MN908947.3 7092 7117 -MN908947.3 7035 7058 MN908947.3 7389 7415 -MN908947.3 7305 7332 MN908947.3 7671 7694 -MN908947.3 7626 7651 MN908947.3 7997 8019 -MN908947.3 7943 7968 MN908947.3 8319 8341 -MN908947.3 8249 8275 MN908947.3 8635 8661 -MN908947.3 8595 8619 MN908947.3 8954 8983 -MN908947.3 8888 8913 MN908947.3 9245 9271 -MN908947.3 9204 9226 MN908947.3 9557 9585 -MN908947.3 9477 9502 MN908947.3 9834 9858 -MN908947.3 9784 9806 MN908947.3 10146 10171 -MN908947.3 10076 10099 MN908947.3 10437 10459 -MN908947.3 10362 10384 MN908947.3 10737 10763 -MN908947.3 10666 10688 MN908947.3 11048 11074 -MN908947.3 10999 11022 MN908947.3 11372 11394 -MN908947.3 11306 11331 MN908947.3 11668 11693 -MN908947.3 11555 11584 MN908947.3 11927 11949 -MN908947.3 11863 11889 MN908947.3 12234 12256 -MN908947.3 12110 12133 MN908947.3 12465 12490 -MN908947.3 12417 12439 MN908947.3 12779 12802 -MN908947.3 12710 12732 MN908947.3 13074 13096 -MN908947.3 13005 13027 MN908947.3 13378 13400 -MN908947.3 13007 13029 MN908947.3 13363 13385 -MN908947.3 13319 13344 MN908947.3 13669 13699 -MN908947.3 13307 13336 MN908947.3 13660 13689 -MN908947.3 13599 13621 MN908947.3 13962 13984 -MN908947.3 13602 13625 MN908947.3 13961 13984 -MN908947.3 13918 13946 MN908947.3 14271 14299 -MN908947.3 14207 14232 MN908947.3 14579 14601 -MN908947.3 14545 14570 MN908947.3 14898 14926 -MN908947.3 14865 14895 MN908947.3 15224 15246 -MN908947.3 15171 15193 MN908947.3 15538 15560 -MN908947.3 15481 15503 MN908947.3 15861 15886 -MN908947.3 15827 15851 MN908947.3 16186 16209 -MN908947.3 16118 16144 MN908947.3 16485 16510 -MN908947.3 16416 16444 MN908947.3 16804 16833 -MN908947.3 16748 16770 MN908947.3 17130 17152 -MN908947.3 17065 17087 MN908947.3 17430 17452 -MN908947.3 17381 17406 MN908947.3 17738 17761 -MN908947.3 17674 17697 MN908947.3 18036 18062 -MN908947.3 17966 17993 MN908947.3 18324 18348 -MN908947.3 18253 18275 MN908947.3 18650 18672 -MN908947.3 18596 18618 MN908947.3 18957 18979 -MN908947.3 18896 18918 MN908947.3 19275 19297 -MN908947.3 19204 19232 MN908947.3 19591 19616 -MN908947.3 19548 19570 MN908947.3 19911 19939 -MN908947.3 19844 19866 MN908947.3 20231 20255 -MN908947.3 20172 20200 MN908947.3 20542 20572 -MN908947.3 20472 20496 MN908947.3 20867 20890 -MN908947.3 20786 20813 MN908947.3 21146 21169 -MN908947.3 21075 21104 MN908947.3 21427 21455 -MN908947.3 21357 21386 MN908947.3 21716 21743 -MN908947.3 21658 21682 MN908947.3 22013 22038 -MN908947.3 21961 21990 MN908947.3 22324 22346 -MN908947.3 22262 22290 MN908947.3 22626 22650 -MN908947.3 22516 22542 MN908947.3 22877 22903 -MN908947.3 22797 22819 MN908947.3 23192 23214 -MN908947.3 22798 22821 MN908947.3 23189 23212 -MN908947.3 23122 23144 MN908947.3 23500 23522 -MN908947.3 23443 23466 MN908947.3 23822 23847 -MN908947.3 23789 23812 MN908947.3 24145 24169 -MN908947.3 24078 24100 MN908947.3 24443 24467 -MN908947.3 24391 24416 MN908947.3 24765 24789 -MN908947.3 24696 24721 MN908947.3 25052 25076 -MN908947.3 24978 25003 MN908947.3 25347 25369 -MN908947.3 25279 25301 MN908947.3 25646 25673 -MN908947.3 25601 25623 MN908947.3 25969 25994 -MN908947.3 25902 25924 MN908947.3 26290 26315 -MN908947.3 26197 26219 MN908947.3 26566 26590 -MN908947.3 26520 26542 MN908947.3 26890 26913 -MN908947.3 26835 26857 MN908947.3 27202 27227 -MN908947.3 26838 26860 MN908947.3 27190 27215 -MN908947.3 27141 27164 MN908947.3 27511 27533 -MN908947.3 27446 27471 MN908947.3 27825 27854 -MN908947.3 27784 27808 MN908947.3 28145 28172 -MN908947.3 28081 28104 MN908947.3 28442 28464 -MN908947.3 28394 28416 MN908947.3 28756 28779 -MN908947.3 28677 28699 MN908947.3 29041 29063 -MN908947.3 28985 29007 MN908947.3 29356 29378 -MN908947.3 29288 29316 MN908947.3 29665 29693 -MN908947.3 29486 29510 MN908947.3 29836 29866 +MN908947.3 25 45 MN908947.3 265 285 +MN908947.3 215 235 MN908947.3 485 505 From 50f76c669649994168f1e6d1a91f4759ad2ccc25 Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Mon, 14 Mar 2022 09:50:10 +0100 Subject: [PATCH 45/53] add missing version in nanofilt.yaml --- workflow/envs/nanofilt.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/envs/nanofilt.yaml b/workflow/envs/nanofilt.yaml index 8daa7c83f..e845b6fba 100644 --- a/workflow/envs/nanofilt.yaml +++ b/workflow/envs/nanofilt.yaml @@ -3,5 +3,5 @@ channels: - conda-forge dependencies: - nanofilt =2.8 - - gzip - - file + - gzip =1.11 + - file =5.39 From e33bca10775b0b13259d78c7dabcb2c61e3442ac Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Mon, 14 Mar 2022 09:55:06 +0100 Subject: [PATCH 46/53] add remvoed temp flags back in --- workflow/rules/assembly.smk | 3 +-- workflow/rules/qc.smk | 4 ++-- workflow/rules/read_mapping.smk | 3 +-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index a1338e30f..784251861 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -94,8 +94,7 @@ rule spades_assemble_se: # "results/newdate/nonhuman-reads/se/UnCoVar_RefDataSet_Batch2_BC03_cat.fastq", "results/newdate/nonhuman-reads/se/{sample}.fastq", output: - # used to be temp() - "results/{date}/assembly/{sample}/spades-se/{sample}.contigs.fasta", + temp("results/{date}/assembly/{sample}/spades-se/{sample}.contigs.fasta"), log: "logs/{date}/spades/se/{sample}.log", conda: diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index b4e5a3918..ff584bfdf 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -201,7 +201,7 @@ rule extract_reads_of_interest: bam="results/{date}/mapped/ref~main+human/{sample}.bam", index="results/{date}/mapped/ref~main+human/{sample}.bam.bai", output: - "results/{date}/mapped/ref~main+human/nonhuman/{sample}.bam", + temp("results/{date}/mapped/ref~main+human/nonhuman/{sample}.bam"), log: "logs/{date}/extract_reads_of_interest/{sample}.log", params: @@ -234,7 +234,7 @@ rule order_nonhuman_reads_se: input: "results/{date}/mapped/ref~main+human/nonhuman/{sample}.bam", output: - fq="results/{date}/nonhuman-reads/se/{sample}.fastq", + fq=temp("results/{date}/nonhuman-reads/se/{sample}.fastq"), bam_sorted=temp("results/{date}/nonhuman-reads/{sample}.sorted.bam"), log: "logs/{date}/order_nonhuman_reads/se/{sample}.log", diff --git a/workflow/rules/read_mapping.smk b/workflow/rules/read_mapping.smk index 08323285d..b4b2243b2 100644 --- a/workflow/rules/read_mapping.smk +++ b/workflow/rules/read_mapping.smk @@ -49,8 +49,7 @@ rule map_reads: reads=get_reads, idx=get_bwa_index, output: - # used to be temp() - "results/{date}/mapped/ref~{reference}/{sample}.bam", + temp("results/{date}/mapped/ref~{reference}/{sample}.bam"), log: "logs/{date}/bwa-mem/ref~{reference}/{sample}.log", params: From cdfe62611b641bc7723bd7018170527a52eb9b11 Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Mon, 14 Mar 2022 09:55:49 +0100 Subject: [PATCH 47/53] rmv whitespace --- workflow/rules/qc.smk | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index ff584bfdf..009075c01 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -244,8 +244,7 @@ rule order_nonhuman_reads_se: shell: "(samtools sort -@ {threads} -n {input} -o {output.bam_sorted}; " " samtools fastq -@ {threads} -0 {output.fq} {output.bam_sorted})" - - " > {log} 2>&1" + "> {log} 2>&1" # analysis of species diversity present AFTER removing human contamination From b1712f07fc8faf902b71803e512d4b54ce07f68b Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Mon, 14 Mar 2022 10:40:35 +0100 Subject: [PATCH 48/53] update env file --- workflow/envs/canu.yaml | 4 ++-- workflow/envs/minimap2.yaml | 2 +- workflow/envs/seqtk.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/envs/canu.yaml b/workflow/envs/canu.yaml index 2199be94a..a8fec4951 100644 --- a/workflow/envs/canu.yaml +++ b/workflow/envs/canu.yaml @@ -3,5 +3,5 @@ channels: - conda-forge dependencies: # do not use canu 2.2 ! - - canu = 2.1.1 - - minimap2 = 2.22 + - canu =2.1.1 + - minimap2 =2.22 diff --git a/workflow/envs/minimap2.yaml b/workflow/envs/minimap2.yaml index 16f072c0b..a44e24867 100644 --- a/workflow/envs/minimap2.yaml +++ b/workflow/envs/minimap2.yaml @@ -2,4 +2,4 @@ channels: - bioconda - conda-forge dependencies: - - minimap2 = 2.22 + - minimap2 =2.22 diff --git a/workflow/envs/seqtk.yaml b/workflow/envs/seqtk.yaml index 123ad1616..101cc3c9d 100644 --- a/workflow/envs/seqtk.yaml +++ b/workflow/envs/seqtk.yaml @@ -2,4 +2,4 @@ channels: - bioconda - conda-forge dependencies: - - seqtk = 1.3 + - seqtk =1.3 From 2c885c180c168a46646e9f696006f96cc5eaf848 Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Mon, 14 Mar 2022 10:40:44 +0100 Subject: [PATCH 49/53] add canu logging --- workflow/rules/long_read.smk | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index cdf2a20e8..e10c9cf8d 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -103,24 +103,20 @@ rule canu_correct: "../envs/canu.yaml" threads: 16 shell: - """ - ( if [ -d {params.outdir} ]; then rm -Rf {params.outdir}; fi && - canu -correct -nanopore {input} -p {wildcards.sample} -d {params.outdir} genomeSize=30k minOverlapLength=10 minReadLength=200 \ - useGrid=false {params.for_testing} \ - corMMapMerSize=10 corOutCoverage=50000 corMinCoverage=0 maxInputCoverage=20000 \ - corOverlapper=minimap utgOverlapper=minimap obtOverlapper=minimap \ - corConcurrency={params.concurrency} \ - cormhapConcurrency={params.concurrency} cormhapThreads={params.concurrency} \ - cormmapConcurrency={params.concurrency} cormmapThreads={params.concurrency} \ - obtmmapConcurrency={params.concurrency} obtmmapThreads={params.concurrency} \ - utgmmapConcurrency={params.concurrency} utgmmapThreads={params.concurrency} \ - redConcurrency={params.concurrency} redThreads={params.concurrency} \ - ovbConcurrency={params.concurrency} \ - ovsConcurrency={params.concurrency} \ - oeaConcurrency={params.concurrency}) - gzip -d {params.corr_gz} --keep - > {log} 2>&1 - """ + "( if [ -d {params.outdir} ]; then rm -Rf {params.outdir}; fi && " + "canu -correct -nanopore {input} -p {wildcards.sample} -d {params.outdir} genomeSize=30k " + "minOverlapLength=10 minReadLength=200 useGrid=false {params.for_testing} " + "corMMapMerSize=10 corOutCoverage=50000 corMinCoverage=0 maxInputCoverage=20000 " + "corOverlapper=minimap utgOverlapper=minimap obtOverlapper=minimap " + "corConcurrency={params.concurrency} ovbConcurrency={params.concurrency} " + "cormhapConcurrency={params.concurrency} cormhapThreads={params.concurrency} " + "cormmapConcurrency={params.concurrency} cormmapThreads={params.concurrency} " + "obtmmapConcurrency={params.concurrency} obtmmapThreads={params.concurrency} " + "utgmmapConcurrency={params.concurrency} utgmmapThreads={params.concurrency} " + "redConcurrency={params.concurrency} redThreads={params.concurrency} " + "ovsConcurrency={params.concurrency} oeaConcurrency={params.concurrency} && " + "gzip -d {params.corr_gz} --keep)" + "> {log} 2>&1" rule clip_adbc_corrected: From 173747af409f3bab9433e292f372359659a010ed Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Mon, 14 Mar 2022 10:42:17 +0100 Subject: [PATCH 50/53] update config --- .tests/config/config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.tests/config/config.yaml b/.tests/config/config.yaml index 4552e119b..25a8ea6f4 100644 --- a/.tests/config/config.yaml +++ b/.tests/config/config.yaml @@ -13,7 +13,7 @@ testing: - MT470120 - MT451810 # flag for using genomes from genbank. Only for benchmarking purposes. - use-genbank: False + use-genbank: True # genome to use as reference. Must be a NCBI accession virus-reference-genome: @@ -128,7 +128,7 @@ strain-calling: # paths to store genomes that are extracted from the full GISAID data extracted-strain-genomes: resources/genomes # flag for using all lineage reference from GISAIDS Epicov database. API key must be exported as env var GISAID_API_TOKEN. - use-gisaid: False + use-gisaid: True # GenBank accession for downloading lineage-references # only used, if use-gisaid flag is set to False lineage-references: From da021a20b57a4f33604661ec919e22c0b9bcc9f2 Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Mon, 14 Mar 2022 10:50:31 +0100 Subject: [PATCH 51/53] add temp --- workflow/rules/long_read.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index e10c9cf8d..0090113a2 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -142,7 +142,7 @@ rule bcftools_consensus_ont: bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.orf.bcf", # clonal vs. subclonal? bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.orf.bcf.csi", output: - "results/{date}/consensus/bcftools/{sample}.fasta", + temp("results/{date}/consensus/bcftools/{sample}.fasta"), log: "logs/{date}/bcftools-consensus-ont/{sample}.log", conda: From 0a2b5fcb518fa482c2ea1974cf91c5e7daa0f4aa Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Mon, 14 Mar 2022 19:45:11 +0100 Subject: [PATCH 52/53] rmv hardcoded date path --- workflow/rules/assembly.smk | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index 784251861..62cc50298 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -91,8 +91,7 @@ rule assembly_spades_pe: rule spades_assemble_se: input: - # "results/newdate/nonhuman-reads/se/UnCoVar_RefDataSet_Batch2_BC03_cat.fastq", - "results/newdate/nonhuman-reads/se/{sample}.fastq", + get_reads_after_qc, output: temp("results/{date}/assembly/{sample}/spades-se/{sample}.contigs.fasta"), log: From d4965fd58724da555acf7add62d0e8b40256844c Mon Sep 17 00:00:00 2001 From: Thomas Battenfeld Date: Tue, 15 Mar 2022 18:32:19 +0100 Subject: [PATCH 53/53] add logs --- workflow/rules/long_read.smk | 4 ++-- workflow/rules/qc.smk | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk index 0090113a2..33e4cec03 100644 --- a/workflow/rules/long_read.smk +++ b/workflow/rules/long_read.smk @@ -69,7 +69,7 @@ rule downsample_and_trim_raw: conda: "../envs/notramp.yaml" shell: - "notramp -a -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}" + "notramp -a -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir} 2> {log}" use rule assembly_polishing_ont as medaka_consensus_reference with: @@ -133,7 +133,7 @@ rule clip_adbc_corrected: conda: "../envs/notramp.yaml" shell: - "notramp -t --incl_prim -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}" + "notramp -t --incl_prim -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir} 2> {log}" rule bcftools_consensus_ont: diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 009075c01..5e1e26231 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -256,7 +256,6 @@ rule species_diversity_after_pe: kraken_output=temp( "results/{date}/species-diversity-nonhuman/pe/{sample}/{sample}.kraken" ), - # removed parethesis; used to be temp()? report="results/{date}/species-diversity-nonhuman/pe/{sample}/{sample}.cleaned.kreport2", log: "logs/{date}/kraken/{sample}_pe_nonhuman.log",