diff --git a/.gitattributes b/.gitattributes index 2c6853f..94be2f7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,3 +2,4 @@ *.css linguist-detectable=false *.js linguist-detectable=false *.py linguist-detectable=false +*.R linguist-detectable=false diff --git a/.gitignore b/.gitignore index 9d2b9f0..40fc71f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,9 @@ -*/log* -anno* -*/rep* -NanoPreprocess/preproc_data -NanoPreprocess/wrong -real_data -NanoPreprocess/bin/guppy* -NanoPreprocess/bin/ont-guppy* -log -*/out* -work singularity -.nextflow* +mop_preprocess/bin/multi_to_single_fast5 +mop_preprocess/bin/read_fast5_basecaller.py +mop_preprocess/bin/ont* +mop_preprocess/bin/lib* +mop_preprocess/bin/MINIMAP2_LICENSE +mop_preprocess/bin/guppy* +*/work +*/.next* diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..73be20c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "BioNextflow"] + path = BioNextflow + url = https://github.com/biocorecrg/BioNextflow diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index c513501..0000000 --- a/.travis.yml +++ /dev/null @@ -1,22 +0,0 @@ -install: travis_wait 30 mvn install -sudo: required -language: java -jdk: openjdk8 -services: docker - -install: - # Install Nextflow - - mkdir nextflow - - cd nextflow - - wget -qO- get.nextflow.io | bash - - sudo ln -s $PWD/nextflow /usr/local/bin/nextflow - -# Install Master Of Porse - - git clone https://github.com/biocorecrg/master_of_pores.git - - cd master_of_pores - - sh INSTALL.sh - -# Run the pipeline -script: - - cd NanoPreprocess - - nextflow run nanopreprocess.nf -with-docker -profile local diff --git a/BioNextflow b/BioNextflow new file mode 160000 index 0000000..20310e2 --- /dev/null +++ b/BioNextflow @@ -0,0 +1 @@ +Subproject commit 20310e21dbd99ea705d04aa2b0940a817b1cffde diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 53d69ba..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,14 +0,0 @@ -## Version 1.1 -* Added a new module called NanoPreprocessSimple that starts from fastq files instead of fast5 files. It allows the analysis of multiple files at a time. -* Added support to vbz compressed fast5 https://github.com/nanoporetech/vbz_compression in NanoPreprocess, NanoMod and NanoTail -* NanoPreprocess now outputs also CRAM files and can do downsampling with the parameter --downsampling -* NanoPreprocess allows performing variant calling using medaka (BETA) -* NanoPreprocess allows performing demultiplexing with GUPPY -* Added plots for Epinano output in NanoMod -* Added a conversion of Tombo results in bed format in NanoMod -* Added a INSTALL.sh file for automatically retrieve guppy 3.4.5 from https://mirror.oxfordnanoportal.com/, place it in NanoPreprocess/bin and making the required links -* Added profiles for being used locally and on the CRG SGE cluster - - -## Version 1.0 -This is the original version published in the paper [MasterOfPores: A Workflow for the Analysis of Oxford Nanopore Direct RNA Sequencing Datasets](https://www.frontiersin.org/articles/10.3389/fgene.2020.00211/full) diff --git a/NanoMod/bin/epinano_paired.py b/NanoMod/bin/epinano_paired.py deleted file mode 100755 index b8c5cb0..0000000 --- a/NanoMod/bin/epinano_paired.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python -# -- coding: utf-8 - - -import argparse, re -import numpy as np -from collections import Counter as cnt -from collections import defaultdict -import gzip - -parser = argparse.ArgumentParser() -parser.add_argument ('-k','--knockout', required=True, dest='kos', action='append', help='knockout sample epinano prediciton results') -parser.add_argument ('-w','--wildtype', required=True, dest='wts', action='append', help='wildtype sample epinano prediction results') -parser.add_argument ('-c','--coverage', nargs = '?', const=5, default=5,type = int, help='minimum coverage to be considered as valid, default is 5') -parser.add_argument ('-o','--output', required = True, help= 'output file name') -parser.add_argument ('-m','--motif', type=str, default='[AGCTUagctu]',help='motif to be kept in the final results; default will keep all motifs; "-m [AG][AG]AC[ACT]" will keep RRACH motifs; ') -parser.add_argument ('-dk','--ko_duplicates', required = True, type=int, default=1, help='a site has to be at least predicted in this many samples to be used to determine final modificaiton status; default is 1') -parser.add_argument ('-dw','--wt_duplicates', required = True, type=int, default=1, help='a site has to be at least predicted in this many samples to be used to determine final modificaiton status; default is 1') -parser.add_argument ('-gz', '--gzipped', default=False, dest='isgzip', action='store_true', help='indicates if the input files are gzipped or not') -args = parser.parse_args() - -''' -python3 compare_wt_ko_predicitons.py -k ko.csv -k ko3.csv -k ko2.csv -w wt.csv -c 5 -o out -m [AG][AG]AC[ACT] -''' - -def read_result (cov, mtf, epinano_result_files, gzip): - ''' - input1: SVM.py predicted results - input2: minimum depth - input3: motif to be saved; default is RRACH - ''' - motif = dict() - info = defaultdict(list) - for f in epinano_result_files: - with fopen (f, gzip) as fh: - for l in fh: - if l.startswith('#'): - continue - ary = l.strip().split(',') - mapping_cov = float(ary[3].split(':')[2]) - if mapping_cov < cov: - continue - if not re.match (mtf,ary[0]): - continue - win,ref,kmer,probm = ary[1],ary[2],ary[0],float(ary[-2]) - k = ref + ',' + win.split(':')[2] - info[k].append ((mapping_cov,probm)) - motif[k] = kmer - return info, motif - -def fopen (file, isgzip): - fh = "" - if isgzip: - fh = gzip.open(file, 'rt', encoding='utf-8') - else: - fh = open (file) - return fh - - -def scoring (common_sites, sample_info, repeats): - ''' - score prediction results - if it is one on one comparison: - use the probm as it is - else: - if (s1 ≥ 0.5 and s2 ≥ 0.5 and .. sn ≥ 0.5): - M = 1 - else: - M = (s1 + s2 + sn)/n - if (Mwt/Mko) > 1.5 and Mwt > 0.5: - status = modified - else: - status = unmodified - ''' - scores = dict () - probs = defaultdict(list) - for s in common_sites: - probms = [x[1] for x in sample_info[s]] - if len (probms) < repeats: - continue - elif len (probms) == 1: - scores[s] = np.mean (probms) - else: - cond = [x > 0.5 for x in probms] - if all (cond): - scores[s] = 1 - else: - scores[s] = np.mean (probms) - probs[s] = probms - return scores,probs - -if __name__ == '__main__': - kos = args.kos - wts = args.wts - cov = args.coverage - ko_duplicates = args.ko_duplicates - wt_duplicates = args.wt_duplicates - isgzip = args.isgzip - motif = args.motif - out = open (args.output, 'w') - - head = "#chr,position,motif,modification_status\n" - out.write(head) - wt_inf, mtfs = read_result (cov,motif,wts,isgzip) - ko_inf, mtfs = read_result (cov,motif,kos,isgzip) - all_wt_sites_lst = [x for x in wt_inf] - all_ko_sites_lst = [x for x in ko_inf] - commonSites = set (all_wt_sites_lst).intersection (set(all_ko_sites_lst)) - wt_scores, wt_probs = scoring (commonSites, wt_inf, wt_duplicates) - ko_scores, ko_probs = scoring (commonSites, ko_inf, ko_duplicates) - for s in commonSites: - if not (s in wt_scores and s in ko_scores): - continue - if wt_scores[s]/ko_scores[s] > 1.5 and wt_scores[s] > 0.5: - out.write ('{},{},{}\n'.format(s,mtfs[s],"YES")) - else: - out.write ('{},{},{}\n'.format(s,mtfs[s],"NO")) - out.close() diff --git a/NanoMod/bin/epinano_scatterplot.R b/NanoMod/bin/epinano_scatterplot.R deleted file mode 100755 index bb48ef0..0000000 --- a/NanoMod/bin/epinano_scatterplot.R +++ /dev/null @@ -1,88 +0,0 @@ -#Scatter plots -#Rscript epinano_scatterplot.R input1 label1 input2 label2 feature -#Libraries needed -library(plyr) -library(ggplot2) -library(ggrepel) -library(MASS) -library(reshape2) - -# Reading arguments from command line -args = commandArgs(trailingOnly=TRUE) - -#Arguments -zinput1<-gzfile(args[1], "rt") -zinput2<-gzfile(args[3], "rt") - -input1 <- read.delim(zinput1,sep=",") #1st variable -label1 <- as.character(args[2]) #1st label -input2 <-read.delim(zinput2,sep=",") #2nd variable -label2 <- as.character(args[4]) #2nd label -feature<- as.character(args[5]) #Feature - - - -#Cleanup -cleanup <- function(input, label) { - #Filter low coverage reads - input <- subset(input, cov>30) - #Filter read starts - input <- subset(input, pos>20) - #Add summed errors column - input$sum <- input$mis + input$del + input$ins - #Add a column with position - input$position<- paste(input$X.Ref,input$pos) - #Change column names - input <- input[, c("X.Ref","pos","position", "base", feature)] - colnames(input)<- c("Chr","Position","chr_pos","base",feature ) - data_melted<- melt(data = input, id.vars = c("Chr", "Position", "chr_pos", "base")) - colnames(data_melted)[which(names(data_melted) == "value")] <- paste(label, "value", sep="_") - return(data_melted) -} - - -#Cleanup and process the data -data1 <- cleanup(input1, label1) -data2 <- cleanup(input2, label2) - -merged <- join(data1,data2, by="chr_pos") -merged$Chr <- NULL -merged$Position <- NULL -merged$base <- NULL -merged$variable <- NULL - - -plot<- function(data) -for (chr in unique(data$Chr)) { - subs <- subset(data, Chr==chr) - if(nrow(subs)>0){ - res<- rlm(subs[,c(paste(label1, "value", sep="_"))] ~ subs[,c(paste(label2, "value", sep="_"))]) #linear model - res_vec <- res$residuals#this contains residuals - threshold <- 5 * sd(res_vec) #The threshold - subs$score<- abs(subs[,c(paste(label1, "value", sep="_"))] - subs[,c(paste(label2, "value", sep="_"))]) - pdf(file=paste(chr,feature, label1, label2, "scatter.pdf", sep="_"),height=5,width=5,onefile=FALSE) - print(ggplot(subs, aes_string(x=paste(label1, "value", sep="_"), y=paste(label2, "value", sep="_"))) + - geom_point(size=2, color="grey")+ - geom_abline(slope=1, intercept=0,linetype="dashed")+ - geom_point(data=subset(subs, score>threshold), size=2, color="red")+ - geom_text_repel(data=subset(subs, score>threshold), aes(label=chr_pos), colour="black",segment.size = 0.4,segment.color = "grey50",size=5)+ - ggtitle(feature)+ - xlab(label1)+ - ylab(label2) + - theme_bw()+ - theme(axis.text.x = element_text(face="bold", color="black",size=11), - axis.text.y = element_text(face="bold", color="black", size=11), - plot.title = element_text(color="black", size=15, face="bold.italic",hjust = 0.5), - axis.title.x = element_text(color="black", size=15, face="bold"), - axis.title.y = element_text(color="black", size=15, face="bold"), - panel.background = element_blank(), - axis.line = element_line(colour = "black", size=0.5), - legend.title = element_text(color = "black", size = 15,face="bold"), - legend.text = element_text(color = "black", size=15), - panel.grid.major = element_blank(), panel.grid.minor = element_blank()) - ) - dev.off() - } - } - -plot(merged) diff --git a/NanoMod/bin/join.r b/NanoMod/bin/join.r deleted file mode 100755 index 2a8643b..0000000 --- a/NanoMod/bin/join.r +++ /dev/null @@ -1,60 +0,0 @@ -# R --slave --args epi tombo prefix < join.r - -args<-commandArgs(TRUE) -epi <- args[1] -tombo <- args[2] -prefix <- args[3] - -a<-read.table(epi, sep="\t", header=FALSE) -b<-read.table(tombo, sep="\t", header=FALSE) - -a$V2<-"epinano" -b$V2<-"tombo" - -c<-merge(a,b, all=T, by.x="V1", by.y="V1", sort=FALSE) -colnames(c)<-c("positions", "epinano", "tombo") - -c[c=="epinano"]<-1 -c[c=="tombo"]<-1 -c[is.na(c)]<-0 - -write.table(c, file = "RNA_modifications.txt", sep="\t", row.names = FALSE) - -library(VennDiagram) -library(RColorBrewer) -myCol <- c("#B3E2CD","#FDCDAC") - -venn.diagram( - x = list(a$V1, b$V1), - category.names = c("Epinano", "Tombo"), - filename = 'venn_diagram.png', - imagetype = "png", - output=TRUE, - height = 1024, - width = 1024 , - resolution = 300, - compression = "lzw", - main.pos = c(0.5,0.7), - - # Circles - lwd = 2, - lty = 'blank', - fill = myCol, - margin = 0.6, - main = prefix, - main.fontfamily = "sans", - - # Numbers - cex = .6, - fontface = "bold", - fontfamily = "sans", - - # Set names - cat.cex = 0.6, - cat.fontface = "bold", - cat.default.pos = "outer", - cat.pos = c(-135, 135), - cat.fontfamily = "sans" -) - - diff --git a/NanoMod/bin/tombo_2_bed.sh b/NanoMod/bin/tombo_2_bed.sh deleted file mode 100755 index 02d8dd8..0000000 --- a/NanoMod/bin/tombo_2_bed.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash -# -# This script convert the tombo output in a 6 field bed file t - -# specify the name of the tombo input -if [ x"$1" == x ]; then - - echo "please specify the tombo output file" - - exit 1 - -fi - -if [ x"$2" == x ]; then - - echo "please specify the name of the output file" - - exit 1 - -fi - - -grep ">" $1 | awk '{print $1"\t"$5}' | sed s/">"//g | sed s/\:/\\t/g | awk '{num++; OFS="\t"; print $1,$2,$2, $4*100, "PRED_"num,$3}' > $2 diff --git a/NanoMod/bin/tombo_filter.py b/NanoMod/bin/tombo_filter.py deleted file mode 100755 index f0f08d1..0000000 --- a/NanoMod/bin/tombo_filter.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python -import argparse, re -import numpy as np -from collections import Counter as cnt -from collections import defaultdict - -parser = argparse.ArgumentParser() -parser.add_argument ('-t','--tomboouputs', required=True, dest='tbs', action='append', help='knockout sample tombo prediciton results') -parser.add_argument ('-p','--percentage', nargs = '?', const=0.5, default=0.5,type = float, help='threshod to filter for positives; (0,1]; default is 0.5') -parser.add_argument ('-o','--output', required = True, help= 'output file name') -parser.add_argument ('-m','--motif', type=str, default='[AGCTUagctu]',help='motif to be kept in the final results; default will keep all motifs; "-m [AG][AG]AC[ACT]" will keep RRACH motifs; ') -parser.add_argument ('-d','--num_duplicates', required = True, type=int, default=1, help='a site has to be at least predicted in this many samples to be used to determine final modificaiton status; default is 3') - -args = parser.parse_args() - -def read_tombo_fasta (fasta, motif): - complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} - info = defaultdict (list) - for f in fasta: - with open (f, 'r') as fh: - for l in fh: - if l.startswith ('>'): - mtf = next (fh) - ary = l.strip().split() - id = ary[0][1:] - strand = id.split(':')[-1] - #id = ':'.join (id.split(':')[:2]) - prob = float (ary[-1]) - if strand == '-': - # Are we sure that is not already reverse complement? - mtf = "".join(complement.get(base,base) for base in reversed(mtf)) - id = ':'.join (id.split(':')[:2]) - id = id + ":" + mtf.rstrip() - if not re.match (motif, mtf): - continue - info[id].append ((mtf,prob)) - return info - -def score (sample_info, threshold, duplicates): - scores = dict() - for s in sample_info: - num = len (sample_info[s]) - if num < duplicates: - continue - probs = [x[1] for x in sample_info[s]] - cond = all ([x>threshold for x in probs]) - if (cond): - scores[s] = 1 - else: - scores[s] = np.mean (probs) - return scores - - -if __name__ == '__main__': - tbs = args.tbs - motif = args.motif - out = open (args.output,'w') - tombo_info = read_tombo_fasta (tbs,args.motif) ; print (tombo_info) - tombo_score = score (tombo_info, args.percentage, args.num_duplicates); print (tombo_score) - mod = '' - out.write ('#chr,position,motif,modification_status\n') - for s in tombo_score: - if tombo_score[s] > args.percentage: - mod = 'YES' - else: - mod = 'NO' - s = s.replace (':',',') - out.write ("{},{}\n".format(s,mod)) - out.close() diff --git a/NanoMod/comparison.tsv b/NanoMod/comparison.tsv deleted file mode 100644 index d473b1c..0000000 --- a/NanoMod/comparison.tsv +++ /dev/null @@ -1,2 +0,0 @@ -RNAAA023484_wt1 RNAAA024588_ko1 -RNAAB056712_wt2 RNAAB058843_ko2 diff --git a/NanoMod/config.yaml b/NanoMod/config.yaml deleted file mode 100644 index faa50a8..0000000 --- a/NanoMod/config.yaml +++ /dev/null @@ -1,26 +0,0 @@ -title: "Master of Pores" -subtitle: "" -intro_text: False - -report_header_info: - - Application Type: 'Nanopore sequencing' - -custom_logo: 'logo_small.png' -custom_logo_url: 'https://github.com/biocorecrg/nanopore_analysis' -custom_logo_title: 'Master of Pores' - -extra_fn_clean_trim: - - '_QC' - -table_columns_visible: - FastQC: - percent_duplicates: True - -module_order: - - fastqc: - name: 'FastQC' - info: 'This section of the report shows FastQC results on raw reads.' - - - alnQC: - name: "alnQC" - info: 'This section of the report shows alnQC results on aligned reads' diff --git a/NanoMod/models/model2.1-mis3.del3.q3.poly.dump b/NanoMod/models/model2.1-mis3.del3.q3.poly.dump deleted file mode 100644 index 3692593..0000000 Binary files a/NanoMod/models/model2.1-mis3.del3.q3.poly.dump and /dev/null differ diff --git a/NanoMod/nanomod.nf b/NanoMod/nanomod.nf deleted file mode 100644 index 0288c3c..0000000 --- a/NanoMod/nanomod.nf +++ /dev/null @@ -1,444 +0,0 @@ -#!/usr/bin/env nextflow - - -/* - * Define the pipeline parameters - * - */ - -// Pipeline version -version = '0.1' - -params.help = false -params.resume = false - -log.info """ - -╔╦╗┌─┐┌─┐┌┬┐┌─┐┬─┐ ┌─┐┌─┐ ╔═╗╔═╗╦═╗╔═╗╔═╗ -║║║├─┤└─┐ │ ├┤ ├┬┘ │ │├┤ ╠═╝║ ║╠╦╝║╣ ╚═╗ -╩ ╩┴ ┴└─┘ ┴ └─┘┴└─ └─┘└ ╩ ╚═╝╩╚═╚═╝╚═╝ - -==================================================== -BIOCORE@CRG NanoDirectRNA. Detection of modification and polyA length (RNA) - N F ~ version ${version} -==================================================== - -***************** Input files ******************* -input_path : ${params.input_path} -comparison : ${params.comparison} - -********** reference has to be the genome ************* -reference : ${params.reference} -output : ${params.output} - -coverage : ${params.coverage} -************* tombo and epinano params **************** -tombo_opt : ${params.tombo_opt} - -epinano_opt : ${params.epinano_opt} - -email : ${params.email} -""" - -// Help and avoiding typos -if (params.help) exit 1 -if (params.resume) exit 1, "Are you making the classical --resume typo? Be careful!!!! ;)" - -// check input files -reference = file(params.reference) -if( !reference.exists() ) exit 1, "Missing reference file: ${reference}!" -//config_report = file("$baseDir/config.yaml") -//if( !config_report.exists() ) exit 1, "Missing config.yaml file!" -logo = file("$baseDir/../docs/logo_small.png") - -model_folder = file("$baseDir/models/") -if( !model_folder.exists() ) exit 1, "Missing folders with EpiNano's models!" -joinScript = file("$baseDir/bin/join.r") - -tombo_opt = params.tombo_opt -epinano_opt = params.epinano_opt - -epinanoScript = file("$baseDir/bin/epinano_scatterplot.R") - -// Output folders -outputtombo = "${params.output}/Tombo" -outputEpinano = "${params.output}/Epinano" -//outputReport = file("${outputMultiQC}/multiqc_report.html") - -/* -* move old multiQCreport - -if( outputReport.exists() ) { - log.info "Moving old report to multiqc_report.html multiqc_report.html.old" - outputReport.moveTo("${outputMultiQC}/multiqc_report.html.old") -} -*/ - -compfile = file(params.comparison) -if( !compfile.exists() ) exit 1, "Missing comparison file: ${compfile}. Specify path with --comparisons" - -/* - * Creates the channels with comparisons - */ - Channel - .from(compfile.readLines()) - .map { line -> - list = line.split("\t") - if (list[0]!= "") { - def sampleID = list[0] - def ctrlID = list[1] - [ sampleID, ctrlID ] - } - } - .into{ id_to_tombo_fast5; id_to_tombo_idx; id_to_epinano; id_to_epinano_plots; id_for_resquiggling} - - -/* - * Creates the channels with samples and KOs for EpiNano - */ - Channel - .from(compfile.readLines()) - .map { line -> - list = line.split("\t") - if (list[0]!= "") { - list[0] - } - } - .set{ samples_for_epinano_filtering } - - Channel - .from(compfile.readLines()) - .map { line -> - list = line.split("\t") - if (list[1]!= "") { - list[1] - } - } - .set{ ko_for_epinano_filtering } - -id_for_resquiggling.flatten().unique().map { - [it, file("${params.input_path}/${it}/fast5_files/*.fast5")] -}.transpose().set{fast5_for_resquiggling} - -id_to_epinano.flatten().unique().map { - [it, file("${params.input_path}/${it}/alignment/*.bam")] -}.transpose().set{bams_for_variant_calling} - - - -/* -* Perform resquiggling -*/ - -process resquiggling { - tag {"${idsample}-${fast5.simpleName}"} - label 'big_mem_cpus' - - input: - set idsample, file(fast5) from fast5_for_resquiggling - file(reference) - - output: - file ("*.resquiggle.failed.tsv") into failed_resquiggles - set idsample, file ("${idsample}-${fast5.simpleName}") into singlefast5 - set idsample, file (".*.tombo.index") into tombo_indexes - - script: - def infolder = "${idsample}-${fast5.simpleName}" - - """ - #from multifast5 to singlefast5 - mkdir ${infolder}; - multi_to_single_fast5 -i ./ -s ./ -t ${task.cpus}; - rm ./filename_mapping.txt; - mv ./*/*.fast5 ${infolder}; - # resquiggling - tombo resquiggle ${infolder} ${reference} --rna --processes ${task.cpus} --overwrite --failed-reads-filename ${infolder}.resquiggle.failed.tsv - """ -} - -/* -* Group data together with indexes -*/ -singlefast5.groupTuple().into{grouped_single_fast5_A; grouped_single_fast5_B} -id_to_tombo_fast5.combine(grouped_single_fast5_A, by: 0).map { - [ it[1], it[0], it[2] ] -}.combine(grouped_single_fast5_B, by: 0)map { - [ "${it[1]}--${it[0]}", it[2], it[3]] -}.set{fast5_for_tombo_modifications} - -tombo_indexes.groupTuple().into{grouped_indexes_A; grouped_indexes_B} -id_to_tombo_idx.combine(grouped_indexes_A, by: 0).map { - [ it[1], it[0], it[2] ] -}.combine(grouped_indexes_B, by: 0)map { - [ "${it[1]}--${it[0]}", it[2], it[3] ] -}.set{idx_for_tombo_modifications} - -fast5_for_tombo_modifications.combine(idx_for_tombo_modifications, by: 0).set{data_for_tombo_modifications} - -/* -* detect modification -*/ - -process getModifications { - label 'big_mem_cpus' - tag {"${combID}"} - publishDir outputtombo, pattern: "*.significant_regions.fasta", mode: 'copy' - publishDir outputtombo, pattern: "*.significant_regions.bed", mode: 'copy' - - input: - file(reference) - set val(combID), file(fast5s_A), file(fast5s_B), file(index_A), file(index_B) from data_for_tombo_modifications - - output: - file ("*.significant_regions.fasta") into sign_tombo_regions - file ("*.significant_regions.bed") - - script: - def reference_cmd = unzipFile(reference, "reference.fa") - def folder_names = "${combID}".split("--") - def folder_name_A = folder_names[0] - def folder_name_B = folder_names[1] - """ - ${reference_cmd} - mkdir ${folder_name_A} ${folder_name_B} - mv ${fast5s_A} ${folder_name_A} - mv ${index_A} ${folder_name_A} - mv ${fast5s_B} ${folder_name_B} - mv ${index_B} ${folder_name_B} - tombo detect_modifications model_sample_compare --minimum-test-reads ${params.coverage} --fast5-basedirs ${folder_name_A}/* --control-fast5-basedirs ${folder_name_B}/* --statistics-file-basename ${folder_name_A}_${folder_name_B}_model_sample_compare --rna --per-read-statistics-basename ${folder_name_A}_${folder_name_B}_per-read-statistics --processes ${task.cpus} - tombo text_output signif_sequence_context ${tombo_opt} --num-regions 1000000000 --statistics-filename ${folder_name_A}_${folder_name_B}_model_sample_compare.tombo.stats --genome-fasta reference.fa --fast5-basedirs ${folder_name_A} --sequences-filename ${folder_name_A}_${folder_name_B}.significant_regions.fasta - rm reference.fa - - tombo_2_bed.sh ${folder_name_A}_${folder_name_B}.significant_regions.fasta ${folder_name_A}_${folder_name_B}.significant_regions.bed - """ -} - - -/* -* Perform preprocessing for Epinano -*/ - -process index_reference { - - input: - file(reference) - - output: - set file("reference.fa"), file("*.dict"), file ("*.fai") into indexes - - script: - """ - if [ `echo ${reference} | grep ".gz"` ]; then - zcat ${reference} > reference.fa - else - ln -s ${reference} reference.fa - fi - \$PICARD CreateSequenceDictionary R=reference.fa O=reference.dict - samtools faidx reference.fa - """ -} - - -/* -* Calling variants for Epinano -*/ - -process CallVariantsForEpinano { - - tag {"${sampleID}"} - - input: - set val(sampleID), file(alnfile) from bams_for_variant_calling - set file(reference), file(dict_index), file(faiidx) from indexes - - output: - set sampleID, file("${sampleID}.tsv") into variants_for_frequency - - script: - """ - samtools view -h ${alnfile} -F 256 | \$SAM2TSV -R ${reference} | cut -f 3 --complement > ${sampleID}.tsv - """ -} - -/* -* Calc variant frequencies for Epinano -*/ - -process calcVarFrequenciesForEpinano { - publishDir outputEpinano, pattern: "*.csv.gz", mode: 'copy' - container "biocorecrg/mopepinano:0.1" - - tag {"${sampleID}"} - label 'big_mem_cpus' - - input: - set val(sampleID), file(tsvfile) from variants_for_frequency - - output: - set val(sampleID), file("*.tsv.per.site.var.csv.gz") into per_site_vars_A, per_site_vars_B - file("*.csv.gz") - - script: - """ - TSV_to_Variants_Freq.py3 -f ${tsvfile} -t ${task.cpus} - for i in *.csv; do gzip \$i; done - """ -} - -per_site_vars_A.combine(per_site_vars_B).map { - [ it[0], it[2], it[1], it[3] ] -}.set{per_site_combs} - -per_site_combs.join(id_to_epinano_plots, by:[0,1]) -.into{per_site_for_plotsA; per_site_for_plotsB; per_site_for_plotsC} - -process makeEpinanoPlotsMis { - publishDir outputEpinano, mode: 'copy' - container "biocorecrg/mopnanotail:0.3" - - tag {"${sampleIDA}--${sampleIDB}"} - - input: - set val(sampleIDA), val(sampleIDB), file(per_site_varA), file(per_site_varB) from per_site_for_plotsA - - output: - file("*.pdf") - - script: - """ - Rscript --vanilla ${epinanoScript} ${per_site_varA} ${sampleIDA} ${per_site_varB} ${sampleIDB} mis - """ -} - -process makeEpinanoPlotsIns { - publishDir outputEpinano, mode: 'copy' - container "biocorecrg/mopnanotail:0.3" - - tag {"${sampleIDA}--${sampleIDB}"} - - input: - set val(sampleIDA), val(sampleIDB), file(per_site_varA), file(per_site_varB) from per_site_for_plotsB - - output: - file("*.pdf") - - script: - """ - Rscript --vanilla ${epinanoScript} ${per_site_varA} ${sampleIDA} ${per_site_varB} ${sampleIDB} ins - """ -} - -process makeEpinanoPlotsDel { - publishDir outputEpinano, mode: 'copy' - container "biocorecrg/mopnanotail:0.3" - - tag {"${sampleIDA}--${sampleIDB}"} - - input: - set val(sampleIDA), val(sampleIDB), file(per_site_varA), file(per_site_varB) from per_site_for_plotsC - - output: - file("*.pdf") - - script: - """ - Rscript --vanilla ${epinanoScript} ${per_site_varA} ${sampleIDA} ${per_site_varB} ${sampleIDB} del - """ -} - - -//per_site_vars.map{ -// [ it[0], it[1].splitText( by: 1, keepHeader:true, file:true ) ] -//}.transpose().set{per_site_vars_splitted} - - - - - - - - - - - - - - - - - - - - - - - - -/* -* functions -*/ -// Get the name from the folder -def getFolderName(sample) { - folder_info = sample.toString().tokenize("/") - return folder_info[-2] -} - - - - -// make named pipe -def unzipBash(filename) { - cmd = filename.toString() - if (cmd[-3..-1] == ".gz") { - cmd = "<(zcat ${filename})" - } - return cmd -} - -def unzipFile(zipfile, filename) { - cmd = "cp ${zipfile} ${filename}" - filestring = zipfile.toString() - if (filestring[-3..-1] == ".gz") { - cmd = "zcat ${zipfile} > ${filename}" - } - return cmd -} - -/* -* Finish message -*/ -workflow.onComplete { - println "Pipeline BIOCORE@CRG Master of Pore completed!" - println "Started at $workflow.start" - println "Finished at $workflow.complete" - println "Time elapsed: $workflow.duration" - println "Execution status: ${ workflow.success ? 'OK' : 'failed' }" -} - -/* -* Mail notification -*/ - -if (params.email == "yourmail@yourdomain" || params.email == "") { - log.info 'Skipping the email\n' -} -else { - log.info "Sending the email to ${params.email}\n" - - workflow.onComplete { - - def msg = """\ - NanoMod module's execution summary - --------------------------- - Completed at: ${workflow.complete} - Duration : ${workflow.duration} - Success : ${workflow.success} - workDir : ${workflow.workDir} - exit status : ${workflow.exitStatus} - Error report: ${workflow.errorReport ?: '-'} - """ - .stripIndent() - - sendMail(to: params.email, subject: "Master of Pore execution", body: msg) - } -} diff --git a/NanoMod/nextflow.config b/NanoMod/nextflow.config deleted file mode 100644 index a6772c6..0000000 --- a/NanoMod/nextflow.config +++ /dev/null @@ -1,40 +0,0 @@ -includeConfig "$baseDir/params.config" -//includeConfig "../nextflow.global.config" -singularity.cacheDir = "$baseDir/../singularity" - - - -process { -errorStrategy = 'ignore' - memory='12G' - cache='lenient' - queue = 'biocore-el7,long-sl7,short-sl7' - container = 'biocorecrg/mopmod1:0.2' - containerOptions = { workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g)': null} - withLabel: single_cpu_long { - cpus = 1 - memory = '30G' - time = '6h' - } - - withLabel: single_cpu { - cpus = 1 - memory = '30G' - } - withLabel: big_mem_cpus { - cpus = 8 - errorStrategy = 'retry' - memory = {30.GB + 10.GB * task.attempt} - maxRetries = 4 - } - withLabel: big_long_mem_cpus { - cpus = 8 - time = '48h' - cpus = 8 - errorStrategy = 'retry' - memory = {30.GB + 10.GB * task.attempt} - maxRetries = 4 - } - -} -//singularity.enabled = true diff --git a/NanoMod/params.config b/NanoMod/params.config deleted file mode 100644 index 8748598..0000000 --- a/NanoMod/params.config +++ /dev/null @@ -1,16 +0,0 @@ -params { - input_path = "$baseDir/../NanoPreprocess/output_sk1/" - comparison = "$baseDir/comparison.tsv" - - reference = "$baseDir/../anno2/Yeast_sk1.fa.gz" - - output = "$baseDir/output_gen" - coverage = 5 - - tombo_opt = "--num-bases 5" - - epinano_opt = "" - - email = "" -} - diff --git a/NanoPreprocess/bin/bam2stats.py b/NanoPreprocess/bin/bam2stats.py deleted file mode 100755 index 12eaad6..0000000 --- a/NanoPreprocess/bin/bam2stats.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python3 -# Report stats (mapped reads and identity to reference) from samtools stats -# for bam file(s) ignoring secondary, suplementary and qc failed alignments -# -# USAGE: bam2stats.py bam1 bam2 ... bamN - -import os, subprocess, sys - -def bam2stats(fn, flag=3840): - """Get stats from samtools stats""" - args = ["samtools", "stats", "-F%s"%flag, fn] - proc = subprocess.Popen(args, stdout=subprocess.PIPE) - k2v = {} - for l in proc.stdout: - l = l.decode("utf-8") - if l.startswith('SN'): - ldata = l[:-1].split()#; print(ldata) - kv = [[]] - for e in ldata[1:]: - kv[-1].append(e) - if e.endswith(':'): - kv[-1][-1] = kv[-1][-1][:-1] - kv.append([]) - k2v[" ".join(kv[0])] = kv[1] - # convert digits to int - for k in k2v: - if k2v[k][0].isdigit(): - k2v[k] = int(k2v[k][0]) - # report if no reads mapped - if not k2v['reads mapped']: - return "No reads mapped" - text = [] - text.append("{:,}\t{:.1f}%\t{:,}\t{:.1f}%".format(k2v['reads mapped'], 100*k2v['reads mapped']/k2v['sequences'], k2v['bases mapped (cigar)'], 100*k2v['bases mapped (cigar)']/k2v['total length'])) - text.append("{:,}\t{:,}".format(k2v['average length'], k2v['maximum length'])) - text.append("{:.2f}%".format(100-100*k2v['mismatches']/k2v['bases mapped (cigar)'], )) #"identity: %.2f%"%(100-k2v['mismatches']/k2v['bases mapped (cigar)'], )) - return "\t".join(text) - -for fn in sys.argv[1:]: - if os.path.isfile(fn): - sys.stdout.write("#File name\tMapped reads\tMap %\tBases\tBases %\tAvg read length\tMax read length\tidentity\n") - sys.stdout.write("%s\t%s\n"%(fn, bam2stats(fn))) - -''' -CHK 4691e107 9942d94c cd9ffd51 -# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. -SN raw total sequences: 4000 -SN filtered sequences: 0 -SN sequences: 4000 -SN is sorted: 1 -SN 1st fragments: 4000 -SN last fragments: 0 -SN reads mapped: 1440 -SN reads mapped and paired: 0 # paired-end technology bit set + both mates mapped -SN reads unmapped: 2560 -SN reads properly paired: 0 # proper-pair bit set -SN reads paired: 0 # paired-end technology bit set -SN reads duplicated: 0 # PCR or optical duplicate bit set -SN reads MQ0: 726 # mapped and MQ=0 -SN reads QC failed: 0 -SN non-primary alignments: 6801 -SN total length: 136941 # ignores clipping -SN bases mapped: 109284 # ignores clipping -SN bases mapped (cigar): 108908 # more accurate -SN bases trimmed: 0 -SN bases duplicated: 0 -SN mismatches: 14898 # from NM fields -SN error rate: 1.367944e-01 # mismatches / bases mapped (cigar) -SN average length: 34 -SN maximum length: 401 -SN average quality: 3.5 -SN insert size average: 0.0 -SN insert size standard deviation: 0.0 -SN inward oriented pairs: 0 -SN outward oriented pairs: 0 -SN pairs with other orientation: 0 -SN pairs on different chromosomes: 0 - -''' - diff --git a/NanoPreprocess/bin/cmd_line_deeplexicon_caller_2019_09_12.py b/NanoPreprocess/bin/cmd_line_deeplexicon_caller_2019_09_12.py deleted file mode 100755 index 9e527c6..0000000 --- a/NanoPreprocess/bin/cmd_line_deeplexicon_caller_2019_09_12.py +++ /dev/null @@ -1,677 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 - -import warnings -warnings.filterwarnings("ignore", category=FutureWarning) -import argparse -import sys -# from __future__ import print_function -import os -from copy import deepcopy -import re -import csv -import time -import configparser -import h5py -import traceback -import math -import numpy as np -# from PIL import Image -import pyts -from pyts.image import MarkovTransitionField, GramianAngularField, RecurrencePlot -import tensorflow as tf -import keras -from keras.layers import Dense, Conv2D, BatchNormalization, Activation -from keras.layers import AveragePooling2D, Input, Flatten -from keras.optimizers import Adam -from keras.callbacks import ModelCheckpoint, LearningRateScheduler -from keras.callbacks import ReduceLROnPlateau -from keras.preprocessing.image import ImageDataGenerator -from keras.utils import multi_gpu_model -from keras.regularizers import l2 -from keras import backend as K -from keras.models import Model -import pandas as pd -from sklearn import datasets, linear_model -from sklearn.model_selection import train_test_split -from tensorflow.python.client import device_lib -from keras.models import load_model - -# import matplotlib.pyplot as plt - - -''' - - James M. Ferguson (j.ferguson@garvan.org.au) - Genomic Technologies - Garvan Institute - Copyright 2019 - - Tansel Ersevas .... - - script description - - ---------------------------------------------------------------------------- - version 0.0 - initial - - So a cutoff of: 0.4958776 for high accuracy - and another of 0.2943664 for high recovery - - TODO: - - - - ---------------------------------------------------------------------------- - MIT License - - Copyright (c) 2019 -------NAME---------- - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -''' - - - - -class MyParser(argparse.ArgumentParser): - def error(self, message): - sys.stderr.write('error: %s\n' % message) - self.print_help() - sys.exit(2) - -def print_verbose(message): - '''verbose printing''' - sys.stderr.write('info: %s\n' % message) - -def print_err(message): - '''error printing''' - sys.stderr.write('error: %s\n' % message) - -def read_config(filename): - config = configparser.ConfigParser() - config.read(filename) - return(config) - -def _get_available_devices(): - local_device_protos = device_lib.list_local_devices() - return [x.name for x in local_device_protos] - -def _check_available_devices(): - available_devices = _get_available_devices() - print_verbose(available_devices) - # Make sure requested GPUs are available or at least warn if they aren't - return(TRUE) - -def read_model(model_name): - # model = load_model('saved_models/' + model_name) - model = load_model(model_name) # as a path - model.compile(loss='categorical_crossentropy', - optimizer=Adam(), - metrics=['accuracy']) - return(model) - -squiggle_max = 1199 -squiggle_min = 1 -input_cut = 72000 #potenitall need to be configurable -image_size = 224 -num_classes = 4 -window = 2000 - -def main(): - ''' - Main function - ''' - VERSION = "0.9.0" - - parser = MyParser( - description="DeePlexiCon - Demultiplex direct RNA reads") - #group = parser.add_mutually_exclusive_group() - parser.add_argument("-p", "--path", - help="Top path of fast5 files to dmux") - parser.add_argument("-t", "--type", default="multi", choices=["multi", "single"], - help="Multi or single fast5s") - parser.add_argument("-c", "--config", - help="config file") - # parser.add_argument("-g", "--gpu_list", default=1 - # help="list of gpus, 1, or [1,3,5], etc. of PCI_BUS_ID order") - # parser.add_argument("-o", "--output", - # help="Output directory") - # parser.add_argument("-s", "--threshold", type=float, default=0.7, - # help="confidence interval threshold") - parser.add_argument("-s", "--threshold", type=float, default=0.50, - help="probability threshold - 0.5 hi accuracy / 0.3 hi recovery") - # populate choices with models found in saved_models/ - # parser.add_argument("-m", "--model", default="4_bc_normal.h5", choices=["4_bc_normal.h5", "model2"], - parser.add_argument("-m", "--model", - help="Trained model name to use") - parser.add_argument("-b", "--batch_size", type=int, default=4000, - help="batch size - for single fast5s") - parser.add_argument("-V", "--version", - help="Prints version") - parser.add_argument("-v", "--verbose", action="store_true", - help="Verbose output") - - args = parser.parse_args() - # print help if no arguments given - if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - - if args.verbose: - print_verbose("Verbose mode active - dumping info to stderr") - print_verbose("DeePlexiCon: {}".format(VERSION)) - print_verbose("arg list: {}".format(args)) - if tf.test.gpu_device_name(): - print_verbose('Default GPU Device: {}'.format(tf.test.gpu_device_name())) - else: - print_verbose("Please install GPU version of TF") - - - # Globals - if args.config: - config = read_config(args.config) #TODO check config read error - - squiggle_max = 1199 - squiggle_min = 1 - input_cut = 72000 #potenitall need to be configurable - image_size = 224 - num_classes = 4 - window = 2000 - # Devices - # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - - # os.environ["CUDA_VISIBLE_DEVICES"] = config[deeplexicon][gpu_list] if args.config else args.gpu_list - - # do check devices are available, else throw and error - - - # main logic - - # read model - - #TODO Check model errors - model = read_model(config[deeplexicon][trained_model]) if args.config else read_model(args.model) - labels = [] - images = [] - fast5s = {} - stats = "" - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format("fast5", "ReadID", "Barcode", "Confidence Interval", "P_bc_1", "P_bc_2", "P_bc_3", "P_bc_4")) - # for file in input... - for dirpath, dirnames, files in os.walk(args.path): - for fast5 in files: - if fast5.endswith('.fast5'): - fast5_file = os.path.join(dirpath, fast5) - if args.type == "single": - #everthing below this, send off in batches of N=args.batch_size - # The signal extraction and segmentation can happen in the first step - # read fast5 files - readID, seg_signal = get_single_fast5_signal(fast5_file, window) - # segment - # array[name, signal.....] - # seg_signal = dRNA_segmenter(signal) - if not seg_signal: - print_err("Segment not found for:\t{}\t{}".format(fast5_file, readID)) - continue - # convert - sig = np.array(seg_signal, dtype=float) - # print(sig) - img = convert_to_image(sig) - # print(img) - labels.append(readID) # read ID? - # print_verbose(labels) - fast5s[readID] = fast5 - # print_verbose(readID) - # print_verbose(fast5) - # print_verbose(fast5s) - images.append(img) - # sys.exit() - # classify - if len(labels) >= args.batch_size: - C = classify(model, labels, np.array(images), False, args.threshold) - # out = classify(model, readID, np.array([img]), False, 0.0) - # print_verbose(C) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - if out == 0: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 1: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 2: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 3: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3])) - else: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - - - elif args.type == "multi": - #everthing below this, send off in batches of N=args.batch_size - # The signal extraction and segmentation can happen in the first step - # read fast5 files - seg_signal = get_multi_fast5_signal(fast5_file, window) - # segment - # array[name, signal.....] - # seg_signal = dRNA_segmenter(signal) - # print_verbose(list(seg_signal.keys())) - sig_count = 0 - for readID in seg_signal: - # convert - img = convert_to_image(np.array(seg_signal[readID], dtype=float)) - labels.append(readID) #change to readID? - images.append(img) - fast5s[readID] = fast5 - sig_count += 1 - if len(labels) >= args.batch_size: - C = classify(model, labels, np.array(images), False, args.threshold) - # out = classify(model, readID, np.array([img]), False, 0.0) - # print_verbose(C) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - if out == 0: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 1: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 2: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 3: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3])) - else: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - elif args.verbose: - print_verbose("analysing sig_count: {}/{}".format(sig_count, len(seg_signal))) - else: - blah = 0 - #finish up - C = classify(model, labels, np.array(images), False, args.threshold) - # out = classify(model, readID, np.array([img]), False, 0.0) - # print_verbose(C) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - if out == 0: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 1: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 2: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 3: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3])) - else: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - - - - - - - - # final report/stats - # print stats - -# file handling and segmentation - -def get_single_fast5_signal(read_filename, w): - ''' - open sigle fast5 file and extract information - ''' - - # get readID and signal - f5_dic = read_single_fast5(read_filename) - if not f5_dic: - print_err("Signal not extracted from: {}".format(read_filename)) - return 0, 0 - # segment on raw - readID = f5_dic['readID'] - signal = f5_dic['signal'] - seg = dRNA_segmenter(signal, w) - if not seg: - print_verbose("No segment found - skipping: {}".format(readID)) - return 0, 0 - # convert to pA - pA_signal = convert_to_pA(f5_dic) - # return signal/signals - return readID, pA_signal[seg[0]:seg[1]] - - -def get_multi_fast5_signal(read_filename, w): - ''' - open multi fast5 files and extract information - ''' - pA_signals = {} - f5_dic = read_multi_fast5(read_filename) - # print_verbose(list(f5_dic.keys())) - seg = 0 - sig_count = 0 - for read in f5_dic: - sig_count += 1 - print_verbose("reading sig_count: {}/{}".format(sig_count, len(f5_dic))) - # get readID and signal - # print_verbose("read: {}".format(read)) - readID = f5_dic[read]['readID'] - # print_verbose("readID: {}".format(readID)) - signal = f5_dic[read]['signal'] - # print_verbose("sig: {}".format(signal[:5])) - # continue - - - - # segment on raw - seg = dRNA_segmenter(readID, signal, w) - # print_verbose("seg return: {}".format(seg)) - if not seg: - # print_verbose("No segment found - skipping: {}".format(readID)) - seg = 0 - continue - # convert to pA - pA_signal = convert_to_pA(f5_dic[read]) - # print_verbose("pA: {}".format(pA_signal[:5])) - pA_signals[readID] = pA_signal[seg[0]:seg[1]] - # return signal/signals - return pA_signals - - -def read_single_fast5(filename): - ''' - read single fast5 file and return data - ''' - f5_dic = {'signal': [], 'readID': '', 'digitisation': 0.0, - 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0} - - # open fast5 file - try: - hdf = h5py.File(filename, 'r') - except: - traceback.print_exc() - print_err("extract_fast5():fast5 file failed to open: {}".format(filename)) - f5_dic = {} - return f5_dic - try: - c = list(hdf['Raw/Reads'].keys()) - for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]: - f5_dic['signal'].append(int(col)) - - - f5_dic['readID'] = hdf['Raw/Reads/'][c[0]].attrs['read_id'].decode() - f5_dic['digitisation'] = hdf['UniqueGlobalKey/channel_id'].attrs['digitisation'] - f5_dic['offset'] = hdf['UniqueGlobalKey/channel_id'].attrs['offset'] - f5_dic['range'] = float("{0:.2f}".format(hdf['UniqueGlobalKey/channel_id'].attrs['range'])) - f5_dic['sampling_rate'] = hdf['UniqueGlobalKey/channel_id'].attrs['sampling_rate'] - - except: - traceback.print_exc() - print_err("extract_fast5():failed to extract events or fastq from: {}".format(filename)) - f5_dic = {} - - return f5_dic - - -def read_multi_fast5(filename): - ''' - read multifast5 file and return data - ''' - f5_dic = {} - # c = 0 - with h5py.File(filename, 'r') as hdf: - for read in list(hdf.keys()): - # c += 1 - # if c > 100: - # break - f5_dic[read] = {'signal': [], 'readID': '', 'digitisation': 0.0, - 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0} - try: - for col in hdf[read]['Raw/Signal'][()]: - f5_dic[read]['signal'].append(int(col)) - - f5_dic[read]['readID'] = hdf[read]['Raw'].attrs['read_id'].decode() - f5_dic[read]['digitisation'] = hdf[read]['channel_id'].attrs['digitisation'] - f5_dic[read]['offset'] = hdf[read]['channel_id'].attrs['offset'] - f5_dic[read]['range'] = float("{0:.2f}".format(hdf[read]['channel_id'].attrs['range'])) - f5_dic[read]['sampling_rate'] = hdf[read]['channel_id'].attrs['sampling_rate'] - except: - traceback.print_exc() - print_err("extract_fast5():failed to read readID: {}".format(read)) - return f5_dic - - -def dRNA_segmenter(readID, signal, w): - ''' - segment signal/s and return coords of cuts - ''' - def _scale_outliers(squig): - ''' Scale outliers to within m stdevs of median ''' - k = (squig > 0) & (squig < 1200) - return squig[k] - - - sig = _scale_outliers(np.array(signal, dtype=int)) - - s = pd.Series(sig) - t = s.rolling(window=w).mean() - # This should be done better, or changed to median and benchmarked - # Currently trained on mean segmented data - mn = t.mean() - std = t.std() - # Trained on 0.5 - bot = mn - (std*0.5) - - - - # main algo - - begin = False - # max distance for merging 2 segs - seg_dist = 1500 - # max length of a seg - hi_thresh = 200000 - # min length of a seg - lo_thresh = 2000 - - start = 0 - end = 0 - segs = [] - count = -1 - for i in t: - count += 1 - if i < bot and not begin: - start = count - begin = True - elif i < bot: - end = count - elif i > bot and begin: - if segs and start - segs[-1][1] < seg_dist: - segs[-1][1] = end - else: - segs.append([start, end]) - start = 0 - end = 0 - begin = False - else: - continue - - offset = -1050 - buff = 150 - - # fig = plt.figure(1) - # #fig.subplots_adjust(hspace=0.1, wspace=0.01) - # ax = fig.add_subplot(111) - # - # print_verbose("segs: {}".format(segs)) - # # # Show segment lines - # for i, j in segs: - # ax.axvline(x=i+offset-buff, color='m') - # ax.axvline(x=j+offset+buff, color='m') - # - # ax.axhline(y=bot, color='r') - # plt.plot(sig, color='teal') - # plt.plot(t, color='k') - # plt.show() - # plt.clf() - - x, y = 0, 0 - - # print_verbose("segs befor filter: {}".format(segs)) - for a, b in segs: - if b - a > hi_thresh: - continue - if b - a < lo_thresh: - continue - x, y = a, b - # print "{}\t{}\t{}\t{}".format(f5, read_dic[f5], x, y) - # print_verbose("xy before return: [{},{}]".format(x, y)) - return [x+offset-buff, y+offset+buff] - break - print_verbose("dRNA_segmenter: no seg found: {}".format(readID)) - return 0 - - -def convert_to_pA(d): - ''' - convert raw signal data to pA using digitisation, offset, and range - float raw_unit = range / digitisation; - for (int32_t j = 0; j < nsample; j++) { - rawptr[j] = (rawptr[j] + offset) * raw_unit; - } - ''' - digitisation = d['digitisation'] - range = d['range'] - offset = d['offset'] - raw_unit = range / digitisation - new_raw = [] - for i in d['signal']: - j = (i + offset) * raw_unit - new_raw.append("{0:.2f}".format(round(j,2))) - return new_raw - -# Python timeseries interface - - -def pyts_transform(transform, data, image_size, show=False, cmap='rainbow', img_index=0): - try: - t_start=time.time() - X_transform = transform.fit_transform(data) - # if args.verbose: - # print_verbose("{} {}".format(time.time() - t_start, 'seconds')) - if (show): - plt.figure(figsize=(4, 4)) - plt.grid(b=None) - plt.imshow(X_transform[0], cmap=cmap, origin='lmtfower') - plt.savefig(transform.__class__.__name__ + "_image_" + str(img_index) + ".svg", format="svg") - plt.show() - return(X_transform) - except Exception as e: - print_err(str(e)) - return([]) - - -def mtf_transform(data, image_size=500, show=False, img_index=0): - transform = MarkovTransitionField(image_size) - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - -def rp_transform(data, image_size=500 ,show=False ,img_index=0): - # RP transformationmtf - transform = RecurrencePlot(dimension=1, - threshold='percentage_points', - percentage=30) - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='binary', img_index=img_index)) - -def gasf_transform(data, image_size=500, show=False, img_index=0): - # GAF transformation - transform = GramianAngularField(image_size, method='summation') - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - -def gadf_transform(data, image_size=500, show=False ,img_index=0): - # GAF transformation - transform = GramianAngularField(image_size, method='difference') - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - - -def labels_for(a_file_name): - segments=re.split(r'[_\-\.]+', a_file_name) - return(segments) - -def max_in_sequence(sequence): - return(max(np.amax([list(d.values()) for d in sequence]), 0.01)) - -def compress_squiggle(squiggle, compress_factor): - squiggle_len = len(squiggle) - rem = squiggle_len % compress_factor - if rem > 0: - return(np.mean(squiggle[0:squiggle_len - rem].reshape(-1,compress_factor), axis=1)) - return(squiggle) - -def convert_to_image(signal): - transformed_squiggle = gasf_transform(signal.reshape(1,-1), image_size=image_size, show=False) - return(transformed_squiggle) - - -def confidence_margin(npa): - sorted = np.sort(npa)[::-1] #return sort in reverse, i.e. descending - # sorted = np.sort(npa) #return sort in reverse, i.e. descending - d = sorted[0] - sorted[1] - # if args.verbose: - # print_verbose("Sorted: {}".format(sorted)) - # print_verbose("conf interval: {}".format(d)) - # return(sorted[0][0] - sorted[0][1]) - return(d) - -def classify(model, labels, image, subtract_pixel_mean, threshold): - input_shape = image.shape[1:] - x = image.astype('float32') / 255 - - # If subtract pixel mean is enabled - if subtract_pixel_mean: - x_mean = np.mean(x, axis=0) - x -= x_mean - x=[x] - # print(x) - y = model.predict(x, verbose=0) - # print(y) - res = [] - for i in range(len(y)): - cm = confidence_margin(y[i]) - #print_verbose(y[i][np.argmax(y[i])]) - if y[i][np.argmax(y[i])] >= threshold: - # return(np.argmax(y)) - res.append([labels[i], np.argmax(y[i]), cm, y[i]]) - # return(None) - # return(np.argmax(y)) - else: - res.append([labels[i], None, cm, y[i]]) - return res - -if __name__ == '__main__': - main() diff --git a/NanoPreprocess/bin/deeplexicon.py b/NanoPreprocess/bin/deeplexicon.py deleted file mode 100755 index c58b3b6..0000000 --- a/NanoPreprocess/bin/deeplexicon.py +++ /dev/null @@ -1,622 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 - -import warnings -warnings.filterwarnings("ignore", category=FutureWarning) -import argparse -import sys -import os -from copy import deepcopy -import re -import csv -import time -import configparser -import h5py -import traceback -import math -import numpy as np -# from PIL import Image -import pyts -from pyts.image import MarkovTransitionField, GramianAngularField, RecurrencePlot -import tensorflow as tf -import keras -from keras.layers import Dense, Conv2D, BatchNormalization, Activation -from keras.layers import AveragePooling2D, Input, Flatten -from keras.optimizers import Adam -from keras.callbacks import ModelCheckpoint, LearningRateScheduler -from keras.callbacks import ReduceLROnPlateau -from keras.preprocessing.image import ImageDataGenerator -from keras.utils import multi_gpu_model -from keras.regularizers import l2 -from keras import backend as K -from keras.models import Model -import pandas as pd -from sklearn import datasets, linear_model -from sklearn.model_selection import train_test_split -from tensorflow.python.client import device_lib -from keras.models import load_model - - - -''' - - James M. Ferguson (j.ferguson@garvan.org.au) - Genomic Technologies - Garvan Institute - Copyright 2019 - - Tansel Ersevas (t.ersevas@garvan.org.au) - - script description - - - - ---------------------------------------------------------------------------- - version 0.0.0 - initial - version 0.8.0 - CPU version Done - version 0.9.0 - Fixed segment offset - version 0.9.1 - added segment and squiggle output - version 0.9.2 - separate segment output and code clean up - version 1.0.0 - initial release - - So a cutoff of: 0.4958776 for high accuracy - and another of 0.2943664 for high recovery - - TODO: - - Remove leftover libraries - - remove debug plots - - Remove redundant code - - create log files with information - - take in fastq for dmux splitting - - take in paf or bam for training splitting - - - ---------------------------------------------------------------------------- - MIT License - - Copyright (c) 2019 James M. Ferguson - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -''' - - - - -class MyParser(argparse.ArgumentParser): - def error(self, message): - sys.stderr.write('error: %s\n' % message) - self.print_help() - sys.exit(2) - -def print_verbose(message): - '''verbose printing''' - sys.stderr.write('info: %s\n' % message) - -def print_err(message): - '''error printing''' - sys.stderr.write('error: %s\n' % message) - -def read_config(filename): - config = configparser.ConfigParser() - config.read(filename) - return(config) - -def _get_available_devices(): - local_device_protos = device_lib.list_local_devices() - return [x.name for x in local_device_protos] - -def _check_available_devices(): - available_devices = _get_available_devices() - print_verbose(available_devices) - # Make sure requested GPUs are available or at least warn if they aren't - return(TRUE) - -def read_model(model_name): - # model = load_model('saved_models/' + model_name) - model = load_model(model_name) # as a path - model.compile(loss='categorical_crossentropy', - optimizer=Adam(), - metrics=['accuracy']) - return(model) - -squiggle_max = 1199 -squiggle_min = 1 -input_cut = 72000 #potenitall need to be configurable -image_size = 224 -num_classes = 4 -window = 2000 - -def main(): - ''' - Main function - ''' - VERSION = "1.0.0" - - parser = MyParser( - description="DeePlexiCon - Demultiplex direct RNA reads") - #group = parser.add_mutually_exclusive_group() - parser.add_argument("-p", "--path", - help="Top path of fast5 files to dmux") - parser.add_argument("-f", "--form", default="multi", choices=["multi", "single"], - help="Multi or single fast5s") - parser.add_argument("-c", "--config", - help="config file") - # parser.add_argument("-g", "--gpu_list", default=1 - # help="list of gpus, 1, or [1,3,5], etc. of PCI_BUS_ID order") - # parser.add_argument("-o", "--output", - # help="Output directory") - parser.add_argument("-s", "--threshold", type=float, default=0.50, - help="probability threshold - 0.5 hi accuracy / 0.3 hi recovery") - # populate choices with models found in saved_models/ - # parser.add_argument("-m", "--model", default="4_bc_normal.h5", choices=["4_bc_normal.h5", "model2"], - parser.add_argument("-m", "--model", - help="Trained model name to use") - parser.add_argument("--squiggle", - help="dump squiggle data into this .tsv file") - parser.add_argument("--segment", - help="dump segment data into this .tsv file") - parser.add_argument("-b", "--batch_size", type=int, default=4000, - help="batch size - for single fast5s") - parser.add_argument("-V", "--version", - help="Prints version") - parser.add_argument("-v", "--verbose", action="store_true", - help="Verbose output") - - args = parser.parse_args() - # print help if no arguments given - if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - - if args.verbose: - print_verbose("Verbose mode active - dumping info to stderr") - print_verbose("DeePlexiCon: {}".format(VERSION)) - print_verbose("arg list: {}".format(args)) - if tf.test.gpu_device_name(): - print_verbose('Default GPU Device: {}'.format(tf.test.gpu_device_name())) - else: - print_verbose("Please install GPU version of TF") - - if args.squiggle: - squig_file = args.squiggle - with open(squig_file, 'a') as f: - f.write("{}\t{}\n".format("ReadID", "signal_pA")) - else: - squig_file = '' - - if args.segment: - seg_file = args.segment - with open(seg_file, 'a') as f: - f.write("{}\t{}\t{}\n".format("ReadID", "start", "stop")) - else: - seg_file = '' - - # Globals - if args.config: - config = read_config(args.config) #TODO check config read error - - # gpu settings - # Devices - # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - - # os.environ["CUDA_VISIBLE_DEVICES"] = config[deeplexicon][gpu_list] if args.config else args.gpu_list - - # do check devices are available, else throw and error - - - # main logic - - # read model - - model = read_model(config[deeplexicon][trained_model]) if args.config else read_model(args.model) - barcode_out = {0: "bc_1", - 1: "bc_2", - 2: "bc_3", - 3: "bc_4", - None: "unknown" - } - labels = [] - images = [] - fast5s = {} - stats = "" - seg_dic = {} - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format("fast5", "ReadID", "Barcode", "Confidence Interval", "P_bc_1", "P_bc_2", "P_bc_3", "P_bc_4")) - # for file in input... - for dirpath, dirnames, files in os.walk(args.path): - for fast5 in files: - if fast5.endswith('.fast5'): - fast5_file = os.path.join(dirpath, fast5) - if args.form == "single": - #everthing below this, send off in batches of N=args.batch_size - # The signal extraction and segmentation can happen in the first step - # read fast5 files - readID, seg_signal = get_single_fast5_signal(fast5_file, window, squig_file, seg_file) - if not seg_signal: - print_err("Segment not found for:\t{}\t{}".format(fast5_file, readID)) - continue - # convert - sig = np.array(seg_signal, dtype=float) - img = convert_to_image(sig) - labels.append(readID) - fast5s[readID] = fast5 - images.append(img) - # classify - if len(labels) >= args.batch_size: - C = classify(model, labels, np.array(images), False, args.threshold) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcode_out[out], cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - - - elif args.form == "multi": - #everthing below this, send off in batches of N=args.batch_size - # The signal extraction and segmentation can happen in the first step - # read fast5 files - seg_signal = get_multi_fast5_signal(fast5_file, window, squig_file, seg_file) - sig_count = 0 - for readID in seg_signal: - # convert - img = convert_to_image(np.array(seg_signal[readID], dtype=float)) - labels.append(readID) - images.append(img) - fast5s[readID] = fast5 - sig_count += 1 - if len(labels) >= args.batch_size: - C = classify(model, labels, np.array(images), False, args.threshold) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcde_out[out], cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - elif args.verbose: - print_verbose("analysing sig_count: {}/{}".format(sig_count, len(seg_signal))) - else: - blah = 0 # clean - #finish up - C = classify(model, labels, np.array(images), False, args.threshold) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcode_out[out], cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - - - # final report/stats - # print stats - -# file handling and segmentation - -def get_single_fast5_signal(read_filename, w, squig_file, seg_file): - ''' - open sigle fast5 file and extract information - ''' - - # get readID and signal - f5_dic = read_single_fast5(read_filename) - if not f5_dic: - print_err("Signal not extracted from: {}".format(read_filename)) - return 0, 0 - # segment on raw - readID = f5_dic['readID'] - signal = f5_dic['signal'] - seg = dRNA_segmenter(readID, signal, w) - if not seg: - print_verbose("No segment found - skipping: {}".format(readID)) - return 0, 0 - # convert to pA - pA_signal = convert_to_pA(f5_dic) - if squig_file: - with open(squig_file, 'a') as f: - f.write("{}\t{}\n".format(readID, "\t".join(pA_signal))) - if seg_file: - with open(seg_file, 'a') as f: - f.write("{}\t{}\t{}\n".format(readID, seg[0], seg[1])) - # return signal/signals - return readID, pA_signal[seg[0]:seg[1]] - - -def get_multi_fast5_signal(read_filename, w, squig_file, seg_file): - ''' - open multi fast5 files and extract information - ''' - pA_signals = {} - seg_dic = {} - f5_dic = read_multi_fast5(read_filename) - seg = 0 - sig_count = 0 - for read in f5_dic: - sig_count += 1 - print_verbose("reading sig_count: {}/{}".format(sig_count, len(f5_dic))) - # get readID and signal - readID = f5_dic[read]['readID'] - signal = f5_dic[read]['signal'] - - # segment on raw - seg = dRNA_segmenter(readID, signal, w) - if not seg: - seg = 0 - continue - # convert to pA - pA_signal = convert_to_pA(f5_dic[read]) - if squig_file: - with open(squig_file, 'a') as f: - f.write("{}\t{}\n".format(readID, "\t".join(pA_signal))) - if seg_file: - with open(seg_file, 'a') as f: - f.write("{}\t{}\t{}\n".format(readID, seg[0], seg[1])) - pA_signals[readID] = pA_signal[seg[0]:seg[1]] - seg_dic[readID] = seg - # return signal/signals - return pA_signals - - -def read_single_fast5(filename): - ''' - read single fast5 file and return data - ''' - f5_dic = {'signal': [], 'readID': '', 'digitisation': 0.0, - 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0} - - # open fast5 file - try: - hdf = h5py.File(filename, 'r') - except: - traceback.print_exc() - print_err("extract_fast5():fast5 file failed to open: {}".format(filename)) - f5_dic = {} - return f5_dic - try: - c = list(hdf['Raw/Reads'].keys()) - for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]: - f5_dic['signal'].append(int(col)) - - f5_dic['readID'] = hdf['Raw/Reads/'][c[0]].attrs['read_id'].decode() - f5_dic['digitisation'] = hdf['UniqueGlobalKey/channel_id'].attrs['digitisation'] - f5_dic['offset'] = hdf['UniqueGlobalKey/channel_id'].attrs['offset'] - f5_dic['range'] = float("{0:.2f}".format(hdf['UniqueGlobalKey/channel_id'].attrs['range'])) - f5_dic['sampling_rate'] = hdf['UniqueGlobalKey/channel_id'].attrs['sampling_rate'] - - except: - traceback.print_exc() - print_err("extract_fast5():failed to extract events or fastq from: {}".format(filename)) - f5_dic = {} - - return f5_dic - - -def read_multi_fast5(filename): - ''' - read multifast5 file and return data - ''' - f5_dic = {} - with h5py.File(filename, 'r') as hdf: - for read in list(hdf.keys()): - f5_dic[read] = {'signal': [], 'readID': '', 'digitisation': 0.0, - 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0} - try: - for col in hdf[read]['Raw/Signal'][()]: - f5_dic[read]['signal'].append(int(col)) - - f5_dic[read]['readID'] = hdf[read]['Raw'].attrs['read_id'].decode() - f5_dic[read]['digitisation'] = hdf[read]['channel_id'].attrs['digitisation'] - f5_dic[read]['offset'] = hdf[read]['channel_id'].attrs['offset'] - f5_dic[read]['range'] = float("{0:.2f}".format(hdf[read]['channel_id'].attrs['range'])) - f5_dic[read]['sampling_rate'] = hdf[read]['channel_id'].attrs['sampling_rate'] - except: - traceback.print_exc() - print_err("extract_fast5():failed to read readID: {}".format(read)) - return f5_dic - - -def dRNA_segmenter(readID, signal, w): - ''' - segment signal/s and return coords of cuts - ''' - def _scale_outliers(squig): - ''' Scale outliers to within m stdevs of median ''' - k = (squig > 0) & (squig < 1200) - return squig[k] - - - sig = _scale_outliers(np.array(signal, dtype=int)) - - s = pd.Series(sig) - t = s.rolling(window=w).mean() - # This should be done better, or changed to median and benchmarked - # Currently trained on mean segmented data - mn = t.mean() - std = t.std() - # Trained on 0.5 - bot = mn - (std*0.5) - - # main algo - - begin = False - # max distance for merging 2 segs - seg_dist = 1500 - # max length of a seg - hi_thresh = 200000 - # min length of a seg - lo_thresh = 2000 - - start = 0 - end = 0 - segs = [] - count = -1 - for i in t: - count += 1 - if i < bot and not begin: - start = count - begin = True - elif i < bot: - end = count - elif i > bot and begin: - if segs and start - segs[-1][1] < seg_dist: - segs[-1][1] = end - else: - segs.append([start, end]) - start = 0 - end = 0 - begin = False - else: - continue - - # offset = -1050 - # buff = 150 - offset = -1000 - buff = 0 - - x, y = 0, 0 - - for a, b in segs: - if b - a > hi_thresh: - continue - if b - a < lo_thresh: - continue - x, y = a, b - - # to be modified in next major re-training - return [x+offset-buff, y+offset+buff] - break - print_verbose("dRNA_segmenter: no seg found: {}".format(readID)) - return 0 - - -def convert_to_pA(d): - ''' - convert raw signal data to pA using digitisation, offset, and range - float raw_unit = range / digitisation; - for (int32_t j = 0; j < nsample; j++) { - rawptr[j] = (rawptr[j] + offset) * raw_unit; - } - ''' - digitisation = d['digitisation'] - range = d['range'] - offset = d['offset'] - raw_unit = range / digitisation - new_raw = [] - for i in d['signal']: - j = (i + offset) * raw_unit - new_raw.append("{0:.2f}".format(round(j,2))) - return new_raw - - -def pyts_transform(transform, data, image_size, show=False, cmap='rainbow', img_index=0): - try: - t_start=time.time() - X_transform = transform.fit_transform(data) - if (show): - plt.figure(figsize=(4, 4)) - plt.grid(b=None) - plt.imshow(X_transform[0], cmap=cmap, origin='lmtfower') - plt.savefig(transform.__class__.__name__ + "_image_" + str(img_index) + ".svg", format="svg") - plt.show() - return(X_transform) - except Exception as e: - print_err(str(e)) - return([]) - - -def mtf_transform(data, image_size=500, show=False, img_index=0): - transform = MarkovTransitionField(image_size) - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - -def rp_transform(data, image_size=500 ,show=False ,img_index=0): - # RP transformationmtf - transform = RecurrencePlot(dimension=1, - threshold='percentage_points', - percentage=30) - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='binary', img_index=img_index)) - -def gasf_transform(data, image_size=500, show=False, img_index=0): - # GAF transformation - transform = GramianAngularField(image_size, method='summation') - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - -def gadf_transform(data, image_size=500, show=False ,img_index=0): - # GAF transformation - transform = GramianAngularField(image_size, method='difference') - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - - -def labels_for(a_file_name): - segments=re.split(r'[_\-\.]+', a_file_name) - return(segments) - -def max_in_sequence(sequence): - return(max(np.amax([list(d.values()) for d in sequence]), 0.01)) - -def compress_squiggle(squiggle, compress_factor): - squiggle_len = len(squiggle) - rem = squiggle_len % compress_factor - if rem > 0: - return(np.mean(squiggle[0:squiggle_len - rem].reshape(-1,compress_factor), axis=1)) - return(squiggle) - -def convert_to_image(signal): - transformed_squiggle = gasf_transform(signal.reshape(1,-1), image_size=image_size, show=False) - return(transformed_squiggle) - - -def confidence_margin(npa): - sorted = np.sort(npa)[::-1] #return sort in reverse, i.e. descending - # sorted = np.sort(npa) #return sort in reverse, i.e. descending - d = sorted[0] - sorted[1] - return(d) - -def classify(model, labels, image, subtract_pixel_mean, threshold): - input_shape = image.shape[1:] - # x = image.astype('float32') / 255 - x = image.astype('float32') + 1 - x = x / 2 - - # If subtract pixel mean is enabled - if subtract_pixel_mean: - x_mean = np.mean(x, axis=0) - x -= x_mean - x=[x] - y = model.predict(x, verbose=0) - res = [] - for i in range(len(y)): - cm = confidence_margin(y[i]) - if y[i][np.argmax(y[i])] >= threshold: - res.append([labels[i], np.argmax(y[i]), cm, y[i]]) - else: - res.append([labels[i], None, cm, y[i]]) - return res - -if __name__ == '__main__': - main() diff --git a/NanoPreprocess/bin/extract_sequence_from_fastq.py b/NanoPreprocess/bin/extract_sequence_from_fastq.py deleted file mode 100755 index 27f8406..0000000 --- a/NanoPreprocess/bin/extract_sequence_from_fastq.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python -import sys -import gzip -import pprint -import os - -usage = ''' -created by Huanle and Luca for Master of Pores! :) -python extract_sequence_from_fastq.py -''' - -if len (sys.argv) < 3: - print (usage) - exit(1) - -def fopen (f): - if f.endswith('.gz'): - return (gzip.open(f,'rt')) - else: - return (open (f,'rt')) - - -IDs = dict () -if True: - fh = fopen (sys.argv[1]) - for l in fh: - ary = l.strip().split() - IDs[ary[1]] = ary[2] -fh.close() - -outprefix = os.path.splitext(sys.argv[2])[0] -outext = os.path.splitext(sys.argv[2])[1] -if (outext == '.gz'): - outprefix = os.path.splitext(outprefix)[0] - -fh = fopen (sys.argv[2]) -for l in fh: - rd = l.strip().split()[0] - rd = rd.replace('@','') - seq = next(fh).strip() - inf = next (fh).strip() - q = next(fh).strip() - if rd in IDs: - outfile = outprefix + "." + IDs[rd] + ".fastq" - fw = open(outfile,"a+") - rd = '@'+rd - string = "{0}\n{1}\n{2}\n{3}\n".format(rd,seq,inf,q) - fw.write(string) - #print (string) -fh.close() diff --git a/NanoPreprocess/bin/fast5_to_fastq.py b/NanoPreprocess/bin/fast5_to_fastq.py deleted file mode 100755 index a80ce10..0000000 --- a/NanoPreprocess/bin/fast5_to_fastq.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python -import sys, warnings,os -import numpy as np -import re -import gzip -import h5py, ont_fast5_api -from ont_fast5_api.fast5_interface import get_fast5_file - -usage = ''' -_EUSAGE_: python fast5_to_fastq.py fast5file -_INPUT_: both single- and multi- fast5 files can be processed -_BUGS_: report to elzedliu@gmail.com -''' - -if len(sys.argv) < 2: - print (usage); exit(0) - -f5file = sys.argv[1] -hdf5 = h5py.File(f5file,'r') -f5 = get_fast5_file (f5file,mode='r') -reads_in_f5 = f5.get_read_ids() -number_of_reads = len( reads_in_f5) - - -fq_out = open ('.'.join(f5file.split('.')[:-1])+'.fastq','w') - -def fastq_from_fast5 (h5f): - ''' - for single fast5 file, it is hdf5 file open with h5py - for multi- fast5 file, it is hdf5['read_id'] for single read - ''' - k = 'Analyses/Basecall_1D_000/BaseCalled_template/Fastq' - if k in h5f: - fastq = h5f[k].value.decode() - return fastq - else: - return None - -if number_of_reads == 1: - fq = fastq_from_fast5 (hdf5) - if fq : - fq_out.write (fq) - else: - sys.stderr.write("no sequence basecalled for {} \n".format(f5file)) -elif number_of_reads > 1: - for rd in hdf5.keys(): - fq = fastq_from_fast5 (hdf5[rd]) - if fq: - fq = fq.replace('_Basecall_1D_template','') - fq_out.write(fq) - else: - sys.stderr.write("no basecall for read {} \n" .format(rd)) -fq_out.close() diff --git a/NanoPreprocess/bin/fast5_type.py b/NanoPreprocess/bin/fast5_type.py deleted file mode 100755 index 7fa9abc..0000000 --- a/NanoPreprocess/bin/fast5_type.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python - -import sys - -from ont_fast5_api.multi_fast5 import MultiFast5File -from ont_fast5_api.fast5_info import _clean - - -__author__ = 'Huanle.Liu@crg.eu' -__version__ = '0.2' -__email__ = 'same as author' - -usage = ''' - python fast5_type.py fast5file - return: - 0: single read fast5 - 1: multi-reads fast5 -''' - -if len (sys.argv) !=2: - print (usage, file=sys.stderr) - sys.exit() - - -def check_file_type(f5_file): - try: - return 1 if _clean(f5_file.handle.attrs['file_type']).startswith('multi') else 0 - except KeyError: - if len(f5_file.handle) == 0 : - return 1 - if len([read for read in f5_file.handle if read.startswith('read_')]) !=0 : - return 1 - if 'UniqueGlobalKey' in f5_file.handle: - return 0 - raise TypeError('file can not be indetified as single- or multi- read.\n' 'File path: {}'.format(f5_file.filename)) - - -filepath = sys.argv[1] -f5_file = MultiFast5File (filepath, mode='r') -filetype = check_file_type (f5_file) -#filetype = 1 if filetype.startswith ('multi') or else 0 -print (filetype) - - - - - - diff --git a/NanoPreprocess/bin/reorganize.sh b/NanoPreprocess/bin/reorganize.sh deleted file mode 100755 index ece54cc..0000000 --- a/NanoPreprocess/bin/reorganize.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/env bash - -if [ x"$1" == x ]; then - - "please specify an output folder" - - exit 1 - -fi - -for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/fast5_files; mv $i/* $1`basename $i`/fast5_files; done -for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/fastq_files; mv $1/fastq_files/*`basename $i`.fq.gz $1`basename $i`/fastq_files; done -for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/counts; mv $1/counts/*`basename $i`.count $1`basename $i`/counts; done -for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/assigned; mv $1/assigned/*`basename $i`.assigned $1`basename $i`/assigned; done -for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/alignment; mv $1/alignment/*`basename $i`*.bam $1`basename $i`/alignment; done -rmdir $1/alignment $1/assigned $1/counts $1/fastq_files; for i in $1/fast5_files/*; do rmdir $i; done; rmdir $1/fast5_files - - diff --git a/NanoPreprocess/config.yaml b/NanoPreprocess/config.yaml deleted file mode 100644 index 8dddb62..0000000 --- a/NanoPreprocess/config.yaml +++ /dev/null @@ -1,33 +0,0 @@ -title: "Master of Pores" -subtitle: "" -intro_text: False - -report_header_info: - - Application Type: 'Nanopore sequencing' - -custom_logo: 'logo_small.png' -custom_logo_url: 'https://github.com/biocorecrg/nanopore_analysis' -custom_logo_title: 'Master of Pores' - -extra_fn_clean_trim: - - '_QC' - - '.count' - -read_count_multiplier: 0.001 -read_count_prefix: 'K' -read_count_desc: 'thousands' - -table_columns_visible: - FastQC: - percent_duplicates: True - -top_modules: - - fastqc: - name: 'FastQC' - info: 'This section of the report shows FastQC results on raw reads.' - - alnQC: - name: "alnQC" - info: 'This section of the report shows alnQC results on aligned reads' - - minionqc - - RNA201120181_REP2_bc_1_stats: - info: 'Nanoplot stats' diff --git a/NanoPreprocess/deeplexicon/pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5 b/NanoPreprocess/deeplexicon/pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5 deleted file mode 100644 index 924fdc4..0000000 Binary files a/NanoPreprocess/deeplexicon/pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5 and /dev/null differ diff --git a/NanoPreprocess/deeplexicon/pAmps-rep2-4-train1_newdata_nanopore_UResNet20v2_model.039.h5 b/NanoPreprocess/deeplexicon/pAmps-rep2-4-train1_newdata_nanopore_UResNet20v2_model.039.h5 deleted file mode 100644 index b87fdf5..0000000 Binary files a/NanoPreprocess/deeplexicon/pAmps-rep2-4-train1_newdata_nanopore_UResNet20v2_model.039.h5 and /dev/null differ diff --git a/NanoPreprocess/deeplexicon/resnet20-final.h5 b/NanoPreprocess/deeplexicon/resnet20-final.h5 deleted file mode 100644 index 2d7fbbf..0000000 Binary files a/NanoPreprocess/deeplexicon/resnet20-final.h5 and /dev/null differ diff --git a/NanoPreprocess/nanopreprocess.nf b/NanoPreprocess/nanopreprocess.nf deleted file mode 100644 index dfaba39..0000000 --- a/NanoPreprocess/nanopreprocess.nf +++ /dev/null @@ -1,845 +0,0 @@ -#!/usr/bin/env nextflow - - -/* - * Define the pipeline parameters - * - */ - -// Pipeline version -version = '0.1' - -params.help = false -params.resume = false - -log.info """ - -╔╦╗┌─┐┌─┐┌┬┐┌─┐┬─┐ ┌─┐┌─┐ ╔═╗╔═╗╦═╗╔═╗╔═╗ -║║║├─┤└─┐ │ ├┤ ├┬┘ │ │├┤ ╠═╝║ ║╠╦╝║╣ ╚═╗ -╩ ╩┴ ┴└─┘ ┴ └─┘┴└─ └─┘└ ╩ ╚═╝╩╚═╚═╝╚═╝ - -==================================================== -BIOCORE@CRG Preprocessing of Nanopore direct RNA - N F ~ version ${version} -==================================================== - -kit : ${params.kit} -flowcell : ${params.flowcell} -fast5 : ${params.fast5} -reference : ${params.reference} -annotation : ${params.annotation} - -ref_type : ${params.ref_type} -seq_type : ${params.seq_type} - -output : ${params.output} -qualityqc : ${params.qualityqc} -granularity : ${params.granularity} - -basecaller : ${params.basecaller} -basecaller_opt : ${params.basecaller_opt} -GPU : ${params.GPU} -demultiplexing : ${params.demultiplexing} -demultiplexing_opt : ${params.demultiplexing_opt} -demulti_fast5 : ${params.demulti_fast5} - -filter : ${params.filter} -filter_opt : ${params.filter_opt} -mapper : ${params.mapper} -mapper_opt : ${params.mapper_opt} -map_type : ${params.map_type} - -counter : ${params.counter} -counter_opt : ${params.counter_opt} - -downsampling : ${params.downsampling} - -variant_caller : ${params.variant_caller} -variant_opt : ${params.variant_opt} - -email : ${params.email} -""" - -// Help and avoiding typos -if (params.help) exit 1 -if (params.resume) exit 1, "Are you making the classical --resume typo? Be careful!!!! ;)" -if (params.granularity == "") params.granularity = 1000000000 - -// check multi5 and GPU usage. GPU maybe can be removed as param if there is a way to detect it -if (params.GPU != "ON" && params.GPU != "OFF") exit 1, "Please specify ON or OFF in GPU processors are available" - -// check sequence type parameter -//if (params.seq_type != "gDNA" && params.seq_type != "cDNA" && params.seq_type != "RNA") exit 1, "Please specify the sequence type as RNA, cDNA or gDNA" -//if (params.seq_type == "gDNA") { -// log.info "seqType is gDNA: map_type is set to 'unspliced'" -// params.map_type = "unspliced" -//} else { -// log.info "seqType is ${params.seq_type}: map_type is ${params.map_type}" - if (params.map_type != "unspliced" && params.map_type != "spliced") exit 1, "Mapping type NOT supported! Please choose either 'spliced' or 'unspliced'" -//} - -// check input files -reference = file(params.reference) -if( !reference.exists() ) exit 1, "Missing reference file: ${reference}!" -config_report = file("$baseDir/config.yaml") -if( !config_report.exists() ) exit 1, "Missing config.yaml file!" -logo = file("$baseDir/../docs/logo_small.png") -deeplexicon_folder = file("$baseDir/deeplexicon/") - - -basecaller = params.basecaller -basecaller_opt = params.basecaller_opt -demultiplexer = params.demultiplexing -demultiplexer_opt = params.demultiplexing_opt -mapper = params.mapper -mapper_opt = params.mapper_opt -counter_opt = params.counter_opt - - -// Output folders -outputFastq = "${params.output}/fastq_files" -outputFast5 = "${params.output}/fast5_files" -outputQual = "${params.output}/QC_files" -outputMultiQC = "${params.output}/report" -outputMapping = "${params.output}/alignment" -outputCRAM = "${params.output}/cram_files" -outputCounts = "${params.output}/counts" -outputVars = "${params.output}/variants" -outputAssigned = "${params.output}/assigned" -outputReport = file("${outputMultiQC}/multiqc_report.html") - -/* -* move old multiQCreport -*/ -if( outputReport.exists() ) { - log.info "Moving old report to multiqc_report.html multiqc_report.html.old" - outputReport.moveTo("${outputMultiQC}/multiqc_report.html.old") -} - -/* - * Creates the channels that emits fast5 files - */ -Channel - .fromPath( params.fast5) - .ifEmpty { error "Cannot find any file matching: ${params.fast5}" } - .into {fast5_4_name; fast5_4_testing; fast5_4_granularity; fast5_4_variant} - -/* - * Get the name from the folder - */ -folder_info = params.fast5.tokenize("/") -folder_name = folder_info[-2] - -// Check config file for consistency -//if (demultiplexer == "guppy" && params.barcodekit == "") -// exit 1, "Demultiplexing with guppy needs the definition of the barcodekit parameter. Exiting" -//if (basecaller != "guppy" && demultiplexer == "guppy") -// exit 1, "Demultiplexing with guppy can be performed ONLY when the basecaller is guppy too. Exiting" -//if (basecaller == "guppy" && demultiplexer == "guppy") -// log.info "Performing basecalling and demultiplexing at the same time with Guppy." - -/* -* This is default value in case guppy will be used for RNA demultiplexing -*/ -params.barcodekit = "" -if (demultiplexer == "") { - demultiplexer = "OFF" -} - -//if (demultiplexer != "OFF" && demultiplexer != "deeplexicon") -//exit 1, "Demultiplexing of RNA can be performed only with deeplexicon. Current value is ${demultiplexer}" - -if (params.GPU == "YES" && basecaller != "guppy") - exit 1, "GPU can be used only with GUPPY basecaller!" - -if (params.ref_type == "genome") { - if (params.annotation != "") { - annotation = file(params.annotation) - if( !annotation.exists() ) exit 1, "Missing annotation file: ${params.annotation}!" - } -} - - -process testInput { - tag {"${fast5}"} - - input: - file(fast5) from fast5_4_testing.first() - - output: - stdout into multi5_type - - script: - """ - fast5_type.py ${fast5} - """ -} - -multi5_type.map { it.trim().toInteger() }.into{multi5_type_for_msg; multi5_type_for_bc; multi5_type_for_granularity; multi5_type_for_demultiplexing} -multi5_type_for_msg.map{it == 0 ? "Single Fast5 files detected!": "MultiFast5 files detected!" }.println() - -// if you are using GPU analyse the whole dataset, otherwise make batch of 4,000 sequences if they are single fast5 -// or single batches of multi fast5 sequences -multi5_type_for_granularity.merge(fast5_4_granularity.collect()).map{ - (params.GPU == "YES" ? params.granularity : (it[0] == 0 ? it[1..-1].collate(4000) : it[1..-1].collate(1)) ) -}.flatMap().set{fast5_batches} - -// create a map id batch -> list of files -def num_batch = -1 -fast5_batches.map { - num_batch++ - [num_batch, it] -}.into{ fast5_4_basecall; fast5_4_demulti} - -/* -* Perform base calling using albacore or guppy on raw fas5 files -*/ - -process logBaseCalling { - label (params.GPU == "ON" ? 'basecall_gpus': 'basecall_cpus') - echo true - - script: - if (basecaller == "albacore") { - """ - echo "no" - #export PYTHONPATH=$baseDir/bin/albacore:\$PYTHONPATH - #read_fast5_basecaller.py - """ - } else if (basecaller == "guppy"){ - """ - echo '*********************************' - guppy_basecaller --version - guppy_basecaller --print_workflows | grep ${params.flowcell} | grep ${params.kit} - echo '*********************************' - """ - } -} - -process baseCalling { - tag {"${basecaller}-${folder_name}-${idfile}"} - - label (params.GPU == "ON" ? 'basecall_gpus': 'basecall_cpus') - - // move basecalled output fast5 files - if(demultiplexer != "deeplexicon") { - publishDir outputFast5, pattern: "*_out/workspace/*.fast5", mode: 'move', saveAs: { file -> "${file.split('\\/')[-1]}" } - } - - input: - set idfile, file(fast5) from fast5_4_basecall - val (multi5) from multi5_type_for_bc - - output: - file ("${idfile}_out/workspace/*.fast5") optional true into fast5_files_for_demultiplexing - set idfile, file ("${idfile}.*.gz") into fastq_files_for_demultiplexing, demulti_log - file ("${idfile}_out/sequencing_summary.txt") into seq_summaries optional true - - script: - // conversion command if input is RNA - have to check if this is really needed - def RNA_conv_cmd = "" - def demulti_cmd = "" - def infolder = "./" - if (params.seq_type == "RNA") { RNA_conv_cmd = " | awk '{if (NR%4==2) gsub(\"U\",\"T\"); print}' " } - if (basecaller == "albacore") { - // in case input files are multi fast5 convert them in single fast5 since albacore is not able to deal with multi fast5 - if (multi5 == 1) { - demulti_cmd = "mkdir demulti_tmp; mkdir demulti; multi_to_single_fast5 -i ${infolder} -s demulti_tmp -t ${task.cpus}; mv demulti_tmp/*/*.fast5 demulti; rm -fr demulti_tmp" - infolder = "demulti" - } - """ - ${demulti_cmd} - export PYTHONPATH=$baseDir/bin/albacore:\$PYTHONPATH - read_fast5_basecaller.py ${basecaller_opt} --flowcell \"${params.flowcell}\" --kit \"${params.kit}\" --output_format fastq,fast5 \ - --worker_threads ${task.cpus} -s ./${idfile}_out --disable_filtering --input ${infolder}; - cat ${idfile}_out/workspace/*.fastq ${RNA_conv_cmd} >> ${idfile}.fastq - rm ${idfile}_out/workspace/*.fastq - gzip ${idfile}.fastq - mkdir single_basecallings - mv ${idfile}_out/workspace/*/*.fast5 single_basecallings - mkdir temp_multi - single_to_multi_fast5 -i single_basecallings -s temp_multi -t ${task.cpus} - mv temp_multi/batch_0.fast5 ./${idfile}_out/workspace/batch_${idfile}.fast5 - if [-d demulti]; then rm -fr demulti; fi - rm -fr single_basecallings temp_multi - """ - } else if (basecaller == "guppy"){ - def multi_cmd = "" - def gpu_cmd = "" - def gpu_prefix = "" - if (params.GPU == "ON") { - gpu_prefix = 'export LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/.singularity.d/libs"' - //gpu_cmd = '-x "cuda:0"' - gpu_cmd = '-x auto' - } - // in case input files are single fast5 group them in multifast5 at the end - if (multi5 == 0) { - multi_cmd = "mkdir single_basecallings temp_multi; mv *_out/workspace/*.fast5 single_basecallings; single_to_multi_fast5 -i single_basecallings -s temp_multi -t ${task.cpus}; mv temp_multi/batch_0.fast5 ./${idfile}_out/workspace/batch_${idfile}.fast5; rm -fr temp_multi single_basecallings" - } - // Different command line in case guppy is also demultiplexer - if (demultiplexer == "guppy") { - """ - ${gpu_prefix} - guppy_basecaller ${gpu_cmd} ${basecaller_opt} ${demultiplexer_opt} --flowcell ${params.flowcell} --kit ${params.kit} --num_barcode_threads ${task.cpus} --barcode_kits ${params.barcodekit} --trim_barcodes --fast5_out -i ${infolder} --save_path ./${idfile}_out --cpu_threads_per_caller 1 --gpu_runners_per_device 1 --num_callers ${task.cpus} - cd ${idfile}_out; - if [ -d barcode01 ]; then - for d in barcode*; do echo \$d; cat \$d/*.fastq ${RNA_conv_cmd} > ../${idfile}.\$d.fastq; done; - fi - cat unclassified/*.fastq ${RNA_conv_cmd} > ../${idfile}.unclassified.fastq; cd ../ - for i in *.fastq; do gzip \$i; done - ${multi_cmd} - """ - } - else if (demultiplexer == "guppy-readucks") { - """ - ${gpu_prefix} - guppy_basecaller ${gpu_cmd} ${basecaller_opt} ${demultiplexer_opt} --flowcell ${params.flowcell} --kit ${params.kit} --num_barcode_threads ${task.cpus} --barcode_kits ${params.barcodekit} --trim_barcodes --fast5_out -i ${infolder} --save_path ./${idfile}_out --gpu_runners_per_device 1 --cpu_threads_per_caller 1 --num_callers ${task.cpus} - cd ${idfile}_out; - if [ -d barcode01 ]; then - for d in barcode*; do echo \$d; cat \$d/*.fastq ${RNA_conv_cmd} > ../${idfile}.\$d.fastq; done; - fi - cat unclassified/*.fastq ${RNA_conv_cmd} > ../${idfile}.unclassified.fastq; cd ../ - for i in *.fastq; do gzip \$i; done - ${multi_cmd} - """ - } - else { - """ - ${gpu_prefix} - guppy_basecaller ${gpu_cmd} --flowcell ${params.flowcell} --kit ${params.kit} --fast5_out ${basecaller_opt} -i ${infolder} --save_path ./${idfile}_out --cpu_threads_per_caller 1 --gpu_runners_per_device 1 --num_callers ${task.cpus} - cat ${idfile}_out/*.fastq ${RNA_conv_cmd} >> ${idfile}.fastq - rm ${idfile}_out/*.fastq - gzip ${idfile}.fastq - ${multi_cmd} - """ - } - } else if (demultiplexer == "OFF") { - """ - fast5_to_fastq.py ${fast5}; mv *.fastq ${idfile}.fastq; gzip ${idfile}.fastq - """ - } -} - -//demulti_log.println() - -/* -* Perform demultiplexing (optional) using porechop on basecalled reads -*/ -if(demultiplexer == "deeplexicon") { - process demultiplexing_with_deeplexicon { - label 'demulti' - tag {"${demultiplexer}-${idfile}"} - - input: - set idfile, file(fast5) from fast5_4_demulti - val (multi5) from multi5_type_for_bc - file(deeplexicon_folder) - - output: - set idfile, file ("${idfile}_demux.tsv") into demux_for_fastq_extraction - file ("${idfile}_demux.tsv") into demux_for_fast5_extraction - - script: - def model = '' - def executable = 'deeplexicon_sub.py dmux -f multi' - if (multi5 == 0){ - executable = 'deeplexicon_sub.py dmux -f single' - } - if (demultiplexer_opt.contains("pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5")){ - executable = 'cmd_line_deeplexicon_caller_2019_09_12.py' - } - """ - ln -s ${deeplexicon_folder}/* . - ${executable} -p ./ ${demultiplexer_opt} > ${idfile}_demux.tsv - #${executable} -p ./ ${demultiplexer_opt} -b 4000 -v > ${idfile}_demux.tsv - """ - } - - process extracting_demultiplexed_fastq { - label 'basecall_cpus' - tag {"${demultiplexer}"} - - input: - set idfile, file(demux), file(fastq) from demux_for_fastq_extraction.join(fastq_files_for_demultiplexing) - - output: - set idfile, file ("*.fastq.gz") into fastq_for_filtering - - script: - """ - extract_sequence_from_fastq.py ${demux} ${fastq} - for i in *.fastq; do gzip \$i; done - """ - } - - process extracting_demultiplexed_fast5 { - - //label 'basecall_cpus' - tag { demultiplexer } - publishDir outputFast5, mode: 'copy' - - when: - params.demulti_fast5 == "ON" - - input: - file("demux_*") from demux_for_fast5_extraction.collect() - file("*") from fast5_files_for_demultiplexing.collect() - - output: - file("*") - - script: - """ - cat demux_* | grep -v ReadID >> dem.files - awk '{print \$2 > \$3".list" }' dem.files - for i in *.list; do mkdir `basename \$i .list`; fast5_subset --input ./ --save_path `basename \$i .list`/ --read_id_list \$i --batch_size 4000 -t ${task.cpus}; done - rm *.list - rm */filename_mapping.txt - rm dem.files - """ - } - - -} else { - fastq_files_for_demultiplexing.set{ fastq_for_filtering} -} - -/* -* Perform filtering (optional) using nanofilt on fastq files -*/ -if (params.filter == "nanofilt") { - process filtering { - label 'big_cpus' - tag {"${params.filter}-${fastq_file}".replace('.fastq.gz', '')} - - input: - set idfile, file(fastq_file) from fastq_for_filtering.transpose() - - output: - set idfile, file("*-filt.fastq.gz") into fastq_for_next_step - - script: - output = "${fastq_file}".replace(".fastq.gz", "-filt.fastq.gz") - """ - zcat ${fastq_file} | NanoFilt ${params.filter_opt} | gzip > ${output} - """ - } -} else { - fastq_for_filtering.transpose().set{fastq_for_next_step} -} - - -fastq_for_next_step.map{ - filepath=it[1] - if (demultiplexer != "OFF") { - fileparts = filepath.getName().tokenize(".") - ["${folder_name}.${fileparts[-3]}", filepath] - } else { - ["${folder_name}", filepath] - } -}.groupTuple().set{fastq_files_for_grouping} - -/* -* Concatenate FastQ files -*/ -process concatenateFastQFiles { - tag {idfile} - - publishDir outputFastq, pattern: "*.fq.gz", mode: 'copy' - - input: - set idfile, file(fastq_files) from fastq_files_for_grouping - - output: - set idfile, file("${idfile}.fq.gz") into fastq_files_for_fastqc, fastq_files_for_mapping, fastq_files_for_variants - - script: - """ - cat *.fastq.gz >> ${idfile}.fq.gz - """ -} - -/* -* Perform QC on fast5 files -*/ - -process QC { - tag {folder_name} - label 'big_cpus' - publishDir outputQual, mode: 'copy' - errorStrategy 'ignore' - - input: - file("summaries_*") from seq_summaries.collect() - - output: - file ("${folder_name}_QC") into QC_folders - file ("final_summary.stats") into stat_for_variants - - script: - """ - if [ -f "summaries_" ]; then - ln -s summaries_ final_summary.stats - else - head -n 1 summaries_1 > final_summary.stats - for i in summaries_*; do grep -v "filename" \$i >> final_summary.stats; done - fi - MinIONQC.R -i final_summary.stats -o ${folder_name}_QC -q ${params.qualityqc} -p ${task.cpus} - """ -} - -/* -* Perform fastQC on fastq files -*/ - -process fastQC { - tag {idfile} - label 'big_cpus' - - publishDir outputQual, pattern: "*_fastqc.html", mode: 'copy' - - input: - set idfile, file(fastq_file) from fastq_files_for_fastqc - - output: - file ("*_fastqc.*") into fastqc_for_multiqc - - script: - """ - fastqc ${fastq_file} -t ${task.cpus} - """ -} - -/* -* Perform mapping and sorting -*/ -process mapping { - tag {"${mapper}-${idfile}"} - publishDir outputMapping, mode: 'copy' - label 'big_mem_cpus' - - input: - file(reference) - set idfile, file (fastq_file) from fastq_files_for_mapping - - output: - set idfile, file("${idfile}.${mapper}.sorted.bam") optional true into aligned_reads, aligned_reads_for_QC, aligned_reads_for_QC2, aligned_reads_for_counts - set idfile, mapper, file("${idfile}.${mapper}.sorted.bam"), file("${idfile}.${mapper}.sorted.bam.bai") optional true into aligned_reads_for_crams - set idfile, file("${idfile}.${mapper}.sorted.bam"), file("${idfile}.${mapper}.sorted.bam.bai") optional true into aligned_reads_for_vars - file("${idfile}.${mapper}.sorted.bam*") optional true - - script: - if (mapper == "minimap2") { - def mappars = (params.map_type == "spliced") ? "-ax splice -k14" : "-ax map-ont" - mappars += " ${mapper_opt} " - """ - minimap2 -t ${task.cpus} ${mappars} -uf ${reference} ${fastq_file} > reads.mapped.sam - samtools view -@ ${task.cpus} -F4 -hSb reads.mapped.sam > reads.mapped.bam - rm reads.mapped.sam - samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam - samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam - rm reads.mapped.bam - """ - } - else if (mapper == "graphmap2"){ - def mappars = (params.map_type == "spliced") ? "-x rnaseq" : "" - mappars += " ${mapper_opt} " - """ - graphmap2 align -t ${task.cpus} -r ${reference} ${mappars} -d ${fastq_file} | samtools view -@ ${task.cpus} -F4 -hSb - > reads.mapped.bam - samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam - samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam - rm reads.mapped.bam - """ - } - else if (mapper == "graphmap"){ - """ - graphmap align -t ${task.cpus} ${mapper_opt} -r ${reference} -d ${fastq_file} | samtools view -@ ${task.cpus} -F4 -hSb - > reads.mapped.bam - samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam - samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam - rm reads.mapped.bam - """ - } - else { - """ - echo "nothing to do!" - """ - } -} - -/* -* Perform mapping and sorting - -process cram_conversion { - tag {"${mapper}-${idfile}"} - publishDir outputCRAM, mode: 'copy' - label 'big_mem_cpus' - - input: - file(reference) - set idfile, val(mapper), file(aln), file(index) from aligned_reads_for_crams - - output: - file("${idfile}.${mapper}.sorted.cram*") optional true - script: - def downcmd = "" - def input = aln - def cleancmd = "" - gzipcmd = unzipCmd(reference, "myreference.fasta") - gzipclean = "rm myreference.fasta" - if (params.downsampling != "") { - def perc = params.downsampling/100 - downcmd = "samtools view -@ ${task.cpus} -bs ${perc} ${aln} > subsample.bam" - input = "subsample.bam" - cleancmd = "rm subsample.bam" - } - """ - ${downcmd} - ${gzipcmd} - samtools view -@ ${task.cpus} -C ${input} -T myreference.fasta -o ${idfile}.${mapper}.sorted.cram - samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.cram - ${cleancmd} - ${gzipclean} - """ -} -*/ - - -/* -* Perform counting (optional) -*/ - -if ( params.counter == "YES") { - process counting { - tag {"${idfile}"} - publishDir outputCounts, pattern: "*.count", mode: 'copy' - publishDir outputAssigned, pattern: "*.assigned", mode: 'copy' - - input: - set idfile, file(bamfile) from aligned_reads_for_counts - - output: - file("${idfile}.count") into read_counts - file("${idfile}.stats") optional true into count_stats - file("${idfile}.assigned") optional true - script: - if (params.ref_type == "transcriptome") { - """ - NanoCount -i ${bamfile} ${counter_opt} -o ${idfile}.count ${counter_opt}; - awk '{sum+=\$3}END{print FILENAME"\t"sum}' ${idfile}.count |sed s@.count@@g > ${idfile}.stats - samtools view -F 256 ${bamfile} |cut -f 1,3 > ${idfile}.assigned - """ - } else if (params.ref_type == "genome") { - def anno = unzipBash("${params.annotation}") - """ - samtools view ${bamfile} |htseq-count -f sam - ${anno} ${counter_opt} -o ${idfile}.sam > ${idfile}.count - awk '{gsub(/XF:Z:/,"",\$NF); print \$1"\t"\$NF}' ${idfile}.sam |grep -v '__' > ${idfile}.assigned - rm ${idfile}.sam - """ - } - } - - /* - * Join alnQC - */ - process joinCountQCs { - - input: - file "*" from count_stats.collect() - - output: - file("counts_mqc.txt") into count_repo_for_multiQC - - script: - """ - echo '# id: NanoCount - # plot_type: \'table\' - # section_name: Read counts - File name \'Counts\' ' > counts_mqc.txt - cat *.stats >> counts_mqc.txt - """ - } -} else { - read_counts = Channel.empty() - count_repo_for_multiQC = Channel.empty() -} - - -/* -* Perform alnQC -*/ -process alnQC { - tag {bamid} - - input: - set bamid, file(bamfile) from aligned_reads_for_QC - - output: - file "${bamid}.stat" into single_alnQC_outs - - script: - """ - bam2stats.py ${bamfile} > ${bamid}.stat - """ -} - -/* -* Join alnQC -*/ -process joinAlnQCs { - - input: - file "alnqc_*" from single_alnQC_outs.collect() - - output: - file("alnQC_mqc.txt") into alnQC_for_multiQC - - script: - """ - echo '# id: alnQC -# plot_type: \'table\' -# section_name: \'Alignment QC\' ' > alnQC_mqc.txt - cat alnqc_* | head -n 1| sed s@#@@g >> alnQC_mqc.txt - cat alnqc_* | grep -v "#" >> alnQC_mqc.txt - """ -} - -/* -* Perform alnQC2 -*/ - -process alnQC2 { - publishDir outputQual, pattern: "*_plot/*", mode: 'copy' - label 'big_cpus' - errorStrategy 'ignore' - tag {bamid} - - input: - set bamid, file(bamfile) from aligned_reads_for_QC2 - - output: - file("*_plot/*") optional true - file("${bamid}_stats_mqc.png") optional true into qc2_for_multiqc - - script: - """ - NanoPlot --bam ${bamfile} -o ${bamid}_plot --maxlength 5000 -t ${task.cpus} - mkdir tmp_dir - cp ${bamid}_plot/PercentIdentityvsAverageBaseQuality_kde.png tmp_dir - cp ${bamid}_plot/LengthvsQualityScatterPlot_dot.png tmp_dir - cp ${bamid}_plot/HistogramReadlength.png tmp_dir - cp ${bamid}_plot/Weighted_HistogramReadlength.png tmp_dir - gm montage tmp_dir/*.png -tile 2x2 -geometry 800x800 ${bamid}_stats_mqc.png - rm -fr tmp_dir - """ -} - -QC_folders.mix(fastqc_for_multiqc,qc2_for_multiqc,read_counts,count_repo_for_multiQC,alnQC_for_multiQC).set{files_for_report} - -/* -* Perform viral variant call (experimental) -*/ - -if ( params.variant_caller == "YES" && params.seq_type != "RNA") { - - process variant_calling { - tag {"${idfile}"} - publishDir outputVars, pattern: "*.vcf", mode: 'copy' - label 'big_cpus' - - input: - set idfile, file(bamfile), file(bai) from aligned_reads_for_vars - file(reference) - - output: - file("*.vcf") - - script: - gzipcmd = unzipCmd(reference, "myreference.fasta", "yes") - gzipclean = "rm myreference.fasta" - """ - ${gzipcmd} - medaka_variant ${params.variant_opt} -i ${bamfile} -f myreference.fasta -d -t ${task.cpus} -o ./out - mv `ls -t out/round_*.vcf| head -n1 ` . - ${gzipclean} - """ - } - } - - - - -/* -* Perform multiQC report -*/ -process multiQC { - publishDir outputMultiQC, mode: 'copy' - - input: - file(logo) - file(config_report) - file("*") from files_for_report.collect() - - output: - file("multiqc_report.html") into multiQC - - script: - """ - multiqc -c ${config_report} . - """ -} - - -if (params.email == "yourmail@yourdomain" || params.email == "") { - log.info 'Skipping the email\n' -} -else { - log.info "Sending the email to ${params.email}\n" - - workflow.onComplete { - - def msg = """\ - Pipeline execution summary - --------------------------- - Completed at: ${workflow.complete} - Duration : ${workflow.duration} - Success : ${workflow.success} - workDir : ${workflow.workDir} - exit status : ${workflow.exitStatus} - Error report: ${workflow.errorReport ?: '-'} - """ - .stripIndent() - - sendMail(to: params.email, subject: "Master of Pore execution", body: msg, attach: "${outputMultiQC}/multiqc_report.html") - } -} - -workflow.onComplete { - println "Pipeline BIOCORE@CRG Master of Pore completed!" - println "Started at $workflow.start" - println "Finished at $workflow.complete" - println "Time elapsed: $workflow.duration" - println "Execution status: ${ workflow.success ? 'OK' : 'failed' }" -} - -// make named pipe -def unzipCmd(filename, unzippedname, copy="") { - def cmd = "ln -s ${filename} ${unzippedname}" - if (copy!="") { - cmd = "cp ${filename} ${unzippedname}" - } - def ext = filename.getExtension() - if (ext == "gz") { - cmd = "zcat ${filename} > ${unzippedname}" - } - return cmd -} - - -// make named pipe -def unzipBash(filename) { - def cmd = filename.toString() - if (cmd[-3..-1] == ".gz") { - cmd = "<(zcat ${filename})" - } - return cmd -} - diff --git a/NanoPreprocess/nextflow.config b/NanoPreprocess/nextflow.config deleted file mode 100644 index 8032053..0000000 --- a/NanoPreprocess/nextflow.config +++ /dev/null @@ -1,5 +0,0 @@ -includeConfig "$baseDir/params.config" -includeConfig "$baseDir/../nextflow.global.config" -singularity.cacheDir = "$baseDir/../singularity" - -//singularity.enabled = true diff --git a/NanoPreprocess/params.config b/NanoPreprocess/params.config deleted file mode 100644 index 6424d81..0000000 --- a/NanoPreprocess/params.config +++ /dev/null @@ -1,37 +0,0 @@ -params { - kit = "SQK-RNA001" - flowcell = "FLO-MIN106" - fast5 = "$baseDir/../data/multifast/*.fast5" - reference = "$baseDir/../anno/curlcake_constructs.fasta.gz" - annotation = "" - ref_type = "transcriptome" - - seq_type = "cDNA" - output = "$baseDir/multifast" - qualityqc = 5 - granularity = "" - - basecaller = "guppy" - basecaller_opt = "" - GPU = "OFF" - demultiplexing = "" - demultiplexing_opt = "-m pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5" - demulti_fast5 = "OFF" - - filter = "" - filter_opt = "" - - mapper = "graphmap2" - mapper_opt = "" - map_type = "spliced" - - counter = "YES" - counter_opt = "" - - variant_caller = "NO" - variant_opt = "" - - downsampling = "" - - email = "" -} diff --git a/NanoPreprocess/preprocessing.png b/NanoPreprocess/preprocessing.png deleted file mode 100644 index 884482a..0000000 Binary files a/NanoPreprocess/preprocessing.png and /dev/null differ diff --git a/NanoPreprocessSimple/bin/bam2stats.py b/NanoPreprocessSimple/bin/bam2stats.py deleted file mode 100755 index 12eaad6..0000000 --- a/NanoPreprocessSimple/bin/bam2stats.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python3 -# Report stats (mapped reads and identity to reference) from samtools stats -# for bam file(s) ignoring secondary, suplementary and qc failed alignments -# -# USAGE: bam2stats.py bam1 bam2 ... bamN - -import os, subprocess, sys - -def bam2stats(fn, flag=3840): - """Get stats from samtools stats""" - args = ["samtools", "stats", "-F%s"%flag, fn] - proc = subprocess.Popen(args, stdout=subprocess.PIPE) - k2v = {} - for l in proc.stdout: - l = l.decode("utf-8") - if l.startswith('SN'): - ldata = l[:-1].split()#; print(ldata) - kv = [[]] - for e in ldata[1:]: - kv[-1].append(e) - if e.endswith(':'): - kv[-1][-1] = kv[-1][-1][:-1] - kv.append([]) - k2v[" ".join(kv[0])] = kv[1] - # convert digits to int - for k in k2v: - if k2v[k][0].isdigit(): - k2v[k] = int(k2v[k][0]) - # report if no reads mapped - if not k2v['reads mapped']: - return "No reads mapped" - text = [] - text.append("{:,}\t{:.1f}%\t{:,}\t{:.1f}%".format(k2v['reads mapped'], 100*k2v['reads mapped']/k2v['sequences'], k2v['bases mapped (cigar)'], 100*k2v['bases mapped (cigar)']/k2v['total length'])) - text.append("{:,}\t{:,}".format(k2v['average length'], k2v['maximum length'])) - text.append("{:.2f}%".format(100-100*k2v['mismatches']/k2v['bases mapped (cigar)'], )) #"identity: %.2f%"%(100-k2v['mismatches']/k2v['bases mapped (cigar)'], )) - return "\t".join(text) - -for fn in sys.argv[1:]: - if os.path.isfile(fn): - sys.stdout.write("#File name\tMapped reads\tMap %\tBases\tBases %\tAvg read length\tMax read length\tidentity\n") - sys.stdout.write("%s\t%s\n"%(fn, bam2stats(fn))) - -''' -CHK 4691e107 9942d94c cd9ffd51 -# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. -SN raw total sequences: 4000 -SN filtered sequences: 0 -SN sequences: 4000 -SN is sorted: 1 -SN 1st fragments: 4000 -SN last fragments: 0 -SN reads mapped: 1440 -SN reads mapped and paired: 0 # paired-end technology bit set + both mates mapped -SN reads unmapped: 2560 -SN reads properly paired: 0 # proper-pair bit set -SN reads paired: 0 # paired-end technology bit set -SN reads duplicated: 0 # PCR or optical duplicate bit set -SN reads MQ0: 726 # mapped and MQ=0 -SN reads QC failed: 0 -SN non-primary alignments: 6801 -SN total length: 136941 # ignores clipping -SN bases mapped: 109284 # ignores clipping -SN bases mapped (cigar): 108908 # more accurate -SN bases trimmed: 0 -SN bases duplicated: 0 -SN mismatches: 14898 # from NM fields -SN error rate: 1.367944e-01 # mismatches / bases mapped (cigar) -SN average length: 34 -SN maximum length: 401 -SN average quality: 3.5 -SN insert size average: 0.0 -SN insert size standard deviation: 0.0 -SN inward oriented pairs: 0 -SN outward oriented pairs: 0 -SN pairs with other orientation: 0 -SN pairs on different chromosomes: 0 - -''' - diff --git a/NanoPreprocessSimple/bin/cmd_line_deeplexicon_caller_2019_09_12.py b/NanoPreprocessSimple/bin/cmd_line_deeplexicon_caller_2019_09_12.py deleted file mode 100755 index 9e527c6..0000000 --- a/NanoPreprocessSimple/bin/cmd_line_deeplexicon_caller_2019_09_12.py +++ /dev/null @@ -1,677 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 - -import warnings -warnings.filterwarnings("ignore", category=FutureWarning) -import argparse -import sys -# from __future__ import print_function -import os -from copy import deepcopy -import re -import csv -import time -import configparser -import h5py -import traceback -import math -import numpy as np -# from PIL import Image -import pyts -from pyts.image import MarkovTransitionField, GramianAngularField, RecurrencePlot -import tensorflow as tf -import keras -from keras.layers import Dense, Conv2D, BatchNormalization, Activation -from keras.layers import AveragePooling2D, Input, Flatten -from keras.optimizers import Adam -from keras.callbacks import ModelCheckpoint, LearningRateScheduler -from keras.callbacks import ReduceLROnPlateau -from keras.preprocessing.image import ImageDataGenerator -from keras.utils import multi_gpu_model -from keras.regularizers import l2 -from keras import backend as K -from keras.models import Model -import pandas as pd -from sklearn import datasets, linear_model -from sklearn.model_selection import train_test_split -from tensorflow.python.client import device_lib -from keras.models import load_model - -# import matplotlib.pyplot as plt - - -''' - - James M. Ferguson (j.ferguson@garvan.org.au) - Genomic Technologies - Garvan Institute - Copyright 2019 - - Tansel Ersevas .... - - script description - - ---------------------------------------------------------------------------- - version 0.0 - initial - - So a cutoff of: 0.4958776 for high accuracy - and another of 0.2943664 for high recovery - - TODO: - - - - ---------------------------------------------------------------------------- - MIT License - - Copyright (c) 2019 -------NAME---------- - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -''' - - - - -class MyParser(argparse.ArgumentParser): - def error(self, message): - sys.stderr.write('error: %s\n' % message) - self.print_help() - sys.exit(2) - -def print_verbose(message): - '''verbose printing''' - sys.stderr.write('info: %s\n' % message) - -def print_err(message): - '''error printing''' - sys.stderr.write('error: %s\n' % message) - -def read_config(filename): - config = configparser.ConfigParser() - config.read(filename) - return(config) - -def _get_available_devices(): - local_device_protos = device_lib.list_local_devices() - return [x.name for x in local_device_protos] - -def _check_available_devices(): - available_devices = _get_available_devices() - print_verbose(available_devices) - # Make sure requested GPUs are available or at least warn if they aren't - return(TRUE) - -def read_model(model_name): - # model = load_model('saved_models/' + model_name) - model = load_model(model_name) # as a path - model.compile(loss='categorical_crossentropy', - optimizer=Adam(), - metrics=['accuracy']) - return(model) - -squiggle_max = 1199 -squiggle_min = 1 -input_cut = 72000 #potenitall need to be configurable -image_size = 224 -num_classes = 4 -window = 2000 - -def main(): - ''' - Main function - ''' - VERSION = "0.9.0" - - parser = MyParser( - description="DeePlexiCon - Demultiplex direct RNA reads") - #group = parser.add_mutually_exclusive_group() - parser.add_argument("-p", "--path", - help="Top path of fast5 files to dmux") - parser.add_argument("-t", "--type", default="multi", choices=["multi", "single"], - help="Multi or single fast5s") - parser.add_argument("-c", "--config", - help="config file") - # parser.add_argument("-g", "--gpu_list", default=1 - # help="list of gpus, 1, or [1,3,5], etc. of PCI_BUS_ID order") - # parser.add_argument("-o", "--output", - # help="Output directory") - # parser.add_argument("-s", "--threshold", type=float, default=0.7, - # help="confidence interval threshold") - parser.add_argument("-s", "--threshold", type=float, default=0.50, - help="probability threshold - 0.5 hi accuracy / 0.3 hi recovery") - # populate choices with models found in saved_models/ - # parser.add_argument("-m", "--model", default="4_bc_normal.h5", choices=["4_bc_normal.h5", "model2"], - parser.add_argument("-m", "--model", - help="Trained model name to use") - parser.add_argument("-b", "--batch_size", type=int, default=4000, - help="batch size - for single fast5s") - parser.add_argument("-V", "--version", - help="Prints version") - parser.add_argument("-v", "--verbose", action="store_true", - help="Verbose output") - - args = parser.parse_args() - # print help if no arguments given - if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - - if args.verbose: - print_verbose("Verbose mode active - dumping info to stderr") - print_verbose("DeePlexiCon: {}".format(VERSION)) - print_verbose("arg list: {}".format(args)) - if tf.test.gpu_device_name(): - print_verbose('Default GPU Device: {}'.format(tf.test.gpu_device_name())) - else: - print_verbose("Please install GPU version of TF") - - - # Globals - if args.config: - config = read_config(args.config) #TODO check config read error - - squiggle_max = 1199 - squiggle_min = 1 - input_cut = 72000 #potenitall need to be configurable - image_size = 224 - num_classes = 4 - window = 2000 - # Devices - # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - - # os.environ["CUDA_VISIBLE_DEVICES"] = config[deeplexicon][gpu_list] if args.config else args.gpu_list - - # do check devices are available, else throw and error - - - # main logic - - # read model - - #TODO Check model errors - model = read_model(config[deeplexicon][trained_model]) if args.config else read_model(args.model) - labels = [] - images = [] - fast5s = {} - stats = "" - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format("fast5", "ReadID", "Barcode", "Confidence Interval", "P_bc_1", "P_bc_2", "P_bc_3", "P_bc_4")) - # for file in input... - for dirpath, dirnames, files in os.walk(args.path): - for fast5 in files: - if fast5.endswith('.fast5'): - fast5_file = os.path.join(dirpath, fast5) - if args.type == "single": - #everthing below this, send off in batches of N=args.batch_size - # The signal extraction and segmentation can happen in the first step - # read fast5 files - readID, seg_signal = get_single_fast5_signal(fast5_file, window) - # segment - # array[name, signal.....] - # seg_signal = dRNA_segmenter(signal) - if not seg_signal: - print_err("Segment not found for:\t{}\t{}".format(fast5_file, readID)) - continue - # convert - sig = np.array(seg_signal, dtype=float) - # print(sig) - img = convert_to_image(sig) - # print(img) - labels.append(readID) # read ID? - # print_verbose(labels) - fast5s[readID] = fast5 - # print_verbose(readID) - # print_verbose(fast5) - # print_verbose(fast5s) - images.append(img) - # sys.exit() - # classify - if len(labels) >= args.batch_size: - C = classify(model, labels, np.array(images), False, args.threshold) - # out = classify(model, readID, np.array([img]), False, 0.0) - # print_verbose(C) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - if out == 0: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 1: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 2: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 3: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3])) - else: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - - - elif args.type == "multi": - #everthing below this, send off in batches of N=args.batch_size - # The signal extraction and segmentation can happen in the first step - # read fast5 files - seg_signal = get_multi_fast5_signal(fast5_file, window) - # segment - # array[name, signal.....] - # seg_signal = dRNA_segmenter(signal) - # print_verbose(list(seg_signal.keys())) - sig_count = 0 - for readID in seg_signal: - # convert - img = convert_to_image(np.array(seg_signal[readID], dtype=float)) - labels.append(readID) #change to readID? - images.append(img) - fast5s[readID] = fast5 - sig_count += 1 - if len(labels) >= args.batch_size: - C = classify(model, labels, np.array(images), False, args.threshold) - # out = classify(model, readID, np.array([img]), False, 0.0) - # print_verbose(C) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - if out == 0: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 1: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 2: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 3: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3])) - else: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - elif args.verbose: - print_verbose("analysing sig_count: {}/{}".format(sig_count, len(seg_signal))) - else: - blah = 0 - #finish up - C = classify(model, labels, np.array(images), False, args.threshold) - # out = classify(model, readID, np.array([img]), False, 0.0) - # print_verbose(C) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - if out == 0: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 1: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 2: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3])) - elif out == 3: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3])) - else: - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - - - - - - - - # final report/stats - # print stats - -# file handling and segmentation - -def get_single_fast5_signal(read_filename, w): - ''' - open sigle fast5 file and extract information - ''' - - # get readID and signal - f5_dic = read_single_fast5(read_filename) - if not f5_dic: - print_err("Signal not extracted from: {}".format(read_filename)) - return 0, 0 - # segment on raw - readID = f5_dic['readID'] - signal = f5_dic['signal'] - seg = dRNA_segmenter(signal, w) - if not seg: - print_verbose("No segment found - skipping: {}".format(readID)) - return 0, 0 - # convert to pA - pA_signal = convert_to_pA(f5_dic) - # return signal/signals - return readID, pA_signal[seg[0]:seg[1]] - - -def get_multi_fast5_signal(read_filename, w): - ''' - open multi fast5 files and extract information - ''' - pA_signals = {} - f5_dic = read_multi_fast5(read_filename) - # print_verbose(list(f5_dic.keys())) - seg = 0 - sig_count = 0 - for read in f5_dic: - sig_count += 1 - print_verbose("reading sig_count: {}/{}".format(sig_count, len(f5_dic))) - # get readID and signal - # print_verbose("read: {}".format(read)) - readID = f5_dic[read]['readID'] - # print_verbose("readID: {}".format(readID)) - signal = f5_dic[read]['signal'] - # print_verbose("sig: {}".format(signal[:5])) - # continue - - - - # segment on raw - seg = dRNA_segmenter(readID, signal, w) - # print_verbose("seg return: {}".format(seg)) - if not seg: - # print_verbose("No segment found - skipping: {}".format(readID)) - seg = 0 - continue - # convert to pA - pA_signal = convert_to_pA(f5_dic[read]) - # print_verbose("pA: {}".format(pA_signal[:5])) - pA_signals[readID] = pA_signal[seg[0]:seg[1]] - # return signal/signals - return pA_signals - - -def read_single_fast5(filename): - ''' - read single fast5 file and return data - ''' - f5_dic = {'signal': [], 'readID': '', 'digitisation': 0.0, - 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0} - - # open fast5 file - try: - hdf = h5py.File(filename, 'r') - except: - traceback.print_exc() - print_err("extract_fast5():fast5 file failed to open: {}".format(filename)) - f5_dic = {} - return f5_dic - try: - c = list(hdf['Raw/Reads'].keys()) - for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]: - f5_dic['signal'].append(int(col)) - - - f5_dic['readID'] = hdf['Raw/Reads/'][c[0]].attrs['read_id'].decode() - f5_dic['digitisation'] = hdf['UniqueGlobalKey/channel_id'].attrs['digitisation'] - f5_dic['offset'] = hdf['UniqueGlobalKey/channel_id'].attrs['offset'] - f5_dic['range'] = float("{0:.2f}".format(hdf['UniqueGlobalKey/channel_id'].attrs['range'])) - f5_dic['sampling_rate'] = hdf['UniqueGlobalKey/channel_id'].attrs['sampling_rate'] - - except: - traceback.print_exc() - print_err("extract_fast5():failed to extract events or fastq from: {}".format(filename)) - f5_dic = {} - - return f5_dic - - -def read_multi_fast5(filename): - ''' - read multifast5 file and return data - ''' - f5_dic = {} - # c = 0 - with h5py.File(filename, 'r') as hdf: - for read in list(hdf.keys()): - # c += 1 - # if c > 100: - # break - f5_dic[read] = {'signal': [], 'readID': '', 'digitisation': 0.0, - 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0} - try: - for col in hdf[read]['Raw/Signal'][()]: - f5_dic[read]['signal'].append(int(col)) - - f5_dic[read]['readID'] = hdf[read]['Raw'].attrs['read_id'].decode() - f5_dic[read]['digitisation'] = hdf[read]['channel_id'].attrs['digitisation'] - f5_dic[read]['offset'] = hdf[read]['channel_id'].attrs['offset'] - f5_dic[read]['range'] = float("{0:.2f}".format(hdf[read]['channel_id'].attrs['range'])) - f5_dic[read]['sampling_rate'] = hdf[read]['channel_id'].attrs['sampling_rate'] - except: - traceback.print_exc() - print_err("extract_fast5():failed to read readID: {}".format(read)) - return f5_dic - - -def dRNA_segmenter(readID, signal, w): - ''' - segment signal/s and return coords of cuts - ''' - def _scale_outliers(squig): - ''' Scale outliers to within m stdevs of median ''' - k = (squig > 0) & (squig < 1200) - return squig[k] - - - sig = _scale_outliers(np.array(signal, dtype=int)) - - s = pd.Series(sig) - t = s.rolling(window=w).mean() - # This should be done better, or changed to median and benchmarked - # Currently trained on mean segmented data - mn = t.mean() - std = t.std() - # Trained on 0.5 - bot = mn - (std*0.5) - - - - # main algo - - begin = False - # max distance for merging 2 segs - seg_dist = 1500 - # max length of a seg - hi_thresh = 200000 - # min length of a seg - lo_thresh = 2000 - - start = 0 - end = 0 - segs = [] - count = -1 - for i in t: - count += 1 - if i < bot and not begin: - start = count - begin = True - elif i < bot: - end = count - elif i > bot and begin: - if segs and start - segs[-1][1] < seg_dist: - segs[-1][1] = end - else: - segs.append([start, end]) - start = 0 - end = 0 - begin = False - else: - continue - - offset = -1050 - buff = 150 - - # fig = plt.figure(1) - # #fig.subplots_adjust(hspace=0.1, wspace=0.01) - # ax = fig.add_subplot(111) - # - # print_verbose("segs: {}".format(segs)) - # # # Show segment lines - # for i, j in segs: - # ax.axvline(x=i+offset-buff, color='m') - # ax.axvline(x=j+offset+buff, color='m') - # - # ax.axhline(y=bot, color='r') - # plt.plot(sig, color='teal') - # plt.plot(t, color='k') - # plt.show() - # plt.clf() - - x, y = 0, 0 - - # print_verbose("segs befor filter: {}".format(segs)) - for a, b in segs: - if b - a > hi_thresh: - continue - if b - a < lo_thresh: - continue - x, y = a, b - # print "{}\t{}\t{}\t{}".format(f5, read_dic[f5], x, y) - # print_verbose("xy before return: [{},{}]".format(x, y)) - return [x+offset-buff, y+offset+buff] - break - print_verbose("dRNA_segmenter: no seg found: {}".format(readID)) - return 0 - - -def convert_to_pA(d): - ''' - convert raw signal data to pA using digitisation, offset, and range - float raw_unit = range / digitisation; - for (int32_t j = 0; j < nsample; j++) { - rawptr[j] = (rawptr[j] + offset) * raw_unit; - } - ''' - digitisation = d['digitisation'] - range = d['range'] - offset = d['offset'] - raw_unit = range / digitisation - new_raw = [] - for i in d['signal']: - j = (i + offset) * raw_unit - new_raw.append("{0:.2f}".format(round(j,2))) - return new_raw - -# Python timeseries interface - - -def pyts_transform(transform, data, image_size, show=False, cmap='rainbow', img_index=0): - try: - t_start=time.time() - X_transform = transform.fit_transform(data) - # if args.verbose: - # print_verbose("{} {}".format(time.time() - t_start, 'seconds')) - if (show): - plt.figure(figsize=(4, 4)) - plt.grid(b=None) - plt.imshow(X_transform[0], cmap=cmap, origin='lmtfower') - plt.savefig(transform.__class__.__name__ + "_image_" + str(img_index) + ".svg", format="svg") - plt.show() - return(X_transform) - except Exception as e: - print_err(str(e)) - return([]) - - -def mtf_transform(data, image_size=500, show=False, img_index=0): - transform = MarkovTransitionField(image_size) - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - -def rp_transform(data, image_size=500 ,show=False ,img_index=0): - # RP transformationmtf - transform = RecurrencePlot(dimension=1, - threshold='percentage_points', - percentage=30) - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='binary', img_index=img_index)) - -def gasf_transform(data, image_size=500, show=False, img_index=0): - # GAF transformation - transform = GramianAngularField(image_size, method='summation') - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - -def gadf_transform(data, image_size=500, show=False ,img_index=0): - # GAF transformation - transform = GramianAngularField(image_size, method='difference') - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - - -def labels_for(a_file_name): - segments=re.split(r'[_\-\.]+', a_file_name) - return(segments) - -def max_in_sequence(sequence): - return(max(np.amax([list(d.values()) for d in sequence]), 0.01)) - -def compress_squiggle(squiggle, compress_factor): - squiggle_len = len(squiggle) - rem = squiggle_len % compress_factor - if rem > 0: - return(np.mean(squiggle[0:squiggle_len - rem].reshape(-1,compress_factor), axis=1)) - return(squiggle) - -def convert_to_image(signal): - transformed_squiggle = gasf_transform(signal.reshape(1,-1), image_size=image_size, show=False) - return(transformed_squiggle) - - -def confidence_margin(npa): - sorted = np.sort(npa)[::-1] #return sort in reverse, i.e. descending - # sorted = np.sort(npa) #return sort in reverse, i.e. descending - d = sorted[0] - sorted[1] - # if args.verbose: - # print_verbose("Sorted: {}".format(sorted)) - # print_verbose("conf interval: {}".format(d)) - # return(sorted[0][0] - sorted[0][1]) - return(d) - -def classify(model, labels, image, subtract_pixel_mean, threshold): - input_shape = image.shape[1:] - x = image.astype('float32') / 255 - - # If subtract pixel mean is enabled - if subtract_pixel_mean: - x_mean = np.mean(x, axis=0) - x -= x_mean - x=[x] - # print(x) - y = model.predict(x, verbose=0) - # print(y) - res = [] - for i in range(len(y)): - cm = confidence_margin(y[i]) - #print_verbose(y[i][np.argmax(y[i])]) - if y[i][np.argmax(y[i])] >= threshold: - # return(np.argmax(y)) - res.append([labels[i], np.argmax(y[i]), cm, y[i]]) - # return(None) - # return(np.argmax(y)) - else: - res.append([labels[i], None, cm, y[i]]) - return res - -if __name__ == '__main__': - main() diff --git a/NanoPreprocessSimple/bin/deeplexicon.py b/NanoPreprocessSimple/bin/deeplexicon.py deleted file mode 100755 index c58b3b6..0000000 --- a/NanoPreprocessSimple/bin/deeplexicon.py +++ /dev/null @@ -1,622 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 - -import warnings -warnings.filterwarnings("ignore", category=FutureWarning) -import argparse -import sys -import os -from copy import deepcopy -import re -import csv -import time -import configparser -import h5py -import traceback -import math -import numpy as np -# from PIL import Image -import pyts -from pyts.image import MarkovTransitionField, GramianAngularField, RecurrencePlot -import tensorflow as tf -import keras -from keras.layers import Dense, Conv2D, BatchNormalization, Activation -from keras.layers import AveragePooling2D, Input, Flatten -from keras.optimizers import Adam -from keras.callbacks import ModelCheckpoint, LearningRateScheduler -from keras.callbacks import ReduceLROnPlateau -from keras.preprocessing.image import ImageDataGenerator -from keras.utils import multi_gpu_model -from keras.regularizers import l2 -from keras import backend as K -from keras.models import Model -import pandas as pd -from sklearn import datasets, linear_model -from sklearn.model_selection import train_test_split -from tensorflow.python.client import device_lib -from keras.models import load_model - - - -''' - - James M. Ferguson (j.ferguson@garvan.org.au) - Genomic Technologies - Garvan Institute - Copyright 2019 - - Tansel Ersevas (t.ersevas@garvan.org.au) - - script description - - - - ---------------------------------------------------------------------------- - version 0.0.0 - initial - version 0.8.0 - CPU version Done - version 0.9.0 - Fixed segment offset - version 0.9.1 - added segment and squiggle output - version 0.9.2 - separate segment output and code clean up - version 1.0.0 - initial release - - So a cutoff of: 0.4958776 for high accuracy - and another of 0.2943664 for high recovery - - TODO: - - Remove leftover libraries - - remove debug plots - - Remove redundant code - - create log files with information - - take in fastq for dmux splitting - - take in paf or bam for training splitting - - - ---------------------------------------------------------------------------- - MIT License - - Copyright (c) 2019 James M. Ferguson - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -''' - - - - -class MyParser(argparse.ArgumentParser): - def error(self, message): - sys.stderr.write('error: %s\n' % message) - self.print_help() - sys.exit(2) - -def print_verbose(message): - '''verbose printing''' - sys.stderr.write('info: %s\n' % message) - -def print_err(message): - '''error printing''' - sys.stderr.write('error: %s\n' % message) - -def read_config(filename): - config = configparser.ConfigParser() - config.read(filename) - return(config) - -def _get_available_devices(): - local_device_protos = device_lib.list_local_devices() - return [x.name for x in local_device_protos] - -def _check_available_devices(): - available_devices = _get_available_devices() - print_verbose(available_devices) - # Make sure requested GPUs are available or at least warn if they aren't - return(TRUE) - -def read_model(model_name): - # model = load_model('saved_models/' + model_name) - model = load_model(model_name) # as a path - model.compile(loss='categorical_crossentropy', - optimizer=Adam(), - metrics=['accuracy']) - return(model) - -squiggle_max = 1199 -squiggle_min = 1 -input_cut = 72000 #potenitall need to be configurable -image_size = 224 -num_classes = 4 -window = 2000 - -def main(): - ''' - Main function - ''' - VERSION = "1.0.0" - - parser = MyParser( - description="DeePlexiCon - Demultiplex direct RNA reads") - #group = parser.add_mutually_exclusive_group() - parser.add_argument("-p", "--path", - help="Top path of fast5 files to dmux") - parser.add_argument("-f", "--form", default="multi", choices=["multi", "single"], - help="Multi or single fast5s") - parser.add_argument("-c", "--config", - help="config file") - # parser.add_argument("-g", "--gpu_list", default=1 - # help="list of gpus, 1, or [1,3,5], etc. of PCI_BUS_ID order") - # parser.add_argument("-o", "--output", - # help="Output directory") - parser.add_argument("-s", "--threshold", type=float, default=0.50, - help="probability threshold - 0.5 hi accuracy / 0.3 hi recovery") - # populate choices with models found in saved_models/ - # parser.add_argument("-m", "--model", default="4_bc_normal.h5", choices=["4_bc_normal.h5", "model2"], - parser.add_argument("-m", "--model", - help="Trained model name to use") - parser.add_argument("--squiggle", - help="dump squiggle data into this .tsv file") - parser.add_argument("--segment", - help="dump segment data into this .tsv file") - parser.add_argument("-b", "--batch_size", type=int, default=4000, - help="batch size - for single fast5s") - parser.add_argument("-V", "--version", - help="Prints version") - parser.add_argument("-v", "--verbose", action="store_true", - help="Verbose output") - - args = parser.parse_args() - # print help if no arguments given - if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - - if args.verbose: - print_verbose("Verbose mode active - dumping info to stderr") - print_verbose("DeePlexiCon: {}".format(VERSION)) - print_verbose("arg list: {}".format(args)) - if tf.test.gpu_device_name(): - print_verbose('Default GPU Device: {}'.format(tf.test.gpu_device_name())) - else: - print_verbose("Please install GPU version of TF") - - if args.squiggle: - squig_file = args.squiggle - with open(squig_file, 'a') as f: - f.write("{}\t{}\n".format("ReadID", "signal_pA")) - else: - squig_file = '' - - if args.segment: - seg_file = args.segment - with open(seg_file, 'a') as f: - f.write("{}\t{}\t{}\n".format("ReadID", "start", "stop")) - else: - seg_file = '' - - # Globals - if args.config: - config = read_config(args.config) #TODO check config read error - - # gpu settings - # Devices - # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - - # os.environ["CUDA_VISIBLE_DEVICES"] = config[deeplexicon][gpu_list] if args.config else args.gpu_list - - # do check devices are available, else throw and error - - - # main logic - - # read model - - model = read_model(config[deeplexicon][trained_model]) if args.config else read_model(args.model) - barcode_out = {0: "bc_1", - 1: "bc_2", - 2: "bc_3", - 3: "bc_4", - None: "unknown" - } - labels = [] - images = [] - fast5s = {} - stats = "" - seg_dic = {} - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format("fast5", "ReadID", "Barcode", "Confidence Interval", "P_bc_1", "P_bc_2", "P_bc_3", "P_bc_4")) - # for file in input... - for dirpath, dirnames, files in os.walk(args.path): - for fast5 in files: - if fast5.endswith('.fast5'): - fast5_file = os.path.join(dirpath, fast5) - if args.form == "single": - #everthing below this, send off in batches of N=args.batch_size - # The signal extraction and segmentation can happen in the first step - # read fast5 files - readID, seg_signal = get_single_fast5_signal(fast5_file, window, squig_file, seg_file) - if not seg_signal: - print_err("Segment not found for:\t{}\t{}".format(fast5_file, readID)) - continue - # convert - sig = np.array(seg_signal, dtype=float) - img = convert_to_image(sig) - labels.append(readID) - fast5s[readID] = fast5 - images.append(img) - # classify - if len(labels) >= args.batch_size: - C = classify(model, labels, np.array(images), False, args.threshold) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcode_out[out], cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - - - elif args.form == "multi": - #everthing below this, send off in batches of N=args.batch_size - # The signal extraction and segmentation can happen in the first step - # read fast5 files - seg_signal = get_multi_fast5_signal(fast5_file, window, squig_file, seg_file) - sig_count = 0 - for readID in seg_signal: - # convert - img = convert_to_image(np.array(seg_signal[readID], dtype=float)) - labels.append(readID) - images.append(img) - fast5s[readID] = fast5 - sig_count += 1 - if len(labels) >= args.batch_size: - C = classify(model, labels, np.array(images), False, args.threshold) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcde_out[out], cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - elif args.verbose: - print_verbose("analysing sig_count: {}/{}".format(sig_count, len(seg_signal))) - else: - blah = 0 # clean - #finish up - C = classify(model, labels, np.array(images), False, args.threshold) - # save to output - for readID, out, c, P in C: - prob = [round(float(i), 6) for i in P] - cm = round(float(c), 4) - if args.verbose: - print_verbose("cm is: {}".format(cm)) - print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcode_out[out], cm, prob[0], prob[1], prob[2], prob[3])) - labels = [] - images = [] - fast5s = {} - - - # final report/stats - # print stats - -# file handling and segmentation - -def get_single_fast5_signal(read_filename, w, squig_file, seg_file): - ''' - open sigle fast5 file and extract information - ''' - - # get readID and signal - f5_dic = read_single_fast5(read_filename) - if not f5_dic: - print_err("Signal not extracted from: {}".format(read_filename)) - return 0, 0 - # segment on raw - readID = f5_dic['readID'] - signal = f5_dic['signal'] - seg = dRNA_segmenter(readID, signal, w) - if not seg: - print_verbose("No segment found - skipping: {}".format(readID)) - return 0, 0 - # convert to pA - pA_signal = convert_to_pA(f5_dic) - if squig_file: - with open(squig_file, 'a') as f: - f.write("{}\t{}\n".format(readID, "\t".join(pA_signal))) - if seg_file: - with open(seg_file, 'a') as f: - f.write("{}\t{}\t{}\n".format(readID, seg[0], seg[1])) - # return signal/signals - return readID, pA_signal[seg[0]:seg[1]] - - -def get_multi_fast5_signal(read_filename, w, squig_file, seg_file): - ''' - open multi fast5 files and extract information - ''' - pA_signals = {} - seg_dic = {} - f5_dic = read_multi_fast5(read_filename) - seg = 0 - sig_count = 0 - for read in f5_dic: - sig_count += 1 - print_verbose("reading sig_count: {}/{}".format(sig_count, len(f5_dic))) - # get readID and signal - readID = f5_dic[read]['readID'] - signal = f5_dic[read]['signal'] - - # segment on raw - seg = dRNA_segmenter(readID, signal, w) - if not seg: - seg = 0 - continue - # convert to pA - pA_signal = convert_to_pA(f5_dic[read]) - if squig_file: - with open(squig_file, 'a') as f: - f.write("{}\t{}\n".format(readID, "\t".join(pA_signal))) - if seg_file: - with open(seg_file, 'a') as f: - f.write("{}\t{}\t{}\n".format(readID, seg[0], seg[1])) - pA_signals[readID] = pA_signal[seg[0]:seg[1]] - seg_dic[readID] = seg - # return signal/signals - return pA_signals - - -def read_single_fast5(filename): - ''' - read single fast5 file and return data - ''' - f5_dic = {'signal': [], 'readID': '', 'digitisation': 0.0, - 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0} - - # open fast5 file - try: - hdf = h5py.File(filename, 'r') - except: - traceback.print_exc() - print_err("extract_fast5():fast5 file failed to open: {}".format(filename)) - f5_dic = {} - return f5_dic - try: - c = list(hdf['Raw/Reads'].keys()) - for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]: - f5_dic['signal'].append(int(col)) - - f5_dic['readID'] = hdf['Raw/Reads/'][c[0]].attrs['read_id'].decode() - f5_dic['digitisation'] = hdf['UniqueGlobalKey/channel_id'].attrs['digitisation'] - f5_dic['offset'] = hdf['UniqueGlobalKey/channel_id'].attrs['offset'] - f5_dic['range'] = float("{0:.2f}".format(hdf['UniqueGlobalKey/channel_id'].attrs['range'])) - f5_dic['sampling_rate'] = hdf['UniqueGlobalKey/channel_id'].attrs['sampling_rate'] - - except: - traceback.print_exc() - print_err("extract_fast5():failed to extract events or fastq from: {}".format(filename)) - f5_dic = {} - - return f5_dic - - -def read_multi_fast5(filename): - ''' - read multifast5 file and return data - ''' - f5_dic = {} - with h5py.File(filename, 'r') as hdf: - for read in list(hdf.keys()): - f5_dic[read] = {'signal': [], 'readID': '', 'digitisation': 0.0, - 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0} - try: - for col in hdf[read]['Raw/Signal'][()]: - f5_dic[read]['signal'].append(int(col)) - - f5_dic[read]['readID'] = hdf[read]['Raw'].attrs['read_id'].decode() - f5_dic[read]['digitisation'] = hdf[read]['channel_id'].attrs['digitisation'] - f5_dic[read]['offset'] = hdf[read]['channel_id'].attrs['offset'] - f5_dic[read]['range'] = float("{0:.2f}".format(hdf[read]['channel_id'].attrs['range'])) - f5_dic[read]['sampling_rate'] = hdf[read]['channel_id'].attrs['sampling_rate'] - except: - traceback.print_exc() - print_err("extract_fast5():failed to read readID: {}".format(read)) - return f5_dic - - -def dRNA_segmenter(readID, signal, w): - ''' - segment signal/s and return coords of cuts - ''' - def _scale_outliers(squig): - ''' Scale outliers to within m stdevs of median ''' - k = (squig > 0) & (squig < 1200) - return squig[k] - - - sig = _scale_outliers(np.array(signal, dtype=int)) - - s = pd.Series(sig) - t = s.rolling(window=w).mean() - # This should be done better, or changed to median and benchmarked - # Currently trained on mean segmented data - mn = t.mean() - std = t.std() - # Trained on 0.5 - bot = mn - (std*0.5) - - # main algo - - begin = False - # max distance for merging 2 segs - seg_dist = 1500 - # max length of a seg - hi_thresh = 200000 - # min length of a seg - lo_thresh = 2000 - - start = 0 - end = 0 - segs = [] - count = -1 - for i in t: - count += 1 - if i < bot and not begin: - start = count - begin = True - elif i < bot: - end = count - elif i > bot and begin: - if segs and start - segs[-1][1] < seg_dist: - segs[-1][1] = end - else: - segs.append([start, end]) - start = 0 - end = 0 - begin = False - else: - continue - - # offset = -1050 - # buff = 150 - offset = -1000 - buff = 0 - - x, y = 0, 0 - - for a, b in segs: - if b - a > hi_thresh: - continue - if b - a < lo_thresh: - continue - x, y = a, b - - # to be modified in next major re-training - return [x+offset-buff, y+offset+buff] - break - print_verbose("dRNA_segmenter: no seg found: {}".format(readID)) - return 0 - - -def convert_to_pA(d): - ''' - convert raw signal data to pA using digitisation, offset, and range - float raw_unit = range / digitisation; - for (int32_t j = 0; j < nsample; j++) { - rawptr[j] = (rawptr[j] + offset) * raw_unit; - } - ''' - digitisation = d['digitisation'] - range = d['range'] - offset = d['offset'] - raw_unit = range / digitisation - new_raw = [] - for i in d['signal']: - j = (i + offset) * raw_unit - new_raw.append("{0:.2f}".format(round(j,2))) - return new_raw - - -def pyts_transform(transform, data, image_size, show=False, cmap='rainbow', img_index=0): - try: - t_start=time.time() - X_transform = transform.fit_transform(data) - if (show): - plt.figure(figsize=(4, 4)) - plt.grid(b=None) - plt.imshow(X_transform[0], cmap=cmap, origin='lmtfower') - plt.savefig(transform.__class__.__name__ + "_image_" + str(img_index) + ".svg", format="svg") - plt.show() - return(X_transform) - except Exception as e: - print_err(str(e)) - return([]) - - -def mtf_transform(data, image_size=500, show=False, img_index=0): - transform = MarkovTransitionField(image_size) - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - -def rp_transform(data, image_size=500 ,show=False ,img_index=0): - # RP transformationmtf - transform = RecurrencePlot(dimension=1, - threshold='percentage_points', - percentage=30) - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='binary', img_index=img_index)) - -def gasf_transform(data, image_size=500, show=False, img_index=0): - # GAF transformation - transform = GramianAngularField(image_size, method='summation') - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - -def gadf_transform(data, image_size=500, show=False ,img_index=0): - # GAF transformation - transform = GramianAngularField(image_size, method='difference') - return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index)) - - -def labels_for(a_file_name): - segments=re.split(r'[_\-\.]+', a_file_name) - return(segments) - -def max_in_sequence(sequence): - return(max(np.amax([list(d.values()) for d in sequence]), 0.01)) - -def compress_squiggle(squiggle, compress_factor): - squiggle_len = len(squiggle) - rem = squiggle_len % compress_factor - if rem > 0: - return(np.mean(squiggle[0:squiggle_len - rem].reshape(-1,compress_factor), axis=1)) - return(squiggle) - -def convert_to_image(signal): - transformed_squiggle = gasf_transform(signal.reshape(1,-1), image_size=image_size, show=False) - return(transformed_squiggle) - - -def confidence_margin(npa): - sorted = np.sort(npa)[::-1] #return sort in reverse, i.e. descending - # sorted = np.sort(npa) #return sort in reverse, i.e. descending - d = sorted[0] - sorted[1] - return(d) - -def classify(model, labels, image, subtract_pixel_mean, threshold): - input_shape = image.shape[1:] - # x = image.astype('float32') / 255 - x = image.astype('float32') + 1 - x = x / 2 - - # If subtract pixel mean is enabled - if subtract_pixel_mean: - x_mean = np.mean(x, axis=0) - x -= x_mean - x=[x] - y = model.predict(x, verbose=0) - res = [] - for i in range(len(y)): - cm = confidence_margin(y[i]) - if y[i][np.argmax(y[i])] >= threshold: - res.append([labels[i], np.argmax(y[i]), cm, y[i]]) - else: - res.append([labels[i], None, cm, y[i]]) - return res - -if __name__ == '__main__': - main() diff --git a/NanoPreprocessSimple/bin/extract_sequence_from_fastq.py b/NanoPreprocessSimple/bin/extract_sequence_from_fastq.py deleted file mode 100755 index 27f8406..0000000 --- a/NanoPreprocessSimple/bin/extract_sequence_from_fastq.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python -import sys -import gzip -import pprint -import os - -usage = ''' -created by Huanle and Luca for Master of Pores! :) -python extract_sequence_from_fastq.py
-''' - -if len (sys.argv) < 3: - print (usage) - exit(1) - -def fopen (f): - if f.endswith('.gz'): - return (gzip.open(f,'rt')) - else: - return (open (f,'rt')) - - -IDs = dict () -if True: - fh = fopen (sys.argv[1]) - for l in fh: - ary = l.strip().split() - IDs[ary[1]] = ary[2] -fh.close() - -outprefix = os.path.splitext(sys.argv[2])[0] -outext = os.path.splitext(sys.argv[2])[1] -if (outext == '.gz'): - outprefix = os.path.splitext(outprefix)[0] - -fh = fopen (sys.argv[2]) -for l in fh: - rd = l.strip().split()[0] - rd = rd.replace('@','') - seq = next(fh).strip() - inf = next (fh).strip() - q = next(fh).strip() - if rd in IDs: - outfile = outprefix + "." + IDs[rd] + ".fastq" - fw = open(outfile,"a+") - rd = '@'+rd - string = "{0}\n{1}\n{2}\n{3}\n".format(rd,seq,inf,q) - fw.write(string) - #print (string) -fh.close() diff --git a/NanoPreprocessSimple/bin/fast5_to_fastq.py b/NanoPreprocessSimple/bin/fast5_to_fastq.py deleted file mode 100755 index a80ce10..0000000 --- a/NanoPreprocessSimple/bin/fast5_to_fastq.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python -import sys, warnings,os -import numpy as np -import re -import gzip -import h5py, ont_fast5_api -from ont_fast5_api.fast5_interface import get_fast5_file - -usage = ''' -_EUSAGE_: python fast5_to_fastq.py fast5file -_INPUT_: both single- and multi- fast5 files can be processed -_BUGS_: report to elzedliu@gmail.com -''' - -if len(sys.argv) < 2: - print (usage); exit(0) - -f5file = sys.argv[1] -hdf5 = h5py.File(f5file,'r') -f5 = get_fast5_file (f5file,mode='r') -reads_in_f5 = f5.get_read_ids() -number_of_reads = len( reads_in_f5) - - -fq_out = open ('.'.join(f5file.split('.')[:-1])+'.fastq','w') - -def fastq_from_fast5 (h5f): - ''' - for single fast5 file, it is hdf5 file open with h5py - for multi- fast5 file, it is hdf5['read_id'] for single read - ''' - k = 'Analyses/Basecall_1D_000/BaseCalled_template/Fastq' - if k in h5f: - fastq = h5f[k].value.decode() - return fastq - else: - return None - -if number_of_reads == 1: - fq = fastq_from_fast5 (hdf5) - if fq : - fq_out.write (fq) - else: - sys.stderr.write("no sequence basecalled for {} \n".format(f5file)) -elif number_of_reads > 1: - for rd in hdf5.keys(): - fq = fastq_from_fast5 (hdf5[rd]) - if fq: - fq = fq.replace('_Basecall_1D_template','') - fq_out.write(fq) - else: - sys.stderr.write("no basecall for read {} \n" .format(rd)) -fq_out.close() diff --git a/NanoPreprocessSimple/bin/fast5_type.py b/NanoPreprocessSimple/bin/fast5_type.py deleted file mode 100755 index 626c1ed..0000000 --- a/NanoPreprocessSimple/bin/fast5_type.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python -import sys, warnings,os -import h5py, ont_fast5_api -from ont_fast5_api.fast5_interface import get_fast5_file - -__author__ = 'Huanle.Liu@crg.eu' -__version__ = '0.1' -__email__ = 'same as author' - -usage = ''' - python fast5_type.py fast5file - return: - 0: single read fast5 - 1: multi-reads fast5 -''' - -if len (sys.argv) < 2: - print (usage) - exit(0) - -f5file = sys.argv[1] -hdf5 = h5py.File(f5file,'r') -f5 = get_fast5_file (f5file,mode='r') -reads_in_f5 = f5.get_read_ids() -number_of_reads = len( reads_in_f5) -if number_of_reads == 1: - print (0) -elif number_of_reads > 1: - print (1) diff --git a/NanoPreprocessSimple/bin/reorganize.sh b/NanoPreprocessSimple/bin/reorganize.sh deleted file mode 100755 index ece54cc..0000000 --- a/NanoPreprocessSimple/bin/reorganize.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/env bash - -if [ x"$1" == x ]; then - - "please specify an output folder" - - exit 1 - -fi - -for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/fast5_files; mv $i/* $1`basename $i`/fast5_files; done -for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/fastq_files; mv $1/fastq_files/*`basename $i`.fq.gz $1`basename $i`/fastq_files; done -for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/counts; mv $1/counts/*`basename $i`.count $1`basename $i`/counts; done -for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/assigned; mv $1/assigned/*`basename $i`.assigned $1`basename $i`/assigned; done -for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/alignment; mv $1/alignment/*`basename $i`*.bam $1`basename $i`/alignment; done -rmdir $1/alignment $1/assigned $1/counts $1/fastq_files; for i in $1/fast5_files/*; do rmdir $i; done; rmdir $1/fast5_files - - diff --git a/NanoPreprocessSimple/config.yaml b/NanoPreprocessSimple/config.yaml deleted file mode 120000 index 7adc0cc..0000000 --- a/NanoPreprocessSimple/config.yaml +++ /dev/null @@ -1 +0,0 @@ -../NanoPreprocess/config.yaml \ No newline at end of file diff --git a/NanoPreprocessSimple/deeplexicon b/NanoPreprocessSimple/deeplexicon deleted file mode 120000 index aa6ad45..0000000 --- a/NanoPreprocessSimple/deeplexicon +++ /dev/null @@ -1 +0,0 @@ -../NanoPreprocess/deeplexicon/ \ No newline at end of file diff --git a/NanoPreprocessSimple/nanopreprocess_s.nf b/NanoPreprocessSimple/nanopreprocess_s.nf deleted file mode 100644 index 7cdf982..0000000 --- a/NanoPreprocessSimple/nanopreprocess_s.nf +++ /dev/null @@ -1,651 +0,0 @@ -#!/usr/bin/env nextflow - - -/* - * Define the pipeline parameters - * - */ - -// Pipeline version -version = '0.1' - -params.help = false -params.resume = false - -log.info """ - -╔╦╗┌─┐┌─┐┌┬┐┌─┐┬─┐ ┌─┐┌─┐ ╔═╗╔═╗╦═╗╔═╗╔═╗ -║║║├─┤└─┐ │ ├┤ ├┬┘ │ │├┤ ╠═╝║ ║╠╦╝║╣ ╚═╗ -╩ ╩┴ ┴└─┘ ┴ └─┘┴└─ └─┘└ ╩ ╚═╝╩╚═╚═╝╚═╝ - -==================================================== -BIOCORE@CRG Preprocessing of Nanopore direct RNA - N F ~ version ${version} -==================================================== - -fast5 : ${params.fast5} -fastq : ${params.fastq} -reference : ${params.reference} -annotation : ${params.annotation} - -ref_type : ${params.ref_type} -seq_type : ${params.seq_type} - -output : ${params.output} -qualityqc : ${params.qualityqc} -granularity : ${params.granularity} - -GPU : ${params.GPU} -demultiplexing : ${params.demultiplexing} -demultiplexing_opt : ${params.demultiplexing_opt} - -filter : ${params.filter} -filter_opt : ${params.filter_opt} -mapper : ${params.mapper} -mapper_opt : ${params.mapper_opt} -map_type : ${params.map_type} - -counter : ${ params.counter} -counter_opt : ${ params.counter_opt} - -downsampling : ${params.downsampling} - -variant_caller : ${params.variant_caller} -variant_opt : ${params.variant_opt} - -email : ${params.email} -""" - -// Help and avoiding typos -if (params.help) exit 1 -if (params.resume) exit 1, "Are you making the classical --resume typo? Be careful!!!! ;)" -if (params.granularity == "") params.granularity = 1000000000 - -// check multi5 and GPU usage. GPU maybe can be removed as param if there is a way to detect it -if (params.GPU != "ON" && params.GPU != "OFF") exit 1, "Please specify ON or OFF in GPU processors are available" - -if (params.map_type != "unspliced" && params.map_type != "spliced") exit 1, "Mapping type NOT supported! Please choose either 'spliced' or 'unspliced'" - -// check input files -reference = file(params.reference) -if( !reference.exists() ) exit 1, "Missing reference file: ${reference}!" -config_report = file("$baseDir/config.yaml") -if( !config_report.exists() ) exit 1, "Missing config.yaml file!" -logo = file("$baseDir/../docs/logo_small.png") -deeplexicon_folder = file("$baseDir/deeplexicon/") - - -demultiplexer = params.demultiplexing -demultiplexer_opt = params.demultiplexing_opt -mapper = params.mapper -mapper_opt = params.mapper_opt -counter_opt = params.counter_opt - -// Output folders - -outputFastq = "fastq_files" -outputFast5 = "fast5_files" -outputQual = "QC_files" -outputMultiQC = "report" -outputMapping = "alignment" -outputCRAM = "cram_files" -outputCounts = "counts" -outputVars = "variants" -outputAssigned = "assigned" - -/* -* move old multiQCreport -*/ -outputReport = file("${outputMultiQC}/multiqc_report.html") - -if( outputReport.exists() ) { - log.info "Moving old report to multiqc_report.html multiqc_report.html.old" - outputReport.moveTo("${outputMultiQC}/multiqc_report.html.old") -} - - -/* - * Creates the channels that emits fast5 files - */ -if(params.fast5 == "") { - Channel - .empty() - .into {fast5_4_testing; fast5_4_granularity} -} else { - Channel - .fromPath(params.fast5) - .ifEmpty(){ error "Cannot find any file matching: ${params.fast5}" } - .into {fast5_4_testing; fast5_4_granularity} -} -/* - * Creates the channels that emits fastq files - */ -Channel - .fromFilePairs( params.fastq, size: 1) - .ifEmpty(){ error "Cannot find any file matching: ${params.fastq}" } - .set {fastq_files_for_demultiplexing} - - -/* - * Get the name from the folder -if (params.fast5 != "") { - folder_info = params.fast5.tokenize("/") - folder_name = folder_info[-3] -} else { - folder_info = params.fastq.tokenize("/") - folder_name = folder_info[-2] -} - */ - -/* -* This is default value in case guppy will be used for RNA demultiplexing -*/ -if (demultiplexer == "") { - demultiplexer = "OFF" -} - -if (demultiplexer != "OFF" && demultiplexer != "deeplexicon") -exit 1, "Demultiplexing of RNA can be performed only with deeplexicon. Current value is ${demultiplexer}" - -if (params.ref_type == "genome") { - if (params.annotation != "") { - annotation = file(params.annotation) - if( !annotation.exists() ) exit 1, "Missing annotation file: ${params.annotation}!" - } -} - - -process testInput { - tag {"${fast5}"} - - input: - file(fast5) from fast5_4_testing.first() - - output: - stdout into multi5_type - - script: - """ - fast5_type.py ${fast5} - """ -} - -multi5_type.map { it.trim().toInteger() }.into{multi5_type_for_msg; multi5_type_for_bc; multi5_type_for_granularity; multi5_type_for_demultiplexing} -multi5_type_for_msg.map{it == 0 ? "Single Fast5 files detected!": "MultiFast5 files detected!" }.println() - -// if you are using GPU analyse the whole dataset, otherwise make batch of 4,000 sequences if they are single fast5 -// or single batches of multi fast5 sequences -multi5_type_for_granularity.merge(fast5_4_granularity.collect()).map{ - (params.GPU == "YES" ? params.granularity : (it[0] == 0 ? it[1..-1].collate(4000) : it[1..-1].collate(1)) ) -}.flatMap().set{fast5_batches} - -// create a map id batch -> list of files -def num_batch = -1 -fast5_batches.map { - num_batch++ - [num_batch, it] -}.set{fast5_4_demulti} - -/* -* Perform demultiplexing (optional) using deeplexicon on basecalled reads -*/ -if(demultiplexer == "deeplexicon") { - process demultiplexing_with_deeplexicon { - label 'demulti' - tag {"${demultiplexer}-${idfile}"} - - input: - set idfile, file(fast5) from fast5_4_demulti - val (multi5) from multi5_type_for_bc - file(deeplexicon_folder) - - output: - //set idfile, file ("${idfile}_demux.tsv") into demux_for_fastq_extraction - //file ("${idfile}_demux.tsv") into demux_for_fast5_extraction - file ("${idfile}_demux.tsv") into demux_for_collect - - script: - def model = '' - def executable = 'deeplexicon.py -f multi' - if (multi5 == 0){ - executable = 'deeplexicon.py -f single' - } - if (demultiplexer_opt.contains("pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5")){ - executable = 'cmd_line_deeplexicon_caller_2019_09_12.py' - } - """ - ln -s ${deeplexicon_folder}/* . - ${executable} -p ./ ${demultiplexer_opt} -b 4000 -v > ${idfile}_demux.tsv - """ - } - - process collecting_deeplexicon { - - input: - file("demux_*") from demux_for_collect.collect() - - output: - file ("all_demux.txt") into demux_for_fastq_extraction - - script: - """ - awk '{if (NR==1) {print \$0} else if (\$0 !~ /Confidence/) {print \$0}}' demux_* > all_demux.txt - """ - - } - - process extracting_demultiplexed_fastq { - label 'basecall_cpus' - tag {"${idfile}"} - - input: - file(demux) from demux_for_fastq_extraction - set idfile, file(fastq) from fastq_files_for_demultiplexing - - output: - set idfile, file("*.fastq.gz") into fastq_for_filtering - - script: - """ - extract_sequence_from_fastq.py ${demux} ${fastq} - for i in *.fastq; do gzip \$i; done - """ - } -} else { - fastq_files_for_demultiplexing.set{ fastq_for_filtering} -} - - -/* -* Perform filtering (optional) using nanofilt on fastq files -*/ -if (params.filter == "nanofilt") { - process filtering { - label 'big_cpus' - tag {"${params.filter}-${fastq_file}".replace('.fastq.gz', '')} - - input: - set idfile, file(fastq_file) from fastq_for_filtering.transpose() - - output: - set idfile, file("*-filt.fastq.gz") into fastq_for_next_step - - script: - output = "${fastq_file}".replace(".fastq.gz", "-filt.fastq.gz") - """ - zcat ${fastq_file} | NanoFilt ${params.filter_opt} | gzip > ${output} - """ - } -} else { - fastq_for_filtering.transpose().set{ fastq_for_next_step} -} - - -// check this -fastq_for_next_step.map{ - filepath=it[1] - file_parts = "${filepath}".tokenize("/") - def folder_name = file_parts[-2] - if (demultiplexer != "OFF") { - fileparts = filepath.getName().tokenize(".") - ["${folder_name}.${fileparts[-3]}", filepath] - } else { - ["${folder_name}", filepath] - } -}.groupTuple().into{fastq_files_for_fastqc; fastq_files_for_mapping} - - -/* -* Perform fastQC on fastq files -*/ - -process fastQC { - tag {idfile} - label 'big_cpus' - - publishDir "${params.output}/${idfile}/${outputQual}/", pattern: "*_fastqc.html", mode: 'copy' - - input: - set idfile, file(fastq_file) from fastq_files_for_fastqc - - output: - file ("*_fastqc.*") into fastqc_for_multiqc - - script: - """ - fastqc ${fastq_file} -t ${task.cpus} - """ -} - -process mapping { - tag {"${mapper}-${idfile}"} - publishDir "${params.output}/${idfile}/${outputMapping}/", pattern: "*.${mapper}.sorted.bam*", mode: 'copy' - label 'big_mem_cpus' - - input: - file(reference) - set idfile, file (fastq_file) from fastq_files_for_mapping - - output: - set idfile, file("${idfile}.${mapper}.sorted.bam") optional true into aligned_reads, aligned_reads_for_QC, aligned_reads_for_QC2, aligned_reads_for_counts - set idfile, mapper, file("${idfile}.${mapper}.sorted.bam"), file("${idfile}.${mapper}.sorted.bam.bai") optional true into aligned_reads_for_crams - set idfile, file("${idfile}.${mapper}.sorted.bam"), file("${idfile}.${mapper}.sorted.bam.bai") optional true into aligned_reads_for_vars - file("${idfile}.${mapper}.sorted.bam*") optional true - - script: - if (mapper == "minimap2") { - def mappars = (params.map_type == "spliced") ? "-ax splice -k14" : "-ax map-ont" - mappars += " ${mapper_opt} " - """ - minimap2 -t ${task.cpus} ${mappars} -uf ${reference} ${fastq_file} > reads.mapped.sam - samtools view -@ ${task.cpus} -F4 -hSb reads.mapped.sam > reads.mapped.bam - rm reads.mapped.sam - samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam - samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam - rm reads.mapped.bam - """ - } - else if (mapper == "graphmap2"){ - def mappars = (params.map_type == "spliced") ? "-x rnaseq" : "" - mappars += " ${mapper_opt} " - """ - graphmap2 align -t ${task.cpus} -r ${reference} ${mappars} -d ${fastq_file} | samtools view -@ ${task.cpus} -F4 -hSb - > reads.mapped.bam - samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam - samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam - rm reads.mapped.bam - """ - } - else if (mapper == "graphmap"){ - """ - graphmap align -t ${task.cpus} ${mapper_opt} -r ${reference} -d ${fastq_file} | samtools view -@ ${task.cpus} -F4 -hSb - > reads.mapped.bam - samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam - samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam - rm reads.mapped.bam - """ - } - else { - """ - echo "nothing to do!" - """ - } -} - -/* -* Perform mapping and sorting -*/ -process cram_conversion { - tag {"${mapper}-${idfile}"} - publishDir "${params.output}/${idfile}/${outputCRAM}/", mode: 'copy' - label 'big_mem_cpus' - - input: - file(reference) - set idfile, val(mapper), file(aln), file(index) from aligned_reads_for_crams - - output: - file("${idfile}.${mapper}.sorted.cram*") optional true - script: - def downcmd = "" - def input = aln - def cleancmd = "" - gzipcmd = unzipCmd(reference, "myreference.fasta") - gzipclean = "rm myreference.fasta" - if (params.downsampling != "") { - def perc = params.downsampling/100 - downcmd = "samtools view -@ ${task.cpus} -bs ${perc} ${aln} > subsample.bam" - input = "subsample.bam" - cleancmd = "rm subsample.bam" - } - """ - ${downcmd} - ${gzipcmd} - samtools view -@ ${task.cpus} -C ${input} -T myreference.fasta -o ${idfile}.${mapper}.sorted.cram - samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.cram - ${cleancmd} - ${gzipclean} - """ -} - -/* -* Perform counting (optional) -*/ - -if ( params.counter == "YES") { - process counting { - tag {"${idfile}"} - publishDir "${params.output}/${idfile}/${outputCounts}/", pattern: "*.count", mode: 'copy' - publishDir "${params.output}/${idfile}/${outputAssigned}/", pattern: "*.assigned", mode: 'copy' - - input: - set idfile, file(bamfile) from aligned_reads_for_counts - - output: - file("${idfile}.count") into read_counts - file("${idfile}.stats") optional true into count_stats - file("${idfile}.assigned") optional true - script: - if (params.ref_type == "transcriptome") { - """ - NanoCount -i ${bamfile} -o ${idfile}.count ${counter_opt}; - awk '{sum+=\$3}END{print FILENAME"\t"sum}' ${idfile}.count |sed s@.count@@g > ${idfile}.stats - samtools view -F 256 ${bamfile} |cut -f 1,3 > ${idfile}.assigned - """ - } else if (params.ref_type == "genome") { - def anno = unzipBash("${params.annotation}") - """ - samtools view ${bamfile} |htseq-count ${counter_opt} -f sam - ${anno} -o ${idfile}.sam > ${idfile}.count - awk '{gsub(/XF:Z:/,"",\$NF); print \$1"\t"\$NF}' ${idfile}.sam |grep -v '__' > ${idfile}.assigned - rm ${idfile}.sam - """ - } - } - - /* - * Join alnQC - */ - process joinCountQCs { - - input: - file "*" from count_stats.collect() - - output: - file("counts_mqc.txt") into count_repo_for_multiQC - - script: - """ - echo '# id: NanoCount - # plot_type: \'table\' - # section_name: Read counts - File name \'Counts\' ' > counts_mqc.txt - cat *.stats >> counts_mqc.txt - """ - } -} else { - read_counts = Channel.empty() - count_repo_for_multiQC = Channel.empty() -} - - -/* -* Perform alnQC -*/ -process alnQC { - tag {bamid} - - input: - set bamid, file(bamfile) from aligned_reads_for_QC - - output: - file "${bamid}.stat" into single_alnQC_outs - - script: - """ - bam2stats.py ${bamfile} > ${bamid}.stat - """ -} - -/* -* Join alnQC -*/ -process joinAlnQCs { - - input: - file "alnqc_*" from single_alnQC_outs.collect() - - output: - file("alnQC_mqc.txt") into alnQC_for_multiQC - - script: - """ - echo '# id: alnQC -# plot_type: \'table\' -# section_name: \'Alignment QC\' ' > alnQC_mqc.txt - cat alnqc_* | head -n 1| sed s@#@@g >> alnQC_mqc.txt - cat alnqc_* | grep -v "#" >> alnQC_mqc.txt - """ -} - -/* -* Perform alnQC2 -*/ - -process alnQC2 { - publishDir "${params.output}/${bamid}/${outputQual}/", pattern: "*_plot/*", mode: 'copy' - label 'big_cpus' - errorStrategy 'ignore' - tag {bamid} - - input: - set bamid, file(bamfile) from aligned_reads_for_QC2 - - output: - file("*_plot/*") optional true - file("${bamid}_stats_mqc.png") optional true into qc2_for_multiqc - - script: - """ - NanoPlot --bam ${bamfile} -o ${bamid}_plot --maxlength 5000 -t ${task.cpus} - mkdir tmp_dir - cp ${bamid}_plot/PercentIdentityvsAverageBaseQuality_kde.png tmp_dir - cp ${bamid}_plot/LengthvsQualityScatterPlot_dot.png tmp_dir - cp ${bamid}_plot/HistogramReadlength.png tmp_dir - cp ${bamid}_plot/Weighted_HistogramReadlength.png tmp_dir - gm montage tmp_dir/*.png -tile 2x2 -geometry 800x800 ${bamid}_stats_mqc.png - rm -fr tmp_dir - """ -} - -fastqc_for_multiqc.mix(qc2_for_multiqc,read_counts,count_repo_for_multiQC,alnQC_for_multiQC).set{files_for_report} - -/* -* Perform viral variant call (experimental) -*/ - -if ( params.variant_caller == "YES" && params.seq_type != "RNA") { - - process variant_calling { - tag {"${idfile}"} - publishDir "${params.output}/${idfile}/${outputVars}/", pattern: "*.vcf", mode: 'copy' - label 'big_cpus' - errorStrategy 'ignore' - - input: - set idfile, file(bamfile), file(bai) from aligned_reads_for_vars - file(reference) - - output: - file("*.vcf") - - script: - gzipcmd = unzipCmd(reference, "myreference.fasta", "yes") - gzipclean = "rm myreference.fasta" - """ - ${gzipcmd} - medaka consensus ${bamfile} outputs.hdf - medaka variant myreference.fasta outputs.hdf ${idfile}.vcf - medaka tools annotate ${idfile}.vcf reference.fasta ${bamfile} ${idfile}.annot.vcf - - #medaka_variant ${params.variant_opt} -i ${bamfile} -f myreference.fasta -d -t ${task.cpus} -o ./out - #mv `ls -t out/round_*.vcf| head -n1 ` . - ${gzipclean} - """ - } - } - - -// make named pipe -def unzipCmd(filename, unzippedname, copy="") { - def cmd = "ln -s ${filename} ${unzippedname}" - if (copy!="") { - cmd = "cp ${filename} ${unzippedname}" - } - def ext = filename.getExtension() - if (ext == "gz") { - cmd = "zcat ${filename} > ${unzippedname}" - } - return cmd -} - -// make named pipe -def unzipBash(filename) { - def cmd = filename.toString() - if (cmd[-3..-1] == ".gz") { - cmd = "<(zcat ${filename})" - } - return cmd -} - -/* -* Perform multiQC report -*/ -process multiQC { - publishDir "${params.output}/${outputMultiQC}/", mode: 'copy' - - input: - file(logo) - file(config_report) - file("*") from files_for_report.collect() - - output: - file("multiqc_report.html") into multiQC - - script: - """ - multiqc -c ${config_report} . - """ -} - - -if (params.email == "yourmail@yourdomain" || params.email == "") { - log.info 'Skipping the email\n' -} -else { - log.info "Sending the email to ${params.email}\n" - - workflow.onComplete { - - def msg = """\ - Pipeline execution summary - --------------------------- - Completed at: ${workflow.complete} - Duration : ${workflow.duration} - Success : ${workflow.success} - workDir : ${workflow.workDir} - exit status : ${workflow.exitStatus} - Error report: ${workflow.errorReport ?: '-'} - """ - .stripIndent() - - sendMail(to: params.email, subject: "Master of Pore execution", body: msg, attach: "${outputMultiQC}/multiqc_report.html") - } -} - -workflow.onComplete { - println "Pipeline BIOCORE@CRG Master of Pore completed!" - println "Started at $workflow.start" - println "Finished at $workflow.complete" - println "Time elapsed: $workflow.duration" - println "Execution status: ${ workflow.success ? 'OK' : 'failed' }" -} - - - diff --git a/NanoPreprocessSimple/nextflow.config b/NanoPreprocessSimple/nextflow.config deleted file mode 120000 index 227f6b9..0000000 --- a/NanoPreprocessSimple/nextflow.config +++ /dev/null @@ -1 +0,0 @@ -../NanoPreprocess/nextflow.config \ No newline at end of file diff --git a/NanoPreprocessSimple/params.config b/NanoPreprocessSimple/params.config deleted file mode 100644 index 8016d1a..0000000 --- a/NanoPreprocessSimple/params.config +++ /dev/null @@ -1,33 +0,0 @@ -params { - fast5 = "$baseDir/../NanoPreprocess/multifast/fast5_files/*.fast5" - fastq = "$baseDir/../NanoPreprocess/multifast/fastq_files/*.fq.gz" - reference = "$baseDir/../anno/curlcake_constructs.fasta.gz" - annotation = "" - ref_type = "transcriptome" - - seq_type = "cDNA" - output = "$baseDir/output" - qualityqc = 5 - granularity = "" - - GPU = "OFF" - demultiplexing = "deeplexicon" - demultiplexing_opt = "-s 0.9 -m resnet20-final.h5" - demulti_fast5 = "OFF" - - filter = "" - filter_opt = "" - - mapper = "minimap2" - mapper_opt = "-uf -k14" - map_type = "unspliced" - - variant_caller = "YES" - variant_opt = "" - - downsampling = "" - - counter = "YES" - counter_opt = "" - email = "" -} diff --git a/NanoTail/bin/join.r b/NanoTail/bin/join.r deleted file mode 100644 index 38809e6..0000000 --- a/NanoTail/bin/join.r +++ /dev/null @@ -1,43 +0,0 @@ -# R --slave --args findr nanopol assigned prefix < join.r - -args<-commandArgs(TRUE) -findr <- args[1] -nanopol <- args[2] -assigned <- args[3] -prefix <- args[4] - -a<-read.table(findr, sep="\t", header=FALSE) -b<-read.table(nanopol, sep="\t", header=FALSE) -g<-read.table(assigned, sep="\t", header=FALSE) - -c<-merge(a,b, all.x=T, by.x="V1", by.y="V1", sort=FALSE) -d<-merge(c,g, all.x=T, by.x="V1", by.y="V1", sort=FALSE) - -colnames(d)<-c("Read name", "Tailfindr", "Nanopolish", "Gene Name") - -e<-na.omit(d) -e[,2]<-log(e[,2]+1,base=2) -e[,3]<-log(e[,3]+1,base=2) - -write.table(d, file = paste0(prefix, "_joined.txt"), sep="\t", row.names = FALSE) - - -x<-e[,2] -y<-e[,3] - -data.lm<-lm(y ~ x) -rsquared<-round(summary(data.lm)$r.squared, 2) - -rsquare=bquote(R^2 == .(rsquared)) - -subtitle=expression('PolyA tail length estimation log'[2]*'(n+1)') - -png(paste0(prefix, "_corr.png"), width = 1600, height = 1200) -op <- par(mar=c(8, 8, 8, 8) + 0.1) -plot(x, y, xlab="TailfindR", ylab="Nanopolish", ylim=c(0,10), xlim=c(0,10), cex=.2, cex.lab=2, cex.sub=2,, cex.main=3, cex.axis=2, pch=19, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.5), frame=FALSE, main=prefix) -mtext(side=3, line=0, at=6.5, adj=1, cex=2, subtitle) -mtext(side=3, line=-65, at=10, adj=1, cex=2, rsquare) - -par(op) -abline(data.lm, col = "blue") -dev.off() diff --git a/NanoTail/config.yaml b/NanoTail/config.yaml deleted file mode 100644 index faa50a8..0000000 --- a/NanoTail/config.yaml +++ /dev/null @@ -1,26 +0,0 @@ -title: "Master of Pores" -subtitle: "" -intro_text: False - -report_header_info: - - Application Type: 'Nanopore sequencing' - -custom_logo: 'logo_small.png' -custom_logo_url: 'https://github.com/biocorecrg/nanopore_analysis' -custom_logo_title: 'Master of Pores' - -extra_fn_clean_trim: - - '_QC' - -table_columns_visible: - FastQC: - percent_duplicates: True - -module_order: - - fastqc: - name: 'FastQC' - info: 'This section of the report shows FastQC results on raw reads.' - - - alnQC: - name: "alnQC" - info: 'This section of the report shows alnQC results on aligned reads' diff --git a/NanoTail/nanotail.nf b/NanoTail/nanotail.nf deleted file mode 100644 index de71c85..0000000 --- a/NanoTail/nanotail.nf +++ /dev/null @@ -1,307 +0,0 @@ -#!/usr/bin/env nextflow - - -/* - * Define the pipeline parameters - * - */ - -// Pipeline version -version = '0.1' - -params.help = false -params.resume = false - -log.info """ - -╔╦╗┌─┐┌─┐┌┬┐┌─┐┬─┐ ┌─┐┌─┐ ╔═╗╔═╗╦═╗╔═╗╔═╗ -║║║├─┤└─┐ │ ├┤ ├┬┘ │ │├┤ ╠═╝║ ║╠╦╝║╣ ╚═╗ -╩ ╩┴ ┴└─┘ ┴ └─┘┴└─ └─┘└ ╩ ╚═╝╩╚═╚═╝╚═╝ - -==================================================== -BIOCORE@CRG NanoTail. Detection of polyA length (RNA) - N F ~ version ${version} -==================================================== - -***************** Input files ********************* -input_folders : ${params.input_folders} - -******* reference has to be the genome ********** -reference : ${params.reference} -output : ${params.output} - -********* nanopolish and tailfindr cmd options ********** -nanopolish_opt : ${params.nanopolish_opt} -tailfindr_opt : ${params.tailfindr_opt} - -email : ${params.email} - -""" - -// Help and avoiding typos -if (params.help) exit 1 -if (params.resume) exit 1, "Are you making the classical --resume typo? Be careful!!!! ;)" - -// check input files -reference = file(params.reference) -if( !reference.exists() ) exit 1, "Missing reference file: ${reference}!" -config_report = file("$baseDir/config.yaml") -if( !config_report.exists() ) exit 1, "Missing config.yaml file!" -logo = file("$baseDir/../docs/logo_small.png") -joinScript = file("$baseDir/bin/join.r") - - -nanopolish_opt = params.nanopolish_opt -tailfindr_opt = params.tailfindr_opt - -// Output folders -outputTailfindr = "${params.output}/Tailfindr" -outputNanoPolish = "${params.output}/NanoPolish" -outputFinal = "${params.output}/PolyA_final" - -/* - * Creates the channels that emits input data - */ -Channel - .fromFilePairs( params.input_folders, type: 'dir', size: 1) - .ifEmpty { error "Cannot find any folder matching: ${params.input_folders}" } - .into{folders_for_fastq; folders_for_analyisis; folders_for_genes; folders_for_bam_filtering} - -folders_for_analyisis.map { - [ it[0], file("${it[1][0]}/fast5_files/*.fast5") ] - }.transpose().into{data_for_tailfindr; data_for_nanopolish_raw; ciccio} - -folders_for_bam_filtering.map { - [ it[0], file("${it[1][0]}/alignment/*.bam") ] - }.set{data_for_bam_filtering} - -folders_for_fastq.map { - [ it[0], file("${it[1][0]}/fastq_files/*") ] - }.set{fastq_data} - -folders_for_genes.map { - [ it[0], file("${it[1][0]}/assigned/*") ] - }.set{genes_for_final} - - -/* -* Check reference file -*/ -process check_reference { - - input: - file(reference) - - output: - file("reference_sequences.fa") into checked_ref - - script: - """ - if [ `echo ${reference} | grep ".gz"` ]; then - zcat ${reference} > reference_sequences.fa - else ln -s ${reference} reference_sequences.fa - fi - """ -} - -/* -* Estimate polyA tail size with tailfindr -*/ - -process tailfindr { - tag "${sampleID}-${fast5.simpleName}" - label 'big_mem_cpus_time' - - input: - set val(sampleID), file(fast5) from data_for_tailfindr - - output: - set val(sampleID), file("*_findr.csv") into tailfindr_res - - script: - def fast5_index=fast5.getSimpleName() - """ - R --slave -e "library(tailfindr); find_tails(fast5_dir = './' , save_dir = './', ${params.tailfindr_opt}, csv_filename = \'${sampleID}-${fast5_index}_findr.csv\', num_cores = ${task.cpus})" - """ -} - -/* -* Filter bams -*/ -process filter_bam { - tag { sampleID } - label 'big_mem_cpus' - - input: - file(reference) - set val(sampleID), file(alignment) from data_for_bam_filtering - - output: - set val(sampleID), file("${sampleID}_filt.bam"), file("${sampleID}_filt.bam.bai") into filt_bam_for_nanopolish, filt_bam_for_intersect - - script: - """ - #to keep only mapped reads and remove secondary alignments - samtools view -@ {task.cpus} -bF 260 ${alignment} > ${sampleID}_filt.bam - samtools index ${sampleID}_filt.bam - """ -} - - -data_for_nanopolish_raw.combine(filt_bam_for_nanopolish, by:0).combine(fastq_data, by:0) -.set{data_for_nanopolish} - - -/* -* Estimate polyA tail size with nanopolish -*/ -process tail_nanopolish { - tag "${sampleID}-${fast5.simpleName}" - label 'big_mem_cpus_time_np' - - input: - file(checked_ref) - set val(sampleID), file(fast5), file(alignment), file(alnindex), file(fastq) from data_for_nanopolish - - output: - set val(sampleID), file("*.polya.estimation.tsv") into nanopol_res - - script: - def fast5_index=fast5.getSimpleName() - """ - #index reads - nanopolish index -d ./ ${fastq} - # polya length estimation - nanopolish polya -r ${fastq} ${params.nanopolish_opt} -g ${checked_ref} -t ${task.cpus} -b ${alignment} > ${sampleID}-${fast5_index}.polya.estimation.tsv - """ -} - -process collect_nanopolish_results { - publishDir outputNanoPolish, pattern: "*.polya.estimation.tsv", mode: 'copy' - tag { sampleID } - - input: - set val(sampleID), file("nanopol_*") from nanopol_res.groupTuple() - - output: - set val(sampleID), file("${sampleID}.nanopol.len") into nanopol_len - file ("*.polya.estimation.tsv") - - script: - """ - cat nanopol_* | awk '!(NR>1 && /leader_start/)' | grep -v "READ_FAILED_LOAD" >> ${sampleID}.polya.estimation.tsv - awk -F"\t" '{if (\$10=="PASS") print \$1"\t"\$9}' ${sampleID}.polya.estimation.tsv > ${sampleID}.nanopol.len - """ - -} - -process collect_tailfindr_results { - publishDir outputTailfindr, pattern: "*_findr.csv", mode: 'copy' - tag { sampleID } - - input: - set val(sampleID), file("tailfin_*") from tailfindr_res.groupTuple() - - output: - set val(sampleID), file("${sampleID}.findr.len") into tailfindr_len - file ("*_findr.csv") - - script: - """ - cat tailfin_* | awk '!(NR>1 && /tail_start/)' >> ${sampleID}_findr.csv - awk -F"," '{if (\$5!="" && \$1!="read_id") print \$1"\t"\$5}' ${sampleID}_findr.csv > ${sampleID}.findr.len - """ - - -} - - - - -/* -*/ -process join_results { - container "biocorecrg/mopnanotail:0.2" - - publishDir outputFinal, mode: 'copy' - tag { sampleID } - - input: - set val(sampleID), file(nanopol), file(tailfindr), file(genes) from nanopol_len.join(tailfindr_len).join(genes_for_final) - file(joinScript) - - output: - file("${sampleID}_*") - - script: - """ - Rscript --vanilla ${joinScript} ${tailfindr} ${nanopol} ${genes} ${sampleID} - """ - -} - - -/* -* Merge the results -*/ - - - -/* -* functions -*/ -// Get the name from the folder -def getFolderName(sample) { - folder_info = sample.toString().tokenize("/") - return folder_info[-2] -} - -// make named pipe -def unzipBash(filename) { - cmd = filename.toString() - if (cmd[-3..-1] == ".gz") { - cmd = "<(zcat ${filename})" - } - return cmd -} - - - -/* -* Finish message -*/ -workflow.onComplete { - println "Pipeline BIOCORE@CRG Master of Pore completed!" - println "Started at $workflow.start" - println "Finished at $workflow.complete" - println "Time elapsed: $workflow.duration" - println "Execution status: ${ workflow.success ? 'OK' : 'failed' }" -} - -/* -* Mail notification -*/ - -if (params.email == "yourmail@yourdomain" || params.email == "") { - log.info 'Skipping the email\n' -} -else { - log.info "Sending the email to ${params.email}\n" - - workflow.onComplete { - - def msg = """\ - NanoTail module's execution summary - --------------------------- - Completed at: ${workflow.complete} - Duration : ${workflow.duration} - Success : ${workflow.success} - workDir : ${workflow.workDir} - exit status : ${workflow.exitStatus} - Error report: ${workflow.errorReport ?: '-'} - """ - .stripIndent() - - sendMail(to: params.email, subject: "Master of Pore execution", body: msg) - } -} - diff --git a/NanoTail/nextflow.config b/NanoTail/nextflow.config deleted file mode 100644 index 210b3e6..0000000 --- a/NanoTail/nextflow.config +++ /dev/null @@ -1,68 +0,0 @@ -env { - PYTHONNOUSERSITE = 1 -} - -includeConfig "$baseDir/params.config" -//includeConfig "../nextflow.global.config" -singularity.cacheDir = "$baseDir/../singularity" - -singularity.autoMounts = true - - -profiles { - uge { - executor = 'sge' - penv = 'smp' - process { - container = 'biocorecrg/mopmod1:0.2' - withLabel: big_mem_cpus { - queue = 'long-sl7,biocore-el7,short-sl7' - cpus = 8 - memory = '20G' - } - - withLabel: big_mem_cpus_time_np { - container = "biocorecrg/mopnanopolish:0.2" - queue = 'biocore-el7,long-sl7' - cpus = 8 - memory = '60G' - time = '96h' - } - - withLabel: big_mem_cpus_time { - container = "biocorecrg/mopnanotail:0.2" - queue = 'biocore-el7,long-sl7' - cpus = 8 - memory = '60G' - time = '96h' - } - } - } - slurm { - executor = 'slurm' - process { - container = 'biocorecrg/mopmod1:0.2' - withLabel: big_mem_cpus { - cpus = 8 - memory = '20G' - } - - withLabel: big_mem_cpus_time_np { - container = "biocorecrg/mopnanopolish:0.2" - cpus = 8 - memory = '60G' - time = '96h' - } - - withLabel: big_mem_cpus_time { - container = "biocorecrg/mopnanotail:0.2" - cpus = 8 - memory = '60G' - time = '96h' - } - - } - } -} - - diff --git a/NanoTail/params.config b/NanoTail/params.config deleted file mode 100644 index c92374a..0000000 --- a/NanoTail/params.config +++ /dev/null @@ -1,11 +0,0 @@ -params { - - input_folders = "$baseDir/../NanoPreprocess/output/RNA*" - nanopolish_opt = "" - tailfindr_opt = "" - - reference = "$baseDir/../anno2/genome.fa.gz" - output = "$baseDir/outputRNA" - - email = "yourname@yourdomain" -} diff --git a/config.yaml b/config.yaml deleted file mode 100644 index 541e130..0000000 --- a/config.yaml +++ /dev/null @@ -1,30 +0,0 @@ -title: "Master of Pores" -subtitle: "" -intro_text: False - -report_header_info: - - Application Type: 'Nanopore sequencing' - -custom_logo: 'logo_small.png' -custom_logo_url: 'https://github.com/biocorecrg/nanopore_analysis' -custom_logo_title: 'Master of Pores' - -extra_fn_clean_trim: - - '_QC' - -table_columns_visible: - FastQC: - percent_duplicates: True - -module_order: - - fastqc: - name: 'FastQC' - info: 'This section of the report shows FastQC results on raw reads.' - - - alnQC: - name: "Alignment QC" - info: 'This section of the report shows QC results on aligned reads' - - - nanoplot: - name: "Test" - info: "info" diff --git a/example_output/multiqc_report_example.html b/example_output/multiqc_report_example.html deleted file mode 100644 index 3b598b4..0000000 --- a/example_output/multiqc_report_example.html +++ /dev/null @@ -1,6365 +0,0 @@ - - - - - - - - - - - - - -Master of Pores: MultiQC Report - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-

- - - - - - -

- -

Master of Pores

- -

Loading report..

- -
- -
-
- - - -
- - - - -
- - - - -
-

- - Highlight Samples -

- -
- - - - -

- Regex mode off - - -

-
    -
    - - -
    -

    - - Rename Samples -

    - -
    - - - - -

    Click here for bulk input.

    -
    -

    Paste two columns of a tab-delimited table here (eg. from Excel).

    -

    First column should be the old name, second column the new name.

    -
    - - - -
    -

    - Regex mode off - - -

    -
      -
      - - -
      -

      - - Show / Hide Samples -

      - -
      -
      - -
      -
      - -
      -
      - - -
      - - -

      - Regex mode off - - -

      -
        -
        - - -
        -

        Export Plots

        -
        - -
        -
        -
        -
        -
        - - px -
        -
        -
        -
        - - px -
        -
        -
        -
        -
        - -
        -
        - -
        -
        -
        -
        - -
        -
        -
        - - X -
        -
        -
        -
        - -
        -

        Download the raw data used to create the plots in this report below:

        -
        -
        - -
        -
        - -
        -
        - -

        Note that additional data was saved in multiqc_data when this report was generated.

        - -
        -
        -
        - -
        -
        Choose Plots
        - - -
        - -
        - -

        If you use plots from MultiQC in a publication or presentation, please cite:

        -
        - MultiQC: Summarize analysis results for multiple tools and samples in a single report
        - Philip Ewels, Måns Magnusson, Sverker Lundin and Max Käller
        - Bioinformatics (2016)
        - doi: 10.1093/bioinformatics/btw354
        - PMID: 27312411 -
        - -
        - - -
        -

        Save Settings

        -

        You can save the toolbox settings for this report to the browser.

        -
        - - - -
        - -

        Load Settings

        -

        Choose a saved report profile from the dropdown box below:

        -
        -
        - -
        -
        - - - - -
        - -
        - - -
        -

        About MultiQC

        -

        This report was generated using MultiQC, version 1.8 (a7cba17)

        -

        You can see a YouTube video describing how to use MultiQC reports here: - https://youtu.be/qPbIlO_KWN0

        -

        For more information about MultiQC, including other videos and - extensive documentation, please visit http://multiqc.info

        -

        You can report bugs, suggest improvements and find the source code for MultiQC on GitHub: - https://github.com/ewels/MultiQC

        -

        MultiQC is published in Bioinformatics:

        -
        - MultiQC: Summarize analysis results for multiple tools and samples in a single report
        - Philip Ewels, Måns Magnusson, Sverker Lundin and Max Käller
        - Bioinformatics (2016)
        - doi: 10.1093/bioinformatics/btw354
        - PMID: 27312411 -
        -
        - -
        - -
        - - -
        - - - -

        - -
        - - - -
        - - - - -

        - -

        - Master of Pores -
        - -

        - - - - - - - -
        -
        - -
        Application Type
        Nanopore sequencing
        - -
        -
        - - - - - - -
        -

        Report generated on 2020-01-15, 13:15 based on data in: - /nfs/software/bi/biocore_tools/git/nextflow/master_of_pores_paper/master_of_pores/NanoPreprocess/work/72/97457bebc4a80bf88ea5553ae5c70f - -

        - - - -
        - - - - - - - - -
        -

        General Statistics

        - - - - - - - - - - Showing 1/1 rows and 3/5 columns. - -
        -
        -
        -
        Sample Name% Dups% GCK Seqs
        RNAAA023484_wt1
        5.5%
        54%
        1197.5
        - - - - - - - - - -
        -

        FastQC

        -

        FastQC This section of the report shows FastQC results on raw reads.

        - - - - -
        - -

        - Sequence Counts - - - -

        - -

        Sequence counts for each sample. Duplicate read counts are an estimate only.

        - - -
        -

        This plot show the total number of reads, broken down into unique and duplicate -if possible (only more recent versions of FastQC give duplicate info).

        -

        You can read more about duplicate calculation in the -FastQC documentation. -A small part has been copied here for convenience:

        -

        Only sequences which first appear in the first 100,000 sequences -in each file are analysed. This should be enough to get a good impression -for the duplication levels in the whole file. Each sequence is tracked to -the end of the file to give a representative count of the overall duplication level.

        -

        The duplication detection requires an exact sequence match over the whole length of -the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.

        -
        - -
        - - -
        -
        loading..
        -
        - -
        -
        - - - - -
        - -

        - Sequence Quality Histograms - - - -

        - -

        The mean quality value across each base position in the read.

        - - -
        -

        To enable multiple samples to be plotted on the same graph, only the mean quality -scores are plotted (unlike the box plots seen in FastQC reports).

        -

        Taken from the FastQC help:

        -

        The y-axis on the graph shows the quality scores. The higher the score, the better -the base call. The background of the graph divides the y axis into very good quality -calls (green), calls of reasonable quality (orange), and calls of poor quality (red). -The quality of calls on most platforms will degrade as the run progresses, so it is -common to see base calls falling into the orange area towards the end of a read.

        -
        - -
        loading..
        -
        - -
        -
        - - - - -
        - -

        - Per Sequence Quality Scores - - - -

        - -

        The number of reads with average quality scores. Shows if a subset of reads has poor quality.

        - - -
        -

        From the FastQC help:

        -

        The per sequence quality score report allows you to see if a subset of your -sequences have universally low quality values. It is often the case that a -subset of sequences will have universally poor quality, however these should -represent only a small percentage of the total sequences.

        -
        - -
        loading..
        -
        - -
        -
        - - - - -
        - -

        - Per Base Sequence Content - - - -

        - -

        The proportion of each base position for which each of the four normal DNA bases has been called.

        - - -
        -

        To enable multiple samples to be shown in a single plot, the base composition data -is shown as a heatmap. The colours represent the balance between the four bases: -an even distribution should give an even muddy brown colour. Hover over the plot -to see the percentage of the four bases under the cursor.

        -

        To see the data as a line plot, as in the original FastQC graph, click on a sample track.

        -

        From the FastQC help:

        -

        Per Base Sequence Content plots out the proportion of each base position in a -file for which each of the four normal DNA bases has been called.

        -

        In a random library you would expect that there would be little to no difference -between the different bases of a sequence run, so the lines in this plot should -run parallel with each other. The relative amount of each base should reflect -the overall amount of these bases in your genome, but in any case they should -not be hugely imbalanced from each other.

        -

        It's worth noting that some types of library will always produce biased sequence -composition, normally at the start of the read. Libraries produced by priming -using random hexamers (including nearly all RNA-Seq libraries) and those which -were fragmented using transposases inherit an intrinsic bias in the positions -at which reads start. This bias does not concern an absolute sequence, but instead -provides enrichement of a number of different K-mers at the 5' end of the reads. -Whilst this is a true technical bias, it isn't something which can be corrected -by trimming and in most cases doesn't seem to adversely affect the downstream -analysis.

        -
        - -
        -
        -
        - - Click a sample row to see a line plot for that dataset. -
        -
        Rollover for sample name
        - -
        - Position: - -
        %T: -
        -
        %C: -
        -
        %A: -
        -
        %G: -
        -
        -
        -
        - -
        -
        -
        -
        - - -
        -
        - - - - -
        - -

        - Per Sequence GC Content - - - -

        - -

        The average GC content of reads. Normal random library typically have a - roughly normal distribution of GC content.

        - - -
        -

        From the FastQC help:

        -

        This module measures the GC content across the whole length of each sequence -in a file and compares it to a modelled normal distribution of GC content.

        -

        In a normal random library you would expect to see a roughly normal distribution -of GC content where the central peak corresponds to the overall GC content of -the underlying genome. Since we don't know the the GC content of the genome the -modal GC content is calculated from the observed data and used to build a -reference distribution.

        -

        An unusually shaped distribution could indicate a contaminated library or -some other kinds of biased subset. A normal distribution which is shifted -indicates some systematic bias which is independent of base position. If there -is a systematic bias which creates a shifted normal distribution then this won't -be flagged as an error by the module since it doesn't know what your genome's -GC content should be.

        -
        - -
        - - -
        - -
        loading..
        -
        - -
        -
        - - - - -
        - -

        - Per Base N Content - - - -

        - -

        The percentage of base calls at each position for which an N was called.

        - - -
        -

        From the FastQC help:

        -

        If a sequencer is unable to make a base call with sufficient confidence then it will -normally substitute an N rather than a conventional base call. This graph shows the -percentage of base calls at each position for which an N was called.

        -

        It's not unusual to see a very low proportion of Ns appearing in a sequence, especially -nearer the end of a sequence. However, if this proportion rises above a few percent -it suggests that the analysis pipeline was unable to interpret the data well enough to -make valid base calls.

        -
        - -
        loading..
        -
        - -
        -
        - - - - -
        - -

        - Sequence Length Distribution - -

        - -

        The distribution of fragment sizes (read lengths) found. - See the FastQC help

        - - -
        loading..
        -
        - -
        -
        - - - - -
        - -

        - Sequence Duplication Levels - - - -

        - -

        The relative level of duplication found for every sequence.

        - - -
        -

        From the FastQC Help:

        -

        In a diverse library most sequences will occur only once in the final set. -A low level of duplication may indicate a very high level of coverage of the -target sequence, but a high level of duplication is more likely to indicate -some kind of enrichment bias (eg PCR over amplification). This graph shows -the degree of duplication for every sequence in a library: the relative -number of sequences with different degrees of duplication.

        -

        Only sequences which first appear in the first 100,000 sequences -in each file are analysed. This should be enough to get a good impression -for the duplication levels in the whole file. Each sequence is tracked to -the end of the file to give a representative count of the overall duplication level.

        -

        The duplication detection requires an exact sequence match over the whole length of -the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.

        -

        In a properly diverse library most sequences should fall into the far left of the -plot in both the red and blue lines. A general level of enrichment, indicating broad -oversequencing in the library will tend to flatten the lines, lowering the low end -and generally raising other categories. More specific enrichments of subsets, or -the presence of low complexity contaminants will tend to produce spikes towards the -right of the plot.

        -
        - -
        loading..
        -
        - -
        -
        - - - - -
        - -

        - Overrepresented sequences - - - -

        - -

        The total amount of overrepresented sequences found in each library.

        - - -
        -

        FastQC calculates and lists overrepresented sequences in FastQ files. It would not be -possible to show this for all samples in a MultiQC report, so instead this plot shows -the number of sequences categorized as over represented.

        -

        Sometimes, a single sequence may account for a large number of reads in a dataset. -To show this, the bars are split into two: the first shows the overrepresented reads -that come from the single most common sequence. The second shows the total count -from all remaining overrepresented sequences.

        -

        From the FastQC Help:

        -

        A normal high-throughput library will contain a diverse set of sequences, with no -individual sequence making up a tiny fraction of the whole. Finding that a single -sequence is very overrepresented in the set either means that it is highly biologically -significant, or indicates that the library is contaminated, or not as diverse as you expected.

        -

        FastQC lists all of the sequences which make up more than 0.1% of the total. -To conserve memory only sequences which appear in the first 100,000 sequences are tracked -to the end of the file. It is therefore possible that a sequence which is overrepresented -but doesn't appear at the start of the file for some reason could be missed by this module.

        -
        - -
        -
        loading..
        -
        - -
        -
        - - - - -
        - -

        - Adapter Content - - - -

        - -

        The cumulative percentage count of the proportion of your - library which has seen each of the adapter sequences at each position.

        - - -
        -

        Note that only samples with ≥ 0.1% adapter contamination are shown.

        -

        There may be several lines per sample, as one is shown for each adapter -detected in the file.

        -

        From the FastQC Help:

        -

        The plot shows a cumulative percentage count of the proportion -of your library which has seen each of the adapter sequences at each position. -Once a sequence has been seen in a read it is counted as being present -right through to the end of the read so the percentages you see will only -increase as the read length goes on.

        -
        - -
        No samples found with any adapter contamination > 0.1%
        - -
        -
        - - - - -
        - -

        - Status Checks - - - -

        - -

        Status for each FastQC section showing whether results seem entirely normal (green), -slightly abnormal (orange) or very unusual (red).

        - - -
        -

        FastQC assigns a status for each section of the report. -These give a quick evaluation of whether the results of the analysis seem -entirely normal (green), slightly abnormal (orange) or very unusual (red).

        -

        It is important to stress that although the analysis results appear to give a pass/fail result, -these evaluations must be taken in the context of what you expect from your library. -A 'normal' sample as far as FastQC is concerned is random and diverse. -Some experiments may be expected to produce libraries which are biased in particular ways. -You should treat the summary evaluations therefore as pointers to where you should concentrate -your attention and understand why your library may not look random and diverse.

        -

        Specific guidance on how to interpret the output of each module can be found in the relevant -report section, or in the FastQC help.

        -

        In this heatmap, we summarise all of these into a single heatmap for a quick overview. -Note that not all FastQC sections have plots in MultiQC reports, but all status checks -are shown in this heatmap.

        -
        - -
        - -
        loading..
        -
        - - -
        - - -
        -
        - - - -
        -

        Alignment QC

        -

        Alignment QC

        - - - - -
        - - - - -
        - - - - - - - - - Showing 1/1 rows and 7/7 columns. - -
        -
        - -
        File nameMapped readsMap %BasesBases %Avg read lengthMax read lengthidentity
        RNAAA023484_wt1.minimap2.sorted.bam
        613214
        100.0%
        309214724
        94.5%
        533.0
        18861
        89.70%
        - -
        - - -
        - - -
        -
        - - - -
        -

        RNAAA023484 wt1 stats

        -

        RNAAA023484 wt1 stats Embedded image RNAAA023484_wt1_stats_mqc.png

        - - - - -
        - - - - -
        -
        - -
        - - -
        - - - - - - - - - - - - - - - - - - - - -