diff --git a/.gitattributes b/.gitattributes
index 2c6853f..94be2f7 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -2,3 +2,4 @@
*.css linguist-detectable=false
*.js linguist-detectable=false
*.py linguist-detectable=false
+*.R linguist-detectable=false
diff --git a/.gitignore b/.gitignore
index 9d2b9f0..40fc71f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,13 +1,9 @@
-*/log*
-anno*
-*/rep*
-NanoPreprocess/preproc_data
-NanoPreprocess/wrong
-real_data
-NanoPreprocess/bin/guppy*
-NanoPreprocess/bin/ont-guppy*
-log
-*/out*
-work
singularity
-.nextflow*
+mop_preprocess/bin/multi_to_single_fast5
+mop_preprocess/bin/read_fast5_basecaller.py
+mop_preprocess/bin/ont*
+mop_preprocess/bin/lib*
+mop_preprocess/bin/MINIMAP2_LICENSE
+mop_preprocess/bin/guppy*
+*/work
+*/.next*
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..73be20c
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "BioNextflow"]
+ path = BioNextflow
+ url = https://github.com/biocorecrg/BioNextflow
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index c513501..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-install: travis_wait 30 mvn install
-sudo: required
-language: java
-jdk: openjdk8
-services: docker
-
-install:
- # Install Nextflow
- - mkdir nextflow
- - cd nextflow
- - wget -qO- get.nextflow.io | bash
- - sudo ln -s $PWD/nextflow /usr/local/bin/nextflow
-
-# Install Master Of Porse
- - git clone https://github.com/biocorecrg/master_of_pores.git
- - cd master_of_pores
- - sh INSTALL.sh
-
-# Run the pipeline
-script:
- - cd NanoPreprocess
- - nextflow run nanopreprocess.nf -with-docker -profile local
diff --git a/BioNextflow b/BioNextflow
new file mode 160000
index 0000000..20310e2
--- /dev/null
+++ b/BioNextflow
@@ -0,0 +1 @@
+Subproject commit 20310e21dbd99ea705d04aa2b0940a817b1cffde
diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index 53d69ba..0000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,14 +0,0 @@
-## Version 1.1
-* Added a new module called NanoPreprocessSimple that starts from fastq files instead of fast5 files. It allows the analysis of multiple files at a time.
-* Added support to vbz compressed fast5 https://github.com/nanoporetech/vbz_compression in NanoPreprocess, NanoMod and NanoTail
-* NanoPreprocess now outputs also CRAM files and can do downsampling with the parameter --downsampling
-* NanoPreprocess allows performing variant calling using medaka (BETA)
-* NanoPreprocess allows performing demultiplexing with GUPPY
-* Added plots for Epinano output in NanoMod
-* Added a conversion of Tombo results in bed format in NanoMod
-* Added a INSTALL.sh file for automatically retrieve guppy 3.4.5 from https://mirror.oxfordnanoportal.com/, place it in NanoPreprocess/bin and making the required links
-* Added profiles for being used locally and on the CRG SGE cluster
-
-
-## Version 1.0
-This is the original version published in the paper [MasterOfPores: A Workflow for the Analysis of Oxford Nanopore Direct RNA Sequencing Datasets](https://www.frontiersin.org/articles/10.3389/fgene.2020.00211/full)
diff --git a/NanoMod/bin/epinano_paired.py b/NanoMod/bin/epinano_paired.py
deleted file mode 100755
index b8c5cb0..0000000
--- a/NanoMod/bin/epinano_paired.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env python
-# -- coding: utf-8 -
-
-import argparse, re
-import numpy as np
-from collections import Counter as cnt
-from collections import defaultdict
-import gzip
-
-parser = argparse.ArgumentParser()
-parser.add_argument ('-k','--knockout', required=True, dest='kos', action='append', help='knockout sample epinano prediciton results')
-parser.add_argument ('-w','--wildtype', required=True, dest='wts', action='append', help='wildtype sample epinano prediction results')
-parser.add_argument ('-c','--coverage', nargs = '?', const=5, default=5,type = int, help='minimum coverage to be considered as valid, default is 5')
-parser.add_argument ('-o','--output', required = True, help= 'output file name')
-parser.add_argument ('-m','--motif', type=str, default='[AGCTUagctu]',help='motif to be kept in the final results; default will keep all motifs; "-m [AG][AG]AC[ACT]" will keep RRACH motifs; ')
-parser.add_argument ('-dk','--ko_duplicates', required = True, type=int, default=1, help='a site has to be at least predicted in this many samples to be used to determine final modificaiton status; default is 1')
-parser.add_argument ('-dw','--wt_duplicates', required = True, type=int, default=1, help='a site has to be at least predicted in this many samples to be used to determine final modificaiton status; default is 1')
-parser.add_argument ('-gz', '--gzipped', default=False, dest='isgzip', action='store_true', help='indicates if the input files are gzipped or not')
-args = parser.parse_args()
-
-'''
-python3 compare_wt_ko_predicitons.py -k ko.csv -k ko3.csv -k ko2.csv -w wt.csv -c 5 -o out -m [AG][AG]AC[ACT]
-'''
-
-def read_result (cov, mtf, epinano_result_files, gzip):
- '''
- input1: SVM.py predicted results
- input2: minimum depth
- input3: motif to be saved; default is RRACH
- '''
- motif = dict()
- info = defaultdict(list)
- for f in epinano_result_files:
- with fopen (f, gzip) as fh:
- for l in fh:
- if l.startswith('#'):
- continue
- ary = l.strip().split(',')
- mapping_cov = float(ary[3].split(':')[2])
- if mapping_cov < cov:
- continue
- if not re.match (mtf,ary[0]):
- continue
- win,ref,kmer,probm = ary[1],ary[2],ary[0],float(ary[-2])
- k = ref + ',' + win.split(':')[2]
- info[k].append ((mapping_cov,probm))
- motif[k] = kmer
- return info, motif
-
-def fopen (file, isgzip):
- fh = ""
- if isgzip:
- fh = gzip.open(file, 'rt', encoding='utf-8')
- else:
- fh = open (file)
- return fh
-
-
-def scoring (common_sites, sample_info, repeats):
- '''
- score prediction results
- if it is one on one comparison:
- use the probm as it is
- else:
- if (s1 ≥ 0.5 and s2 ≥ 0.5 and .. sn ≥ 0.5):
- M = 1
- else:
- M = (s1 + s2 + sn)/n
- if (Mwt/Mko) > 1.5 and Mwt > 0.5:
- status = modified
- else:
- status = unmodified
- '''
- scores = dict ()
- probs = defaultdict(list)
- for s in common_sites:
- probms = [x[1] for x in sample_info[s]]
- if len (probms) < repeats:
- continue
- elif len (probms) == 1:
- scores[s] = np.mean (probms)
- else:
- cond = [x > 0.5 for x in probms]
- if all (cond):
- scores[s] = 1
- else:
- scores[s] = np.mean (probms)
- probs[s] = probms
- return scores,probs
-
-if __name__ == '__main__':
- kos = args.kos
- wts = args.wts
- cov = args.coverage
- ko_duplicates = args.ko_duplicates
- wt_duplicates = args.wt_duplicates
- isgzip = args.isgzip
- motif = args.motif
- out = open (args.output, 'w')
-
- head = "#chr,position,motif,modification_status\n"
- out.write(head)
- wt_inf, mtfs = read_result (cov,motif,wts,isgzip)
- ko_inf, mtfs = read_result (cov,motif,kos,isgzip)
- all_wt_sites_lst = [x for x in wt_inf]
- all_ko_sites_lst = [x for x in ko_inf]
- commonSites = set (all_wt_sites_lst).intersection (set(all_ko_sites_lst))
- wt_scores, wt_probs = scoring (commonSites, wt_inf, wt_duplicates)
- ko_scores, ko_probs = scoring (commonSites, ko_inf, ko_duplicates)
- for s in commonSites:
- if not (s in wt_scores and s in ko_scores):
- continue
- if wt_scores[s]/ko_scores[s] > 1.5 and wt_scores[s] > 0.5:
- out.write ('{},{},{}\n'.format(s,mtfs[s],"YES"))
- else:
- out.write ('{},{},{}\n'.format(s,mtfs[s],"NO"))
- out.close()
diff --git a/NanoMod/bin/epinano_scatterplot.R b/NanoMod/bin/epinano_scatterplot.R
deleted file mode 100755
index bb48ef0..0000000
--- a/NanoMod/bin/epinano_scatterplot.R
+++ /dev/null
@@ -1,88 +0,0 @@
-#Scatter plots
-#Rscript epinano_scatterplot.R input1 label1 input2 label2 feature
-#Libraries needed
-library(plyr)
-library(ggplot2)
-library(ggrepel)
-library(MASS)
-library(reshape2)
-
-# Reading arguments from command line
-args = commandArgs(trailingOnly=TRUE)
-
-#Arguments
-zinput1<-gzfile(args[1], "rt")
-zinput2<-gzfile(args[3], "rt")
-
-input1 <- read.delim(zinput1,sep=",") #1st variable
-label1 <- as.character(args[2]) #1st label
-input2 <-read.delim(zinput2,sep=",") #2nd variable
-label2 <- as.character(args[4]) #2nd label
-feature<- as.character(args[5]) #Feature
-
-
-
-#Cleanup
-cleanup <- function(input, label) {
- #Filter low coverage reads
- input <- subset(input, cov>30)
- #Filter read starts
- input <- subset(input, pos>20)
- #Add summed errors column
- input$sum <- input$mis + input$del + input$ins
- #Add a column with position
- input$position<- paste(input$X.Ref,input$pos)
- #Change column names
- input <- input[, c("X.Ref","pos","position", "base", feature)]
- colnames(input)<- c("Chr","Position","chr_pos","base",feature )
- data_melted<- melt(data = input, id.vars = c("Chr", "Position", "chr_pos", "base"))
- colnames(data_melted)[which(names(data_melted) == "value")] <- paste(label, "value", sep="_")
- return(data_melted)
-}
-
-
-#Cleanup and process the data
-data1 <- cleanup(input1, label1)
-data2 <- cleanup(input2, label2)
-
-merged <- join(data1,data2, by="chr_pos")
-merged$Chr <- NULL
-merged$Position <- NULL
-merged$base <- NULL
-merged$variable <- NULL
-
-
-plot<- function(data)
-for (chr in unique(data$Chr)) {
- subs <- subset(data, Chr==chr)
- if(nrow(subs)>0){
- res<- rlm(subs[,c(paste(label1, "value", sep="_"))] ~ subs[,c(paste(label2, "value", sep="_"))]) #linear model
- res_vec <- res$residuals#this contains residuals
- threshold <- 5 * sd(res_vec) #The threshold
- subs$score<- abs(subs[,c(paste(label1, "value", sep="_"))] - subs[,c(paste(label2, "value", sep="_"))])
- pdf(file=paste(chr,feature, label1, label2, "scatter.pdf", sep="_"),height=5,width=5,onefile=FALSE)
- print(ggplot(subs, aes_string(x=paste(label1, "value", sep="_"), y=paste(label2, "value", sep="_"))) +
- geom_point(size=2, color="grey")+
- geom_abline(slope=1, intercept=0,linetype="dashed")+
- geom_point(data=subset(subs, score>threshold), size=2, color="red")+
- geom_text_repel(data=subset(subs, score>threshold), aes(label=chr_pos), colour="black",segment.size = 0.4,segment.color = "grey50",size=5)+
- ggtitle(feature)+
- xlab(label1)+
- ylab(label2) +
- theme_bw()+
- theme(axis.text.x = element_text(face="bold", color="black",size=11),
- axis.text.y = element_text(face="bold", color="black", size=11),
- plot.title = element_text(color="black", size=15, face="bold.italic",hjust = 0.5),
- axis.title.x = element_text(color="black", size=15, face="bold"),
- axis.title.y = element_text(color="black", size=15, face="bold"),
- panel.background = element_blank(),
- axis.line = element_line(colour = "black", size=0.5),
- legend.title = element_text(color = "black", size = 15,face="bold"),
- legend.text = element_text(color = "black", size=15),
- panel.grid.major = element_blank(), panel.grid.minor = element_blank())
- )
- dev.off()
- }
- }
-
-plot(merged)
diff --git a/NanoMod/bin/join.r b/NanoMod/bin/join.r
deleted file mode 100755
index 2a8643b..0000000
--- a/NanoMod/bin/join.r
+++ /dev/null
@@ -1,60 +0,0 @@
-# R --slave --args epi tombo prefix < join.r
-
-args<-commandArgs(TRUE)
-epi <- args[1]
-tombo <- args[2]
-prefix <- args[3]
-
-a<-read.table(epi, sep="\t", header=FALSE)
-b<-read.table(tombo, sep="\t", header=FALSE)
-
-a$V2<-"epinano"
-b$V2<-"tombo"
-
-c<-merge(a,b, all=T, by.x="V1", by.y="V1", sort=FALSE)
-colnames(c)<-c("positions", "epinano", "tombo")
-
-c[c=="epinano"]<-1
-c[c=="tombo"]<-1
-c[is.na(c)]<-0
-
-write.table(c, file = "RNA_modifications.txt", sep="\t", row.names = FALSE)
-
-library(VennDiagram)
-library(RColorBrewer)
-myCol <- c("#B3E2CD","#FDCDAC")
-
-venn.diagram(
- x = list(a$V1, b$V1),
- category.names = c("Epinano", "Tombo"),
- filename = 'venn_diagram.png',
- imagetype = "png",
- output=TRUE,
- height = 1024,
- width = 1024 ,
- resolution = 300,
- compression = "lzw",
- main.pos = c(0.5,0.7),
-
- # Circles
- lwd = 2,
- lty = 'blank',
- fill = myCol,
- margin = 0.6,
- main = prefix,
- main.fontfamily = "sans",
-
- # Numbers
- cex = .6,
- fontface = "bold",
- fontfamily = "sans",
-
- # Set names
- cat.cex = 0.6,
- cat.fontface = "bold",
- cat.default.pos = "outer",
- cat.pos = c(-135, 135),
- cat.fontfamily = "sans"
-)
-
-
diff --git a/NanoMod/bin/tombo_2_bed.sh b/NanoMod/bin/tombo_2_bed.sh
deleted file mode 100755
index 02d8dd8..0000000
--- a/NanoMod/bin/tombo_2_bed.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-#
-# This script convert the tombo output in a 6 field bed file t
-
-# specify the name of the tombo input
-if [ x"$1" == x ]; then
-
- echo "please specify the tombo output file"
-
- exit 1
-
-fi
-
-if [ x"$2" == x ]; then
-
- echo "please specify the name of the output file"
-
- exit 1
-
-fi
-
-
-grep ">" $1 | awk '{print $1"\t"$5}' | sed s/">"//g | sed s/\:/\\t/g | awk '{num++; OFS="\t"; print $1,$2,$2, $4*100, "PRED_"num,$3}' > $2
diff --git a/NanoMod/bin/tombo_filter.py b/NanoMod/bin/tombo_filter.py
deleted file mode 100755
index f0f08d1..0000000
--- a/NanoMod/bin/tombo_filter.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env python
-import argparse, re
-import numpy as np
-from collections import Counter as cnt
-from collections import defaultdict
-
-parser = argparse.ArgumentParser()
-parser.add_argument ('-t','--tomboouputs', required=True, dest='tbs', action='append', help='knockout sample tombo prediciton results')
-parser.add_argument ('-p','--percentage', nargs = '?', const=0.5, default=0.5,type = float, help='threshod to filter for positives; (0,1]; default is 0.5')
-parser.add_argument ('-o','--output', required = True, help= 'output file name')
-parser.add_argument ('-m','--motif', type=str, default='[AGCTUagctu]',help='motif to be kept in the final results; default will keep all motifs; "-m [AG][AG]AC[ACT]" will keep RRACH motifs; ')
-parser.add_argument ('-d','--num_duplicates', required = True, type=int, default=1, help='a site has to be at least predicted in this many samples to be used to determine final modificaiton status; default is 3')
-
-args = parser.parse_args()
-
-def read_tombo_fasta (fasta, motif):
- complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
- info = defaultdict (list)
- for f in fasta:
- with open (f, 'r') as fh:
- for l in fh:
- if l.startswith ('>'):
- mtf = next (fh)
- ary = l.strip().split()
- id = ary[0][1:]
- strand = id.split(':')[-1]
- #id = ':'.join (id.split(':')[:2])
- prob = float (ary[-1])
- if strand == '-':
- # Are we sure that is not already reverse complement?
- mtf = "".join(complement.get(base,base) for base in reversed(mtf))
- id = ':'.join (id.split(':')[:2])
- id = id + ":" + mtf.rstrip()
- if not re.match (motif, mtf):
- continue
- info[id].append ((mtf,prob))
- return info
-
-def score (sample_info, threshold, duplicates):
- scores = dict()
- for s in sample_info:
- num = len (sample_info[s])
- if num < duplicates:
- continue
- probs = [x[1] for x in sample_info[s]]
- cond = all ([x>threshold for x in probs])
- if (cond):
- scores[s] = 1
- else:
- scores[s] = np.mean (probs)
- return scores
-
-
-if __name__ == '__main__':
- tbs = args.tbs
- motif = args.motif
- out = open (args.output,'w')
- tombo_info = read_tombo_fasta (tbs,args.motif) ; print (tombo_info)
- tombo_score = score (tombo_info, args.percentage, args.num_duplicates); print (tombo_score)
- mod = ''
- out.write ('#chr,position,motif,modification_status\n')
- for s in tombo_score:
- if tombo_score[s] > args.percentage:
- mod = 'YES'
- else:
- mod = 'NO'
- s = s.replace (':',',')
- out.write ("{},{}\n".format(s,mod))
- out.close()
diff --git a/NanoMod/comparison.tsv b/NanoMod/comparison.tsv
deleted file mode 100644
index d473b1c..0000000
--- a/NanoMod/comparison.tsv
+++ /dev/null
@@ -1,2 +0,0 @@
-RNAAA023484_wt1 RNAAA024588_ko1
-RNAAB056712_wt2 RNAAB058843_ko2
diff --git a/NanoMod/config.yaml b/NanoMod/config.yaml
deleted file mode 100644
index faa50a8..0000000
--- a/NanoMod/config.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-title: "Master of Pores"
-subtitle: ""
-intro_text: False
-
-report_header_info:
- - Application Type: 'Nanopore sequencing'
-
-custom_logo: 'logo_small.png'
-custom_logo_url: 'https://github.com/biocorecrg/nanopore_analysis'
-custom_logo_title: 'Master of Pores'
-
-extra_fn_clean_trim:
- - '_QC'
-
-table_columns_visible:
- FastQC:
- percent_duplicates: True
-
-module_order:
- - fastqc:
- name: 'FastQC'
- info: 'This section of the report shows FastQC results on raw reads.'
-
- - alnQC:
- name: "alnQC"
- info: 'This section of the report shows alnQC results on aligned reads'
diff --git a/NanoMod/models/model2.1-mis3.del3.q3.poly.dump b/NanoMod/models/model2.1-mis3.del3.q3.poly.dump
deleted file mode 100644
index 3692593..0000000
Binary files a/NanoMod/models/model2.1-mis3.del3.q3.poly.dump and /dev/null differ
diff --git a/NanoMod/nanomod.nf b/NanoMod/nanomod.nf
deleted file mode 100644
index 0288c3c..0000000
--- a/NanoMod/nanomod.nf
+++ /dev/null
@@ -1,444 +0,0 @@
-#!/usr/bin/env nextflow
-
-
-/*
- * Define the pipeline parameters
- *
- */
-
-// Pipeline version
-version = '0.1'
-
-params.help = false
-params.resume = false
-
-log.info """
-
-╔╦╗┌─┐┌─┐┌┬┐┌─┐┬─┐ ┌─┐┌─┐ ╔═╗╔═╗╦═╗╔═╗╔═╗
-║║║├─┤└─┐ │ ├┤ ├┬┘ │ │├┤ ╠═╝║ ║╠╦╝║╣ ╚═╗
-╩ ╩┴ ┴└─┘ ┴ └─┘┴└─ └─┘└ ╩ ╚═╝╩╚═╚═╝╚═╝
-
-====================================================
-BIOCORE@CRG NanoDirectRNA. Detection of modification and polyA length (RNA) - N F ~ version ${version}
-====================================================
-
-***************** Input files *******************
-input_path : ${params.input_path}
-comparison : ${params.comparison}
-
-********** reference has to be the genome *************
-reference : ${params.reference}
-output : ${params.output}
-
-coverage : ${params.coverage}
-************* tombo and epinano params ****************
-tombo_opt : ${params.tombo_opt}
-
-epinano_opt : ${params.epinano_opt}
-
-email : ${params.email}
-"""
-
-// Help and avoiding typos
-if (params.help) exit 1
-if (params.resume) exit 1, "Are you making the classical --resume typo? Be careful!!!! ;)"
-
-// check input files
-reference = file(params.reference)
-if( !reference.exists() ) exit 1, "Missing reference file: ${reference}!"
-//config_report = file("$baseDir/config.yaml")
-//if( !config_report.exists() ) exit 1, "Missing config.yaml file!"
-logo = file("$baseDir/../docs/logo_small.png")
-
-model_folder = file("$baseDir/models/")
-if( !model_folder.exists() ) exit 1, "Missing folders with EpiNano's models!"
-joinScript = file("$baseDir/bin/join.r")
-
-tombo_opt = params.tombo_opt
-epinano_opt = params.epinano_opt
-
-epinanoScript = file("$baseDir/bin/epinano_scatterplot.R")
-
-// Output folders
-outputtombo = "${params.output}/Tombo"
-outputEpinano = "${params.output}/Epinano"
-//outputReport = file("${outputMultiQC}/multiqc_report.html")
-
-/*
-* move old multiQCreport
-
-if( outputReport.exists() ) {
- log.info "Moving old report to multiqc_report.html multiqc_report.html.old"
- outputReport.moveTo("${outputMultiQC}/multiqc_report.html.old")
-}
-*/
-
-compfile = file(params.comparison)
-if( !compfile.exists() ) exit 1, "Missing comparison file: ${compfile}. Specify path with --comparisons"
-
-/*
- * Creates the channels with comparisons
- */
- Channel
- .from(compfile.readLines())
- .map { line ->
- list = line.split("\t")
- if (list[0]!= "") {
- def sampleID = list[0]
- def ctrlID = list[1]
- [ sampleID, ctrlID ]
- }
- }
- .into{ id_to_tombo_fast5; id_to_tombo_idx; id_to_epinano; id_to_epinano_plots; id_for_resquiggling}
-
-
-/*
- * Creates the channels with samples and KOs for EpiNano
- */
- Channel
- .from(compfile.readLines())
- .map { line ->
- list = line.split("\t")
- if (list[0]!= "") {
- list[0]
- }
- }
- .set{ samples_for_epinano_filtering }
-
- Channel
- .from(compfile.readLines())
- .map { line ->
- list = line.split("\t")
- if (list[1]!= "") {
- list[1]
- }
- }
- .set{ ko_for_epinano_filtering }
-
-id_for_resquiggling.flatten().unique().map {
- [it, file("${params.input_path}/${it}/fast5_files/*.fast5")]
-}.transpose().set{fast5_for_resquiggling}
-
-id_to_epinano.flatten().unique().map {
- [it, file("${params.input_path}/${it}/alignment/*.bam")]
-}.transpose().set{bams_for_variant_calling}
-
-
-
-/*
-* Perform resquiggling
-*/
-
-process resquiggling {
- tag {"${idsample}-${fast5.simpleName}"}
- label 'big_mem_cpus'
-
- input:
- set idsample, file(fast5) from fast5_for_resquiggling
- file(reference)
-
- output:
- file ("*.resquiggle.failed.tsv") into failed_resquiggles
- set idsample, file ("${idsample}-${fast5.simpleName}") into singlefast5
- set idsample, file (".*.tombo.index") into tombo_indexes
-
- script:
- def infolder = "${idsample}-${fast5.simpleName}"
-
- """
- #from multifast5 to singlefast5
- mkdir ${infolder};
- multi_to_single_fast5 -i ./ -s ./ -t ${task.cpus};
- rm ./filename_mapping.txt;
- mv ./*/*.fast5 ${infolder};
- # resquiggling
- tombo resquiggle ${infolder} ${reference} --rna --processes ${task.cpus} --overwrite --failed-reads-filename ${infolder}.resquiggle.failed.tsv
- """
-}
-
-/*
-* Group data together with indexes
-*/
-singlefast5.groupTuple().into{grouped_single_fast5_A; grouped_single_fast5_B}
-id_to_tombo_fast5.combine(grouped_single_fast5_A, by: 0).map {
- [ it[1], it[0], it[2] ]
-}.combine(grouped_single_fast5_B, by: 0)map {
- [ "${it[1]}--${it[0]}", it[2], it[3]]
-}.set{fast5_for_tombo_modifications}
-
-tombo_indexes.groupTuple().into{grouped_indexes_A; grouped_indexes_B}
-id_to_tombo_idx.combine(grouped_indexes_A, by: 0).map {
- [ it[1], it[0], it[2] ]
-}.combine(grouped_indexes_B, by: 0)map {
- [ "${it[1]}--${it[0]}", it[2], it[3] ]
-}.set{idx_for_tombo_modifications}
-
-fast5_for_tombo_modifications.combine(idx_for_tombo_modifications, by: 0).set{data_for_tombo_modifications}
-
-/*
-* detect modification
-*/
-
-process getModifications {
- label 'big_mem_cpus'
- tag {"${combID}"}
- publishDir outputtombo, pattern: "*.significant_regions.fasta", mode: 'copy'
- publishDir outputtombo, pattern: "*.significant_regions.bed", mode: 'copy'
-
- input:
- file(reference)
- set val(combID), file(fast5s_A), file(fast5s_B), file(index_A), file(index_B) from data_for_tombo_modifications
-
- output:
- file ("*.significant_regions.fasta") into sign_tombo_regions
- file ("*.significant_regions.bed")
-
- script:
- def reference_cmd = unzipFile(reference, "reference.fa")
- def folder_names = "${combID}".split("--")
- def folder_name_A = folder_names[0]
- def folder_name_B = folder_names[1]
- """
- ${reference_cmd}
- mkdir ${folder_name_A} ${folder_name_B}
- mv ${fast5s_A} ${folder_name_A}
- mv ${index_A} ${folder_name_A}
- mv ${fast5s_B} ${folder_name_B}
- mv ${index_B} ${folder_name_B}
- tombo detect_modifications model_sample_compare --minimum-test-reads ${params.coverage} --fast5-basedirs ${folder_name_A}/* --control-fast5-basedirs ${folder_name_B}/* --statistics-file-basename ${folder_name_A}_${folder_name_B}_model_sample_compare --rna --per-read-statistics-basename ${folder_name_A}_${folder_name_B}_per-read-statistics --processes ${task.cpus}
- tombo text_output signif_sequence_context ${tombo_opt} --num-regions 1000000000 --statistics-filename ${folder_name_A}_${folder_name_B}_model_sample_compare.tombo.stats --genome-fasta reference.fa --fast5-basedirs ${folder_name_A} --sequences-filename ${folder_name_A}_${folder_name_B}.significant_regions.fasta
- rm reference.fa
-
- tombo_2_bed.sh ${folder_name_A}_${folder_name_B}.significant_regions.fasta ${folder_name_A}_${folder_name_B}.significant_regions.bed
- """
-}
-
-
-/*
-* Perform preprocessing for Epinano
-*/
-
-process index_reference {
-
- input:
- file(reference)
-
- output:
- set file("reference.fa"), file("*.dict"), file ("*.fai") into indexes
-
- script:
- """
- if [ `echo ${reference} | grep ".gz"` ]; then
- zcat ${reference} > reference.fa
- else
- ln -s ${reference} reference.fa
- fi
- \$PICARD CreateSequenceDictionary R=reference.fa O=reference.dict
- samtools faidx reference.fa
- """
-}
-
-
-/*
-* Calling variants for Epinano
-*/
-
-process CallVariantsForEpinano {
-
- tag {"${sampleID}"}
-
- input:
- set val(sampleID), file(alnfile) from bams_for_variant_calling
- set file(reference), file(dict_index), file(faiidx) from indexes
-
- output:
- set sampleID, file("${sampleID}.tsv") into variants_for_frequency
-
- script:
- """
- samtools view -h ${alnfile} -F 256 | \$SAM2TSV -R ${reference} | cut -f 3 --complement > ${sampleID}.tsv
- """
-}
-
-/*
-* Calc variant frequencies for Epinano
-*/
-
-process calcVarFrequenciesForEpinano {
- publishDir outputEpinano, pattern: "*.csv.gz", mode: 'copy'
- container "biocorecrg/mopepinano:0.1"
-
- tag {"${sampleID}"}
- label 'big_mem_cpus'
-
- input:
- set val(sampleID), file(tsvfile) from variants_for_frequency
-
- output:
- set val(sampleID), file("*.tsv.per.site.var.csv.gz") into per_site_vars_A, per_site_vars_B
- file("*.csv.gz")
-
- script:
- """
- TSV_to_Variants_Freq.py3 -f ${tsvfile} -t ${task.cpus}
- for i in *.csv; do gzip \$i; done
- """
-}
-
-per_site_vars_A.combine(per_site_vars_B).map {
- [ it[0], it[2], it[1], it[3] ]
-}.set{per_site_combs}
-
-per_site_combs.join(id_to_epinano_plots, by:[0,1])
-.into{per_site_for_plotsA; per_site_for_plotsB; per_site_for_plotsC}
-
-process makeEpinanoPlotsMis {
- publishDir outputEpinano, mode: 'copy'
- container "biocorecrg/mopnanotail:0.3"
-
- tag {"${sampleIDA}--${sampleIDB}"}
-
- input:
- set val(sampleIDA), val(sampleIDB), file(per_site_varA), file(per_site_varB) from per_site_for_plotsA
-
- output:
- file("*.pdf")
-
- script:
- """
- Rscript --vanilla ${epinanoScript} ${per_site_varA} ${sampleIDA} ${per_site_varB} ${sampleIDB} mis
- """
-}
-
-process makeEpinanoPlotsIns {
- publishDir outputEpinano, mode: 'copy'
- container "biocorecrg/mopnanotail:0.3"
-
- tag {"${sampleIDA}--${sampleIDB}"}
-
- input:
- set val(sampleIDA), val(sampleIDB), file(per_site_varA), file(per_site_varB) from per_site_for_plotsB
-
- output:
- file("*.pdf")
-
- script:
- """
- Rscript --vanilla ${epinanoScript} ${per_site_varA} ${sampleIDA} ${per_site_varB} ${sampleIDB} ins
- """
-}
-
-process makeEpinanoPlotsDel {
- publishDir outputEpinano, mode: 'copy'
- container "biocorecrg/mopnanotail:0.3"
-
- tag {"${sampleIDA}--${sampleIDB}"}
-
- input:
- set val(sampleIDA), val(sampleIDB), file(per_site_varA), file(per_site_varB) from per_site_for_plotsC
-
- output:
- file("*.pdf")
-
- script:
- """
- Rscript --vanilla ${epinanoScript} ${per_site_varA} ${sampleIDA} ${per_site_varB} ${sampleIDB} del
- """
-}
-
-
-//per_site_vars.map{
-// [ it[0], it[1].splitText( by: 1, keepHeader:true, file:true ) ]
-//}.transpose().set{per_site_vars_splitted}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-/*
-* functions
-*/
-// Get the name from the folder
-def getFolderName(sample) {
- folder_info = sample.toString().tokenize("/")
- return folder_info[-2]
-}
-
-
-
-
-// make named pipe
-def unzipBash(filename) {
- cmd = filename.toString()
- if (cmd[-3..-1] == ".gz") {
- cmd = "<(zcat ${filename})"
- }
- return cmd
-}
-
-def unzipFile(zipfile, filename) {
- cmd = "cp ${zipfile} ${filename}"
- filestring = zipfile.toString()
- if (filestring[-3..-1] == ".gz") {
- cmd = "zcat ${zipfile} > ${filename}"
- }
- return cmd
-}
-
-/*
-* Finish message
-*/
-workflow.onComplete {
- println "Pipeline BIOCORE@CRG Master of Pore completed!"
- println "Started at $workflow.start"
- println "Finished at $workflow.complete"
- println "Time elapsed: $workflow.duration"
- println "Execution status: ${ workflow.success ? 'OK' : 'failed' }"
-}
-
-/*
-* Mail notification
-*/
-
-if (params.email == "yourmail@yourdomain" || params.email == "") {
- log.info 'Skipping the email\n'
-}
-else {
- log.info "Sending the email to ${params.email}\n"
-
- workflow.onComplete {
-
- def msg = """\
- NanoMod module's execution summary
- ---------------------------
- Completed at: ${workflow.complete}
- Duration : ${workflow.duration}
- Success : ${workflow.success}
- workDir : ${workflow.workDir}
- exit status : ${workflow.exitStatus}
- Error report: ${workflow.errorReport ?: '-'}
- """
- .stripIndent()
-
- sendMail(to: params.email, subject: "Master of Pore execution", body: msg)
- }
-}
diff --git a/NanoMod/nextflow.config b/NanoMod/nextflow.config
deleted file mode 100644
index a6772c6..0000000
--- a/NanoMod/nextflow.config
+++ /dev/null
@@ -1,40 +0,0 @@
-includeConfig "$baseDir/params.config"
-//includeConfig "../nextflow.global.config"
-singularity.cacheDir = "$baseDir/../singularity"
-
-
-
-process {
-errorStrategy = 'ignore'
- memory='12G'
- cache='lenient'
- queue = 'biocore-el7,long-sl7,short-sl7'
- container = 'biocorecrg/mopmod1:0.2'
- containerOptions = { workflow.containerEngine == "docker" ? '-u $(id -u):$(id -g)': null}
- withLabel: single_cpu_long {
- cpus = 1
- memory = '30G'
- time = '6h'
- }
-
- withLabel: single_cpu {
- cpus = 1
- memory = '30G'
- }
- withLabel: big_mem_cpus {
- cpus = 8
- errorStrategy = 'retry'
- memory = {30.GB + 10.GB * task.attempt}
- maxRetries = 4
- }
- withLabel: big_long_mem_cpus {
- cpus = 8
- time = '48h'
- cpus = 8
- errorStrategy = 'retry'
- memory = {30.GB + 10.GB * task.attempt}
- maxRetries = 4
- }
-
-}
-//singularity.enabled = true
diff --git a/NanoMod/params.config b/NanoMod/params.config
deleted file mode 100644
index 8748598..0000000
--- a/NanoMod/params.config
+++ /dev/null
@@ -1,16 +0,0 @@
-params {
- input_path = "$baseDir/../NanoPreprocess/output_sk1/"
- comparison = "$baseDir/comparison.tsv"
-
- reference = "$baseDir/../anno2/Yeast_sk1.fa.gz"
-
- output = "$baseDir/output_gen"
- coverage = 5
-
- tombo_opt = "--num-bases 5"
-
- epinano_opt = ""
-
- email = ""
-}
-
diff --git a/NanoPreprocess/bin/bam2stats.py b/NanoPreprocess/bin/bam2stats.py
deleted file mode 100755
index 12eaad6..0000000
--- a/NanoPreprocess/bin/bam2stats.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python3
-# Report stats (mapped reads and identity to reference) from samtools stats
-# for bam file(s) ignoring secondary, suplementary and qc failed alignments
-#
-# USAGE: bam2stats.py bam1 bam2 ... bamN
-
-import os, subprocess, sys
-
-def bam2stats(fn, flag=3840):
- """Get stats from samtools stats"""
- args = ["samtools", "stats", "-F%s"%flag, fn]
- proc = subprocess.Popen(args, stdout=subprocess.PIPE)
- k2v = {}
- for l in proc.stdout:
- l = l.decode("utf-8")
- if l.startswith('SN'):
- ldata = l[:-1].split()#; print(ldata)
- kv = [[]]
- for e in ldata[1:]:
- kv[-1].append(e)
- if e.endswith(':'):
- kv[-1][-1] = kv[-1][-1][:-1]
- kv.append([])
- k2v[" ".join(kv[0])] = kv[1]
- # convert digits to int
- for k in k2v:
- if k2v[k][0].isdigit():
- k2v[k] = int(k2v[k][0])
- # report if no reads mapped
- if not k2v['reads mapped']:
- return "No reads mapped"
- text = []
- text.append("{:,}\t{:.1f}%\t{:,}\t{:.1f}%".format(k2v['reads mapped'], 100*k2v['reads mapped']/k2v['sequences'], k2v['bases mapped (cigar)'], 100*k2v['bases mapped (cigar)']/k2v['total length']))
- text.append("{:,}\t{:,}".format(k2v['average length'], k2v['maximum length']))
- text.append("{:.2f}%".format(100-100*k2v['mismatches']/k2v['bases mapped (cigar)'], )) #"identity: %.2f%"%(100-k2v['mismatches']/k2v['bases mapped (cigar)'], ))
- return "\t".join(text)
-
-for fn in sys.argv[1:]:
- if os.path.isfile(fn):
- sys.stdout.write("#File name\tMapped reads\tMap %\tBases\tBases %\tAvg read length\tMax read length\tidentity\n")
- sys.stdout.write("%s\t%s\n"%(fn, bam2stats(fn)))
-
-'''
-CHK 4691e107 9942d94c cd9ffd51
-# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
-SN raw total sequences: 4000
-SN filtered sequences: 0
-SN sequences: 4000
-SN is sorted: 1
-SN 1st fragments: 4000
-SN last fragments: 0
-SN reads mapped: 1440
-SN reads mapped and paired: 0 # paired-end technology bit set + both mates mapped
-SN reads unmapped: 2560
-SN reads properly paired: 0 # proper-pair bit set
-SN reads paired: 0 # paired-end technology bit set
-SN reads duplicated: 0 # PCR or optical duplicate bit set
-SN reads MQ0: 726 # mapped and MQ=0
-SN reads QC failed: 0
-SN non-primary alignments: 6801
-SN total length: 136941 # ignores clipping
-SN bases mapped: 109284 # ignores clipping
-SN bases mapped (cigar): 108908 # more accurate
-SN bases trimmed: 0
-SN bases duplicated: 0
-SN mismatches: 14898 # from NM fields
-SN error rate: 1.367944e-01 # mismatches / bases mapped (cigar)
-SN average length: 34
-SN maximum length: 401
-SN average quality: 3.5
-SN insert size average: 0.0
-SN insert size standard deviation: 0.0
-SN inward oriented pairs: 0
-SN outward oriented pairs: 0
-SN pairs with other orientation: 0
-SN pairs on different chromosomes: 0
-
-'''
-
diff --git a/NanoPreprocess/bin/cmd_line_deeplexicon_caller_2019_09_12.py b/NanoPreprocess/bin/cmd_line_deeplexicon_caller_2019_09_12.py
deleted file mode 100755
index 9e527c6..0000000
--- a/NanoPreprocess/bin/cmd_line_deeplexicon_caller_2019_09_12.py
+++ /dev/null
@@ -1,677 +0,0 @@
-#!/usr/bin/env python3
-# coding: utf-8
-
-import warnings
-warnings.filterwarnings("ignore", category=FutureWarning)
-import argparse
-import sys
-# from __future__ import print_function
-import os
-from copy import deepcopy
-import re
-import csv
-import time
-import configparser
-import h5py
-import traceback
-import math
-import numpy as np
-# from PIL import Image
-import pyts
-from pyts.image import MarkovTransitionField, GramianAngularField, RecurrencePlot
-import tensorflow as tf
-import keras
-from keras.layers import Dense, Conv2D, BatchNormalization, Activation
-from keras.layers import AveragePooling2D, Input, Flatten
-from keras.optimizers import Adam
-from keras.callbacks import ModelCheckpoint, LearningRateScheduler
-from keras.callbacks import ReduceLROnPlateau
-from keras.preprocessing.image import ImageDataGenerator
-from keras.utils import multi_gpu_model
-from keras.regularizers import l2
-from keras import backend as K
-from keras.models import Model
-import pandas as pd
-from sklearn import datasets, linear_model
-from sklearn.model_selection import train_test_split
-from tensorflow.python.client import device_lib
-from keras.models import load_model
-
-# import matplotlib.pyplot as plt
-
-
-'''
-
- James M. Ferguson (j.ferguson@garvan.org.au)
- Genomic Technologies
- Garvan Institute
- Copyright 2019
-
- Tansel Ersevas ....
-
- script description
-
- ----------------------------------------------------------------------------
- version 0.0 - initial
-
- So a cutoff of: 0.4958776 for high accuracy
- and another of 0.2943664 for high recovery
-
- TODO:
- -
-
- ----------------------------------------------------------------------------
- MIT License
-
- Copyright (c) 2019 -------NAME----------
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in all
- copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-'''
-
-
-
-
-class MyParser(argparse.ArgumentParser):
- def error(self, message):
- sys.stderr.write('error: %s\n' % message)
- self.print_help()
- sys.exit(2)
-
-def print_verbose(message):
- '''verbose printing'''
- sys.stderr.write('info: %s\n' % message)
-
-def print_err(message):
- '''error printing'''
- sys.stderr.write('error: %s\n' % message)
-
-def read_config(filename):
- config = configparser.ConfigParser()
- config.read(filename)
- return(config)
-
-def _get_available_devices():
- local_device_protos = device_lib.list_local_devices()
- return [x.name for x in local_device_protos]
-
-def _check_available_devices():
- available_devices = _get_available_devices()
- print_verbose(available_devices)
- # Make sure requested GPUs are available or at least warn if they aren't
- return(TRUE)
-
-def read_model(model_name):
- # model = load_model('saved_models/' + model_name)
- model = load_model(model_name) # as a path
- model.compile(loss='categorical_crossentropy',
- optimizer=Adam(),
- metrics=['accuracy'])
- return(model)
-
-squiggle_max = 1199
-squiggle_min = 1
-input_cut = 72000 #potenitall need to be configurable
-image_size = 224
-num_classes = 4
-window = 2000
-
-def main():
- '''
- Main function
- '''
- VERSION = "0.9.0"
-
- parser = MyParser(
- description="DeePlexiCon - Demultiplex direct RNA reads")
- #group = parser.add_mutually_exclusive_group()
- parser.add_argument("-p", "--path",
- help="Top path of fast5 files to dmux")
- parser.add_argument("-t", "--type", default="multi", choices=["multi", "single"],
- help="Multi or single fast5s")
- parser.add_argument("-c", "--config",
- help="config file")
- # parser.add_argument("-g", "--gpu_list", default=1
- # help="list of gpus, 1, or [1,3,5], etc. of PCI_BUS_ID order")
- # parser.add_argument("-o", "--output",
- # help="Output directory")
- # parser.add_argument("-s", "--threshold", type=float, default=0.7,
- # help="confidence interval threshold")
- parser.add_argument("-s", "--threshold", type=float, default=0.50,
- help="probability threshold - 0.5 hi accuracy / 0.3 hi recovery")
- # populate choices with models found in saved_models/
- # parser.add_argument("-m", "--model", default="4_bc_normal.h5", choices=["4_bc_normal.h5", "model2"],
- parser.add_argument("-m", "--model",
- help="Trained model name to use")
- parser.add_argument("-b", "--batch_size", type=int, default=4000,
- help="batch size - for single fast5s")
- parser.add_argument("-V", "--version",
- help="Prints version")
- parser.add_argument("-v", "--verbose", action="store_true",
- help="Verbose output")
-
- args = parser.parse_args()
- # print help if no arguments given
- if len(sys.argv) == 1:
- parser.print_help(sys.stderr)
- sys.exit(1)
-
- if args.verbose:
- print_verbose("Verbose mode active - dumping info to stderr")
- print_verbose("DeePlexiCon: {}".format(VERSION))
- print_verbose("arg list: {}".format(args))
- if tf.test.gpu_device_name():
- print_verbose('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
- else:
- print_verbose("Please install GPU version of TF")
-
-
- # Globals
- if args.config:
- config = read_config(args.config) #TODO check config read error
-
- squiggle_max = 1199
- squiggle_min = 1
- input_cut = 72000 #potenitall need to be configurable
- image_size = 224
- num_classes = 4
- window = 2000
- # Devices
- # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-
- # os.environ["CUDA_VISIBLE_DEVICES"] = config[deeplexicon][gpu_list] if args.config else args.gpu_list
-
- # do check devices are available, else throw and error
-
-
- # main logic
-
- # read model
-
- #TODO Check model errors
- model = read_model(config[deeplexicon][trained_model]) if args.config else read_model(args.model)
- labels = []
- images = []
- fast5s = {}
- stats = ""
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format("fast5", "ReadID", "Barcode", "Confidence Interval", "P_bc_1", "P_bc_2", "P_bc_3", "P_bc_4"))
- # for file in input...
- for dirpath, dirnames, files in os.walk(args.path):
- for fast5 in files:
- if fast5.endswith('.fast5'):
- fast5_file = os.path.join(dirpath, fast5)
- if args.type == "single":
- #everthing below this, send off in batches of N=args.batch_size
- # The signal extraction and segmentation can happen in the first step
- # read fast5 files
- readID, seg_signal = get_single_fast5_signal(fast5_file, window)
- # segment
- # array[name, signal.....]
- # seg_signal = dRNA_segmenter(signal)
- if not seg_signal:
- print_err("Segment not found for:\t{}\t{}".format(fast5_file, readID))
- continue
- # convert
- sig = np.array(seg_signal, dtype=float)
- # print(sig)
- img = convert_to_image(sig)
- # print(img)
- labels.append(readID) # read ID?
- # print_verbose(labels)
- fast5s[readID] = fast5
- # print_verbose(readID)
- # print_verbose(fast5)
- # print_verbose(fast5s)
- images.append(img)
- # sys.exit()
- # classify
- if len(labels) >= args.batch_size:
- C = classify(model, labels, np.array(images), False, args.threshold)
- # out = classify(model, readID, np.array([img]), False, 0.0)
- # print_verbose(C)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- if out == 0:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 1:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 2:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 3:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3]))
- else:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
-
-
- elif args.type == "multi":
- #everthing below this, send off in batches of N=args.batch_size
- # The signal extraction and segmentation can happen in the first step
- # read fast5 files
- seg_signal = get_multi_fast5_signal(fast5_file, window)
- # segment
- # array[name, signal.....]
- # seg_signal = dRNA_segmenter(signal)
- # print_verbose(list(seg_signal.keys()))
- sig_count = 0
- for readID in seg_signal:
- # convert
- img = convert_to_image(np.array(seg_signal[readID], dtype=float))
- labels.append(readID) #change to readID?
- images.append(img)
- fast5s[readID] = fast5
- sig_count += 1
- if len(labels) >= args.batch_size:
- C = classify(model, labels, np.array(images), False, args.threshold)
- # out = classify(model, readID, np.array([img]), False, 0.0)
- # print_verbose(C)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- if out == 0:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 1:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 2:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 3:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3]))
- else:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
- elif args.verbose:
- print_verbose("analysing sig_count: {}/{}".format(sig_count, len(seg_signal)))
- else:
- blah = 0
- #finish up
- C = classify(model, labels, np.array(images), False, args.threshold)
- # out = classify(model, readID, np.array([img]), False, 0.0)
- # print_verbose(C)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- if out == 0:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 1:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 2:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 3:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3]))
- else:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
-
-
-
-
-
-
-
- # final report/stats
- # print stats
-
-# file handling and segmentation
-
-def get_single_fast5_signal(read_filename, w):
- '''
- open sigle fast5 file and extract information
- '''
-
- # get readID and signal
- f5_dic = read_single_fast5(read_filename)
- if not f5_dic:
- print_err("Signal not extracted from: {}".format(read_filename))
- return 0, 0
- # segment on raw
- readID = f5_dic['readID']
- signal = f5_dic['signal']
- seg = dRNA_segmenter(signal, w)
- if not seg:
- print_verbose("No segment found - skipping: {}".format(readID))
- return 0, 0
- # convert to pA
- pA_signal = convert_to_pA(f5_dic)
- # return signal/signals
- return readID, pA_signal[seg[0]:seg[1]]
-
-
-def get_multi_fast5_signal(read_filename, w):
- '''
- open multi fast5 files and extract information
- '''
- pA_signals = {}
- f5_dic = read_multi_fast5(read_filename)
- # print_verbose(list(f5_dic.keys()))
- seg = 0
- sig_count = 0
- for read in f5_dic:
- sig_count += 1
- print_verbose("reading sig_count: {}/{}".format(sig_count, len(f5_dic)))
- # get readID and signal
- # print_verbose("read: {}".format(read))
- readID = f5_dic[read]['readID']
- # print_verbose("readID: {}".format(readID))
- signal = f5_dic[read]['signal']
- # print_verbose("sig: {}".format(signal[:5]))
- # continue
-
-
-
- # segment on raw
- seg = dRNA_segmenter(readID, signal, w)
- # print_verbose("seg return: {}".format(seg))
- if not seg:
- # print_verbose("No segment found - skipping: {}".format(readID))
- seg = 0
- continue
- # convert to pA
- pA_signal = convert_to_pA(f5_dic[read])
- # print_verbose("pA: {}".format(pA_signal[:5]))
- pA_signals[readID] = pA_signal[seg[0]:seg[1]]
- # return signal/signals
- return pA_signals
-
-
-def read_single_fast5(filename):
- '''
- read single fast5 file and return data
- '''
- f5_dic = {'signal': [], 'readID': '', 'digitisation': 0.0,
- 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0}
-
- # open fast5 file
- try:
- hdf = h5py.File(filename, 'r')
- except:
- traceback.print_exc()
- print_err("extract_fast5():fast5 file failed to open: {}".format(filename))
- f5_dic = {}
- return f5_dic
- try:
- c = list(hdf['Raw/Reads'].keys())
- for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]:
- f5_dic['signal'].append(int(col))
-
-
- f5_dic['readID'] = hdf['Raw/Reads/'][c[0]].attrs['read_id'].decode()
- f5_dic['digitisation'] = hdf['UniqueGlobalKey/channel_id'].attrs['digitisation']
- f5_dic['offset'] = hdf['UniqueGlobalKey/channel_id'].attrs['offset']
- f5_dic['range'] = float("{0:.2f}".format(hdf['UniqueGlobalKey/channel_id'].attrs['range']))
- f5_dic['sampling_rate'] = hdf['UniqueGlobalKey/channel_id'].attrs['sampling_rate']
-
- except:
- traceback.print_exc()
- print_err("extract_fast5():failed to extract events or fastq from: {}".format(filename))
- f5_dic = {}
-
- return f5_dic
-
-
-def read_multi_fast5(filename):
- '''
- read multifast5 file and return data
- '''
- f5_dic = {}
- # c = 0
- with h5py.File(filename, 'r') as hdf:
- for read in list(hdf.keys()):
- # c += 1
- # if c > 100:
- # break
- f5_dic[read] = {'signal': [], 'readID': '', 'digitisation': 0.0,
- 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0}
- try:
- for col in hdf[read]['Raw/Signal'][()]:
- f5_dic[read]['signal'].append(int(col))
-
- f5_dic[read]['readID'] = hdf[read]['Raw'].attrs['read_id'].decode()
- f5_dic[read]['digitisation'] = hdf[read]['channel_id'].attrs['digitisation']
- f5_dic[read]['offset'] = hdf[read]['channel_id'].attrs['offset']
- f5_dic[read]['range'] = float("{0:.2f}".format(hdf[read]['channel_id'].attrs['range']))
- f5_dic[read]['sampling_rate'] = hdf[read]['channel_id'].attrs['sampling_rate']
- except:
- traceback.print_exc()
- print_err("extract_fast5():failed to read readID: {}".format(read))
- return f5_dic
-
-
-def dRNA_segmenter(readID, signal, w):
- '''
- segment signal/s and return coords of cuts
- '''
- def _scale_outliers(squig):
- ''' Scale outliers to within m stdevs of median '''
- k = (squig > 0) & (squig < 1200)
- return squig[k]
-
-
- sig = _scale_outliers(np.array(signal, dtype=int))
-
- s = pd.Series(sig)
- t = s.rolling(window=w).mean()
- # This should be done better, or changed to median and benchmarked
- # Currently trained on mean segmented data
- mn = t.mean()
- std = t.std()
- # Trained on 0.5
- bot = mn - (std*0.5)
-
-
-
- # main algo
-
- begin = False
- # max distance for merging 2 segs
- seg_dist = 1500
- # max length of a seg
- hi_thresh = 200000
- # min length of a seg
- lo_thresh = 2000
-
- start = 0
- end = 0
- segs = []
- count = -1
- for i in t:
- count += 1
- if i < bot and not begin:
- start = count
- begin = True
- elif i < bot:
- end = count
- elif i > bot and begin:
- if segs and start - segs[-1][1] < seg_dist:
- segs[-1][1] = end
- else:
- segs.append([start, end])
- start = 0
- end = 0
- begin = False
- else:
- continue
-
- offset = -1050
- buff = 150
-
- # fig = plt.figure(1)
- # #fig.subplots_adjust(hspace=0.1, wspace=0.01)
- # ax = fig.add_subplot(111)
- #
- # print_verbose("segs: {}".format(segs))
- # # # Show segment lines
- # for i, j in segs:
- # ax.axvline(x=i+offset-buff, color='m')
- # ax.axvline(x=j+offset+buff, color='m')
- #
- # ax.axhline(y=bot, color='r')
- # plt.plot(sig, color='teal')
- # plt.plot(t, color='k')
- # plt.show()
- # plt.clf()
-
- x, y = 0, 0
-
- # print_verbose("segs befor filter: {}".format(segs))
- for a, b in segs:
- if b - a > hi_thresh:
- continue
- if b - a < lo_thresh:
- continue
- x, y = a, b
- # print "{}\t{}\t{}\t{}".format(f5, read_dic[f5], x, y)
- # print_verbose("xy before return: [{},{}]".format(x, y))
- return [x+offset-buff, y+offset+buff]
- break
- print_verbose("dRNA_segmenter: no seg found: {}".format(readID))
- return 0
-
-
-def convert_to_pA(d):
- '''
- convert raw signal data to pA using digitisation, offset, and range
- float raw_unit = range / digitisation;
- for (int32_t j = 0; j < nsample; j++) {
- rawptr[j] = (rawptr[j] + offset) * raw_unit;
- }
- '''
- digitisation = d['digitisation']
- range = d['range']
- offset = d['offset']
- raw_unit = range / digitisation
- new_raw = []
- for i in d['signal']:
- j = (i + offset) * raw_unit
- new_raw.append("{0:.2f}".format(round(j,2)))
- return new_raw
-
-# Python timeseries interface
-
-
-def pyts_transform(transform, data, image_size, show=False, cmap='rainbow', img_index=0):
- try:
- t_start=time.time()
- X_transform = transform.fit_transform(data)
- # if args.verbose:
- # print_verbose("{} {}".format(time.time() - t_start, 'seconds'))
- if (show):
- plt.figure(figsize=(4, 4))
- plt.grid(b=None)
- plt.imshow(X_transform[0], cmap=cmap, origin='lmtfower')
- plt.savefig(transform.__class__.__name__ + "_image_" + str(img_index) + ".svg", format="svg")
- plt.show()
- return(X_transform)
- except Exception as e:
- print_err(str(e))
- return([])
-
-
-def mtf_transform(data, image_size=500, show=False, img_index=0):
- transform = MarkovTransitionField(image_size)
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-def rp_transform(data, image_size=500 ,show=False ,img_index=0):
- # RP transformationmtf
- transform = RecurrencePlot(dimension=1,
- threshold='percentage_points',
- percentage=30)
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='binary', img_index=img_index))
-
-def gasf_transform(data, image_size=500, show=False, img_index=0):
- # GAF transformation
- transform = GramianAngularField(image_size, method='summation')
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-def gadf_transform(data, image_size=500, show=False ,img_index=0):
- # GAF transformation
- transform = GramianAngularField(image_size, method='difference')
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-
-def labels_for(a_file_name):
- segments=re.split(r'[_\-\.]+', a_file_name)
- return(segments)
-
-def max_in_sequence(sequence):
- return(max(np.amax([list(d.values()) for d in sequence]), 0.01))
-
-def compress_squiggle(squiggle, compress_factor):
- squiggle_len = len(squiggle)
- rem = squiggle_len % compress_factor
- if rem > 0:
- return(np.mean(squiggle[0:squiggle_len - rem].reshape(-1,compress_factor), axis=1))
- return(squiggle)
-
-def convert_to_image(signal):
- transformed_squiggle = gasf_transform(signal.reshape(1,-1), image_size=image_size, show=False)
- return(transformed_squiggle)
-
-
-def confidence_margin(npa):
- sorted = np.sort(npa)[::-1] #return sort in reverse, i.e. descending
- # sorted = np.sort(npa) #return sort in reverse, i.e. descending
- d = sorted[0] - sorted[1]
- # if args.verbose:
- # print_verbose("Sorted: {}".format(sorted))
- # print_verbose("conf interval: {}".format(d))
- # return(sorted[0][0] - sorted[0][1])
- return(d)
-
-def classify(model, labels, image, subtract_pixel_mean, threshold):
- input_shape = image.shape[1:]
- x = image.astype('float32') / 255
-
- # If subtract pixel mean is enabled
- if subtract_pixel_mean:
- x_mean = np.mean(x, axis=0)
- x -= x_mean
- x=[x]
- # print(x)
- y = model.predict(x, verbose=0)
- # print(y)
- res = []
- for i in range(len(y)):
- cm = confidence_margin(y[i])
- #print_verbose(y[i][np.argmax(y[i])])
- if y[i][np.argmax(y[i])] >= threshold:
- # return(np.argmax(y))
- res.append([labels[i], np.argmax(y[i]), cm, y[i]])
- # return(None)
- # return(np.argmax(y))
- else:
- res.append([labels[i], None, cm, y[i]])
- return res
-
-if __name__ == '__main__':
- main()
diff --git a/NanoPreprocess/bin/deeplexicon.py b/NanoPreprocess/bin/deeplexicon.py
deleted file mode 100755
index c58b3b6..0000000
--- a/NanoPreprocess/bin/deeplexicon.py
+++ /dev/null
@@ -1,622 +0,0 @@
-#!/usr/bin/env python3
-# coding: utf-8
-
-import warnings
-warnings.filterwarnings("ignore", category=FutureWarning)
-import argparse
-import sys
-import os
-from copy import deepcopy
-import re
-import csv
-import time
-import configparser
-import h5py
-import traceback
-import math
-import numpy as np
-# from PIL import Image
-import pyts
-from pyts.image import MarkovTransitionField, GramianAngularField, RecurrencePlot
-import tensorflow as tf
-import keras
-from keras.layers import Dense, Conv2D, BatchNormalization, Activation
-from keras.layers import AveragePooling2D, Input, Flatten
-from keras.optimizers import Adam
-from keras.callbacks import ModelCheckpoint, LearningRateScheduler
-from keras.callbacks import ReduceLROnPlateau
-from keras.preprocessing.image import ImageDataGenerator
-from keras.utils import multi_gpu_model
-from keras.regularizers import l2
-from keras import backend as K
-from keras.models import Model
-import pandas as pd
-from sklearn import datasets, linear_model
-from sklearn.model_selection import train_test_split
-from tensorflow.python.client import device_lib
-from keras.models import load_model
-
-
-
-'''
-
- James M. Ferguson (j.ferguson@garvan.org.au)
- Genomic Technologies
- Garvan Institute
- Copyright 2019
-
- Tansel Ersevas (t.ersevas@garvan.org.au)
-
- script description
-
-
-
- ----------------------------------------------------------------------------
- version 0.0.0 - initial
- version 0.8.0 - CPU version Done
- version 0.9.0 - Fixed segment offset
- version 0.9.1 - added segment and squiggle output
- version 0.9.2 - separate segment output and code clean up
- version 1.0.0 - initial release
-
- So a cutoff of: 0.4958776 for high accuracy
- and another of 0.2943664 for high recovery
-
- TODO:
- - Remove leftover libraries
- - remove debug plots
- - Remove redundant code
- - create log files with information
- - take in fastq for dmux splitting
- - take in paf or bam for training splitting
-
-
- ----------------------------------------------------------------------------
- MIT License
-
- Copyright (c) 2019 James M. Ferguson
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in all
- copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-'''
-
-
-
-
-class MyParser(argparse.ArgumentParser):
- def error(self, message):
- sys.stderr.write('error: %s\n' % message)
- self.print_help()
- sys.exit(2)
-
-def print_verbose(message):
- '''verbose printing'''
- sys.stderr.write('info: %s\n' % message)
-
-def print_err(message):
- '''error printing'''
- sys.stderr.write('error: %s\n' % message)
-
-def read_config(filename):
- config = configparser.ConfigParser()
- config.read(filename)
- return(config)
-
-def _get_available_devices():
- local_device_protos = device_lib.list_local_devices()
- return [x.name for x in local_device_protos]
-
-def _check_available_devices():
- available_devices = _get_available_devices()
- print_verbose(available_devices)
- # Make sure requested GPUs are available or at least warn if they aren't
- return(TRUE)
-
-def read_model(model_name):
- # model = load_model('saved_models/' + model_name)
- model = load_model(model_name) # as a path
- model.compile(loss='categorical_crossentropy',
- optimizer=Adam(),
- metrics=['accuracy'])
- return(model)
-
-squiggle_max = 1199
-squiggle_min = 1
-input_cut = 72000 #potenitall need to be configurable
-image_size = 224
-num_classes = 4
-window = 2000
-
-def main():
- '''
- Main function
- '''
- VERSION = "1.0.0"
-
- parser = MyParser(
- description="DeePlexiCon - Demultiplex direct RNA reads")
- #group = parser.add_mutually_exclusive_group()
- parser.add_argument("-p", "--path",
- help="Top path of fast5 files to dmux")
- parser.add_argument("-f", "--form", default="multi", choices=["multi", "single"],
- help="Multi or single fast5s")
- parser.add_argument("-c", "--config",
- help="config file")
- # parser.add_argument("-g", "--gpu_list", default=1
- # help="list of gpus, 1, or [1,3,5], etc. of PCI_BUS_ID order")
- # parser.add_argument("-o", "--output",
- # help="Output directory")
- parser.add_argument("-s", "--threshold", type=float, default=0.50,
- help="probability threshold - 0.5 hi accuracy / 0.3 hi recovery")
- # populate choices with models found in saved_models/
- # parser.add_argument("-m", "--model", default="4_bc_normal.h5", choices=["4_bc_normal.h5", "model2"],
- parser.add_argument("-m", "--model",
- help="Trained model name to use")
- parser.add_argument("--squiggle",
- help="dump squiggle data into this .tsv file")
- parser.add_argument("--segment",
- help="dump segment data into this .tsv file")
- parser.add_argument("-b", "--batch_size", type=int, default=4000,
- help="batch size - for single fast5s")
- parser.add_argument("-V", "--version",
- help="Prints version")
- parser.add_argument("-v", "--verbose", action="store_true",
- help="Verbose output")
-
- args = parser.parse_args()
- # print help if no arguments given
- if len(sys.argv) == 1:
- parser.print_help(sys.stderr)
- sys.exit(1)
-
- if args.verbose:
- print_verbose("Verbose mode active - dumping info to stderr")
- print_verbose("DeePlexiCon: {}".format(VERSION))
- print_verbose("arg list: {}".format(args))
- if tf.test.gpu_device_name():
- print_verbose('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
- else:
- print_verbose("Please install GPU version of TF")
-
- if args.squiggle:
- squig_file = args.squiggle
- with open(squig_file, 'a') as f:
- f.write("{}\t{}\n".format("ReadID", "signal_pA"))
- else:
- squig_file = ''
-
- if args.segment:
- seg_file = args.segment
- with open(seg_file, 'a') as f:
- f.write("{}\t{}\t{}\n".format("ReadID", "start", "stop"))
- else:
- seg_file = ''
-
- # Globals
- if args.config:
- config = read_config(args.config) #TODO check config read error
-
- # gpu settings
- # Devices
- # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-
- # os.environ["CUDA_VISIBLE_DEVICES"] = config[deeplexicon][gpu_list] if args.config else args.gpu_list
-
- # do check devices are available, else throw and error
-
-
- # main logic
-
- # read model
-
- model = read_model(config[deeplexicon][trained_model]) if args.config else read_model(args.model)
- barcode_out = {0: "bc_1",
- 1: "bc_2",
- 2: "bc_3",
- 3: "bc_4",
- None: "unknown"
- }
- labels = []
- images = []
- fast5s = {}
- stats = ""
- seg_dic = {}
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format("fast5", "ReadID", "Barcode", "Confidence Interval", "P_bc_1", "P_bc_2", "P_bc_3", "P_bc_4"))
- # for file in input...
- for dirpath, dirnames, files in os.walk(args.path):
- for fast5 in files:
- if fast5.endswith('.fast5'):
- fast5_file = os.path.join(dirpath, fast5)
- if args.form == "single":
- #everthing below this, send off in batches of N=args.batch_size
- # The signal extraction and segmentation can happen in the first step
- # read fast5 files
- readID, seg_signal = get_single_fast5_signal(fast5_file, window, squig_file, seg_file)
- if not seg_signal:
- print_err("Segment not found for:\t{}\t{}".format(fast5_file, readID))
- continue
- # convert
- sig = np.array(seg_signal, dtype=float)
- img = convert_to_image(sig)
- labels.append(readID)
- fast5s[readID] = fast5
- images.append(img)
- # classify
- if len(labels) >= args.batch_size:
- C = classify(model, labels, np.array(images), False, args.threshold)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcode_out[out], cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
-
-
- elif args.form == "multi":
- #everthing below this, send off in batches of N=args.batch_size
- # The signal extraction and segmentation can happen in the first step
- # read fast5 files
- seg_signal = get_multi_fast5_signal(fast5_file, window, squig_file, seg_file)
- sig_count = 0
- for readID in seg_signal:
- # convert
- img = convert_to_image(np.array(seg_signal[readID], dtype=float))
- labels.append(readID)
- images.append(img)
- fast5s[readID] = fast5
- sig_count += 1
- if len(labels) >= args.batch_size:
- C = classify(model, labels, np.array(images), False, args.threshold)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcde_out[out], cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
- elif args.verbose:
- print_verbose("analysing sig_count: {}/{}".format(sig_count, len(seg_signal)))
- else:
- blah = 0 # clean
- #finish up
- C = classify(model, labels, np.array(images), False, args.threshold)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcode_out[out], cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
-
-
- # final report/stats
- # print stats
-
-# file handling and segmentation
-
-def get_single_fast5_signal(read_filename, w, squig_file, seg_file):
- '''
- open sigle fast5 file and extract information
- '''
-
- # get readID and signal
- f5_dic = read_single_fast5(read_filename)
- if not f5_dic:
- print_err("Signal not extracted from: {}".format(read_filename))
- return 0, 0
- # segment on raw
- readID = f5_dic['readID']
- signal = f5_dic['signal']
- seg = dRNA_segmenter(readID, signal, w)
- if not seg:
- print_verbose("No segment found - skipping: {}".format(readID))
- return 0, 0
- # convert to pA
- pA_signal = convert_to_pA(f5_dic)
- if squig_file:
- with open(squig_file, 'a') as f:
- f.write("{}\t{}\n".format(readID, "\t".join(pA_signal)))
- if seg_file:
- with open(seg_file, 'a') as f:
- f.write("{}\t{}\t{}\n".format(readID, seg[0], seg[1]))
- # return signal/signals
- return readID, pA_signal[seg[0]:seg[1]]
-
-
-def get_multi_fast5_signal(read_filename, w, squig_file, seg_file):
- '''
- open multi fast5 files and extract information
- '''
- pA_signals = {}
- seg_dic = {}
- f5_dic = read_multi_fast5(read_filename)
- seg = 0
- sig_count = 0
- for read in f5_dic:
- sig_count += 1
- print_verbose("reading sig_count: {}/{}".format(sig_count, len(f5_dic)))
- # get readID and signal
- readID = f5_dic[read]['readID']
- signal = f5_dic[read]['signal']
-
- # segment on raw
- seg = dRNA_segmenter(readID, signal, w)
- if not seg:
- seg = 0
- continue
- # convert to pA
- pA_signal = convert_to_pA(f5_dic[read])
- if squig_file:
- with open(squig_file, 'a') as f:
- f.write("{}\t{}\n".format(readID, "\t".join(pA_signal)))
- if seg_file:
- with open(seg_file, 'a') as f:
- f.write("{}\t{}\t{}\n".format(readID, seg[0], seg[1]))
- pA_signals[readID] = pA_signal[seg[0]:seg[1]]
- seg_dic[readID] = seg
- # return signal/signals
- return pA_signals
-
-
-def read_single_fast5(filename):
- '''
- read single fast5 file and return data
- '''
- f5_dic = {'signal': [], 'readID': '', 'digitisation': 0.0,
- 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0}
-
- # open fast5 file
- try:
- hdf = h5py.File(filename, 'r')
- except:
- traceback.print_exc()
- print_err("extract_fast5():fast5 file failed to open: {}".format(filename))
- f5_dic = {}
- return f5_dic
- try:
- c = list(hdf['Raw/Reads'].keys())
- for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]:
- f5_dic['signal'].append(int(col))
-
- f5_dic['readID'] = hdf['Raw/Reads/'][c[0]].attrs['read_id'].decode()
- f5_dic['digitisation'] = hdf['UniqueGlobalKey/channel_id'].attrs['digitisation']
- f5_dic['offset'] = hdf['UniqueGlobalKey/channel_id'].attrs['offset']
- f5_dic['range'] = float("{0:.2f}".format(hdf['UniqueGlobalKey/channel_id'].attrs['range']))
- f5_dic['sampling_rate'] = hdf['UniqueGlobalKey/channel_id'].attrs['sampling_rate']
-
- except:
- traceback.print_exc()
- print_err("extract_fast5():failed to extract events or fastq from: {}".format(filename))
- f5_dic = {}
-
- return f5_dic
-
-
-def read_multi_fast5(filename):
- '''
- read multifast5 file and return data
- '''
- f5_dic = {}
- with h5py.File(filename, 'r') as hdf:
- for read in list(hdf.keys()):
- f5_dic[read] = {'signal': [], 'readID': '', 'digitisation': 0.0,
- 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0}
- try:
- for col in hdf[read]['Raw/Signal'][()]:
- f5_dic[read]['signal'].append(int(col))
-
- f5_dic[read]['readID'] = hdf[read]['Raw'].attrs['read_id'].decode()
- f5_dic[read]['digitisation'] = hdf[read]['channel_id'].attrs['digitisation']
- f5_dic[read]['offset'] = hdf[read]['channel_id'].attrs['offset']
- f5_dic[read]['range'] = float("{0:.2f}".format(hdf[read]['channel_id'].attrs['range']))
- f5_dic[read]['sampling_rate'] = hdf[read]['channel_id'].attrs['sampling_rate']
- except:
- traceback.print_exc()
- print_err("extract_fast5():failed to read readID: {}".format(read))
- return f5_dic
-
-
-def dRNA_segmenter(readID, signal, w):
- '''
- segment signal/s and return coords of cuts
- '''
- def _scale_outliers(squig):
- ''' Scale outliers to within m stdevs of median '''
- k = (squig > 0) & (squig < 1200)
- return squig[k]
-
-
- sig = _scale_outliers(np.array(signal, dtype=int))
-
- s = pd.Series(sig)
- t = s.rolling(window=w).mean()
- # This should be done better, or changed to median and benchmarked
- # Currently trained on mean segmented data
- mn = t.mean()
- std = t.std()
- # Trained on 0.5
- bot = mn - (std*0.5)
-
- # main algo
-
- begin = False
- # max distance for merging 2 segs
- seg_dist = 1500
- # max length of a seg
- hi_thresh = 200000
- # min length of a seg
- lo_thresh = 2000
-
- start = 0
- end = 0
- segs = []
- count = -1
- for i in t:
- count += 1
- if i < bot and not begin:
- start = count
- begin = True
- elif i < bot:
- end = count
- elif i > bot and begin:
- if segs and start - segs[-1][1] < seg_dist:
- segs[-1][1] = end
- else:
- segs.append([start, end])
- start = 0
- end = 0
- begin = False
- else:
- continue
-
- # offset = -1050
- # buff = 150
- offset = -1000
- buff = 0
-
- x, y = 0, 0
-
- for a, b in segs:
- if b - a > hi_thresh:
- continue
- if b - a < lo_thresh:
- continue
- x, y = a, b
-
- # to be modified in next major re-training
- return [x+offset-buff, y+offset+buff]
- break
- print_verbose("dRNA_segmenter: no seg found: {}".format(readID))
- return 0
-
-
-def convert_to_pA(d):
- '''
- convert raw signal data to pA using digitisation, offset, and range
- float raw_unit = range / digitisation;
- for (int32_t j = 0; j < nsample; j++) {
- rawptr[j] = (rawptr[j] + offset) * raw_unit;
- }
- '''
- digitisation = d['digitisation']
- range = d['range']
- offset = d['offset']
- raw_unit = range / digitisation
- new_raw = []
- for i in d['signal']:
- j = (i + offset) * raw_unit
- new_raw.append("{0:.2f}".format(round(j,2)))
- return new_raw
-
-
-def pyts_transform(transform, data, image_size, show=False, cmap='rainbow', img_index=0):
- try:
- t_start=time.time()
- X_transform = transform.fit_transform(data)
- if (show):
- plt.figure(figsize=(4, 4))
- plt.grid(b=None)
- plt.imshow(X_transform[0], cmap=cmap, origin='lmtfower')
- plt.savefig(transform.__class__.__name__ + "_image_" + str(img_index) + ".svg", format="svg")
- plt.show()
- return(X_transform)
- except Exception as e:
- print_err(str(e))
- return([])
-
-
-def mtf_transform(data, image_size=500, show=False, img_index=0):
- transform = MarkovTransitionField(image_size)
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-def rp_transform(data, image_size=500 ,show=False ,img_index=0):
- # RP transformationmtf
- transform = RecurrencePlot(dimension=1,
- threshold='percentage_points',
- percentage=30)
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='binary', img_index=img_index))
-
-def gasf_transform(data, image_size=500, show=False, img_index=0):
- # GAF transformation
- transform = GramianAngularField(image_size, method='summation')
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-def gadf_transform(data, image_size=500, show=False ,img_index=0):
- # GAF transformation
- transform = GramianAngularField(image_size, method='difference')
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-
-def labels_for(a_file_name):
- segments=re.split(r'[_\-\.]+', a_file_name)
- return(segments)
-
-def max_in_sequence(sequence):
- return(max(np.amax([list(d.values()) for d in sequence]), 0.01))
-
-def compress_squiggle(squiggle, compress_factor):
- squiggle_len = len(squiggle)
- rem = squiggle_len % compress_factor
- if rem > 0:
- return(np.mean(squiggle[0:squiggle_len - rem].reshape(-1,compress_factor), axis=1))
- return(squiggle)
-
-def convert_to_image(signal):
- transformed_squiggle = gasf_transform(signal.reshape(1,-1), image_size=image_size, show=False)
- return(transformed_squiggle)
-
-
-def confidence_margin(npa):
- sorted = np.sort(npa)[::-1] #return sort in reverse, i.e. descending
- # sorted = np.sort(npa) #return sort in reverse, i.e. descending
- d = sorted[0] - sorted[1]
- return(d)
-
-def classify(model, labels, image, subtract_pixel_mean, threshold):
- input_shape = image.shape[1:]
- # x = image.astype('float32') / 255
- x = image.astype('float32') + 1
- x = x / 2
-
- # If subtract pixel mean is enabled
- if subtract_pixel_mean:
- x_mean = np.mean(x, axis=0)
- x -= x_mean
- x=[x]
- y = model.predict(x, verbose=0)
- res = []
- for i in range(len(y)):
- cm = confidence_margin(y[i])
- if y[i][np.argmax(y[i])] >= threshold:
- res.append([labels[i], np.argmax(y[i]), cm, y[i]])
- else:
- res.append([labels[i], None, cm, y[i]])
- return res
-
-if __name__ == '__main__':
- main()
diff --git a/NanoPreprocess/bin/extract_sequence_from_fastq.py b/NanoPreprocess/bin/extract_sequence_from_fastq.py
deleted file mode 100755
index 27f8406..0000000
--- a/NanoPreprocess/bin/extract_sequence_from_fastq.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env python
-import sys
-import gzip
-import pprint
-import os
-
-usage = '''
-created by Huanle and Luca for Master of Pores! :)
-python extract_sequence_from_fastq.py
-'''
-
-if len (sys.argv) < 3:
- print (usage)
- exit(1)
-
-def fopen (f):
- if f.endswith('.gz'):
- return (gzip.open(f,'rt'))
- else:
- return (open (f,'rt'))
-
-
-IDs = dict ()
-if True:
- fh = fopen (sys.argv[1])
- for l in fh:
- ary = l.strip().split()
- IDs[ary[1]] = ary[2]
-fh.close()
-
-outprefix = os.path.splitext(sys.argv[2])[0]
-outext = os.path.splitext(sys.argv[2])[1]
-if (outext == '.gz'):
- outprefix = os.path.splitext(outprefix)[0]
-
-fh = fopen (sys.argv[2])
-for l in fh:
- rd = l.strip().split()[0]
- rd = rd.replace('@','')
- seq = next(fh).strip()
- inf = next (fh).strip()
- q = next(fh).strip()
- if rd in IDs:
- outfile = outprefix + "." + IDs[rd] + ".fastq"
- fw = open(outfile,"a+")
- rd = '@'+rd
- string = "{0}\n{1}\n{2}\n{3}\n".format(rd,seq,inf,q)
- fw.write(string)
- #print (string)
-fh.close()
diff --git a/NanoPreprocess/bin/fast5_to_fastq.py b/NanoPreprocess/bin/fast5_to_fastq.py
deleted file mode 100755
index a80ce10..0000000
--- a/NanoPreprocess/bin/fast5_to_fastq.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env python
-import sys, warnings,os
-import numpy as np
-import re
-import gzip
-import h5py, ont_fast5_api
-from ont_fast5_api.fast5_interface import get_fast5_file
-
-usage = '''
-_EUSAGE_: python fast5_to_fastq.py fast5file
-_INPUT_: both single- and multi- fast5 files can be processed
-_BUGS_: report to elzedliu@gmail.com
-'''
-
-if len(sys.argv) < 2:
- print (usage); exit(0)
-
-f5file = sys.argv[1]
-hdf5 = h5py.File(f5file,'r')
-f5 = get_fast5_file (f5file,mode='r')
-reads_in_f5 = f5.get_read_ids()
-number_of_reads = len( reads_in_f5)
-
-
-fq_out = open ('.'.join(f5file.split('.')[:-1])+'.fastq','w')
-
-def fastq_from_fast5 (h5f):
- '''
- for single fast5 file, it is hdf5 file open with h5py
- for multi- fast5 file, it is hdf5['read_id'] for single read
- '''
- k = 'Analyses/Basecall_1D_000/BaseCalled_template/Fastq'
- if k in h5f:
- fastq = h5f[k].value.decode()
- return fastq
- else:
- return None
-
-if number_of_reads == 1:
- fq = fastq_from_fast5 (hdf5)
- if fq :
- fq_out.write (fq)
- else:
- sys.stderr.write("no sequence basecalled for {} \n".format(f5file))
-elif number_of_reads > 1:
- for rd in hdf5.keys():
- fq = fastq_from_fast5 (hdf5[rd])
- if fq:
- fq = fq.replace('_Basecall_1D_template','')
- fq_out.write(fq)
- else:
- sys.stderr.write("no basecall for read {} \n" .format(rd))
-fq_out.close()
diff --git a/NanoPreprocess/bin/fast5_type.py b/NanoPreprocess/bin/fast5_type.py
deleted file mode 100755
index 7fa9abc..0000000
--- a/NanoPreprocess/bin/fast5_type.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-
-from ont_fast5_api.multi_fast5 import MultiFast5File
-from ont_fast5_api.fast5_info import _clean
-
-
-__author__ = 'Huanle.Liu@crg.eu'
-__version__ = '0.2'
-__email__ = 'same as author'
-
-usage = '''
- python fast5_type.py fast5file
- return:
- 0: single read fast5
- 1: multi-reads fast5
-'''
-
-if len (sys.argv) !=2:
- print (usage, file=sys.stderr)
- sys.exit()
-
-
-def check_file_type(f5_file):
- try:
- return 1 if _clean(f5_file.handle.attrs['file_type']).startswith('multi') else 0
- except KeyError:
- if len(f5_file.handle) == 0 :
- return 1
- if len([read for read in f5_file.handle if read.startswith('read_')]) !=0 :
- return 1
- if 'UniqueGlobalKey' in f5_file.handle:
- return 0
- raise TypeError('file can not be indetified as single- or multi- read.\n' 'File path: {}'.format(f5_file.filename))
-
-
-filepath = sys.argv[1]
-f5_file = MultiFast5File (filepath, mode='r')
-filetype = check_file_type (f5_file)
-#filetype = 1 if filetype.startswith ('multi') or else 0
-print (filetype)
-
-
-
-
-
-
diff --git a/NanoPreprocess/bin/reorganize.sh b/NanoPreprocess/bin/reorganize.sh
deleted file mode 100755
index ece54cc..0000000
--- a/NanoPreprocess/bin/reorganize.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/env bash
-
-if [ x"$1" == x ]; then
-
- "please specify an output folder"
-
- exit 1
-
-fi
-
-for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/fast5_files; mv $i/* $1`basename $i`/fast5_files; done
-for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/fastq_files; mv $1/fastq_files/*`basename $i`.fq.gz $1`basename $i`/fastq_files; done
-for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/counts; mv $1/counts/*`basename $i`.count $1`basename $i`/counts; done
-for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/assigned; mv $1/assigned/*`basename $i`.assigned $1`basename $i`/assigned; done
-for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/alignment; mv $1/alignment/*`basename $i`*.bam $1`basename $i`/alignment; done
-rmdir $1/alignment $1/assigned $1/counts $1/fastq_files; for i in $1/fast5_files/*; do rmdir $i; done; rmdir $1/fast5_files
-
-
diff --git a/NanoPreprocess/config.yaml b/NanoPreprocess/config.yaml
deleted file mode 100644
index 8dddb62..0000000
--- a/NanoPreprocess/config.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-title: "Master of Pores"
-subtitle: ""
-intro_text: False
-
-report_header_info:
- - Application Type: 'Nanopore sequencing'
-
-custom_logo: 'logo_small.png'
-custom_logo_url: 'https://github.com/biocorecrg/nanopore_analysis'
-custom_logo_title: 'Master of Pores'
-
-extra_fn_clean_trim:
- - '_QC'
- - '.count'
-
-read_count_multiplier: 0.001
-read_count_prefix: 'K'
-read_count_desc: 'thousands'
-
-table_columns_visible:
- FastQC:
- percent_duplicates: True
-
-top_modules:
- - fastqc:
- name: 'FastQC'
- info: 'This section of the report shows FastQC results on raw reads.'
- - alnQC:
- name: "alnQC"
- info: 'This section of the report shows alnQC results on aligned reads'
- - minionqc
- - RNA201120181_REP2_bc_1_stats:
- info: 'Nanoplot stats'
diff --git a/NanoPreprocess/deeplexicon/pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5 b/NanoPreprocess/deeplexicon/pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5
deleted file mode 100644
index 924fdc4..0000000
Binary files a/NanoPreprocess/deeplexicon/pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5 and /dev/null differ
diff --git a/NanoPreprocess/deeplexicon/pAmps-rep2-4-train1_newdata_nanopore_UResNet20v2_model.039.h5 b/NanoPreprocess/deeplexicon/pAmps-rep2-4-train1_newdata_nanopore_UResNet20v2_model.039.h5
deleted file mode 100644
index b87fdf5..0000000
Binary files a/NanoPreprocess/deeplexicon/pAmps-rep2-4-train1_newdata_nanopore_UResNet20v2_model.039.h5 and /dev/null differ
diff --git a/NanoPreprocess/deeplexicon/resnet20-final.h5 b/NanoPreprocess/deeplexicon/resnet20-final.h5
deleted file mode 100644
index 2d7fbbf..0000000
Binary files a/NanoPreprocess/deeplexicon/resnet20-final.h5 and /dev/null differ
diff --git a/NanoPreprocess/nanopreprocess.nf b/NanoPreprocess/nanopreprocess.nf
deleted file mode 100644
index dfaba39..0000000
--- a/NanoPreprocess/nanopreprocess.nf
+++ /dev/null
@@ -1,845 +0,0 @@
-#!/usr/bin/env nextflow
-
-
-/*
- * Define the pipeline parameters
- *
- */
-
-// Pipeline version
-version = '0.1'
-
-params.help = false
-params.resume = false
-
-log.info """
-
-╔╦╗┌─┐┌─┐┌┬┐┌─┐┬─┐ ┌─┐┌─┐ ╔═╗╔═╗╦═╗╔═╗╔═╗
-║║║├─┤└─┐ │ ├┤ ├┬┘ │ │├┤ ╠═╝║ ║╠╦╝║╣ ╚═╗
-╩ ╩┴ ┴└─┘ ┴ └─┘┴└─ └─┘└ ╩ ╚═╝╩╚═╚═╝╚═╝
-
-====================================================
-BIOCORE@CRG Preprocessing of Nanopore direct RNA - N F ~ version ${version}
-====================================================
-
-kit : ${params.kit}
-flowcell : ${params.flowcell}
-fast5 : ${params.fast5}
-reference : ${params.reference}
-annotation : ${params.annotation}
-
-ref_type : ${params.ref_type}
-seq_type : ${params.seq_type}
-
-output : ${params.output}
-qualityqc : ${params.qualityqc}
-granularity : ${params.granularity}
-
-basecaller : ${params.basecaller}
-basecaller_opt : ${params.basecaller_opt}
-GPU : ${params.GPU}
-demultiplexing : ${params.demultiplexing}
-demultiplexing_opt : ${params.demultiplexing_opt}
-demulti_fast5 : ${params.demulti_fast5}
-
-filter : ${params.filter}
-filter_opt : ${params.filter_opt}
-mapper : ${params.mapper}
-mapper_opt : ${params.mapper_opt}
-map_type : ${params.map_type}
-
-counter : ${params.counter}
-counter_opt : ${params.counter_opt}
-
-downsampling : ${params.downsampling}
-
-variant_caller : ${params.variant_caller}
-variant_opt : ${params.variant_opt}
-
-email : ${params.email}
-"""
-
-// Help and avoiding typos
-if (params.help) exit 1
-if (params.resume) exit 1, "Are you making the classical --resume typo? Be careful!!!! ;)"
-if (params.granularity == "") params.granularity = 1000000000
-
-// check multi5 and GPU usage. GPU maybe can be removed as param if there is a way to detect it
-if (params.GPU != "ON" && params.GPU != "OFF") exit 1, "Please specify ON or OFF in GPU processors are available"
-
-// check sequence type parameter
-//if (params.seq_type != "gDNA" && params.seq_type != "cDNA" && params.seq_type != "RNA") exit 1, "Please specify the sequence type as RNA, cDNA or gDNA"
-//if (params.seq_type == "gDNA") {
-// log.info "seqType is gDNA: map_type is set to 'unspliced'"
-// params.map_type = "unspliced"
-//} else {
-// log.info "seqType is ${params.seq_type}: map_type is ${params.map_type}"
- if (params.map_type != "unspliced" && params.map_type != "spliced") exit 1, "Mapping type NOT supported! Please choose either 'spliced' or 'unspliced'"
-//}
-
-// check input files
-reference = file(params.reference)
-if( !reference.exists() ) exit 1, "Missing reference file: ${reference}!"
-config_report = file("$baseDir/config.yaml")
-if( !config_report.exists() ) exit 1, "Missing config.yaml file!"
-logo = file("$baseDir/../docs/logo_small.png")
-deeplexicon_folder = file("$baseDir/deeplexicon/")
-
-
-basecaller = params.basecaller
-basecaller_opt = params.basecaller_opt
-demultiplexer = params.demultiplexing
-demultiplexer_opt = params.demultiplexing_opt
-mapper = params.mapper
-mapper_opt = params.mapper_opt
-counter_opt = params.counter_opt
-
-
-// Output folders
-outputFastq = "${params.output}/fastq_files"
-outputFast5 = "${params.output}/fast5_files"
-outputQual = "${params.output}/QC_files"
-outputMultiQC = "${params.output}/report"
-outputMapping = "${params.output}/alignment"
-outputCRAM = "${params.output}/cram_files"
-outputCounts = "${params.output}/counts"
-outputVars = "${params.output}/variants"
-outputAssigned = "${params.output}/assigned"
-outputReport = file("${outputMultiQC}/multiqc_report.html")
-
-/*
-* move old multiQCreport
-*/
-if( outputReport.exists() ) {
- log.info "Moving old report to multiqc_report.html multiqc_report.html.old"
- outputReport.moveTo("${outputMultiQC}/multiqc_report.html.old")
-}
-
-/*
- * Creates the channels that emits fast5 files
- */
-Channel
- .fromPath( params.fast5)
- .ifEmpty { error "Cannot find any file matching: ${params.fast5}" }
- .into {fast5_4_name; fast5_4_testing; fast5_4_granularity; fast5_4_variant}
-
-/*
- * Get the name from the folder
- */
-folder_info = params.fast5.tokenize("/")
-folder_name = folder_info[-2]
-
-// Check config file for consistency
-//if (demultiplexer == "guppy" && params.barcodekit == "")
-// exit 1, "Demultiplexing with guppy needs the definition of the barcodekit parameter. Exiting"
-//if (basecaller != "guppy" && demultiplexer == "guppy")
-// exit 1, "Demultiplexing with guppy can be performed ONLY when the basecaller is guppy too. Exiting"
-//if (basecaller == "guppy" && demultiplexer == "guppy")
-// log.info "Performing basecalling and demultiplexing at the same time with Guppy."
-
-/*
-* This is default value in case guppy will be used for RNA demultiplexing
-*/
-params.barcodekit = ""
-if (demultiplexer == "") {
- demultiplexer = "OFF"
-}
-
-//if (demultiplexer != "OFF" && demultiplexer != "deeplexicon")
-//exit 1, "Demultiplexing of RNA can be performed only with deeplexicon. Current value is ${demultiplexer}"
-
-if (params.GPU == "YES" && basecaller != "guppy")
- exit 1, "GPU can be used only with GUPPY basecaller!"
-
-if (params.ref_type == "genome") {
- if (params.annotation != "") {
- annotation = file(params.annotation)
- if( !annotation.exists() ) exit 1, "Missing annotation file: ${params.annotation}!"
- }
-}
-
-
-process testInput {
- tag {"${fast5}"}
-
- input:
- file(fast5) from fast5_4_testing.first()
-
- output:
- stdout into multi5_type
-
- script:
- """
- fast5_type.py ${fast5}
- """
-}
-
-multi5_type.map { it.trim().toInteger() }.into{multi5_type_for_msg; multi5_type_for_bc; multi5_type_for_granularity; multi5_type_for_demultiplexing}
-multi5_type_for_msg.map{it == 0 ? "Single Fast5 files detected!": "MultiFast5 files detected!" }.println()
-
-// if you are using GPU analyse the whole dataset, otherwise make batch of 4,000 sequences if they are single fast5
-// or single batches of multi fast5 sequences
-multi5_type_for_granularity.merge(fast5_4_granularity.collect()).map{
- (params.GPU == "YES" ? params.granularity : (it[0] == 0 ? it[1..-1].collate(4000) : it[1..-1].collate(1)) )
-}.flatMap().set{fast5_batches}
-
-// create a map id batch -> list of files
-def num_batch = -1
-fast5_batches.map {
- num_batch++
- [num_batch, it]
-}.into{ fast5_4_basecall; fast5_4_demulti}
-
-/*
-* Perform base calling using albacore or guppy on raw fas5 files
-*/
-
-process logBaseCalling {
- label (params.GPU == "ON" ? 'basecall_gpus': 'basecall_cpus')
- echo true
-
- script:
- if (basecaller == "albacore") {
- """
- echo "no"
- #export PYTHONPATH=$baseDir/bin/albacore:\$PYTHONPATH
- #read_fast5_basecaller.py
- """
- } else if (basecaller == "guppy"){
- """
- echo '*********************************'
- guppy_basecaller --version
- guppy_basecaller --print_workflows | grep ${params.flowcell} | grep ${params.kit}
- echo '*********************************'
- """
- }
-}
-
-process baseCalling {
- tag {"${basecaller}-${folder_name}-${idfile}"}
-
- label (params.GPU == "ON" ? 'basecall_gpus': 'basecall_cpus')
-
- // move basecalled output fast5 files
- if(demultiplexer != "deeplexicon") {
- publishDir outputFast5, pattern: "*_out/workspace/*.fast5", mode: 'move', saveAs: { file -> "${file.split('\\/')[-1]}" }
- }
-
- input:
- set idfile, file(fast5) from fast5_4_basecall
- val (multi5) from multi5_type_for_bc
-
- output:
- file ("${idfile}_out/workspace/*.fast5") optional true into fast5_files_for_demultiplexing
- set idfile, file ("${idfile}.*.gz") into fastq_files_for_demultiplexing, demulti_log
- file ("${idfile}_out/sequencing_summary.txt") into seq_summaries optional true
-
- script:
- // conversion command if input is RNA - have to check if this is really needed
- def RNA_conv_cmd = ""
- def demulti_cmd = ""
- def infolder = "./"
- if (params.seq_type == "RNA") { RNA_conv_cmd = " | awk '{if (NR%4==2) gsub(\"U\",\"T\"); print}' " }
- if (basecaller == "albacore") {
- // in case input files are multi fast5 convert them in single fast5 since albacore is not able to deal with multi fast5
- if (multi5 == 1) {
- demulti_cmd = "mkdir demulti_tmp; mkdir demulti; multi_to_single_fast5 -i ${infolder} -s demulti_tmp -t ${task.cpus}; mv demulti_tmp/*/*.fast5 demulti; rm -fr demulti_tmp"
- infolder = "demulti"
- }
- """
- ${demulti_cmd}
- export PYTHONPATH=$baseDir/bin/albacore:\$PYTHONPATH
- read_fast5_basecaller.py ${basecaller_opt} --flowcell \"${params.flowcell}\" --kit \"${params.kit}\" --output_format fastq,fast5 \
- --worker_threads ${task.cpus} -s ./${idfile}_out --disable_filtering --input ${infolder};
- cat ${idfile}_out/workspace/*.fastq ${RNA_conv_cmd} >> ${idfile}.fastq
- rm ${idfile}_out/workspace/*.fastq
- gzip ${idfile}.fastq
- mkdir single_basecallings
- mv ${idfile}_out/workspace/*/*.fast5 single_basecallings
- mkdir temp_multi
- single_to_multi_fast5 -i single_basecallings -s temp_multi -t ${task.cpus}
- mv temp_multi/batch_0.fast5 ./${idfile}_out/workspace/batch_${idfile}.fast5
- if [-d demulti]; then rm -fr demulti; fi
- rm -fr single_basecallings temp_multi
- """
- } else if (basecaller == "guppy"){
- def multi_cmd = ""
- def gpu_cmd = ""
- def gpu_prefix = ""
- if (params.GPU == "ON") {
- gpu_prefix = 'export LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/.singularity.d/libs"'
- //gpu_cmd = '-x "cuda:0"'
- gpu_cmd = '-x auto'
- }
- // in case input files are single fast5 group them in multifast5 at the end
- if (multi5 == 0) {
- multi_cmd = "mkdir single_basecallings temp_multi; mv *_out/workspace/*.fast5 single_basecallings; single_to_multi_fast5 -i single_basecallings -s temp_multi -t ${task.cpus}; mv temp_multi/batch_0.fast5 ./${idfile}_out/workspace/batch_${idfile}.fast5; rm -fr temp_multi single_basecallings"
- }
- // Different command line in case guppy is also demultiplexer
- if (demultiplexer == "guppy") {
- """
- ${gpu_prefix}
- guppy_basecaller ${gpu_cmd} ${basecaller_opt} ${demultiplexer_opt} --flowcell ${params.flowcell} --kit ${params.kit} --num_barcode_threads ${task.cpus} --barcode_kits ${params.barcodekit} --trim_barcodes --fast5_out -i ${infolder} --save_path ./${idfile}_out --cpu_threads_per_caller 1 --gpu_runners_per_device 1 --num_callers ${task.cpus}
- cd ${idfile}_out;
- if [ -d barcode01 ]; then
- for d in barcode*; do echo \$d; cat \$d/*.fastq ${RNA_conv_cmd} > ../${idfile}.\$d.fastq; done;
- fi
- cat unclassified/*.fastq ${RNA_conv_cmd} > ../${idfile}.unclassified.fastq; cd ../
- for i in *.fastq; do gzip \$i; done
- ${multi_cmd}
- """
- }
- else if (demultiplexer == "guppy-readucks") {
- """
- ${gpu_prefix}
- guppy_basecaller ${gpu_cmd} ${basecaller_opt} ${demultiplexer_opt} --flowcell ${params.flowcell} --kit ${params.kit} --num_barcode_threads ${task.cpus} --barcode_kits ${params.barcodekit} --trim_barcodes --fast5_out -i ${infolder} --save_path ./${idfile}_out --gpu_runners_per_device 1 --cpu_threads_per_caller 1 --num_callers ${task.cpus}
- cd ${idfile}_out;
- if [ -d barcode01 ]; then
- for d in barcode*; do echo \$d; cat \$d/*.fastq ${RNA_conv_cmd} > ../${idfile}.\$d.fastq; done;
- fi
- cat unclassified/*.fastq ${RNA_conv_cmd} > ../${idfile}.unclassified.fastq; cd ../
- for i in *.fastq; do gzip \$i; done
- ${multi_cmd}
- """
- }
- else {
- """
- ${gpu_prefix}
- guppy_basecaller ${gpu_cmd} --flowcell ${params.flowcell} --kit ${params.kit} --fast5_out ${basecaller_opt} -i ${infolder} --save_path ./${idfile}_out --cpu_threads_per_caller 1 --gpu_runners_per_device 1 --num_callers ${task.cpus}
- cat ${idfile}_out/*.fastq ${RNA_conv_cmd} >> ${idfile}.fastq
- rm ${idfile}_out/*.fastq
- gzip ${idfile}.fastq
- ${multi_cmd}
- """
- }
- } else if (demultiplexer == "OFF") {
- """
- fast5_to_fastq.py ${fast5}; mv *.fastq ${idfile}.fastq; gzip ${idfile}.fastq
- """
- }
-}
-
-//demulti_log.println()
-
-/*
-* Perform demultiplexing (optional) using porechop on basecalled reads
-*/
-if(demultiplexer == "deeplexicon") {
- process demultiplexing_with_deeplexicon {
- label 'demulti'
- tag {"${demultiplexer}-${idfile}"}
-
- input:
- set idfile, file(fast5) from fast5_4_demulti
- val (multi5) from multi5_type_for_bc
- file(deeplexicon_folder)
-
- output:
- set idfile, file ("${idfile}_demux.tsv") into demux_for_fastq_extraction
- file ("${idfile}_demux.tsv") into demux_for_fast5_extraction
-
- script:
- def model = ''
- def executable = 'deeplexicon_sub.py dmux -f multi'
- if (multi5 == 0){
- executable = 'deeplexicon_sub.py dmux -f single'
- }
- if (demultiplexer_opt.contains("pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5")){
- executable = 'cmd_line_deeplexicon_caller_2019_09_12.py'
- }
- """
- ln -s ${deeplexicon_folder}/* .
- ${executable} -p ./ ${demultiplexer_opt} > ${idfile}_demux.tsv
- #${executable} -p ./ ${demultiplexer_opt} -b 4000 -v > ${idfile}_demux.tsv
- """
- }
-
- process extracting_demultiplexed_fastq {
- label 'basecall_cpus'
- tag {"${demultiplexer}"}
-
- input:
- set idfile, file(demux), file(fastq) from demux_for_fastq_extraction.join(fastq_files_for_demultiplexing)
-
- output:
- set idfile, file ("*.fastq.gz") into fastq_for_filtering
-
- script:
- """
- extract_sequence_from_fastq.py ${demux} ${fastq}
- for i in *.fastq; do gzip \$i; done
- """
- }
-
- process extracting_demultiplexed_fast5 {
-
- //label 'basecall_cpus'
- tag { demultiplexer }
- publishDir outputFast5, mode: 'copy'
-
- when:
- params.demulti_fast5 == "ON"
-
- input:
- file("demux_*") from demux_for_fast5_extraction.collect()
- file("*") from fast5_files_for_demultiplexing.collect()
-
- output:
- file("*")
-
- script:
- """
- cat demux_* | grep -v ReadID >> dem.files
- awk '{print \$2 > \$3".list" }' dem.files
- for i in *.list; do mkdir `basename \$i .list`; fast5_subset --input ./ --save_path `basename \$i .list`/ --read_id_list \$i --batch_size 4000 -t ${task.cpus}; done
- rm *.list
- rm */filename_mapping.txt
- rm dem.files
- """
- }
-
-
-} else {
- fastq_files_for_demultiplexing.set{ fastq_for_filtering}
-}
-
-/*
-* Perform filtering (optional) using nanofilt on fastq files
-*/
-if (params.filter == "nanofilt") {
- process filtering {
- label 'big_cpus'
- tag {"${params.filter}-${fastq_file}".replace('.fastq.gz', '')}
-
- input:
- set idfile, file(fastq_file) from fastq_for_filtering.transpose()
-
- output:
- set idfile, file("*-filt.fastq.gz") into fastq_for_next_step
-
- script:
- output = "${fastq_file}".replace(".fastq.gz", "-filt.fastq.gz")
- """
- zcat ${fastq_file} | NanoFilt ${params.filter_opt} | gzip > ${output}
- """
- }
-} else {
- fastq_for_filtering.transpose().set{fastq_for_next_step}
-}
-
-
-fastq_for_next_step.map{
- filepath=it[1]
- if (demultiplexer != "OFF") {
- fileparts = filepath.getName().tokenize(".")
- ["${folder_name}.${fileparts[-3]}", filepath]
- } else {
- ["${folder_name}", filepath]
- }
-}.groupTuple().set{fastq_files_for_grouping}
-
-/*
-* Concatenate FastQ files
-*/
-process concatenateFastQFiles {
- tag {idfile}
-
- publishDir outputFastq, pattern: "*.fq.gz", mode: 'copy'
-
- input:
- set idfile, file(fastq_files) from fastq_files_for_grouping
-
- output:
- set idfile, file("${idfile}.fq.gz") into fastq_files_for_fastqc, fastq_files_for_mapping, fastq_files_for_variants
-
- script:
- """
- cat *.fastq.gz >> ${idfile}.fq.gz
- """
-}
-
-/*
-* Perform QC on fast5 files
-*/
-
-process QC {
- tag {folder_name}
- label 'big_cpus'
- publishDir outputQual, mode: 'copy'
- errorStrategy 'ignore'
-
- input:
- file("summaries_*") from seq_summaries.collect()
-
- output:
- file ("${folder_name}_QC") into QC_folders
- file ("final_summary.stats") into stat_for_variants
-
- script:
- """
- if [ -f "summaries_" ]; then
- ln -s summaries_ final_summary.stats
- else
- head -n 1 summaries_1 > final_summary.stats
- for i in summaries_*; do grep -v "filename" \$i >> final_summary.stats; done
- fi
- MinIONQC.R -i final_summary.stats -o ${folder_name}_QC -q ${params.qualityqc} -p ${task.cpus}
- """
-}
-
-/*
-* Perform fastQC on fastq files
-*/
-
-process fastQC {
- tag {idfile}
- label 'big_cpus'
-
- publishDir outputQual, pattern: "*_fastqc.html", mode: 'copy'
-
- input:
- set idfile, file(fastq_file) from fastq_files_for_fastqc
-
- output:
- file ("*_fastqc.*") into fastqc_for_multiqc
-
- script:
- """
- fastqc ${fastq_file} -t ${task.cpus}
- """
-}
-
-/*
-* Perform mapping and sorting
-*/
-process mapping {
- tag {"${mapper}-${idfile}"}
- publishDir outputMapping, mode: 'copy'
- label 'big_mem_cpus'
-
- input:
- file(reference)
- set idfile, file (fastq_file) from fastq_files_for_mapping
-
- output:
- set idfile, file("${idfile}.${mapper}.sorted.bam") optional true into aligned_reads, aligned_reads_for_QC, aligned_reads_for_QC2, aligned_reads_for_counts
- set idfile, mapper, file("${idfile}.${mapper}.sorted.bam"), file("${idfile}.${mapper}.sorted.bam.bai") optional true into aligned_reads_for_crams
- set idfile, file("${idfile}.${mapper}.sorted.bam"), file("${idfile}.${mapper}.sorted.bam.bai") optional true into aligned_reads_for_vars
- file("${idfile}.${mapper}.sorted.bam*") optional true
-
- script:
- if (mapper == "minimap2") {
- def mappars = (params.map_type == "spliced") ? "-ax splice -k14" : "-ax map-ont"
- mappars += " ${mapper_opt} "
- """
- minimap2 -t ${task.cpus} ${mappars} -uf ${reference} ${fastq_file} > reads.mapped.sam
- samtools view -@ ${task.cpus} -F4 -hSb reads.mapped.sam > reads.mapped.bam
- rm reads.mapped.sam
- samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam
- samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam
- rm reads.mapped.bam
- """
- }
- else if (mapper == "graphmap2"){
- def mappars = (params.map_type == "spliced") ? "-x rnaseq" : ""
- mappars += " ${mapper_opt} "
- """
- graphmap2 align -t ${task.cpus} -r ${reference} ${mappars} -d ${fastq_file} | samtools view -@ ${task.cpus} -F4 -hSb - > reads.mapped.bam
- samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam
- samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam
- rm reads.mapped.bam
- """
- }
- else if (mapper == "graphmap"){
- """
- graphmap align -t ${task.cpus} ${mapper_opt} -r ${reference} -d ${fastq_file} | samtools view -@ ${task.cpus} -F4 -hSb - > reads.mapped.bam
- samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam
- samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam
- rm reads.mapped.bam
- """
- }
- else {
- """
- echo "nothing to do!"
- """
- }
-}
-
-/*
-* Perform mapping and sorting
-
-process cram_conversion {
- tag {"${mapper}-${idfile}"}
- publishDir outputCRAM, mode: 'copy'
- label 'big_mem_cpus'
-
- input:
- file(reference)
- set idfile, val(mapper), file(aln), file(index) from aligned_reads_for_crams
-
- output:
- file("${idfile}.${mapper}.sorted.cram*") optional true
- script:
- def downcmd = ""
- def input = aln
- def cleancmd = ""
- gzipcmd = unzipCmd(reference, "myreference.fasta")
- gzipclean = "rm myreference.fasta"
- if (params.downsampling != "") {
- def perc = params.downsampling/100
- downcmd = "samtools view -@ ${task.cpus} -bs ${perc} ${aln} > subsample.bam"
- input = "subsample.bam"
- cleancmd = "rm subsample.bam"
- }
- """
- ${downcmd}
- ${gzipcmd}
- samtools view -@ ${task.cpus} -C ${input} -T myreference.fasta -o ${idfile}.${mapper}.sorted.cram
- samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.cram
- ${cleancmd}
- ${gzipclean}
- """
-}
-*/
-
-
-/*
-* Perform counting (optional)
-*/
-
-if ( params.counter == "YES") {
- process counting {
- tag {"${idfile}"}
- publishDir outputCounts, pattern: "*.count", mode: 'copy'
- publishDir outputAssigned, pattern: "*.assigned", mode: 'copy'
-
- input:
- set idfile, file(bamfile) from aligned_reads_for_counts
-
- output:
- file("${idfile}.count") into read_counts
- file("${idfile}.stats") optional true into count_stats
- file("${idfile}.assigned") optional true
- script:
- if (params.ref_type == "transcriptome") {
- """
- NanoCount -i ${bamfile} ${counter_opt} -o ${idfile}.count ${counter_opt};
- awk '{sum+=\$3}END{print FILENAME"\t"sum}' ${idfile}.count |sed s@.count@@g > ${idfile}.stats
- samtools view -F 256 ${bamfile} |cut -f 1,3 > ${idfile}.assigned
- """
- } else if (params.ref_type == "genome") {
- def anno = unzipBash("${params.annotation}")
- """
- samtools view ${bamfile} |htseq-count -f sam - ${anno} ${counter_opt} -o ${idfile}.sam > ${idfile}.count
- awk '{gsub(/XF:Z:/,"",\$NF); print \$1"\t"\$NF}' ${idfile}.sam |grep -v '__' > ${idfile}.assigned
- rm ${idfile}.sam
- """
- }
- }
-
- /*
- * Join alnQC
- */
- process joinCountQCs {
-
- input:
- file "*" from count_stats.collect()
-
- output:
- file("counts_mqc.txt") into count_repo_for_multiQC
-
- script:
- """
- echo '# id: NanoCount
- # plot_type: \'table\'
- # section_name: Read counts
- File name \'Counts\' ' > counts_mqc.txt
- cat *.stats >> counts_mqc.txt
- """
- }
-} else {
- read_counts = Channel.empty()
- count_repo_for_multiQC = Channel.empty()
-}
-
-
-/*
-* Perform alnQC
-*/
-process alnQC {
- tag {bamid}
-
- input:
- set bamid, file(bamfile) from aligned_reads_for_QC
-
- output:
- file "${bamid}.stat" into single_alnQC_outs
-
- script:
- """
- bam2stats.py ${bamfile} > ${bamid}.stat
- """
-}
-
-/*
-* Join alnQC
-*/
-process joinAlnQCs {
-
- input:
- file "alnqc_*" from single_alnQC_outs.collect()
-
- output:
- file("alnQC_mqc.txt") into alnQC_for_multiQC
-
- script:
- """
- echo '# id: alnQC
-# plot_type: \'table\'
-# section_name: \'Alignment QC\' ' > alnQC_mqc.txt
- cat alnqc_* | head -n 1| sed s@#@@g >> alnQC_mqc.txt
- cat alnqc_* | grep -v "#" >> alnQC_mqc.txt
- """
-}
-
-/*
-* Perform alnQC2
-*/
-
-process alnQC2 {
- publishDir outputQual, pattern: "*_plot/*", mode: 'copy'
- label 'big_cpus'
- errorStrategy 'ignore'
- tag {bamid}
-
- input:
- set bamid, file(bamfile) from aligned_reads_for_QC2
-
- output:
- file("*_plot/*") optional true
- file("${bamid}_stats_mqc.png") optional true into qc2_for_multiqc
-
- script:
- """
- NanoPlot --bam ${bamfile} -o ${bamid}_plot --maxlength 5000 -t ${task.cpus}
- mkdir tmp_dir
- cp ${bamid}_plot/PercentIdentityvsAverageBaseQuality_kde.png tmp_dir
- cp ${bamid}_plot/LengthvsQualityScatterPlot_dot.png tmp_dir
- cp ${bamid}_plot/HistogramReadlength.png tmp_dir
- cp ${bamid}_plot/Weighted_HistogramReadlength.png tmp_dir
- gm montage tmp_dir/*.png -tile 2x2 -geometry 800x800 ${bamid}_stats_mqc.png
- rm -fr tmp_dir
- """
-}
-
-QC_folders.mix(fastqc_for_multiqc,qc2_for_multiqc,read_counts,count_repo_for_multiQC,alnQC_for_multiQC).set{files_for_report}
-
-/*
-* Perform viral variant call (experimental)
-*/
-
-if ( params.variant_caller == "YES" && params.seq_type != "RNA") {
-
- process variant_calling {
- tag {"${idfile}"}
- publishDir outputVars, pattern: "*.vcf", mode: 'copy'
- label 'big_cpus'
-
- input:
- set idfile, file(bamfile), file(bai) from aligned_reads_for_vars
- file(reference)
-
- output:
- file("*.vcf")
-
- script:
- gzipcmd = unzipCmd(reference, "myreference.fasta", "yes")
- gzipclean = "rm myreference.fasta"
- """
- ${gzipcmd}
- medaka_variant ${params.variant_opt} -i ${bamfile} -f myreference.fasta -d -t ${task.cpus} -o ./out
- mv `ls -t out/round_*.vcf| head -n1 ` .
- ${gzipclean}
- """
- }
- }
-
-
-
-
-/*
-* Perform multiQC report
-*/
-process multiQC {
- publishDir outputMultiQC, mode: 'copy'
-
- input:
- file(logo)
- file(config_report)
- file("*") from files_for_report.collect()
-
- output:
- file("multiqc_report.html") into multiQC
-
- script:
- """
- multiqc -c ${config_report} .
- """
-}
-
-
-if (params.email == "yourmail@yourdomain" || params.email == "") {
- log.info 'Skipping the email\n'
-}
-else {
- log.info "Sending the email to ${params.email}\n"
-
- workflow.onComplete {
-
- def msg = """\
- Pipeline execution summary
- ---------------------------
- Completed at: ${workflow.complete}
- Duration : ${workflow.duration}
- Success : ${workflow.success}
- workDir : ${workflow.workDir}
- exit status : ${workflow.exitStatus}
- Error report: ${workflow.errorReport ?: '-'}
- """
- .stripIndent()
-
- sendMail(to: params.email, subject: "Master of Pore execution", body: msg, attach: "${outputMultiQC}/multiqc_report.html")
- }
-}
-
-workflow.onComplete {
- println "Pipeline BIOCORE@CRG Master of Pore completed!"
- println "Started at $workflow.start"
- println "Finished at $workflow.complete"
- println "Time elapsed: $workflow.duration"
- println "Execution status: ${ workflow.success ? 'OK' : 'failed' }"
-}
-
-// make named pipe
-def unzipCmd(filename, unzippedname, copy="") {
- def cmd = "ln -s ${filename} ${unzippedname}"
- if (copy!="") {
- cmd = "cp ${filename} ${unzippedname}"
- }
- def ext = filename.getExtension()
- if (ext == "gz") {
- cmd = "zcat ${filename} > ${unzippedname}"
- }
- return cmd
-}
-
-
-// make named pipe
-def unzipBash(filename) {
- def cmd = filename.toString()
- if (cmd[-3..-1] == ".gz") {
- cmd = "<(zcat ${filename})"
- }
- return cmd
-}
-
diff --git a/NanoPreprocess/nextflow.config b/NanoPreprocess/nextflow.config
deleted file mode 100644
index 8032053..0000000
--- a/NanoPreprocess/nextflow.config
+++ /dev/null
@@ -1,5 +0,0 @@
-includeConfig "$baseDir/params.config"
-includeConfig "$baseDir/../nextflow.global.config"
-singularity.cacheDir = "$baseDir/../singularity"
-
-//singularity.enabled = true
diff --git a/NanoPreprocess/params.config b/NanoPreprocess/params.config
deleted file mode 100644
index 6424d81..0000000
--- a/NanoPreprocess/params.config
+++ /dev/null
@@ -1,37 +0,0 @@
-params {
- kit = "SQK-RNA001"
- flowcell = "FLO-MIN106"
- fast5 = "$baseDir/../data/multifast/*.fast5"
- reference = "$baseDir/../anno/curlcake_constructs.fasta.gz"
- annotation = ""
- ref_type = "transcriptome"
-
- seq_type = "cDNA"
- output = "$baseDir/multifast"
- qualityqc = 5
- granularity = ""
-
- basecaller = "guppy"
- basecaller_opt = ""
- GPU = "OFF"
- demultiplexing = ""
- demultiplexing_opt = "-m pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5"
- demulti_fast5 = "OFF"
-
- filter = ""
- filter_opt = ""
-
- mapper = "graphmap2"
- mapper_opt = ""
- map_type = "spliced"
-
- counter = "YES"
- counter_opt = ""
-
- variant_caller = "NO"
- variant_opt = ""
-
- downsampling = ""
-
- email = ""
-}
diff --git a/NanoPreprocess/preprocessing.png b/NanoPreprocess/preprocessing.png
deleted file mode 100644
index 884482a..0000000
Binary files a/NanoPreprocess/preprocessing.png and /dev/null differ
diff --git a/NanoPreprocessSimple/bin/bam2stats.py b/NanoPreprocessSimple/bin/bam2stats.py
deleted file mode 100755
index 12eaad6..0000000
--- a/NanoPreprocessSimple/bin/bam2stats.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python3
-# Report stats (mapped reads and identity to reference) from samtools stats
-# for bam file(s) ignoring secondary, suplementary and qc failed alignments
-#
-# USAGE: bam2stats.py bam1 bam2 ... bamN
-
-import os, subprocess, sys
-
-def bam2stats(fn, flag=3840):
- """Get stats from samtools stats"""
- args = ["samtools", "stats", "-F%s"%flag, fn]
- proc = subprocess.Popen(args, stdout=subprocess.PIPE)
- k2v = {}
- for l in proc.stdout:
- l = l.decode("utf-8")
- if l.startswith('SN'):
- ldata = l[:-1].split()#; print(ldata)
- kv = [[]]
- for e in ldata[1:]:
- kv[-1].append(e)
- if e.endswith(':'):
- kv[-1][-1] = kv[-1][-1][:-1]
- kv.append([])
- k2v[" ".join(kv[0])] = kv[1]
- # convert digits to int
- for k in k2v:
- if k2v[k][0].isdigit():
- k2v[k] = int(k2v[k][0])
- # report if no reads mapped
- if not k2v['reads mapped']:
- return "No reads mapped"
- text = []
- text.append("{:,}\t{:.1f}%\t{:,}\t{:.1f}%".format(k2v['reads mapped'], 100*k2v['reads mapped']/k2v['sequences'], k2v['bases mapped (cigar)'], 100*k2v['bases mapped (cigar)']/k2v['total length']))
- text.append("{:,}\t{:,}".format(k2v['average length'], k2v['maximum length']))
- text.append("{:.2f}%".format(100-100*k2v['mismatches']/k2v['bases mapped (cigar)'], )) #"identity: %.2f%"%(100-k2v['mismatches']/k2v['bases mapped (cigar)'], ))
- return "\t".join(text)
-
-for fn in sys.argv[1:]:
- if os.path.isfile(fn):
- sys.stdout.write("#File name\tMapped reads\tMap %\tBases\tBases %\tAvg read length\tMax read length\tidentity\n")
- sys.stdout.write("%s\t%s\n"%(fn, bam2stats(fn)))
-
-'''
-CHK 4691e107 9942d94c cd9ffd51
-# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
-SN raw total sequences: 4000
-SN filtered sequences: 0
-SN sequences: 4000
-SN is sorted: 1
-SN 1st fragments: 4000
-SN last fragments: 0
-SN reads mapped: 1440
-SN reads mapped and paired: 0 # paired-end technology bit set + both mates mapped
-SN reads unmapped: 2560
-SN reads properly paired: 0 # proper-pair bit set
-SN reads paired: 0 # paired-end technology bit set
-SN reads duplicated: 0 # PCR or optical duplicate bit set
-SN reads MQ0: 726 # mapped and MQ=0
-SN reads QC failed: 0
-SN non-primary alignments: 6801
-SN total length: 136941 # ignores clipping
-SN bases mapped: 109284 # ignores clipping
-SN bases mapped (cigar): 108908 # more accurate
-SN bases trimmed: 0
-SN bases duplicated: 0
-SN mismatches: 14898 # from NM fields
-SN error rate: 1.367944e-01 # mismatches / bases mapped (cigar)
-SN average length: 34
-SN maximum length: 401
-SN average quality: 3.5
-SN insert size average: 0.0
-SN insert size standard deviation: 0.0
-SN inward oriented pairs: 0
-SN outward oriented pairs: 0
-SN pairs with other orientation: 0
-SN pairs on different chromosomes: 0
-
-'''
-
diff --git a/NanoPreprocessSimple/bin/cmd_line_deeplexicon_caller_2019_09_12.py b/NanoPreprocessSimple/bin/cmd_line_deeplexicon_caller_2019_09_12.py
deleted file mode 100755
index 9e527c6..0000000
--- a/NanoPreprocessSimple/bin/cmd_line_deeplexicon_caller_2019_09_12.py
+++ /dev/null
@@ -1,677 +0,0 @@
-#!/usr/bin/env python3
-# coding: utf-8
-
-import warnings
-warnings.filterwarnings("ignore", category=FutureWarning)
-import argparse
-import sys
-# from __future__ import print_function
-import os
-from copy import deepcopy
-import re
-import csv
-import time
-import configparser
-import h5py
-import traceback
-import math
-import numpy as np
-# from PIL import Image
-import pyts
-from pyts.image import MarkovTransitionField, GramianAngularField, RecurrencePlot
-import tensorflow as tf
-import keras
-from keras.layers import Dense, Conv2D, BatchNormalization, Activation
-from keras.layers import AveragePooling2D, Input, Flatten
-from keras.optimizers import Adam
-from keras.callbacks import ModelCheckpoint, LearningRateScheduler
-from keras.callbacks import ReduceLROnPlateau
-from keras.preprocessing.image import ImageDataGenerator
-from keras.utils import multi_gpu_model
-from keras.regularizers import l2
-from keras import backend as K
-from keras.models import Model
-import pandas as pd
-from sklearn import datasets, linear_model
-from sklearn.model_selection import train_test_split
-from tensorflow.python.client import device_lib
-from keras.models import load_model
-
-# import matplotlib.pyplot as plt
-
-
-'''
-
- James M. Ferguson (j.ferguson@garvan.org.au)
- Genomic Technologies
- Garvan Institute
- Copyright 2019
-
- Tansel Ersevas ....
-
- script description
-
- ----------------------------------------------------------------------------
- version 0.0 - initial
-
- So a cutoff of: 0.4958776 for high accuracy
- and another of 0.2943664 for high recovery
-
- TODO:
- -
-
- ----------------------------------------------------------------------------
- MIT License
-
- Copyright (c) 2019 -------NAME----------
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in all
- copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-'''
-
-
-
-
-class MyParser(argparse.ArgumentParser):
- def error(self, message):
- sys.stderr.write('error: %s\n' % message)
- self.print_help()
- sys.exit(2)
-
-def print_verbose(message):
- '''verbose printing'''
- sys.stderr.write('info: %s\n' % message)
-
-def print_err(message):
- '''error printing'''
- sys.stderr.write('error: %s\n' % message)
-
-def read_config(filename):
- config = configparser.ConfigParser()
- config.read(filename)
- return(config)
-
-def _get_available_devices():
- local_device_protos = device_lib.list_local_devices()
- return [x.name for x in local_device_protos]
-
-def _check_available_devices():
- available_devices = _get_available_devices()
- print_verbose(available_devices)
- # Make sure requested GPUs are available or at least warn if they aren't
- return(TRUE)
-
-def read_model(model_name):
- # model = load_model('saved_models/' + model_name)
- model = load_model(model_name) # as a path
- model.compile(loss='categorical_crossentropy',
- optimizer=Adam(),
- metrics=['accuracy'])
- return(model)
-
-squiggle_max = 1199
-squiggle_min = 1
-input_cut = 72000 #potenitall need to be configurable
-image_size = 224
-num_classes = 4
-window = 2000
-
-def main():
- '''
- Main function
- '''
- VERSION = "0.9.0"
-
- parser = MyParser(
- description="DeePlexiCon - Demultiplex direct RNA reads")
- #group = parser.add_mutually_exclusive_group()
- parser.add_argument("-p", "--path",
- help="Top path of fast5 files to dmux")
- parser.add_argument("-t", "--type", default="multi", choices=["multi", "single"],
- help="Multi or single fast5s")
- parser.add_argument("-c", "--config",
- help="config file")
- # parser.add_argument("-g", "--gpu_list", default=1
- # help="list of gpus, 1, or [1,3,5], etc. of PCI_BUS_ID order")
- # parser.add_argument("-o", "--output",
- # help="Output directory")
- # parser.add_argument("-s", "--threshold", type=float, default=0.7,
- # help="confidence interval threshold")
- parser.add_argument("-s", "--threshold", type=float, default=0.50,
- help="probability threshold - 0.5 hi accuracy / 0.3 hi recovery")
- # populate choices with models found in saved_models/
- # parser.add_argument("-m", "--model", default="4_bc_normal.h5", choices=["4_bc_normal.h5", "model2"],
- parser.add_argument("-m", "--model",
- help="Trained model name to use")
- parser.add_argument("-b", "--batch_size", type=int, default=4000,
- help="batch size - for single fast5s")
- parser.add_argument("-V", "--version",
- help="Prints version")
- parser.add_argument("-v", "--verbose", action="store_true",
- help="Verbose output")
-
- args = parser.parse_args()
- # print help if no arguments given
- if len(sys.argv) == 1:
- parser.print_help(sys.stderr)
- sys.exit(1)
-
- if args.verbose:
- print_verbose("Verbose mode active - dumping info to stderr")
- print_verbose("DeePlexiCon: {}".format(VERSION))
- print_verbose("arg list: {}".format(args))
- if tf.test.gpu_device_name():
- print_verbose('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
- else:
- print_verbose("Please install GPU version of TF")
-
-
- # Globals
- if args.config:
- config = read_config(args.config) #TODO check config read error
-
- squiggle_max = 1199
- squiggle_min = 1
- input_cut = 72000 #potenitall need to be configurable
- image_size = 224
- num_classes = 4
- window = 2000
- # Devices
- # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-
- # os.environ["CUDA_VISIBLE_DEVICES"] = config[deeplexicon][gpu_list] if args.config else args.gpu_list
-
- # do check devices are available, else throw and error
-
-
- # main logic
-
- # read model
-
- #TODO Check model errors
- model = read_model(config[deeplexicon][trained_model]) if args.config else read_model(args.model)
- labels = []
- images = []
- fast5s = {}
- stats = ""
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format("fast5", "ReadID", "Barcode", "Confidence Interval", "P_bc_1", "P_bc_2", "P_bc_3", "P_bc_4"))
- # for file in input...
- for dirpath, dirnames, files in os.walk(args.path):
- for fast5 in files:
- if fast5.endswith('.fast5'):
- fast5_file = os.path.join(dirpath, fast5)
- if args.type == "single":
- #everthing below this, send off in batches of N=args.batch_size
- # The signal extraction and segmentation can happen in the first step
- # read fast5 files
- readID, seg_signal = get_single_fast5_signal(fast5_file, window)
- # segment
- # array[name, signal.....]
- # seg_signal = dRNA_segmenter(signal)
- if not seg_signal:
- print_err("Segment not found for:\t{}\t{}".format(fast5_file, readID))
- continue
- # convert
- sig = np.array(seg_signal, dtype=float)
- # print(sig)
- img = convert_to_image(sig)
- # print(img)
- labels.append(readID) # read ID?
- # print_verbose(labels)
- fast5s[readID] = fast5
- # print_verbose(readID)
- # print_verbose(fast5)
- # print_verbose(fast5s)
- images.append(img)
- # sys.exit()
- # classify
- if len(labels) >= args.batch_size:
- C = classify(model, labels, np.array(images), False, args.threshold)
- # out = classify(model, readID, np.array([img]), False, 0.0)
- # print_verbose(C)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- if out == 0:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 1:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 2:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 3:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3]))
- else:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
-
-
- elif args.type == "multi":
- #everthing below this, send off in batches of N=args.batch_size
- # The signal extraction and segmentation can happen in the first step
- # read fast5 files
- seg_signal = get_multi_fast5_signal(fast5_file, window)
- # segment
- # array[name, signal.....]
- # seg_signal = dRNA_segmenter(signal)
- # print_verbose(list(seg_signal.keys()))
- sig_count = 0
- for readID in seg_signal:
- # convert
- img = convert_to_image(np.array(seg_signal[readID], dtype=float))
- labels.append(readID) #change to readID?
- images.append(img)
- fast5s[readID] = fast5
- sig_count += 1
- if len(labels) >= args.batch_size:
- C = classify(model, labels, np.array(images), False, args.threshold)
- # out = classify(model, readID, np.array([img]), False, 0.0)
- # print_verbose(C)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- if out == 0:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 1:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 2:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 3:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3]))
- else:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
- elif args.verbose:
- print_verbose("analysing sig_count: {}/{}".format(sig_count, len(seg_signal)))
- else:
- blah = 0
- #finish up
- C = classify(model, labels, np.array(images), False, args.threshold)
- # out = classify(model, readID, np.array([img]), False, 0.0)
- # print_verbose(C)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- if out == 0:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_1", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 1:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_2", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 2:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_3", cm, prob[0], prob[1], prob[2], prob[3]))
- elif out == 3:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "bc_4", cm, prob[0], prob[1], prob[2], prob[3]))
- else:
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, "unknown", cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
-
-
-
-
-
-
-
- # final report/stats
- # print stats
-
-# file handling and segmentation
-
-def get_single_fast5_signal(read_filename, w):
- '''
- open sigle fast5 file and extract information
- '''
-
- # get readID and signal
- f5_dic = read_single_fast5(read_filename)
- if not f5_dic:
- print_err("Signal not extracted from: {}".format(read_filename))
- return 0, 0
- # segment on raw
- readID = f5_dic['readID']
- signal = f5_dic['signal']
- seg = dRNA_segmenter(signal, w)
- if not seg:
- print_verbose("No segment found - skipping: {}".format(readID))
- return 0, 0
- # convert to pA
- pA_signal = convert_to_pA(f5_dic)
- # return signal/signals
- return readID, pA_signal[seg[0]:seg[1]]
-
-
-def get_multi_fast5_signal(read_filename, w):
- '''
- open multi fast5 files and extract information
- '''
- pA_signals = {}
- f5_dic = read_multi_fast5(read_filename)
- # print_verbose(list(f5_dic.keys()))
- seg = 0
- sig_count = 0
- for read in f5_dic:
- sig_count += 1
- print_verbose("reading sig_count: {}/{}".format(sig_count, len(f5_dic)))
- # get readID and signal
- # print_verbose("read: {}".format(read))
- readID = f5_dic[read]['readID']
- # print_verbose("readID: {}".format(readID))
- signal = f5_dic[read]['signal']
- # print_verbose("sig: {}".format(signal[:5]))
- # continue
-
-
-
- # segment on raw
- seg = dRNA_segmenter(readID, signal, w)
- # print_verbose("seg return: {}".format(seg))
- if not seg:
- # print_verbose("No segment found - skipping: {}".format(readID))
- seg = 0
- continue
- # convert to pA
- pA_signal = convert_to_pA(f5_dic[read])
- # print_verbose("pA: {}".format(pA_signal[:5]))
- pA_signals[readID] = pA_signal[seg[0]:seg[1]]
- # return signal/signals
- return pA_signals
-
-
-def read_single_fast5(filename):
- '''
- read single fast5 file and return data
- '''
- f5_dic = {'signal': [], 'readID': '', 'digitisation': 0.0,
- 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0}
-
- # open fast5 file
- try:
- hdf = h5py.File(filename, 'r')
- except:
- traceback.print_exc()
- print_err("extract_fast5():fast5 file failed to open: {}".format(filename))
- f5_dic = {}
- return f5_dic
- try:
- c = list(hdf['Raw/Reads'].keys())
- for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]:
- f5_dic['signal'].append(int(col))
-
-
- f5_dic['readID'] = hdf['Raw/Reads/'][c[0]].attrs['read_id'].decode()
- f5_dic['digitisation'] = hdf['UniqueGlobalKey/channel_id'].attrs['digitisation']
- f5_dic['offset'] = hdf['UniqueGlobalKey/channel_id'].attrs['offset']
- f5_dic['range'] = float("{0:.2f}".format(hdf['UniqueGlobalKey/channel_id'].attrs['range']))
- f5_dic['sampling_rate'] = hdf['UniqueGlobalKey/channel_id'].attrs['sampling_rate']
-
- except:
- traceback.print_exc()
- print_err("extract_fast5():failed to extract events or fastq from: {}".format(filename))
- f5_dic = {}
-
- return f5_dic
-
-
-def read_multi_fast5(filename):
- '''
- read multifast5 file and return data
- '''
- f5_dic = {}
- # c = 0
- with h5py.File(filename, 'r') as hdf:
- for read in list(hdf.keys()):
- # c += 1
- # if c > 100:
- # break
- f5_dic[read] = {'signal': [], 'readID': '', 'digitisation': 0.0,
- 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0}
- try:
- for col in hdf[read]['Raw/Signal'][()]:
- f5_dic[read]['signal'].append(int(col))
-
- f5_dic[read]['readID'] = hdf[read]['Raw'].attrs['read_id'].decode()
- f5_dic[read]['digitisation'] = hdf[read]['channel_id'].attrs['digitisation']
- f5_dic[read]['offset'] = hdf[read]['channel_id'].attrs['offset']
- f5_dic[read]['range'] = float("{0:.2f}".format(hdf[read]['channel_id'].attrs['range']))
- f5_dic[read]['sampling_rate'] = hdf[read]['channel_id'].attrs['sampling_rate']
- except:
- traceback.print_exc()
- print_err("extract_fast5():failed to read readID: {}".format(read))
- return f5_dic
-
-
-def dRNA_segmenter(readID, signal, w):
- '''
- segment signal/s and return coords of cuts
- '''
- def _scale_outliers(squig):
- ''' Scale outliers to within m stdevs of median '''
- k = (squig > 0) & (squig < 1200)
- return squig[k]
-
-
- sig = _scale_outliers(np.array(signal, dtype=int))
-
- s = pd.Series(sig)
- t = s.rolling(window=w).mean()
- # This should be done better, or changed to median and benchmarked
- # Currently trained on mean segmented data
- mn = t.mean()
- std = t.std()
- # Trained on 0.5
- bot = mn - (std*0.5)
-
-
-
- # main algo
-
- begin = False
- # max distance for merging 2 segs
- seg_dist = 1500
- # max length of a seg
- hi_thresh = 200000
- # min length of a seg
- lo_thresh = 2000
-
- start = 0
- end = 0
- segs = []
- count = -1
- for i in t:
- count += 1
- if i < bot and not begin:
- start = count
- begin = True
- elif i < bot:
- end = count
- elif i > bot and begin:
- if segs and start - segs[-1][1] < seg_dist:
- segs[-1][1] = end
- else:
- segs.append([start, end])
- start = 0
- end = 0
- begin = False
- else:
- continue
-
- offset = -1050
- buff = 150
-
- # fig = plt.figure(1)
- # #fig.subplots_adjust(hspace=0.1, wspace=0.01)
- # ax = fig.add_subplot(111)
- #
- # print_verbose("segs: {}".format(segs))
- # # # Show segment lines
- # for i, j in segs:
- # ax.axvline(x=i+offset-buff, color='m')
- # ax.axvline(x=j+offset+buff, color='m')
- #
- # ax.axhline(y=bot, color='r')
- # plt.plot(sig, color='teal')
- # plt.plot(t, color='k')
- # plt.show()
- # plt.clf()
-
- x, y = 0, 0
-
- # print_verbose("segs befor filter: {}".format(segs))
- for a, b in segs:
- if b - a > hi_thresh:
- continue
- if b - a < lo_thresh:
- continue
- x, y = a, b
- # print "{}\t{}\t{}\t{}".format(f5, read_dic[f5], x, y)
- # print_verbose("xy before return: [{},{}]".format(x, y))
- return [x+offset-buff, y+offset+buff]
- break
- print_verbose("dRNA_segmenter: no seg found: {}".format(readID))
- return 0
-
-
-def convert_to_pA(d):
- '''
- convert raw signal data to pA using digitisation, offset, and range
- float raw_unit = range / digitisation;
- for (int32_t j = 0; j < nsample; j++) {
- rawptr[j] = (rawptr[j] + offset) * raw_unit;
- }
- '''
- digitisation = d['digitisation']
- range = d['range']
- offset = d['offset']
- raw_unit = range / digitisation
- new_raw = []
- for i in d['signal']:
- j = (i + offset) * raw_unit
- new_raw.append("{0:.2f}".format(round(j,2)))
- return new_raw
-
-# Python timeseries interface
-
-
-def pyts_transform(transform, data, image_size, show=False, cmap='rainbow', img_index=0):
- try:
- t_start=time.time()
- X_transform = transform.fit_transform(data)
- # if args.verbose:
- # print_verbose("{} {}".format(time.time() - t_start, 'seconds'))
- if (show):
- plt.figure(figsize=(4, 4))
- plt.grid(b=None)
- plt.imshow(X_transform[0], cmap=cmap, origin='lmtfower')
- plt.savefig(transform.__class__.__name__ + "_image_" + str(img_index) + ".svg", format="svg")
- plt.show()
- return(X_transform)
- except Exception as e:
- print_err(str(e))
- return([])
-
-
-def mtf_transform(data, image_size=500, show=False, img_index=0):
- transform = MarkovTransitionField(image_size)
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-def rp_transform(data, image_size=500 ,show=False ,img_index=0):
- # RP transformationmtf
- transform = RecurrencePlot(dimension=1,
- threshold='percentage_points',
- percentage=30)
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='binary', img_index=img_index))
-
-def gasf_transform(data, image_size=500, show=False, img_index=0):
- # GAF transformation
- transform = GramianAngularField(image_size, method='summation')
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-def gadf_transform(data, image_size=500, show=False ,img_index=0):
- # GAF transformation
- transform = GramianAngularField(image_size, method='difference')
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-
-def labels_for(a_file_name):
- segments=re.split(r'[_\-\.]+', a_file_name)
- return(segments)
-
-def max_in_sequence(sequence):
- return(max(np.amax([list(d.values()) for d in sequence]), 0.01))
-
-def compress_squiggle(squiggle, compress_factor):
- squiggle_len = len(squiggle)
- rem = squiggle_len % compress_factor
- if rem > 0:
- return(np.mean(squiggle[0:squiggle_len - rem].reshape(-1,compress_factor), axis=1))
- return(squiggle)
-
-def convert_to_image(signal):
- transformed_squiggle = gasf_transform(signal.reshape(1,-1), image_size=image_size, show=False)
- return(transformed_squiggle)
-
-
-def confidence_margin(npa):
- sorted = np.sort(npa)[::-1] #return sort in reverse, i.e. descending
- # sorted = np.sort(npa) #return sort in reverse, i.e. descending
- d = sorted[0] - sorted[1]
- # if args.verbose:
- # print_verbose("Sorted: {}".format(sorted))
- # print_verbose("conf interval: {}".format(d))
- # return(sorted[0][0] - sorted[0][1])
- return(d)
-
-def classify(model, labels, image, subtract_pixel_mean, threshold):
- input_shape = image.shape[1:]
- x = image.astype('float32') / 255
-
- # If subtract pixel mean is enabled
- if subtract_pixel_mean:
- x_mean = np.mean(x, axis=0)
- x -= x_mean
- x=[x]
- # print(x)
- y = model.predict(x, verbose=0)
- # print(y)
- res = []
- for i in range(len(y)):
- cm = confidence_margin(y[i])
- #print_verbose(y[i][np.argmax(y[i])])
- if y[i][np.argmax(y[i])] >= threshold:
- # return(np.argmax(y))
- res.append([labels[i], np.argmax(y[i]), cm, y[i]])
- # return(None)
- # return(np.argmax(y))
- else:
- res.append([labels[i], None, cm, y[i]])
- return res
-
-if __name__ == '__main__':
- main()
diff --git a/NanoPreprocessSimple/bin/deeplexicon.py b/NanoPreprocessSimple/bin/deeplexicon.py
deleted file mode 100755
index c58b3b6..0000000
--- a/NanoPreprocessSimple/bin/deeplexicon.py
+++ /dev/null
@@ -1,622 +0,0 @@
-#!/usr/bin/env python3
-# coding: utf-8
-
-import warnings
-warnings.filterwarnings("ignore", category=FutureWarning)
-import argparse
-import sys
-import os
-from copy import deepcopy
-import re
-import csv
-import time
-import configparser
-import h5py
-import traceback
-import math
-import numpy as np
-# from PIL import Image
-import pyts
-from pyts.image import MarkovTransitionField, GramianAngularField, RecurrencePlot
-import tensorflow as tf
-import keras
-from keras.layers import Dense, Conv2D, BatchNormalization, Activation
-from keras.layers import AveragePooling2D, Input, Flatten
-from keras.optimizers import Adam
-from keras.callbacks import ModelCheckpoint, LearningRateScheduler
-from keras.callbacks import ReduceLROnPlateau
-from keras.preprocessing.image import ImageDataGenerator
-from keras.utils import multi_gpu_model
-from keras.regularizers import l2
-from keras import backend as K
-from keras.models import Model
-import pandas as pd
-from sklearn import datasets, linear_model
-from sklearn.model_selection import train_test_split
-from tensorflow.python.client import device_lib
-from keras.models import load_model
-
-
-
-'''
-
- James M. Ferguson (j.ferguson@garvan.org.au)
- Genomic Technologies
- Garvan Institute
- Copyright 2019
-
- Tansel Ersevas (t.ersevas@garvan.org.au)
-
- script description
-
-
-
- ----------------------------------------------------------------------------
- version 0.0.0 - initial
- version 0.8.0 - CPU version Done
- version 0.9.0 - Fixed segment offset
- version 0.9.1 - added segment and squiggle output
- version 0.9.2 - separate segment output and code clean up
- version 1.0.0 - initial release
-
- So a cutoff of: 0.4958776 for high accuracy
- and another of 0.2943664 for high recovery
-
- TODO:
- - Remove leftover libraries
- - remove debug plots
- - Remove redundant code
- - create log files with information
- - take in fastq for dmux splitting
- - take in paf or bam for training splitting
-
-
- ----------------------------------------------------------------------------
- MIT License
-
- Copyright (c) 2019 James M. Ferguson
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in all
- copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-'''
-
-
-
-
-class MyParser(argparse.ArgumentParser):
- def error(self, message):
- sys.stderr.write('error: %s\n' % message)
- self.print_help()
- sys.exit(2)
-
-def print_verbose(message):
- '''verbose printing'''
- sys.stderr.write('info: %s\n' % message)
-
-def print_err(message):
- '''error printing'''
- sys.stderr.write('error: %s\n' % message)
-
-def read_config(filename):
- config = configparser.ConfigParser()
- config.read(filename)
- return(config)
-
-def _get_available_devices():
- local_device_protos = device_lib.list_local_devices()
- return [x.name for x in local_device_protos]
-
-def _check_available_devices():
- available_devices = _get_available_devices()
- print_verbose(available_devices)
- # Make sure requested GPUs are available or at least warn if they aren't
- return(TRUE)
-
-def read_model(model_name):
- # model = load_model('saved_models/' + model_name)
- model = load_model(model_name) # as a path
- model.compile(loss='categorical_crossentropy',
- optimizer=Adam(),
- metrics=['accuracy'])
- return(model)
-
-squiggle_max = 1199
-squiggle_min = 1
-input_cut = 72000 #potenitall need to be configurable
-image_size = 224
-num_classes = 4
-window = 2000
-
-def main():
- '''
- Main function
- '''
- VERSION = "1.0.0"
-
- parser = MyParser(
- description="DeePlexiCon - Demultiplex direct RNA reads")
- #group = parser.add_mutually_exclusive_group()
- parser.add_argument("-p", "--path",
- help="Top path of fast5 files to dmux")
- parser.add_argument("-f", "--form", default="multi", choices=["multi", "single"],
- help="Multi or single fast5s")
- parser.add_argument("-c", "--config",
- help="config file")
- # parser.add_argument("-g", "--gpu_list", default=1
- # help="list of gpus, 1, or [1,3,5], etc. of PCI_BUS_ID order")
- # parser.add_argument("-o", "--output",
- # help="Output directory")
- parser.add_argument("-s", "--threshold", type=float, default=0.50,
- help="probability threshold - 0.5 hi accuracy / 0.3 hi recovery")
- # populate choices with models found in saved_models/
- # parser.add_argument("-m", "--model", default="4_bc_normal.h5", choices=["4_bc_normal.h5", "model2"],
- parser.add_argument("-m", "--model",
- help="Trained model name to use")
- parser.add_argument("--squiggle",
- help="dump squiggle data into this .tsv file")
- parser.add_argument("--segment",
- help="dump segment data into this .tsv file")
- parser.add_argument("-b", "--batch_size", type=int, default=4000,
- help="batch size - for single fast5s")
- parser.add_argument("-V", "--version",
- help="Prints version")
- parser.add_argument("-v", "--verbose", action="store_true",
- help="Verbose output")
-
- args = parser.parse_args()
- # print help if no arguments given
- if len(sys.argv) == 1:
- parser.print_help(sys.stderr)
- sys.exit(1)
-
- if args.verbose:
- print_verbose("Verbose mode active - dumping info to stderr")
- print_verbose("DeePlexiCon: {}".format(VERSION))
- print_verbose("arg list: {}".format(args))
- if tf.test.gpu_device_name():
- print_verbose('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
- else:
- print_verbose("Please install GPU version of TF")
-
- if args.squiggle:
- squig_file = args.squiggle
- with open(squig_file, 'a') as f:
- f.write("{}\t{}\n".format("ReadID", "signal_pA"))
- else:
- squig_file = ''
-
- if args.segment:
- seg_file = args.segment
- with open(seg_file, 'a') as f:
- f.write("{}\t{}\t{}\n".format("ReadID", "start", "stop"))
- else:
- seg_file = ''
-
- # Globals
- if args.config:
- config = read_config(args.config) #TODO check config read error
-
- # gpu settings
- # Devices
- # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-
- # os.environ["CUDA_VISIBLE_DEVICES"] = config[deeplexicon][gpu_list] if args.config else args.gpu_list
-
- # do check devices are available, else throw and error
-
-
- # main logic
-
- # read model
-
- model = read_model(config[deeplexicon][trained_model]) if args.config else read_model(args.model)
- barcode_out = {0: "bc_1",
- 1: "bc_2",
- 2: "bc_3",
- 3: "bc_4",
- None: "unknown"
- }
- labels = []
- images = []
- fast5s = {}
- stats = ""
- seg_dic = {}
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format("fast5", "ReadID", "Barcode", "Confidence Interval", "P_bc_1", "P_bc_2", "P_bc_3", "P_bc_4"))
- # for file in input...
- for dirpath, dirnames, files in os.walk(args.path):
- for fast5 in files:
- if fast5.endswith('.fast5'):
- fast5_file = os.path.join(dirpath, fast5)
- if args.form == "single":
- #everthing below this, send off in batches of N=args.batch_size
- # The signal extraction and segmentation can happen in the first step
- # read fast5 files
- readID, seg_signal = get_single_fast5_signal(fast5_file, window, squig_file, seg_file)
- if not seg_signal:
- print_err("Segment not found for:\t{}\t{}".format(fast5_file, readID))
- continue
- # convert
- sig = np.array(seg_signal, dtype=float)
- img = convert_to_image(sig)
- labels.append(readID)
- fast5s[readID] = fast5
- images.append(img)
- # classify
- if len(labels) >= args.batch_size:
- C = classify(model, labels, np.array(images), False, args.threshold)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcode_out[out], cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
-
-
- elif args.form == "multi":
- #everthing below this, send off in batches of N=args.batch_size
- # The signal extraction and segmentation can happen in the first step
- # read fast5 files
- seg_signal = get_multi_fast5_signal(fast5_file, window, squig_file, seg_file)
- sig_count = 0
- for readID in seg_signal:
- # convert
- img = convert_to_image(np.array(seg_signal[readID], dtype=float))
- labels.append(readID)
- images.append(img)
- fast5s[readID] = fast5
- sig_count += 1
- if len(labels) >= args.batch_size:
- C = classify(model, labels, np.array(images), False, args.threshold)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcde_out[out], cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
- elif args.verbose:
- print_verbose("analysing sig_count: {}/{}".format(sig_count, len(seg_signal)))
- else:
- blah = 0 # clean
- #finish up
- C = classify(model, labels, np.array(images), False, args.threshold)
- # save to output
- for readID, out, c, P in C:
- prob = [round(float(i), 6) for i in P]
- cm = round(float(c), 4)
- if args.verbose:
- print_verbose("cm is: {}".format(cm))
- print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fast5s[readID], readID, barcode_out[out], cm, prob[0], prob[1], prob[2], prob[3]))
- labels = []
- images = []
- fast5s = {}
-
-
- # final report/stats
- # print stats
-
-# file handling and segmentation
-
-def get_single_fast5_signal(read_filename, w, squig_file, seg_file):
- '''
- open sigle fast5 file and extract information
- '''
-
- # get readID and signal
- f5_dic = read_single_fast5(read_filename)
- if not f5_dic:
- print_err("Signal not extracted from: {}".format(read_filename))
- return 0, 0
- # segment on raw
- readID = f5_dic['readID']
- signal = f5_dic['signal']
- seg = dRNA_segmenter(readID, signal, w)
- if not seg:
- print_verbose("No segment found - skipping: {}".format(readID))
- return 0, 0
- # convert to pA
- pA_signal = convert_to_pA(f5_dic)
- if squig_file:
- with open(squig_file, 'a') as f:
- f.write("{}\t{}\n".format(readID, "\t".join(pA_signal)))
- if seg_file:
- with open(seg_file, 'a') as f:
- f.write("{}\t{}\t{}\n".format(readID, seg[0], seg[1]))
- # return signal/signals
- return readID, pA_signal[seg[0]:seg[1]]
-
-
-def get_multi_fast5_signal(read_filename, w, squig_file, seg_file):
- '''
- open multi fast5 files and extract information
- '''
- pA_signals = {}
- seg_dic = {}
- f5_dic = read_multi_fast5(read_filename)
- seg = 0
- sig_count = 0
- for read in f5_dic:
- sig_count += 1
- print_verbose("reading sig_count: {}/{}".format(sig_count, len(f5_dic)))
- # get readID and signal
- readID = f5_dic[read]['readID']
- signal = f5_dic[read]['signal']
-
- # segment on raw
- seg = dRNA_segmenter(readID, signal, w)
- if not seg:
- seg = 0
- continue
- # convert to pA
- pA_signal = convert_to_pA(f5_dic[read])
- if squig_file:
- with open(squig_file, 'a') as f:
- f.write("{}\t{}\n".format(readID, "\t".join(pA_signal)))
- if seg_file:
- with open(seg_file, 'a') as f:
- f.write("{}\t{}\t{}\n".format(readID, seg[0], seg[1]))
- pA_signals[readID] = pA_signal[seg[0]:seg[1]]
- seg_dic[readID] = seg
- # return signal/signals
- return pA_signals
-
-
-def read_single_fast5(filename):
- '''
- read single fast5 file and return data
- '''
- f5_dic = {'signal': [], 'readID': '', 'digitisation': 0.0,
- 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0}
-
- # open fast5 file
- try:
- hdf = h5py.File(filename, 'r')
- except:
- traceback.print_exc()
- print_err("extract_fast5():fast5 file failed to open: {}".format(filename))
- f5_dic = {}
- return f5_dic
- try:
- c = list(hdf['Raw/Reads'].keys())
- for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]:
- f5_dic['signal'].append(int(col))
-
- f5_dic['readID'] = hdf['Raw/Reads/'][c[0]].attrs['read_id'].decode()
- f5_dic['digitisation'] = hdf['UniqueGlobalKey/channel_id'].attrs['digitisation']
- f5_dic['offset'] = hdf['UniqueGlobalKey/channel_id'].attrs['offset']
- f5_dic['range'] = float("{0:.2f}".format(hdf['UniqueGlobalKey/channel_id'].attrs['range']))
- f5_dic['sampling_rate'] = hdf['UniqueGlobalKey/channel_id'].attrs['sampling_rate']
-
- except:
- traceback.print_exc()
- print_err("extract_fast5():failed to extract events or fastq from: {}".format(filename))
- f5_dic = {}
-
- return f5_dic
-
-
-def read_multi_fast5(filename):
- '''
- read multifast5 file and return data
- '''
- f5_dic = {}
- with h5py.File(filename, 'r') as hdf:
- for read in list(hdf.keys()):
- f5_dic[read] = {'signal': [], 'readID': '', 'digitisation': 0.0,
- 'offset': 0.0, 'range': 0.0, 'sampling_rate': 0.0}
- try:
- for col in hdf[read]['Raw/Signal'][()]:
- f5_dic[read]['signal'].append(int(col))
-
- f5_dic[read]['readID'] = hdf[read]['Raw'].attrs['read_id'].decode()
- f5_dic[read]['digitisation'] = hdf[read]['channel_id'].attrs['digitisation']
- f5_dic[read]['offset'] = hdf[read]['channel_id'].attrs['offset']
- f5_dic[read]['range'] = float("{0:.2f}".format(hdf[read]['channel_id'].attrs['range']))
- f5_dic[read]['sampling_rate'] = hdf[read]['channel_id'].attrs['sampling_rate']
- except:
- traceback.print_exc()
- print_err("extract_fast5():failed to read readID: {}".format(read))
- return f5_dic
-
-
-def dRNA_segmenter(readID, signal, w):
- '''
- segment signal/s and return coords of cuts
- '''
- def _scale_outliers(squig):
- ''' Scale outliers to within m stdevs of median '''
- k = (squig > 0) & (squig < 1200)
- return squig[k]
-
-
- sig = _scale_outliers(np.array(signal, dtype=int))
-
- s = pd.Series(sig)
- t = s.rolling(window=w).mean()
- # This should be done better, or changed to median and benchmarked
- # Currently trained on mean segmented data
- mn = t.mean()
- std = t.std()
- # Trained on 0.5
- bot = mn - (std*0.5)
-
- # main algo
-
- begin = False
- # max distance for merging 2 segs
- seg_dist = 1500
- # max length of a seg
- hi_thresh = 200000
- # min length of a seg
- lo_thresh = 2000
-
- start = 0
- end = 0
- segs = []
- count = -1
- for i in t:
- count += 1
- if i < bot and not begin:
- start = count
- begin = True
- elif i < bot:
- end = count
- elif i > bot and begin:
- if segs and start - segs[-1][1] < seg_dist:
- segs[-1][1] = end
- else:
- segs.append([start, end])
- start = 0
- end = 0
- begin = False
- else:
- continue
-
- # offset = -1050
- # buff = 150
- offset = -1000
- buff = 0
-
- x, y = 0, 0
-
- for a, b in segs:
- if b - a > hi_thresh:
- continue
- if b - a < lo_thresh:
- continue
- x, y = a, b
-
- # to be modified in next major re-training
- return [x+offset-buff, y+offset+buff]
- break
- print_verbose("dRNA_segmenter: no seg found: {}".format(readID))
- return 0
-
-
-def convert_to_pA(d):
- '''
- convert raw signal data to pA using digitisation, offset, and range
- float raw_unit = range / digitisation;
- for (int32_t j = 0; j < nsample; j++) {
- rawptr[j] = (rawptr[j] + offset) * raw_unit;
- }
- '''
- digitisation = d['digitisation']
- range = d['range']
- offset = d['offset']
- raw_unit = range / digitisation
- new_raw = []
- for i in d['signal']:
- j = (i + offset) * raw_unit
- new_raw.append("{0:.2f}".format(round(j,2)))
- return new_raw
-
-
-def pyts_transform(transform, data, image_size, show=False, cmap='rainbow', img_index=0):
- try:
- t_start=time.time()
- X_transform = transform.fit_transform(data)
- if (show):
- plt.figure(figsize=(4, 4))
- plt.grid(b=None)
- plt.imshow(X_transform[0], cmap=cmap, origin='lmtfower')
- plt.savefig(transform.__class__.__name__ + "_image_" + str(img_index) + ".svg", format="svg")
- plt.show()
- return(X_transform)
- except Exception as e:
- print_err(str(e))
- return([])
-
-
-def mtf_transform(data, image_size=500, show=False, img_index=0):
- transform = MarkovTransitionField(image_size)
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-def rp_transform(data, image_size=500 ,show=False ,img_index=0):
- # RP transformationmtf
- transform = RecurrencePlot(dimension=1,
- threshold='percentage_points',
- percentage=30)
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='binary', img_index=img_index))
-
-def gasf_transform(data, image_size=500, show=False, img_index=0):
- # GAF transformation
- transform = GramianAngularField(image_size, method='summation')
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-def gadf_transform(data, image_size=500, show=False ,img_index=0):
- # GAF transformation
- transform = GramianAngularField(image_size, method='difference')
- return(pyts_transform(transform, data, image_size=image_size, show=show, cmap='rainbow', img_index=img_index))
-
-
-def labels_for(a_file_name):
- segments=re.split(r'[_\-\.]+', a_file_name)
- return(segments)
-
-def max_in_sequence(sequence):
- return(max(np.amax([list(d.values()) for d in sequence]), 0.01))
-
-def compress_squiggle(squiggle, compress_factor):
- squiggle_len = len(squiggle)
- rem = squiggle_len % compress_factor
- if rem > 0:
- return(np.mean(squiggle[0:squiggle_len - rem].reshape(-1,compress_factor), axis=1))
- return(squiggle)
-
-def convert_to_image(signal):
- transformed_squiggle = gasf_transform(signal.reshape(1,-1), image_size=image_size, show=False)
- return(transformed_squiggle)
-
-
-def confidence_margin(npa):
- sorted = np.sort(npa)[::-1] #return sort in reverse, i.e. descending
- # sorted = np.sort(npa) #return sort in reverse, i.e. descending
- d = sorted[0] - sorted[1]
- return(d)
-
-def classify(model, labels, image, subtract_pixel_mean, threshold):
- input_shape = image.shape[1:]
- # x = image.astype('float32') / 255
- x = image.astype('float32') + 1
- x = x / 2
-
- # If subtract pixel mean is enabled
- if subtract_pixel_mean:
- x_mean = np.mean(x, axis=0)
- x -= x_mean
- x=[x]
- y = model.predict(x, verbose=0)
- res = []
- for i in range(len(y)):
- cm = confidence_margin(y[i])
- if y[i][np.argmax(y[i])] >= threshold:
- res.append([labels[i], np.argmax(y[i]), cm, y[i]])
- else:
- res.append([labels[i], None, cm, y[i]])
- return res
-
-if __name__ == '__main__':
- main()
diff --git a/NanoPreprocessSimple/bin/extract_sequence_from_fastq.py b/NanoPreprocessSimple/bin/extract_sequence_from_fastq.py
deleted file mode 100755
index 27f8406..0000000
--- a/NanoPreprocessSimple/bin/extract_sequence_from_fastq.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env python
-import sys
-import gzip
-import pprint
-import os
-
-usage = '''
-created by Huanle and Luca for Master of Pores! :)
-python extract_sequence_from_fastq.py
-'''
-
-if len (sys.argv) < 3:
- print (usage)
- exit(1)
-
-def fopen (f):
- if f.endswith('.gz'):
- return (gzip.open(f,'rt'))
- else:
- return (open (f,'rt'))
-
-
-IDs = dict ()
-if True:
- fh = fopen (sys.argv[1])
- for l in fh:
- ary = l.strip().split()
- IDs[ary[1]] = ary[2]
-fh.close()
-
-outprefix = os.path.splitext(sys.argv[2])[0]
-outext = os.path.splitext(sys.argv[2])[1]
-if (outext == '.gz'):
- outprefix = os.path.splitext(outprefix)[0]
-
-fh = fopen (sys.argv[2])
-for l in fh:
- rd = l.strip().split()[0]
- rd = rd.replace('@','')
- seq = next(fh).strip()
- inf = next (fh).strip()
- q = next(fh).strip()
- if rd in IDs:
- outfile = outprefix + "." + IDs[rd] + ".fastq"
- fw = open(outfile,"a+")
- rd = '@'+rd
- string = "{0}\n{1}\n{2}\n{3}\n".format(rd,seq,inf,q)
- fw.write(string)
- #print (string)
-fh.close()
diff --git a/NanoPreprocessSimple/bin/fast5_to_fastq.py b/NanoPreprocessSimple/bin/fast5_to_fastq.py
deleted file mode 100755
index a80ce10..0000000
--- a/NanoPreprocessSimple/bin/fast5_to_fastq.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env python
-import sys, warnings,os
-import numpy as np
-import re
-import gzip
-import h5py, ont_fast5_api
-from ont_fast5_api.fast5_interface import get_fast5_file
-
-usage = '''
-_EUSAGE_: python fast5_to_fastq.py fast5file
-_INPUT_: both single- and multi- fast5 files can be processed
-_BUGS_: report to elzedliu@gmail.com
-'''
-
-if len(sys.argv) < 2:
- print (usage); exit(0)
-
-f5file = sys.argv[1]
-hdf5 = h5py.File(f5file,'r')
-f5 = get_fast5_file (f5file,mode='r')
-reads_in_f5 = f5.get_read_ids()
-number_of_reads = len( reads_in_f5)
-
-
-fq_out = open ('.'.join(f5file.split('.')[:-1])+'.fastq','w')
-
-def fastq_from_fast5 (h5f):
- '''
- for single fast5 file, it is hdf5 file open with h5py
- for multi- fast5 file, it is hdf5['read_id'] for single read
- '''
- k = 'Analyses/Basecall_1D_000/BaseCalled_template/Fastq'
- if k in h5f:
- fastq = h5f[k].value.decode()
- return fastq
- else:
- return None
-
-if number_of_reads == 1:
- fq = fastq_from_fast5 (hdf5)
- if fq :
- fq_out.write (fq)
- else:
- sys.stderr.write("no sequence basecalled for {} \n".format(f5file))
-elif number_of_reads > 1:
- for rd in hdf5.keys():
- fq = fastq_from_fast5 (hdf5[rd])
- if fq:
- fq = fq.replace('_Basecall_1D_template','')
- fq_out.write(fq)
- else:
- sys.stderr.write("no basecall for read {} \n" .format(rd))
-fq_out.close()
diff --git a/NanoPreprocessSimple/bin/fast5_type.py b/NanoPreprocessSimple/bin/fast5_type.py
deleted file mode 100755
index 626c1ed..0000000
--- a/NanoPreprocessSimple/bin/fast5_type.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env python
-import sys, warnings,os
-import h5py, ont_fast5_api
-from ont_fast5_api.fast5_interface import get_fast5_file
-
-__author__ = 'Huanle.Liu@crg.eu'
-__version__ = '0.1'
-__email__ = 'same as author'
-
-usage = '''
- python fast5_type.py fast5file
- return:
- 0: single read fast5
- 1: multi-reads fast5
-'''
-
-if len (sys.argv) < 2:
- print (usage)
- exit(0)
-
-f5file = sys.argv[1]
-hdf5 = h5py.File(f5file,'r')
-f5 = get_fast5_file (f5file,mode='r')
-reads_in_f5 = f5.get_read_ids()
-number_of_reads = len( reads_in_f5)
-if number_of_reads == 1:
- print (0)
-elif number_of_reads > 1:
- print (1)
diff --git a/NanoPreprocessSimple/bin/reorganize.sh b/NanoPreprocessSimple/bin/reorganize.sh
deleted file mode 100755
index ece54cc..0000000
--- a/NanoPreprocessSimple/bin/reorganize.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/env bash
-
-if [ x"$1" == x ]; then
-
- "please specify an output folder"
-
- exit 1
-
-fi
-
-for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/fast5_files; mv $i/* $1`basename $i`/fast5_files; done
-for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/fastq_files; mv $1/fastq_files/*`basename $i`.fq.gz $1`basename $i`/fastq_files; done
-for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/counts; mv $1/counts/*`basename $i`.count $1`basename $i`/counts; done
-for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/assigned; mv $1/assigned/*`basename $i`.assigned $1`basename $i`/assigned; done
-for i in $1/fast5_files/* ; do mkdir -p $1/`basename $i`/alignment; mv $1/alignment/*`basename $i`*.bam $1`basename $i`/alignment; done
-rmdir $1/alignment $1/assigned $1/counts $1/fastq_files; for i in $1/fast5_files/*; do rmdir $i; done; rmdir $1/fast5_files
-
-
diff --git a/NanoPreprocessSimple/config.yaml b/NanoPreprocessSimple/config.yaml
deleted file mode 120000
index 7adc0cc..0000000
--- a/NanoPreprocessSimple/config.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../NanoPreprocess/config.yaml
\ No newline at end of file
diff --git a/NanoPreprocessSimple/deeplexicon b/NanoPreprocessSimple/deeplexicon
deleted file mode 120000
index aa6ad45..0000000
--- a/NanoPreprocessSimple/deeplexicon
+++ /dev/null
@@ -1 +0,0 @@
-../NanoPreprocess/deeplexicon/
\ No newline at end of file
diff --git a/NanoPreprocessSimple/nanopreprocess_s.nf b/NanoPreprocessSimple/nanopreprocess_s.nf
deleted file mode 100644
index 7cdf982..0000000
--- a/NanoPreprocessSimple/nanopreprocess_s.nf
+++ /dev/null
@@ -1,651 +0,0 @@
-#!/usr/bin/env nextflow
-
-
-/*
- * Define the pipeline parameters
- *
- */
-
-// Pipeline version
-version = '0.1'
-
-params.help = false
-params.resume = false
-
-log.info """
-
-╔╦╗┌─┐┌─┐┌┬┐┌─┐┬─┐ ┌─┐┌─┐ ╔═╗╔═╗╦═╗╔═╗╔═╗
-║║║├─┤└─┐ │ ├┤ ├┬┘ │ │├┤ ╠═╝║ ║╠╦╝║╣ ╚═╗
-╩ ╩┴ ┴└─┘ ┴ └─┘┴└─ └─┘└ ╩ ╚═╝╩╚═╚═╝╚═╝
-
-====================================================
-BIOCORE@CRG Preprocessing of Nanopore direct RNA - N F ~ version ${version}
-====================================================
-
-fast5 : ${params.fast5}
-fastq : ${params.fastq}
-reference : ${params.reference}
-annotation : ${params.annotation}
-
-ref_type : ${params.ref_type}
-seq_type : ${params.seq_type}
-
-output : ${params.output}
-qualityqc : ${params.qualityqc}
-granularity : ${params.granularity}
-
-GPU : ${params.GPU}
-demultiplexing : ${params.demultiplexing}
-demultiplexing_opt : ${params.demultiplexing_opt}
-
-filter : ${params.filter}
-filter_opt : ${params.filter_opt}
-mapper : ${params.mapper}
-mapper_opt : ${params.mapper_opt}
-map_type : ${params.map_type}
-
-counter : ${ params.counter}
-counter_opt : ${ params.counter_opt}
-
-downsampling : ${params.downsampling}
-
-variant_caller : ${params.variant_caller}
-variant_opt : ${params.variant_opt}
-
-email : ${params.email}
-"""
-
-// Help and avoiding typos
-if (params.help) exit 1
-if (params.resume) exit 1, "Are you making the classical --resume typo? Be careful!!!! ;)"
-if (params.granularity == "") params.granularity = 1000000000
-
-// check multi5 and GPU usage. GPU maybe can be removed as param if there is a way to detect it
-if (params.GPU != "ON" && params.GPU != "OFF") exit 1, "Please specify ON or OFF in GPU processors are available"
-
-if (params.map_type != "unspliced" && params.map_type != "spliced") exit 1, "Mapping type NOT supported! Please choose either 'spliced' or 'unspliced'"
-
-// check input files
-reference = file(params.reference)
-if( !reference.exists() ) exit 1, "Missing reference file: ${reference}!"
-config_report = file("$baseDir/config.yaml")
-if( !config_report.exists() ) exit 1, "Missing config.yaml file!"
-logo = file("$baseDir/../docs/logo_small.png")
-deeplexicon_folder = file("$baseDir/deeplexicon/")
-
-
-demultiplexer = params.demultiplexing
-demultiplexer_opt = params.demultiplexing_opt
-mapper = params.mapper
-mapper_opt = params.mapper_opt
-counter_opt = params.counter_opt
-
-// Output folders
-
-outputFastq = "fastq_files"
-outputFast5 = "fast5_files"
-outputQual = "QC_files"
-outputMultiQC = "report"
-outputMapping = "alignment"
-outputCRAM = "cram_files"
-outputCounts = "counts"
-outputVars = "variants"
-outputAssigned = "assigned"
-
-/*
-* move old multiQCreport
-*/
-outputReport = file("${outputMultiQC}/multiqc_report.html")
-
-if( outputReport.exists() ) {
- log.info "Moving old report to multiqc_report.html multiqc_report.html.old"
- outputReport.moveTo("${outputMultiQC}/multiqc_report.html.old")
-}
-
-
-/*
- * Creates the channels that emits fast5 files
- */
-if(params.fast5 == "") {
- Channel
- .empty()
- .into {fast5_4_testing; fast5_4_granularity}
-} else {
- Channel
- .fromPath(params.fast5)
- .ifEmpty(){ error "Cannot find any file matching: ${params.fast5}" }
- .into {fast5_4_testing; fast5_4_granularity}
-}
-/*
- * Creates the channels that emits fastq files
- */
-Channel
- .fromFilePairs( params.fastq, size: 1)
- .ifEmpty(){ error "Cannot find any file matching: ${params.fastq}" }
- .set {fastq_files_for_demultiplexing}
-
-
-/*
- * Get the name from the folder
-if (params.fast5 != "") {
- folder_info = params.fast5.tokenize("/")
- folder_name = folder_info[-3]
-} else {
- folder_info = params.fastq.tokenize("/")
- folder_name = folder_info[-2]
-}
- */
-
-/*
-* This is default value in case guppy will be used for RNA demultiplexing
-*/
-if (demultiplexer == "") {
- demultiplexer = "OFF"
-}
-
-if (demultiplexer != "OFF" && demultiplexer != "deeplexicon")
-exit 1, "Demultiplexing of RNA can be performed only with deeplexicon. Current value is ${demultiplexer}"
-
-if (params.ref_type == "genome") {
- if (params.annotation != "") {
- annotation = file(params.annotation)
- if( !annotation.exists() ) exit 1, "Missing annotation file: ${params.annotation}!"
- }
-}
-
-
-process testInput {
- tag {"${fast5}"}
-
- input:
- file(fast5) from fast5_4_testing.first()
-
- output:
- stdout into multi5_type
-
- script:
- """
- fast5_type.py ${fast5}
- """
-}
-
-multi5_type.map { it.trim().toInteger() }.into{multi5_type_for_msg; multi5_type_for_bc; multi5_type_for_granularity; multi5_type_for_demultiplexing}
-multi5_type_for_msg.map{it == 0 ? "Single Fast5 files detected!": "MultiFast5 files detected!" }.println()
-
-// if you are using GPU analyse the whole dataset, otherwise make batch of 4,000 sequences if they are single fast5
-// or single batches of multi fast5 sequences
-multi5_type_for_granularity.merge(fast5_4_granularity.collect()).map{
- (params.GPU == "YES" ? params.granularity : (it[0] == 0 ? it[1..-1].collate(4000) : it[1..-1].collate(1)) )
-}.flatMap().set{fast5_batches}
-
-// create a map id batch -> list of files
-def num_batch = -1
-fast5_batches.map {
- num_batch++
- [num_batch, it]
-}.set{fast5_4_demulti}
-
-/*
-* Perform demultiplexing (optional) using deeplexicon on basecalled reads
-*/
-if(demultiplexer == "deeplexicon") {
- process demultiplexing_with_deeplexicon {
- label 'demulti'
- tag {"${demultiplexer}-${idfile}"}
-
- input:
- set idfile, file(fast5) from fast5_4_demulti
- val (multi5) from multi5_type_for_bc
- file(deeplexicon_folder)
-
- output:
- //set idfile, file ("${idfile}_demux.tsv") into demux_for_fastq_extraction
- //file ("${idfile}_demux.tsv") into demux_for_fast5_extraction
- file ("${idfile}_demux.tsv") into demux_for_collect
-
- script:
- def model = ''
- def executable = 'deeplexicon.py -f multi'
- if (multi5 == 0){
- executable = 'deeplexicon.py -f single'
- }
- if (demultiplexer_opt.contains("pAmps-final-actrun_newdata_nanopore_UResNet20v2_model.030.h5")){
- executable = 'cmd_line_deeplexicon_caller_2019_09_12.py'
- }
- """
- ln -s ${deeplexicon_folder}/* .
- ${executable} -p ./ ${demultiplexer_opt} -b 4000 -v > ${idfile}_demux.tsv
- """
- }
-
- process collecting_deeplexicon {
-
- input:
- file("demux_*") from demux_for_collect.collect()
-
- output:
- file ("all_demux.txt") into demux_for_fastq_extraction
-
- script:
- """
- awk '{if (NR==1) {print \$0} else if (\$0 !~ /Confidence/) {print \$0}}' demux_* > all_demux.txt
- """
-
- }
-
- process extracting_demultiplexed_fastq {
- label 'basecall_cpus'
- tag {"${idfile}"}
-
- input:
- file(demux) from demux_for_fastq_extraction
- set idfile, file(fastq) from fastq_files_for_demultiplexing
-
- output:
- set idfile, file("*.fastq.gz") into fastq_for_filtering
-
- script:
- """
- extract_sequence_from_fastq.py ${demux} ${fastq}
- for i in *.fastq; do gzip \$i; done
- """
- }
-} else {
- fastq_files_for_demultiplexing.set{ fastq_for_filtering}
-}
-
-
-/*
-* Perform filtering (optional) using nanofilt on fastq files
-*/
-if (params.filter == "nanofilt") {
- process filtering {
- label 'big_cpus'
- tag {"${params.filter}-${fastq_file}".replace('.fastq.gz', '')}
-
- input:
- set idfile, file(fastq_file) from fastq_for_filtering.transpose()
-
- output:
- set idfile, file("*-filt.fastq.gz") into fastq_for_next_step
-
- script:
- output = "${fastq_file}".replace(".fastq.gz", "-filt.fastq.gz")
- """
- zcat ${fastq_file} | NanoFilt ${params.filter_opt} | gzip > ${output}
- """
- }
-} else {
- fastq_for_filtering.transpose().set{ fastq_for_next_step}
-}
-
-
-// check this
-fastq_for_next_step.map{
- filepath=it[1]
- file_parts = "${filepath}".tokenize("/")
- def folder_name = file_parts[-2]
- if (demultiplexer != "OFF") {
- fileparts = filepath.getName().tokenize(".")
- ["${folder_name}.${fileparts[-3]}", filepath]
- } else {
- ["${folder_name}", filepath]
- }
-}.groupTuple().into{fastq_files_for_fastqc; fastq_files_for_mapping}
-
-
-/*
-* Perform fastQC on fastq files
-*/
-
-process fastQC {
- tag {idfile}
- label 'big_cpus'
-
- publishDir "${params.output}/${idfile}/${outputQual}/", pattern: "*_fastqc.html", mode: 'copy'
-
- input:
- set idfile, file(fastq_file) from fastq_files_for_fastqc
-
- output:
- file ("*_fastqc.*") into fastqc_for_multiqc
-
- script:
- """
- fastqc ${fastq_file} -t ${task.cpus}
- """
-}
-
-process mapping {
- tag {"${mapper}-${idfile}"}
- publishDir "${params.output}/${idfile}/${outputMapping}/", pattern: "*.${mapper}.sorted.bam*", mode: 'copy'
- label 'big_mem_cpus'
-
- input:
- file(reference)
- set idfile, file (fastq_file) from fastq_files_for_mapping
-
- output:
- set idfile, file("${idfile}.${mapper}.sorted.bam") optional true into aligned_reads, aligned_reads_for_QC, aligned_reads_for_QC2, aligned_reads_for_counts
- set idfile, mapper, file("${idfile}.${mapper}.sorted.bam"), file("${idfile}.${mapper}.sorted.bam.bai") optional true into aligned_reads_for_crams
- set idfile, file("${idfile}.${mapper}.sorted.bam"), file("${idfile}.${mapper}.sorted.bam.bai") optional true into aligned_reads_for_vars
- file("${idfile}.${mapper}.sorted.bam*") optional true
-
- script:
- if (mapper == "minimap2") {
- def mappars = (params.map_type == "spliced") ? "-ax splice -k14" : "-ax map-ont"
- mappars += " ${mapper_opt} "
- """
- minimap2 -t ${task.cpus} ${mappars} -uf ${reference} ${fastq_file} > reads.mapped.sam
- samtools view -@ ${task.cpus} -F4 -hSb reads.mapped.sam > reads.mapped.bam
- rm reads.mapped.sam
- samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam
- samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam
- rm reads.mapped.bam
- """
- }
- else if (mapper == "graphmap2"){
- def mappars = (params.map_type == "spliced") ? "-x rnaseq" : ""
- mappars += " ${mapper_opt} "
- """
- graphmap2 align -t ${task.cpus} -r ${reference} ${mappars} -d ${fastq_file} | samtools view -@ ${task.cpus} -F4 -hSb - > reads.mapped.bam
- samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam
- samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam
- rm reads.mapped.bam
- """
- }
- else if (mapper == "graphmap"){
- """
- graphmap align -t ${task.cpus} ${mapper_opt} -r ${reference} -d ${fastq_file} | samtools view -@ ${task.cpus} -F4 -hSb - > reads.mapped.bam
- samtools sort -@ ${task.cpus} -o ${idfile}.${mapper}.sorted.bam reads.mapped.bam
- samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.bam
- rm reads.mapped.bam
- """
- }
- else {
- """
- echo "nothing to do!"
- """
- }
-}
-
-/*
-* Perform mapping and sorting
-*/
-process cram_conversion {
- tag {"${mapper}-${idfile}"}
- publishDir "${params.output}/${idfile}/${outputCRAM}/", mode: 'copy'
- label 'big_mem_cpus'
-
- input:
- file(reference)
- set idfile, val(mapper), file(aln), file(index) from aligned_reads_for_crams
-
- output:
- file("${idfile}.${mapper}.sorted.cram*") optional true
- script:
- def downcmd = ""
- def input = aln
- def cleancmd = ""
- gzipcmd = unzipCmd(reference, "myreference.fasta")
- gzipclean = "rm myreference.fasta"
- if (params.downsampling != "") {
- def perc = params.downsampling/100
- downcmd = "samtools view -@ ${task.cpus} -bs ${perc} ${aln} > subsample.bam"
- input = "subsample.bam"
- cleancmd = "rm subsample.bam"
- }
- """
- ${downcmd}
- ${gzipcmd}
- samtools view -@ ${task.cpus} -C ${input} -T myreference.fasta -o ${idfile}.${mapper}.sorted.cram
- samtools index -@ ${task.cpus} ${idfile}.${mapper}.sorted.cram
- ${cleancmd}
- ${gzipclean}
- """
-}
-
-/*
-* Perform counting (optional)
-*/
-
-if ( params.counter == "YES") {
- process counting {
- tag {"${idfile}"}
- publishDir "${params.output}/${idfile}/${outputCounts}/", pattern: "*.count", mode: 'copy'
- publishDir "${params.output}/${idfile}/${outputAssigned}/", pattern: "*.assigned", mode: 'copy'
-
- input:
- set idfile, file(bamfile) from aligned_reads_for_counts
-
- output:
- file("${idfile}.count") into read_counts
- file("${idfile}.stats") optional true into count_stats
- file("${idfile}.assigned") optional true
- script:
- if (params.ref_type == "transcriptome") {
- """
- NanoCount -i ${bamfile} -o ${idfile}.count ${counter_opt};
- awk '{sum+=\$3}END{print FILENAME"\t"sum}' ${idfile}.count |sed s@.count@@g > ${idfile}.stats
- samtools view -F 256 ${bamfile} |cut -f 1,3 > ${idfile}.assigned
- """
- } else if (params.ref_type == "genome") {
- def anno = unzipBash("${params.annotation}")
- """
- samtools view ${bamfile} |htseq-count ${counter_opt} -f sam - ${anno} -o ${idfile}.sam > ${idfile}.count
- awk '{gsub(/XF:Z:/,"",\$NF); print \$1"\t"\$NF}' ${idfile}.sam |grep -v '__' > ${idfile}.assigned
- rm ${idfile}.sam
- """
- }
- }
-
- /*
- * Join alnQC
- */
- process joinCountQCs {
-
- input:
- file "*" from count_stats.collect()
-
- output:
- file("counts_mqc.txt") into count_repo_for_multiQC
-
- script:
- """
- echo '# id: NanoCount
- # plot_type: \'table\'
- # section_name: Read counts
- File name \'Counts\' ' > counts_mqc.txt
- cat *.stats >> counts_mqc.txt
- """
- }
-} else {
- read_counts = Channel.empty()
- count_repo_for_multiQC = Channel.empty()
-}
-
-
-/*
-* Perform alnQC
-*/
-process alnQC {
- tag {bamid}
-
- input:
- set bamid, file(bamfile) from aligned_reads_for_QC
-
- output:
- file "${bamid}.stat" into single_alnQC_outs
-
- script:
- """
- bam2stats.py ${bamfile} > ${bamid}.stat
- """
-}
-
-/*
-* Join alnQC
-*/
-process joinAlnQCs {
-
- input:
- file "alnqc_*" from single_alnQC_outs.collect()
-
- output:
- file("alnQC_mqc.txt") into alnQC_for_multiQC
-
- script:
- """
- echo '# id: alnQC
-# plot_type: \'table\'
-# section_name: \'Alignment QC\' ' > alnQC_mqc.txt
- cat alnqc_* | head -n 1| sed s@#@@g >> alnQC_mqc.txt
- cat alnqc_* | grep -v "#" >> alnQC_mqc.txt
- """
-}
-
-/*
-* Perform alnQC2
-*/
-
-process alnQC2 {
- publishDir "${params.output}/${bamid}/${outputQual}/", pattern: "*_plot/*", mode: 'copy'
- label 'big_cpus'
- errorStrategy 'ignore'
- tag {bamid}
-
- input:
- set bamid, file(bamfile) from aligned_reads_for_QC2
-
- output:
- file("*_plot/*") optional true
- file("${bamid}_stats_mqc.png") optional true into qc2_for_multiqc
-
- script:
- """
- NanoPlot --bam ${bamfile} -o ${bamid}_plot --maxlength 5000 -t ${task.cpus}
- mkdir tmp_dir
- cp ${bamid}_plot/PercentIdentityvsAverageBaseQuality_kde.png tmp_dir
- cp ${bamid}_plot/LengthvsQualityScatterPlot_dot.png tmp_dir
- cp ${bamid}_plot/HistogramReadlength.png tmp_dir
- cp ${bamid}_plot/Weighted_HistogramReadlength.png tmp_dir
- gm montage tmp_dir/*.png -tile 2x2 -geometry 800x800 ${bamid}_stats_mqc.png
- rm -fr tmp_dir
- """
-}
-
-fastqc_for_multiqc.mix(qc2_for_multiqc,read_counts,count_repo_for_multiQC,alnQC_for_multiQC).set{files_for_report}
-
-/*
-* Perform viral variant call (experimental)
-*/
-
-if ( params.variant_caller == "YES" && params.seq_type != "RNA") {
-
- process variant_calling {
- tag {"${idfile}"}
- publishDir "${params.output}/${idfile}/${outputVars}/", pattern: "*.vcf", mode: 'copy'
- label 'big_cpus'
- errorStrategy 'ignore'
-
- input:
- set idfile, file(bamfile), file(bai) from aligned_reads_for_vars
- file(reference)
-
- output:
- file("*.vcf")
-
- script:
- gzipcmd = unzipCmd(reference, "myreference.fasta", "yes")
- gzipclean = "rm myreference.fasta"
- """
- ${gzipcmd}
- medaka consensus ${bamfile} outputs.hdf
- medaka variant myreference.fasta outputs.hdf ${idfile}.vcf
- medaka tools annotate ${idfile}.vcf reference.fasta ${bamfile} ${idfile}.annot.vcf
-
- #medaka_variant ${params.variant_opt} -i ${bamfile} -f myreference.fasta -d -t ${task.cpus} -o ./out
- #mv `ls -t out/round_*.vcf| head -n1 ` .
- ${gzipclean}
- """
- }
- }
-
-
-// make named pipe
-def unzipCmd(filename, unzippedname, copy="") {
- def cmd = "ln -s ${filename} ${unzippedname}"
- if (copy!="") {
- cmd = "cp ${filename} ${unzippedname}"
- }
- def ext = filename.getExtension()
- if (ext == "gz") {
- cmd = "zcat ${filename} > ${unzippedname}"
- }
- return cmd
-}
-
-// make named pipe
-def unzipBash(filename) {
- def cmd = filename.toString()
- if (cmd[-3..-1] == ".gz") {
- cmd = "<(zcat ${filename})"
- }
- return cmd
-}
-
-/*
-* Perform multiQC report
-*/
-process multiQC {
- publishDir "${params.output}/${outputMultiQC}/", mode: 'copy'
-
- input:
- file(logo)
- file(config_report)
- file("*") from files_for_report.collect()
-
- output:
- file("multiqc_report.html") into multiQC
-
- script:
- """
- multiqc -c ${config_report} .
- """
-}
-
-
-if (params.email == "yourmail@yourdomain" || params.email == "") {
- log.info 'Skipping the email\n'
-}
-else {
- log.info "Sending the email to ${params.email}\n"
-
- workflow.onComplete {
-
- def msg = """\
- Pipeline execution summary
- ---------------------------
- Completed at: ${workflow.complete}
- Duration : ${workflow.duration}
- Success : ${workflow.success}
- workDir : ${workflow.workDir}
- exit status : ${workflow.exitStatus}
- Error report: ${workflow.errorReport ?: '-'}
- """
- .stripIndent()
-
- sendMail(to: params.email, subject: "Master of Pore execution", body: msg, attach: "${outputMultiQC}/multiqc_report.html")
- }
-}
-
-workflow.onComplete {
- println "Pipeline BIOCORE@CRG Master of Pore completed!"
- println "Started at $workflow.start"
- println "Finished at $workflow.complete"
- println "Time elapsed: $workflow.duration"
- println "Execution status: ${ workflow.success ? 'OK' : 'failed' }"
-}
-
-
-
diff --git a/NanoPreprocessSimple/nextflow.config b/NanoPreprocessSimple/nextflow.config
deleted file mode 120000
index 227f6b9..0000000
--- a/NanoPreprocessSimple/nextflow.config
+++ /dev/null
@@ -1 +0,0 @@
-../NanoPreprocess/nextflow.config
\ No newline at end of file
diff --git a/NanoPreprocessSimple/params.config b/NanoPreprocessSimple/params.config
deleted file mode 100644
index 8016d1a..0000000
--- a/NanoPreprocessSimple/params.config
+++ /dev/null
@@ -1,33 +0,0 @@
-params {
- fast5 = "$baseDir/../NanoPreprocess/multifast/fast5_files/*.fast5"
- fastq = "$baseDir/../NanoPreprocess/multifast/fastq_files/*.fq.gz"
- reference = "$baseDir/../anno/curlcake_constructs.fasta.gz"
- annotation = ""
- ref_type = "transcriptome"
-
- seq_type = "cDNA"
- output = "$baseDir/output"
- qualityqc = 5
- granularity = ""
-
- GPU = "OFF"
- demultiplexing = "deeplexicon"
- demultiplexing_opt = "-s 0.9 -m resnet20-final.h5"
- demulti_fast5 = "OFF"
-
- filter = ""
- filter_opt = ""
-
- mapper = "minimap2"
- mapper_opt = "-uf -k14"
- map_type = "unspliced"
-
- variant_caller = "YES"
- variant_opt = ""
-
- downsampling = ""
-
- counter = "YES"
- counter_opt = ""
- email = ""
-}
diff --git a/NanoTail/bin/join.r b/NanoTail/bin/join.r
deleted file mode 100644
index 38809e6..0000000
--- a/NanoTail/bin/join.r
+++ /dev/null
@@ -1,43 +0,0 @@
-# R --slave --args findr nanopol assigned prefix < join.r
-
-args<-commandArgs(TRUE)
-findr <- args[1]
-nanopol <- args[2]
-assigned <- args[3]
-prefix <- args[4]
-
-a<-read.table(findr, sep="\t", header=FALSE)
-b<-read.table(nanopol, sep="\t", header=FALSE)
-g<-read.table(assigned, sep="\t", header=FALSE)
-
-c<-merge(a,b, all.x=T, by.x="V1", by.y="V1", sort=FALSE)
-d<-merge(c,g, all.x=T, by.x="V1", by.y="V1", sort=FALSE)
-
-colnames(d)<-c("Read name", "Tailfindr", "Nanopolish", "Gene Name")
-
-e<-na.omit(d)
-e[,2]<-log(e[,2]+1,base=2)
-e[,3]<-log(e[,3]+1,base=2)
-
-write.table(d, file = paste0(prefix, "_joined.txt"), sep="\t", row.names = FALSE)
-
-
-x<-e[,2]
-y<-e[,3]
-
-data.lm<-lm(y ~ x)
-rsquared<-round(summary(data.lm)$r.squared, 2)
-
-rsquare=bquote(R^2 == .(rsquared))
-
-subtitle=expression('PolyA tail length estimation log'[2]*'(n+1)')
-
-png(paste0(prefix, "_corr.png"), width = 1600, height = 1200)
-op <- par(mar=c(8, 8, 8, 8) + 0.1)
-plot(x, y, xlab="TailfindR", ylab="Nanopolish", ylim=c(0,10), xlim=c(0,10), cex=.2, cex.lab=2, cex.sub=2,, cex.main=3, cex.axis=2, pch=19, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.5), frame=FALSE, main=prefix)
-mtext(side=3, line=0, at=6.5, adj=1, cex=2, subtitle)
-mtext(side=3, line=-65, at=10, adj=1, cex=2, rsquare)
-
-par(op)
-abline(data.lm, col = "blue")
-dev.off()
diff --git a/NanoTail/config.yaml b/NanoTail/config.yaml
deleted file mode 100644
index faa50a8..0000000
--- a/NanoTail/config.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-title: "Master of Pores"
-subtitle: ""
-intro_text: False
-
-report_header_info:
- - Application Type: 'Nanopore sequencing'
-
-custom_logo: 'logo_small.png'
-custom_logo_url: 'https://github.com/biocorecrg/nanopore_analysis'
-custom_logo_title: 'Master of Pores'
-
-extra_fn_clean_trim:
- - '_QC'
-
-table_columns_visible:
- FastQC:
- percent_duplicates: True
-
-module_order:
- - fastqc:
- name: 'FastQC'
- info: 'This section of the report shows FastQC results on raw reads.'
-
- - alnQC:
- name: "alnQC"
- info: 'This section of the report shows alnQC results on aligned reads'
diff --git a/NanoTail/nanotail.nf b/NanoTail/nanotail.nf
deleted file mode 100644
index de71c85..0000000
--- a/NanoTail/nanotail.nf
+++ /dev/null
@@ -1,307 +0,0 @@
-#!/usr/bin/env nextflow
-
-
-/*
- * Define the pipeline parameters
- *
- */
-
-// Pipeline version
-version = '0.1'
-
-params.help = false
-params.resume = false
-
-log.info """
-
-╔╦╗┌─┐┌─┐┌┬┐┌─┐┬─┐ ┌─┐┌─┐ ╔═╗╔═╗╦═╗╔═╗╔═╗
-║║║├─┤└─┐ │ ├┤ ├┬┘ │ │├┤ ╠═╝║ ║╠╦╝║╣ ╚═╗
-╩ ╩┴ ┴└─┘ ┴ └─┘┴└─ └─┘└ ╩ ╚═╝╩╚═╚═╝╚═╝
-
-====================================================
-BIOCORE@CRG NanoTail. Detection of polyA length (RNA) - N F ~ version ${version}
-====================================================
-
-***************** Input files *********************
-input_folders : ${params.input_folders}
-
-******* reference has to be the genome **********
-reference : ${params.reference}
-output : ${params.output}
-
-********* nanopolish and tailfindr cmd options **********
-nanopolish_opt : ${params.nanopolish_opt}
-tailfindr_opt : ${params.tailfindr_opt}
-
-email : ${params.email}
-
-"""
-
-// Help and avoiding typos
-if (params.help) exit 1
-if (params.resume) exit 1, "Are you making the classical --resume typo? Be careful!!!! ;)"
-
-// check input files
-reference = file(params.reference)
-if( !reference.exists() ) exit 1, "Missing reference file: ${reference}!"
-config_report = file("$baseDir/config.yaml")
-if( !config_report.exists() ) exit 1, "Missing config.yaml file!"
-logo = file("$baseDir/../docs/logo_small.png")
-joinScript = file("$baseDir/bin/join.r")
-
-
-nanopolish_opt = params.nanopolish_opt
-tailfindr_opt = params.tailfindr_opt
-
-// Output folders
-outputTailfindr = "${params.output}/Tailfindr"
-outputNanoPolish = "${params.output}/NanoPolish"
-outputFinal = "${params.output}/PolyA_final"
-
-/*
- * Creates the channels that emits input data
- */
-Channel
- .fromFilePairs( params.input_folders, type: 'dir', size: 1)
- .ifEmpty { error "Cannot find any folder matching: ${params.input_folders}" }
- .into{folders_for_fastq; folders_for_analyisis; folders_for_genes; folders_for_bam_filtering}
-
-folders_for_analyisis.map {
- [ it[0], file("${it[1][0]}/fast5_files/*.fast5") ]
- }.transpose().into{data_for_tailfindr; data_for_nanopolish_raw; ciccio}
-
-folders_for_bam_filtering.map {
- [ it[0], file("${it[1][0]}/alignment/*.bam") ]
- }.set{data_for_bam_filtering}
-
-folders_for_fastq.map {
- [ it[0], file("${it[1][0]}/fastq_files/*") ]
- }.set{fastq_data}
-
-folders_for_genes.map {
- [ it[0], file("${it[1][0]}/assigned/*") ]
- }.set{genes_for_final}
-
-
-/*
-* Check reference file
-*/
-process check_reference {
-
- input:
- file(reference)
-
- output:
- file("reference_sequences.fa") into checked_ref
-
- script:
- """
- if [ `echo ${reference} | grep ".gz"` ]; then
- zcat ${reference} > reference_sequences.fa
- else ln -s ${reference} reference_sequences.fa
- fi
- """
-}
-
-/*
-* Estimate polyA tail size with tailfindr
-*/
-
-process tailfindr {
- tag "${sampleID}-${fast5.simpleName}"
- label 'big_mem_cpus_time'
-
- input:
- set val(sampleID), file(fast5) from data_for_tailfindr
-
- output:
- set val(sampleID), file("*_findr.csv") into tailfindr_res
-
- script:
- def fast5_index=fast5.getSimpleName()
- """
- R --slave -e "library(tailfindr); find_tails(fast5_dir = './' , save_dir = './', ${params.tailfindr_opt}, csv_filename = \'${sampleID}-${fast5_index}_findr.csv\', num_cores = ${task.cpus})"
- """
-}
-
-/*
-* Filter bams
-*/
-process filter_bam {
- tag { sampleID }
- label 'big_mem_cpus'
-
- input:
- file(reference)
- set val(sampleID), file(alignment) from data_for_bam_filtering
-
- output:
- set val(sampleID), file("${sampleID}_filt.bam"), file("${sampleID}_filt.bam.bai") into filt_bam_for_nanopolish, filt_bam_for_intersect
-
- script:
- """
- #to keep only mapped reads and remove secondary alignments
- samtools view -@ {task.cpus} -bF 260 ${alignment} > ${sampleID}_filt.bam
- samtools index ${sampleID}_filt.bam
- """
-}
-
-
-data_for_nanopolish_raw.combine(filt_bam_for_nanopolish, by:0).combine(fastq_data, by:0)
-.set{data_for_nanopolish}
-
-
-/*
-* Estimate polyA tail size with nanopolish
-*/
-process tail_nanopolish {
- tag "${sampleID}-${fast5.simpleName}"
- label 'big_mem_cpus_time_np'
-
- input:
- file(checked_ref)
- set val(sampleID), file(fast5), file(alignment), file(alnindex), file(fastq) from data_for_nanopolish
-
- output:
- set val(sampleID), file("*.polya.estimation.tsv") into nanopol_res
-
- script:
- def fast5_index=fast5.getSimpleName()
- """
- #index reads
- nanopolish index -d ./ ${fastq}
- # polya length estimation
- nanopolish polya -r ${fastq} ${params.nanopolish_opt} -g ${checked_ref} -t ${task.cpus} -b ${alignment} > ${sampleID}-${fast5_index}.polya.estimation.tsv
- """
-}
-
-process collect_nanopolish_results {
- publishDir outputNanoPolish, pattern: "*.polya.estimation.tsv", mode: 'copy'
- tag { sampleID }
-
- input:
- set val(sampleID), file("nanopol_*") from nanopol_res.groupTuple()
-
- output:
- set val(sampleID), file("${sampleID}.nanopol.len") into nanopol_len
- file ("*.polya.estimation.tsv")
-
- script:
- """
- cat nanopol_* | awk '!(NR>1 && /leader_start/)' | grep -v "READ_FAILED_LOAD" >> ${sampleID}.polya.estimation.tsv
- awk -F"\t" '{if (\$10=="PASS") print \$1"\t"\$9}' ${sampleID}.polya.estimation.tsv > ${sampleID}.nanopol.len
- """
-
-}
-
-process collect_tailfindr_results {
- publishDir outputTailfindr, pattern: "*_findr.csv", mode: 'copy'
- tag { sampleID }
-
- input:
- set val(sampleID), file("tailfin_*") from tailfindr_res.groupTuple()
-
- output:
- set val(sampleID), file("${sampleID}.findr.len") into tailfindr_len
- file ("*_findr.csv")
-
- script:
- """
- cat tailfin_* | awk '!(NR>1 && /tail_start/)' >> ${sampleID}_findr.csv
- awk -F"," '{if (\$5!="" && \$1!="read_id") print \$1"\t"\$5}' ${sampleID}_findr.csv > ${sampleID}.findr.len
- """
-
-
-}
-
-
-
-
-/*
-*/
-process join_results {
- container "biocorecrg/mopnanotail:0.2"
-
- publishDir outputFinal, mode: 'copy'
- tag { sampleID }
-
- input:
- set val(sampleID), file(nanopol), file(tailfindr), file(genes) from nanopol_len.join(tailfindr_len).join(genes_for_final)
- file(joinScript)
-
- output:
- file("${sampleID}_*")
-
- script:
- """
- Rscript --vanilla ${joinScript} ${tailfindr} ${nanopol} ${genes} ${sampleID}
- """
-
-}
-
-
-/*
-* Merge the results
-*/
-
-
-
-/*
-* functions
-*/
-// Get the name from the folder
-def getFolderName(sample) {
- folder_info = sample.toString().tokenize("/")
- return folder_info[-2]
-}
-
-// make named pipe
-def unzipBash(filename) {
- cmd = filename.toString()
- if (cmd[-3..-1] == ".gz") {
- cmd = "<(zcat ${filename})"
- }
- return cmd
-}
-
-
-
-/*
-* Finish message
-*/
-workflow.onComplete {
- println "Pipeline BIOCORE@CRG Master of Pore completed!"
- println "Started at $workflow.start"
- println "Finished at $workflow.complete"
- println "Time elapsed: $workflow.duration"
- println "Execution status: ${ workflow.success ? 'OK' : 'failed' }"
-}
-
-/*
-* Mail notification
-*/
-
-if (params.email == "yourmail@yourdomain" || params.email == "") {
- log.info 'Skipping the email\n'
-}
-else {
- log.info "Sending the email to ${params.email}\n"
-
- workflow.onComplete {
-
- def msg = """\
- NanoTail module's execution summary
- ---------------------------
- Completed at: ${workflow.complete}
- Duration : ${workflow.duration}
- Success : ${workflow.success}
- workDir : ${workflow.workDir}
- exit status : ${workflow.exitStatus}
- Error report: ${workflow.errorReport ?: '-'}
- """
- .stripIndent()
-
- sendMail(to: params.email, subject: "Master of Pore execution", body: msg)
- }
-}
-
diff --git a/NanoTail/nextflow.config b/NanoTail/nextflow.config
deleted file mode 100644
index 210b3e6..0000000
--- a/NanoTail/nextflow.config
+++ /dev/null
@@ -1,68 +0,0 @@
-env {
- PYTHONNOUSERSITE = 1
-}
-
-includeConfig "$baseDir/params.config"
-//includeConfig "../nextflow.global.config"
-singularity.cacheDir = "$baseDir/../singularity"
-
-singularity.autoMounts = true
-
-
-profiles {
- uge {
- executor = 'sge'
- penv = 'smp'
- process {
- container = 'biocorecrg/mopmod1:0.2'
- withLabel: big_mem_cpus {
- queue = 'long-sl7,biocore-el7,short-sl7'
- cpus = 8
- memory = '20G'
- }
-
- withLabel: big_mem_cpus_time_np {
- container = "biocorecrg/mopnanopolish:0.2"
- queue = 'biocore-el7,long-sl7'
- cpus = 8
- memory = '60G'
- time = '96h'
- }
-
- withLabel: big_mem_cpus_time {
- container = "biocorecrg/mopnanotail:0.2"
- queue = 'biocore-el7,long-sl7'
- cpus = 8
- memory = '60G'
- time = '96h'
- }
- }
- }
- slurm {
- executor = 'slurm'
- process {
- container = 'biocorecrg/mopmod1:0.2'
- withLabel: big_mem_cpus {
- cpus = 8
- memory = '20G'
- }
-
- withLabel: big_mem_cpus_time_np {
- container = "biocorecrg/mopnanopolish:0.2"
- cpus = 8
- memory = '60G'
- time = '96h'
- }
-
- withLabel: big_mem_cpus_time {
- container = "biocorecrg/mopnanotail:0.2"
- cpus = 8
- memory = '60G'
- time = '96h'
- }
-
- }
- }
-}
-
-
diff --git a/NanoTail/params.config b/NanoTail/params.config
deleted file mode 100644
index c92374a..0000000
--- a/NanoTail/params.config
+++ /dev/null
@@ -1,11 +0,0 @@
-params {
-
- input_folders = "$baseDir/../NanoPreprocess/output/RNA*"
- nanopolish_opt = ""
- tailfindr_opt = ""
-
- reference = "$baseDir/../anno2/genome.fa.gz"
- output = "$baseDir/outputRNA"
-
- email = "yourname@yourdomain"
-}
diff --git a/config.yaml b/config.yaml
deleted file mode 100644
index 541e130..0000000
--- a/config.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-title: "Master of Pores"
-subtitle: ""
-intro_text: False
-
-report_header_info:
- - Application Type: 'Nanopore sequencing'
-
-custom_logo: 'logo_small.png'
-custom_logo_url: 'https://github.com/biocorecrg/nanopore_analysis'
-custom_logo_title: 'Master of Pores'
-
-extra_fn_clean_trim:
- - '_QC'
-
-table_columns_visible:
- FastQC:
- percent_duplicates: True
-
-module_order:
- - fastqc:
- name: 'FastQC'
- info: 'This section of the report shows FastQC results on raw reads.'
-
- - alnQC:
- name: "Alignment QC"
- info: 'This section of the report shows QC results on aligned reads'
-
- - nanoplot:
- name: "Test"
- info: "info"
diff --git a/example_output/multiqc_report_example.html b/example_output/multiqc_report_example.html
deleted file mode 100644
index 3b598b4..0000000
--- a/example_output/multiqc_report_example.html
+++ /dev/null
@@ -1,6365 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-Master of Pores: MultiQC Report
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Loading report..
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Master of Pores
-
-
-
-
-
-
-
-
-
-
-
-
-
- Application Type Nanopore sequencing
-
-
-
-
-
-
-
-
JavaScript Disabled
-
MultiQC reports use JavaScript for plots and toolbox functions. It looks like
- you have JavaScript disabled in your web browser. Please note that many of the report
- functions will not work as intended.
-
-
-
-
Loading report..
-
-
-
Report generated on 2020-01-15, 13:15 based on data in:
- /nfs/software/bi/biocore_tools/git/nextflow/master_of_pores_paper/master_of_pores/NanoPreprocess/work/72/97457bebc4a80bf88ea5553ae5c70f
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
General Statistics
-
-
- Copy table
-
-
-
- Configure Columns
-
-
-
- Sort by highlight
-
-
-
- Plot
-
-
-
Showing 1 /1 rows and 3 /5 columns.
-
-
-
-
-
-
-
-
-
Uncheck the tick box to hide columns. Click and drag the handle on the left to change order.
-
- Show All
- Show None
-
-
-
-
-
-
-
-
-
-
-
-
-
-
FastQC
-
FastQC This section of the report shows FastQC results on raw reads.
-
-
-
-
-
-
-
- Sequence Counts
-
-
-
- Help
-
-
-
-
-
Sequence counts for each sample. Duplicate read counts are an estimate only.
-
-
-
-
This plot show the total number of reads, broken down into unique and duplicate
-if possible (only more recent versions of FastQC give duplicate info).
-
You can read more about duplicate calculation in the
-FastQC documentation .
-A small part has been copied here for convenience:
-
Only sequences which first appear in the first 100,000 sequences
-in each file are analysed. This should be enough to get a good impression
-for the duplication levels in the whole file. Each sequence is tracked to
-the end of the file to give a representative count of the overall duplication level.
-
The duplication detection requires an exact sequence match over the whole length of
-the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.
-
-
-
-Number of reads
-Percentages
-
-
-
-
-
-
-
-
-
-
-
- Sequence Quality Histograms
-
-
-
- Help
-
-
-
-
-
The mean quality value across each base position in the read.
-
-
-
-
To enable multiple samples to be plotted on the same graph, only the mean quality
-scores are plotted (unlike the box plots seen in FastQC reports).
-
Taken from the FastQC help :
-
The y-axis on the graph shows the quality scores. The higher the score, the better
-the base call. The background of the graph divides the y axis into very good quality
-calls (green), calls of reasonable quality (orange), and calls of poor quality (red).
-The quality of calls on most platforms will degrade as the run progresses, so it is
-common to see base calls falling into the orange area towards the end of a read.
-
-
-
-
-
-
-
-
-
-
-
-
-
- Per Sequence Quality Scores
-
-
-
- Help
-
-
-
-
-
The number of reads with average quality scores. Shows if a subset of reads has poor quality.
-
-
-
-
From the FastQC help :
-
The per sequence quality score report allows you to see if a subset of your
-sequences have universally low quality values. It is often the case that a
-subset of sequences will have universally poor quality, however these should
-represent only a small percentage of the total sequences.
-
-
-
-
-
-
-
-
-
-
-
-
-
- Per Base Sequence Content
-
-
-
- Help
-
-
-
-
-
The proportion of each base position for which each of the four normal DNA bases has been called.
-
-
-
-
To enable multiple samples to be shown in a single plot, the base composition data
-is shown as a heatmap. The colours represent the balance between the four bases:
-an even distribution should give an even muddy brown colour. Hover over the plot
-to see the percentage of the four bases under the cursor.
-
To see the data as a line plot, as in the original FastQC graph, click on a sample track.
-
From the FastQC help :
-
Per Base Sequence Content plots out the proportion of each base position in a
-file for which each of the four normal DNA bases has been called.
-
In a random library you would expect that there would be little to no difference
-between the different bases of a sequence run, so the lines in this plot should
-run parallel with each other. The relative amount of each base should reflect
-the overall amount of these bases in your genome, but in any case they should
-not be hugely imbalanced from each other.
-
It's worth noting that some types of library will always produce biased sequence
-composition, normally at the start of the read. Libraries produced by priming
-using random hexamers (including nearly all RNA-Seq libraries) and those which
-were fragmented using transposases inherit an intrinsic bias in the positions
-at which reads start. This bias does not concern an absolute sequence, but instead
-provides enrichement of a number of different K-mers at the 5' end of the reads.
-Whilst this is a true technical bias, it isn't something which can be corrected
-by trimming and in most cases doesn't seem to adversely affect the downstream
-analysis.
-
-
-
-
-
-
- Click a sample row to see a line plot for that dataset.
-
-
Rollover for sample name
-
Export Plot
-
- Position:
-
-
%T: -
-
%C: -
-
%A: -
-
%G: -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Per Sequence GC Content
-
-
-
- Help
-
-
-
-
-
The average GC content of reads. Normal random library typically have a
- roughly normal distribution of GC content.
-
-
-
-
From the FastQC help :
-
This module measures the GC content across the whole length of each sequence
-in a file and compares it to a modelled normal distribution of GC content.
-
In a normal random library you would expect to see a roughly normal distribution
-of GC content where the central peak corresponds to the overall GC content of
-the underlying genome. Since we don't know the the GC content of the genome the
-modal GC content is calculated from the observed data and used to build a
-reference distribution.
-
An unusually shaped distribution could indicate a contaminated library or
-some other kinds of biased subset. A normal distribution which is shifted
-indicates some systematic bias which is independent of base position. If there
-is a systematic bias which creates a shifted normal distribution then this won't
-be flagged as an error by the module since it doesn't know what your genome's
-GC content should be.
-
-
-
-Percentages
-Counts
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Per Base N Content
-
-
-
- Help
-
-
-
-
-
The percentage of base calls at each position for which an N
was called.
-
-
-
-
From the FastQC help :
-
If a sequencer is unable to make a base call with sufficient confidence then it will
-normally substitute an N
rather than a conventional base call. This graph shows the
-percentage of base calls at each position for which an N
was called.
-
It's not unusual to see a very low proportion of Ns appearing in a sequence, especially
-nearer the end of a sequence. However, if this proportion rises above a few percent
-it suggests that the analysis pipeline was unable to interpret the data well enough to
-make valid base calls.
-
-
-
-
-
-
-
-
-
-
-
-
-
- Sequence Length Distribution
-
-
-
-
The distribution of fragment sizes (read lengths) found.
- See the FastQC help
-
-
-
-
-
-
-
-
-
-
-
-
-
- Sequence Duplication Levels
-
-
-
- Help
-
-
-
-
-
The relative level of duplication found for every sequence.
-
-
-
-
From the FastQC Help :
-
In a diverse library most sequences will occur only once in the final set.
-A low level of duplication may indicate a very high level of coverage of the
-target sequence, but a high level of duplication is more likely to indicate
-some kind of enrichment bias (eg PCR over amplification). This graph shows
-the degree of duplication for every sequence in a library: the relative
-number of sequences with different degrees of duplication.
-
Only sequences which first appear in the first 100,000 sequences
-in each file are analysed. This should be enough to get a good impression
-for the duplication levels in the whole file. Each sequence is tracked to
-the end of the file to give a representative count of the overall duplication level.
-
The duplication detection requires an exact sequence match over the whole length of
-the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.
-
In a properly diverse library most sequences should fall into the far left of the
-plot in both the red and blue lines. A general level of enrichment, indicating broad
-oversequencing in the library will tend to flatten the lines, lowering the low end
-and generally raising other categories. More specific enrichments of subsets, or
-the presence of low complexity contaminants will tend to produce spikes towards the
-right of the plot.
-
-
-
-
-
-
-
-
-
-
-
-
-
- Overrepresented sequences
-
-
-
- Help
-
-
-
-
-
The total amount of overrepresented sequences found in each library.
-
-
-
-
FastQC calculates and lists overrepresented sequences in FastQ files. It would not be
-possible to show this for all samples in a MultiQC report, so instead this plot shows
-the number of sequences categorized as over represented.
-
Sometimes, a single sequence may account for a large number of reads in a dataset.
-To show this, the bars are split into two: the first shows the overrepresented reads
-that come from the single most common sequence. The second shows the total count
-from all remaining overrepresented sequences.
-
From the FastQC Help :
-
A normal high-throughput library will contain a diverse set of sequences, with no
-individual sequence making up a tiny fraction of the whole. Finding that a single
-sequence is very overrepresented in the set either means that it is highly biologically
-significant, or indicates that the library is contaminated, or not as diverse as you expected.
-
FastQC lists all of the sequences which make up more than 0.1% of the total.
-To conserve memory only sequences which appear in the first 100,000 sequences are tracked
-to the end of the file. It is therefore possible that a sequence which is overrepresented
-but doesn't appear at the start of the file for some reason could be missed by this module.
-
-
-
-
-
-
-
-
-
-
-
-
-
- Adapter Content
-
-
-
- Help
-
-
-
-
-
The cumulative percentage count of the proportion of your
- library which has seen each of the adapter sequences at each position.
-
-
-
-
Note that only samples with ≥ 0.1% adapter contamination are shown.
-
There may be several lines per sample, as one is shown for each adapter
-detected in the file.
-
From the FastQC Help :
-
The plot shows a cumulative percentage count of the proportion
-of your library which has seen each of the adapter sequences at each position.
-Once a sequence has been seen in a read it is counted as being present
-right through to the end of the read so the percentages you see will only
-increase as the read length goes on.
-
-
-
No samples found with any adapter contamination > 0.1%
-
-
-
-
-
-
-
-
-
-
- Status Checks
-
-
-
- Help
-
-
-
-
-
Status for each FastQC section showing whether results seem entirely normal (green),
-slightly abnormal (orange) or very unusual (red).
-
-
-
-
FastQC assigns a status for each section of the report.
-These give a quick evaluation of whether the results of the analysis seem
-entirely normal (green), slightly abnormal (orange) or very unusual (red).
-
It is important to stress that although the analysis results appear to give a pass/fail result,
-these evaluations must be taken in the context of what you expect from your library.
-A 'normal' sample as far as FastQC is concerned is random and diverse.
-Some experiments may be expected to produce libraries which are biased in particular ways.
-You should treat the summary evaluations therefore as pointers to where you should concentrate
-your attention and understand why your library may not look random and diverse.
-
Specific guidance on how to interpret the output of each module can be found in the relevant
-report section, or in the FastQC help .
-
In this heatmap, we summarise all of these into a single heatmap for a quick overview.
-Note that not all FastQC sections have plots in MultiQC reports, but all status checks
-are shown in this heatmap.
-
-
-
-
- Sort by highlight
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Alignment QC
-
Alignment QC
-
-
-
-
-
-
-
-
-
-
-
- Copy table
-
-
-
- Configure Columns
-
-
-
- Sort by highlight
-
-
-
- Plot
-
-
-
Showing 1 /1 rows and 7 /7 columns.
-
-
-
-
- 613 214
100.0%
309 214 724
94.5%
533.0
18 861
89.70%
-
-
-
-
-
-
-
Uncheck the tick box to hide columns. Click and drag the handle on the left to change order.
-
- Show All
- Show None
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
RNAAA023484 wt1 stats
-
RNAAA023484 wt1 stats Embedded image RNAAA023484_wt1_stats_mqc.png
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Select Column
-
-
-
-
- Select Column
-
-
-
-
-
- Please select two table columns.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Toolbox search strings can behave as regular expressions (regexes). Click a button below to see an example of it in action. Try modifying them yourself in the text box.
-
-
- ^
(start of string)
- $
(end of string)
- []
(character choice)
- \d
(shorthand for [0-9]
)
- \w
(shorthand for [0-9a-zA-Z_]
)
- .
(any character)
- \.
(literal full stop)
- ()
|
(group / separator)
- *
(prev char 0 or more)
- +
(prev char 1 or more)
- ?
(prev char 0 or 1)
- {}
(char num times)
- {,}
(count range)
-
-
-
-
-samp_1
-samp_1_edited
-samp_2
-samp_2_edited
-samp_3
-samp_3_edited
-prepended_samp_1
-tmp_samp_1_edited
-tmpp_samp_1_edited
-tmppp_samp_1_edited
-#samp_1_edited.tmp
-samp_11
-samp_11111
-
-
See regex101.com for a more heavy duty testing suite.
-
-
-
-
-
-
-
-
-
-