Merge pull request #49 from tyler5huang/master

SMuRFv2.0.5
skandlab · Nov 2, 2020 · c391d5d · c391d5d
2 parents df90a1f + 5303011
commit c391d5d
Show file tree

Hide file tree

Showing 6 changed files with 113 additions and 24 deletions.
diff --git a/smurf/DESCRIPTION b/smurf/DESCRIPTION
@@ -7,4 +7,4 @@ Depends: R (>= 3.3.1)
 License: MIT
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 6.1.1.9000
+RoxygenNote: 7.1.0
diff --git a/smurf/R/CDSannotation_snv.R b/smurf/R/CDSannotation_snv.R
@@ -205,6 +205,8 @@ CDSannotation_snv = function(x, tbi, predicted, build, change.build){
           print("reading mutect2")
           # mut.mutect2=suppressWarnings(annotate.vcf(x[[1]],build,mut.mutect2))
           mut.mutect2=suppressWarnings(annotate.vcf(tbi[[1]],build,mut.mutect2))
+          # mut.mutect2=suppressWarnings(annotate.vcf(tbi[[1]],build=seqinfo(scanVcfHeader(x[[1]])),mut.mutect2))
+
         }
 
         if (length(mut.strelka2)!=0){

diff --git a/smurf/R/indelRFpredict.R b/smurf/R/indelRFpredict.R
@@ -101,9 +101,32 @@ indelRFpredict = function(parsevcf, indel.cutoff, fixed.indel.cutoff=F){
 
      print("Warning: There are no predicted indel calls in this sample. Re-examine myresults$smurf_indel$parse_indel and set a lower indel.cutoff.")
 
+     # Generate stats
+     stats <- matrix(,nrow = 11, ncol = 1)
+     stats <- as.data.frame(stats)
+     colnames(stats) <- c("Passed_Calls")
+     rownames(stats) <- c("Strelka2", "Mutect2", "FreeBayes", "VarDict", "VarScan", "Atleast1", "Atleast2", "Atleast3", "Atleast4", "All5", "SMuRF_INDEL")
+
+     counts <- apply(indel_parse[, c("FILTER_Strelka2","FILTER_Mutect2","FILTER_Freebayes","FILTER_Vardict","FILTER_Varscan")], 1, function(x) length(which(x=="TRUE")))
+
+     stats$Passed_Calls[1] <- length(which((indel_parse$FILTER_Strelka2==TRUE)))
+     stats$Passed_Calls[2] <- length(which((indel_parse$FILTER_Mutect2==TRUE)))
+     stats$Passed_Calls[3] <- length(which((indel_parse$FILTER_Freebayes==TRUE)))
+     stats$Passed_Calls[4] <- length(which((indel_parse$FILTER_Vardict==TRUE)))
+     stats$Passed_Calls[5] <- length(which((indel_parse$FILTER_Varscan==TRUE)))
+
+     stats$Passed_Calls[6] <- length(which(counts>=1))
+     stats$Passed_Calls[7] <- length(which(counts>=2))
+     stats$Passed_Calls[8] <- length(which(counts>=3))
+     stats$Passed_Calls[9] <- length(which(counts>=4))
+     stats$Passed_Calls[10] <- length(which(counts>=5))
+
+     stats$Passed_Calls[11] <- 0
+
+     stats<-as.matrix(stats)
      parse<-as.matrix(indel_parse)
 
-     return(list(parse_indel=parse))
+     return(list(stats_indel=stats, parse_indel=parse))
    } 
 
 

diff --git a/smurf/R/parsevcf_allfeaturesall.R b/smurf/R/parsevcf_allfeaturesall.R
@@ -314,10 +314,10 @@ parsevcf_allfeaturesall = function(x, tbi, roi=F, roi.dir=NULL, t.label=NULL){
   end.time=Sys.time() 
   print(end.time-start.time)
 
-  print("merge 4 vcfs and meta data")
+  print("merge 5 vcfs and meta data")
   start.time=Sys.time()
 
-  ## merge 4 vcfs and meta data
+  ## merge 5 vcfs and meta data
   names_m2= paste(names(mcols(gr_m2)[,-1]), "_Mutect2", sep="")
   names_f= paste(names(mcols(gr_f)[,-1]), "_Freebayes", sep="")
   names_vs= paste(names(mcols(gr_vs)[,-1]), "_Varscan", sep="")
@@ -820,10 +820,10 @@ parsevcf_allfeaturesall = function(x, tbi, roi=F, roi.dir=NULL, t.label=NULL){
   end.time=Sys.time() 
   print(end.time-start.time)
 
-  print("merge 4 vcfs and meta data")
+  print("merge 5 vcfs and meta data")
   start.time=Sys.time()
 
-  ## merge 4 vcfs and meta data
+  ## merge 5 vcfs and meta data
   names_m2= paste(names(mcols(gr_m2)[,-1]), "_Mutect2", sep="")
   names_f= paste(names(mcols(gr_f)[,-1]), "_Freebayes", sep="")
   names_vs= paste(names(mcols(gr_vs)[,-1]), "_Varscan", sep="")

diff --git a/smurf/R/smurf.R b/smurf/R/smurf.R
@@ -1,15 +1,15 @@
 #' SMuRF v2.0
 #'
-#' Somatic mutation consensus calling based on four callers:
-#' MuTect2, Freebayes, VarDict, VarScan
+#' Somatic mutation consensus calling based on five callers:
+#' MuTect2, Freebayes, VarDict, VarScan and Strelka2
 #' using a RandomForest model to consolidate a list of high accuracy calls.
 #'
 #' @note
 #' Input files containing variant calls should be ".vcf.gz" format of each caller.
 #' Supported for R >= 3.3.1, Java version 7 (up to 11) is supported. 
 #' 
 #' @param directory Choose directory where the Variant Caller Format(VCF) files are located. 
-#' Alternatively, provide a list object containing the path to the 4 VCF files labelled: mutect, freebayes, vardict and varscan.
+#' Alternatively, provide a list object containing the path to the 5 VCF files labeled: mutect, freebayes, vardict, varscan and strelka.
 #'
 #' @param mode Choose "snv", "indel" or "combined" (snv+indel). 
 #' The appropriate parsing and prediction model will be performed
@@ -31,12 +31,15 @@
 #' 
 #' @param output.dir Path to output directory (if saving files as .txt)
 #' 
-#' @param parse.dir Specify if changing SMuRF default cutoffs.
+#' @param parse.dir Specify if changing SMuRF default cutoffs in the future.
 #' Path to the location of existing snv-parse.txt and indel-parse.txt files generated by SMuRF. 
 #' 
 #' @param change.build TRUE or FALSE. For conversion of your genomic coordinates.
 #' Default option is disabled to retain your original 'build' specified above. 
 #' 
+#' @param find.build Default as FALSE. Additional genome build check for the annotation step.
+#' Retrieves the reference genome from the Strelka2 vcf header info to cross check with the build stated by the user. 
+#' 
 #' @param t.label (optional) Provide the sample name for your tumour sample to ease 
 #' the identification of the normal and tumour sample names in your vcf.
 #' See examples below
@@ -87,23 +90,24 @@
 #'                   model="combined",build='hg19')
 #'                                    
 #' @export
-#' 
 smurf = function(directory=NULL, mode=NULL, nthreads = -1,
                  annotation=F, output.dir=NULL, parse.dir=NULL,
                  snv.cutoff = 'default', indel.cutoff = 'default',
-                 build=NULL, change.build=F, t.label=NULL, re.tabIndex=F,
+                 build=NULL, change.build=F, find.build=F,
+                 t.label=NULL, re.tabIndex=F,
                  check.packages=T, file.exclude=NULL){
 
   #SMuRF version announcement
-  print("SMuRFv2.0.3 (22nd July 2020)")
+  print("SMuRFv2.0.5 (2nd Nov 2020)")
 
 
   if(is.null(directory)){
     # stop('directory path not specified')
     return(write("smurf(directory=NULL, mode=NULL, nthreads = -1,
                  annotation=F, output.dir=NULL,  parse.dir=NULL,
                  snv.cutoff = 'default', indel.cutoff = 'default',
-                 build=NULL, change.build=F, t.label=NULL, re.tabIndex=F,
+                 build=NULL, change.build=F, find.build=F,
+                 t.label=NULL, re.tabIndex=F,
                  check.packages=T, file.exclude=NULL)", stdout()))
   }
 
@@ -177,7 +181,7 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,
       }
       if(length(dir.name)!=1){
         print(dir.name)
-        warning('More than one file is selected. Use file.key to specify unique file name.')
+        warning('More than one file is selected. Use file.exclude to specify unique file name.')
       } 
       return(dir.name)
     }
@@ -204,15 +208,15 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,
     # }
 
     x<-list(mutect2,freebayes,varscan,vardict,strelka2)
-    
+
   } else if (class(directory)=='list'){
     x = directory
     if (length(x)!=5){
       stop('Input file check failed. Directory contains incorrect number of vcf files.')
     }
   }
 
-    
+
   if(re.tabIndex == F) {
 
       mutect2.tbi <- Sys.glob(paste0(directory,"/*mutect*.vcf.gz.tbi"))
@@ -339,6 +343,43 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,
     suppressWarnings(h2o.init(nthreads = nthreads))
 
 
+    #check/match ref genome build
+
+    print('Genome build stated in SMuRF:')
+    print(build)
+    if (build=='hg19') {
+      g.build = 'GRCh37'
+    } else {
+      g.build = build
+    }
+    print('Ref genome used in vcf:')
+    genomebuild = suppressWarnings(scanVcfHeader(x[[5]])@header@listData$reference$Value)
+    match.build = grep(g.build,genomebuild)
+    print(genomebuild)
+
+    if ( annotation==T && (is.null(genomebuild) || length(match.build)==0) ) {
+      print('Warning: build provided does not match ref genome used in vcf. SMuRF CDS annotation may not run properly if genome build is incorrect.')
+    } 
+    if (!is.null(genomebuild) && length(match.build)==0 && find.build==T && annotation==T){
+      find.hg19 = grep('GRCh37',genomebuild)
+      find.hg38 = grep('hg38',genomebuild)
+      if ( length(find.hg19)==0 && length(find.hg38)!=0 && build=='hg19' ) {
+        print('Changing build variable provided')
+        print('hg19 -> hg38')
+        build='hg38'
+      } else if ( length(find.hg19)!=0 && length(find.hg38)==0 && build=='hg38') {
+        print('Changing build variable provided')
+        print('hg38 -> hg19')
+        build='hg19'
+      }
+    } else if (is.null(genomebuild) && find.build==T && annotation==T){
+      print('Changing build variable provided')
+      warning('ref genome used in vcf not found. find.build==T skipped.')
+    }
+
+    print(paste0('Final genome build used for analysis: ',build))
+
+
     #Retrieving files from the directory
 
     write("Accessing files:", stdout())
@@ -373,15 +414,15 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,
 
         } else if (annotation == T) {
 
-          if (nrow(myresults$smurf_snv$predicted_snv)>1) {
+          if (nrow(snvpredict$predicted_snv)>1 && !is.null(snvpredict$predicted_snv)) {
             print("SNV annotation")
             snvannotation<-CDSannotation_snv(x,myresults$smurf_snv$predicted_snv,build=build,change.build=change.build)
           } else {
             print("No snv predictions. Skipping snv annotation step.")
             snvannotation=NULL
           }
 
-          if (nrow(myresults$smurf_indel$predicted_indel)>1) {
+          if (nrow(indelpredict$predicted_indel)>1 && !is.null(indelpredict$predicted_indel)) {
             print("Indel annotation")
             indelannotation<-CDSannotation_indel(x,myresults$smurf_indel$predicted_indel,build=build,change.build=change.build)
           } else {
@@ -435,7 +476,7 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,
 
       } else if (annotation == T) {
 
-        if (length(snvpredict)>1) {
+        if (nrow(snvpredict$predicted_snv)>1 && !is.null(snvpredict$predicted_snv)) {
           print("SNV annotation")
           snvannotation<-CDSannotation_snv(x,tbi,snvpredict,build=build,change.build=change.build)
         } else {
@@ -484,7 +525,7 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,
 
       } else if (annotation == T) {
 
-        if (length(indelpredict)>1) {
+        if (nrow(indelpredict$predicted_indel)>1 && !is.null(indelpredict$predicted_indel)) {
           print("Indel annotation")
           indelannotation<-CDSannotation_indel(x,tbi,indelpredict,build=build,change.build=change.build)
         } else {
@@ -537,15 +578,15 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,
 
       } else if (annotation == T) {
 
-        if (length(snvpredict)>1) {
+        if (nrow(snvpredict$predicted_snv)>1 && !is.null(snvpredict$predicted_snv)) {
           print("SNV annotation")
           snvannotation<-CDSannotation_snv(x,tbi,snvpredict,build=build,change.build=change.build)
         } else {
           print("No snv predictions. Skipping snv annotation step.")
           snvannotation=NULL
         }
 
-        if (length(indelpredict)>1) {
+        if (nrow(indelpredict$predicted_indel)>1 && !is.null(indelpredict$predicted_indel)) {
           print("Indel annotation")
           indelannotation<-CDSannotation_indel(x,tbi,indelpredict,build=build,change.build=change.build)
         } else {

diff --git a/smurf/R/snvRFpredict.R b/smurf/R/snvRFpredict.R
@@ -99,9 +99,32 @@ snvRFpredict = function(parsevcf, snv.cutoff, fixed.snv.cutoff=F){
 
     print("Warning: There are no predicted SNV calls in this sample. Re-examine myresults$smurf_snv$parse_snv and set a lower snv.cutoff.")
 
+    # Generate stats
+    stats <- matrix(,nrow = 11, ncol = 1)
+    stats <- as.data.frame(stats)
+    colnames(stats) <- c("Passed_Calls")
+    rownames(stats) <- c("Strelka2", "Mutect2", "FreeBayes", "VarDict", "VarScan", "Atleast1", "Atleast2", "Atleast3", "Atleast4", "All5", "SMuRF_SNV")
+
+    counts <- apply(snv_parse[, c("FILTER_Strelka2","FILTER_Mutect2","FILTER_Freebayes","FILTER_Vardict","FILTER_Varscan")], 1, function(x) length(which(x=="TRUE")))
+
+    stats$Passed_Calls[1] <- length(which((snv_parse$FILTER_Strelka2==TRUE)))
+    stats$Passed_Calls[2] <- length(which((snv_parse$FILTER_Mutect2==TRUE)))
+    stats$Passed_Calls[3] <- length(which((snv_parse$FILTER_Freebayes==TRUE)))
+    stats$Passed_Calls[4] <- length(which((snv_parse$FILTER_Vardict==TRUE)))
+    stats$Passed_Calls[5] <- length(which((snv_parse$FILTER_Varscan==TRUE)))
+
+    stats$Passed_Calls[6] <- length(which(counts>=1))
+    stats$Passed_Calls[7] <- length(which(counts>=2))
+    stats$Passed_Calls[8] <- length(which(counts>=3))
+    stats$Passed_Calls[9] <- length(which(counts>=4))
+    stats$Passed_Calls[10] <- length(which(counts>=5))
+
+    stats$Passed_Calls[11] <- 0
+
+    stats<-as.matrix(stats)
     parse<-as.matrix(snv_parse)
 
-    return(list(parse_snv=parse))
+    return(list(stats_snv=stats, parse_snv=parse))
   }
 
 }