Skip to content

Commit

Permalink
Merge pull request #49 from tyler5huang/master
Browse files Browse the repository at this point in the history
SMuRFv2.0.5
  • Loading branch information
tyler5huang authored Nov 2, 2020
2 parents df90a1f + 5303011 commit c391d5d
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 24 deletions.
2 changes: 1 addition & 1 deletion smurf/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ Depends: R (>= 3.3.1)
License: MIT
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.1.9000
RoxygenNote: 7.1.0
2 changes: 2 additions & 0 deletions smurf/R/CDSannotation_snv.R
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ CDSannotation_snv = function(x, tbi, predicted, build, change.build){
print("reading mutect2")
# mut.mutect2=suppressWarnings(annotate.vcf(x[[1]],build,mut.mutect2))
mut.mutect2=suppressWarnings(annotate.vcf(tbi[[1]],build,mut.mutect2))
# mut.mutect2=suppressWarnings(annotate.vcf(tbi[[1]],build=seqinfo(scanVcfHeader(x[[1]])),mut.mutect2))

}

if (length(mut.strelka2)!=0){
Expand Down
25 changes: 24 additions & 1 deletion smurf/R/indelRFpredict.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,32 @@ indelRFpredict = function(parsevcf, indel.cutoff, fixed.indel.cutoff=F){

print("Warning: There are no predicted indel calls in this sample. Re-examine myresults$smurf_indel$parse_indel and set a lower indel.cutoff.")

# Generate stats
stats <- matrix(,nrow = 11, ncol = 1)
stats <- as.data.frame(stats)
colnames(stats) <- c("Passed_Calls")
rownames(stats) <- c("Strelka2", "Mutect2", "FreeBayes", "VarDict", "VarScan", "Atleast1", "Atleast2", "Atleast3", "Atleast4", "All5", "SMuRF_INDEL")

counts <- apply(indel_parse[, c("FILTER_Strelka2","FILTER_Mutect2","FILTER_Freebayes","FILTER_Vardict","FILTER_Varscan")], 1, function(x) length(which(x=="TRUE")))

stats$Passed_Calls[1] <- length(which((indel_parse$FILTER_Strelka2==TRUE)))
stats$Passed_Calls[2] <- length(which((indel_parse$FILTER_Mutect2==TRUE)))
stats$Passed_Calls[3] <- length(which((indel_parse$FILTER_Freebayes==TRUE)))
stats$Passed_Calls[4] <- length(which((indel_parse$FILTER_Vardict==TRUE)))
stats$Passed_Calls[5] <- length(which((indel_parse$FILTER_Varscan==TRUE)))

stats$Passed_Calls[6] <- length(which(counts>=1))
stats$Passed_Calls[7] <- length(which(counts>=2))
stats$Passed_Calls[8] <- length(which(counts>=3))
stats$Passed_Calls[9] <- length(which(counts>=4))
stats$Passed_Calls[10] <- length(which(counts>=5))

stats$Passed_Calls[11] <- 0

stats<-as.matrix(stats)
parse<-as.matrix(indel_parse)

return(list(parse_indel=parse))
return(list(stats_indel=stats, parse_indel=parse))
}


Expand Down
8 changes: 4 additions & 4 deletions smurf/R/parsevcf_allfeaturesall.R
Original file line number Diff line number Diff line change
Expand Up @@ -314,10 +314,10 @@ parsevcf_allfeaturesall = function(x, tbi, roi=F, roi.dir=NULL, t.label=NULL){
end.time=Sys.time()
print(end.time-start.time)

print("merge 4 vcfs and meta data")
print("merge 5 vcfs and meta data")
start.time=Sys.time()

## merge 4 vcfs and meta data
## merge 5 vcfs and meta data
names_m2= paste(names(mcols(gr_m2)[,-1]), "_Mutect2", sep="")
names_f= paste(names(mcols(gr_f)[,-1]), "_Freebayes", sep="")
names_vs= paste(names(mcols(gr_vs)[,-1]), "_Varscan", sep="")
Expand Down Expand Up @@ -820,10 +820,10 @@ parsevcf_allfeaturesall = function(x, tbi, roi=F, roi.dir=NULL, t.label=NULL){
end.time=Sys.time()
print(end.time-start.time)

print("merge 4 vcfs and meta data")
print("merge 5 vcfs and meta data")
start.time=Sys.time()

## merge 4 vcfs and meta data
## merge 5 vcfs and meta data
names_m2= paste(names(mcols(gr_m2)[,-1]), "_Mutect2", sep="")
names_f= paste(names(mcols(gr_f)[,-1]), "_Freebayes", sep="")
names_vs= paste(names(mcols(gr_vs)[,-1]), "_Varscan", sep="")
Expand Down
75 changes: 58 additions & 17 deletions smurf/R/smurf.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#' SMuRF v2.0
#'
#' Somatic mutation consensus calling based on four callers:
#' MuTect2, Freebayes, VarDict, VarScan
#' Somatic mutation consensus calling based on five callers:
#' MuTect2, Freebayes, VarDict, VarScan and Strelka2
#' using a RandomForest model to consolidate a list of high accuracy calls.
#'
#' @note
#' Input files containing variant calls should be ".vcf.gz" format of each caller.
#' Supported for R >= 3.3.1, Java version 7 (up to 11) is supported.
#'
#' @param directory Choose directory where the Variant Caller Format(VCF) files are located.
#' Alternatively, provide a list object containing the path to the 4 VCF files labelled: mutect, freebayes, vardict and varscan.
#' Alternatively, provide a list object containing the path to the 5 VCF files labeled: mutect, freebayes, vardict, varscan and strelka.
#'
#' @param mode Choose "snv", "indel" or "combined" (snv+indel).
#' The appropriate parsing and prediction model will be performed
Expand All @@ -31,12 +31,15 @@
#'
#' @param output.dir Path to output directory (if saving files as .txt)
#'
#' @param parse.dir Specify if changing SMuRF default cutoffs.
#' @param parse.dir Specify if changing SMuRF default cutoffs in the future.
#' Path to the location of existing snv-parse.txt and indel-parse.txt files generated by SMuRF.
#'
#' @param change.build TRUE or FALSE. For conversion of your genomic coordinates.
#' Default option is disabled to retain your original 'build' specified above.
#'
#' @param find.build Default as FALSE. Additional genome build check for the annotation step.
#' Retrieves the reference genome from the Strelka2 vcf header info to cross check with the build stated by the user.
#'
#' @param t.label (optional) Provide the sample name for your tumour sample to ease
#' the identification of the normal and tumour sample names in your vcf.
#' See examples below
Expand Down Expand Up @@ -87,23 +90,24 @@
#' model="combined",build='hg19')
#'
#' @export
#'
smurf = function(directory=NULL, mode=NULL, nthreads = -1,
annotation=F, output.dir=NULL, parse.dir=NULL,
snv.cutoff = 'default', indel.cutoff = 'default',
build=NULL, change.build=F, t.label=NULL, re.tabIndex=F,
build=NULL, change.build=F, find.build=F,
t.label=NULL, re.tabIndex=F,
check.packages=T, file.exclude=NULL){

#SMuRF version announcement
print("SMuRFv2.0.3 (22nd July 2020)")
print("SMuRFv2.0.5 (2nd Nov 2020)")


if(is.null(directory)){
# stop('directory path not specified')
return(write("smurf(directory=NULL, mode=NULL, nthreads = -1,
annotation=F, output.dir=NULL, parse.dir=NULL,
snv.cutoff = 'default', indel.cutoff = 'default',
build=NULL, change.build=F, t.label=NULL, re.tabIndex=F,
build=NULL, change.build=F, find.build=F,
t.label=NULL, re.tabIndex=F,
check.packages=T, file.exclude=NULL)", stdout()))
}

Expand Down Expand Up @@ -177,7 +181,7 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,
}
if(length(dir.name)!=1){
print(dir.name)
warning('More than one file is selected. Use file.key to specify unique file name.')
warning('More than one file is selected. Use file.exclude to specify unique file name.')
}
return(dir.name)
}
Expand All @@ -204,15 +208,15 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,
# }

x<-list(mutect2,freebayes,varscan,vardict,strelka2)

} else if (class(directory)=='list'){
x = directory
if (length(x)!=5){
stop('Input file check failed. Directory contains incorrect number of vcf files.')
}
}


if(re.tabIndex == F) {

mutect2.tbi <- Sys.glob(paste0(directory,"/*mutect*.vcf.gz.tbi"))
Expand Down Expand Up @@ -339,6 +343,43 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,
suppressWarnings(h2o.init(nthreads = nthreads))


#check/match ref genome build

print('Genome build stated in SMuRF:')
print(build)
if (build=='hg19') {
g.build = 'GRCh37'
} else {
g.build = build
}
print('Ref genome used in vcf:')
genomebuild = suppressWarnings(scanVcfHeader(x[[5]])@header@listData$reference$Value)
match.build = grep(g.build,genomebuild)
print(genomebuild)

if ( annotation==T && (is.null(genomebuild) || length(match.build)==0) ) {
print('Warning: build provided does not match ref genome used in vcf. SMuRF CDS annotation may not run properly if genome build is incorrect.')
}
if (!is.null(genomebuild) && length(match.build)==0 && find.build==T && annotation==T){
find.hg19 = grep('GRCh37',genomebuild)
find.hg38 = grep('hg38',genomebuild)
if ( length(find.hg19)==0 && length(find.hg38)!=0 && build=='hg19' ) {
print('Changing build variable provided')
print('hg19 -> hg38')
build='hg38'
} else if ( length(find.hg19)!=0 && length(find.hg38)==0 && build=='hg38') {
print('Changing build variable provided')
print('hg38 -> hg19')
build='hg19'
}
} else if (is.null(genomebuild) && find.build==T && annotation==T){
print('Changing build variable provided')
warning('ref genome used in vcf not found. find.build==T skipped.')
}

print(paste0('Final genome build used for analysis: ',build))


#Retrieving files from the directory

write("Accessing files:", stdout())
Expand Down Expand Up @@ -373,15 +414,15 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,

} else if (annotation == T) {

if (nrow(myresults$smurf_snv$predicted_snv)>1) {
if (nrow(snvpredict$predicted_snv)>1 && !is.null(snvpredict$predicted_snv)) {
print("SNV annotation")
snvannotation<-CDSannotation_snv(x,myresults$smurf_snv$predicted_snv,build=build,change.build=change.build)
} else {
print("No snv predictions. Skipping snv annotation step.")
snvannotation=NULL
}

if (nrow(myresults$smurf_indel$predicted_indel)>1) {
if (nrow(indelpredict$predicted_indel)>1 && !is.null(indelpredict$predicted_indel)) {
print("Indel annotation")
indelannotation<-CDSannotation_indel(x,myresults$smurf_indel$predicted_indel,build=build,change.build=change.build)
} else {
Expand Down Expand Up @@ -435,7 +476,7 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,

} else if (annotation == T) {

if (length(snvpredict)>1) {
if (nrow(snvpredict$predicted_snv)>1 && !is.null(snvpredict$predicted_snv)) {
print("SNV annotation")
snvannotation<-CDSannotation_snv(x,tbi,snvpredict,build=build,change.build=change.build)
} else {
Expand Down Expand Up @@ -484,7 +525,7 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,

} else if (annotation == T) {

if (length(indelpredict)>1) {
if (nrow(indelpredict$predicted_indel)>1 && !is.null(indelpredict$predicted_indel)) {
print("Indel annotation")
indelannotation<-CDSannotation_indel(x,tbi,indelpredict,build=build,change.build=change.build)
} else {
Expand Down Expand Up @@ -537,15 +578,15 @@ smurf = function(directory=NULL, mode=NULL, nthreads = -1,

} else if (annotation == T) {

if (length(snvpredict)>1) {
if (nrow(snvpredict$predicted_snv)>1 && !is.null(snvpredict$predicted_snv)) {
print("SNV annotation")
snvannotation<-CDSannotation_snv(x,tbi,snvpredict,build=build,change.build=change.build)
} else {
print("No snv predictions. Skipping snv annotation step.")
snvannotation=NULL
}

if (length(indelpredict)>1) {
if (nrow(indelpredict$predicted_indel)>1 && !is.null(indelpredict$predicted_indel)) {
print("Indel annotation")
indelannotation<-CDSannotation_indel(x,tbi,indelpredict,build=build,change.build=change.build)
} else {
Expand Down
25 changes: 24 additions & 1 deletion smurf/R/snvRFpredict.R
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,32 @@ snvRFpredict = function(parsevcf, snv.cutoff, fixed.snv.cutoff=F){

print("Warning: There are no predicted SNV calls in this sample. Re-examine myresults$smurf_snv$parse_snv and set a lower snv.cutoff.")

# Generate stats
stats <- matrix(,nrow = 11, ncol = 1)
stats <- as.data.frame(stats)
colnames(stats) <- c("Passed_Calls")
rownames(stats) <- c("Strelka2", "Mutect2", "FreeBayes", "VarDict", "VarScan", "Atleast1", "Atleast2", "Atleast3", "Atleast4", "All5", "SMuRF_SNV")

counts <- apply(snv_parse[, c("FILTER_Strelka2","FILTER_Mutect2","FILTER_Freebayes","FILTER_Vardict","FILTER_Varscan")], 1, function(x) length(which(x=="TRUE")))

stats$Passed_Calls[1] <- length(which((snv_parse$FILTER_Strelka2==TRUE)))
stats$Passed_Calls[2] <- length(which((snv_parse$FILTER_Mutect2==TRUE)))
stats$Passed_Calls[3] <- length(which((snv_parse$FILTER_Freebayes==TRUE)))
stats$Passed_Calls[4] <- length(which((snv_parse$FILTER_Vardict==TRUE)))
stats$Passed_Calls[5] <- length(which((snv_parse$FILTER_Varscan==TRUE)))

stats$Passed_Calls[6] <- length(which(counts>=1))
stats$Passed_Calls[7] <- length(which(counts>=2))
stats$Passed_Calls[8] <- length(which(counts>=3))
stats$Passed_Calls[9] <- length(which(counts>=4))
stats$Passed_Calls[10] <- length(which(counts>=5))

stats$Passed_Calls[11] <- 0

stats<-as.matrix(stats)
parse<-as.matrix(snv_parse)

return(list(parse_snv=parse))
return(list(stats_snv=stats, parse_snv=parse))
}

}

0 comments on commit c391d5d

Please sign in to comment.