-
Notifications
You must be signed in to change notification settings - Fork 0
/
FinalPipeline.R
133 lines (78 loc) · 3.06 KB
/
FinalPipeline.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
library(devtools)
Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_131')
library(rmcfs)
library(dplyr)
library(R.ROSETTA)
library(VisuNet)
##### FUNCTION DEFINITIONS #####
load_protein_IS <- function(filename) {
#This function loads file without header and provides a work around for a bug where T and F proteins are interpreted as booleans
data <- read.table(filename, sep="\t", header=FALSE)
# drop the header row and readd it as column names
first_row <- data[1,]
data <- data[-1,]
colnames(data) <- as.character(unlist(first_row))
data <- droplevels(data)
return(data)
}
load_mcfs_results <- function() {
#This function loads saved mcfs result files from working directory
if (exists("mcfs_result"))
return (mcfs_result)
if (file.exists(mcfs_result_filename)) {
mcfs_result <- readRDS(mcfs_result_file)
return (mcfs_result)
}
}
##### FILE LOADING #####
filename <- "data/Project5.csv"
output_dir <- "output"
#mcfs_result_file <- "temp/mcfs_run.rds" #Uncomment if using saved mcfs files
#features_filename <- "temp/most_sig_names.rds" #Uncomment if using saved features file
data = load_protein_IS(filename)
# Data metrics:
dim(data)
attributes(data)
attr = attributes(data)$names
#print(paste("Attributes per tree:", length(attr)*proj_size))
# Check if the dataset is balanced
table(data$Host)
##### MONTE CARLO FEATURE SELECTION #####
#Parameters
n_projections <- 10000
proj_size <- 'auto'
cutoff_method <- 'kmeans'
cutoff_pe <- 20
splits <- 5
splitset_size <- 0.66
mcfs_result <- mcfs(Host~., data, projections=n_projections,projectionSize=proj_size, splits=splits, splitSetSize=splitset_size,cutoffMethod = cutoff_method,
cutoffPermutations = cutoff_pe, threadsNumber = 8)
head(mcfs_result$RI)
plot(mcfs_result, type="distances")
# Cache the results:
saveRDS(mcfs_result, mcfs_result_file)
mcfs_result <- load_mcfs_results()
# Get most significant names
most_sig <- mcfs_result$RI[1:mcfs_result$cutoff_value,]
most_sig_names <- most_sig$attribute
#saveRDS(most_sig_names, features_filename)
#most_sig_names <- readRDS(features_filename)
#Significant Feature Extraction
rule_df <- select(data, most_sig_names, Host)
#write.csv(rule_df,file = "sig_feat_table.csv") #Uncomment to save the file as csv
##### ROSETTA RULE BASED CLASSIFICATION #####
classifier <- "StandardVoter" # StandardVoter, ObjectTrackingVoter, NaiveBayesClassifier
reducer <- "Johnson" # Johnson, Genetic
#JohnsonParam <- list(Modulo=TRUE, BRT=FALSE, BRTprec=0.9, Precompute=FALSE, Approximate=TRUE, Fraction=0.95) #Uncomment if required
#GeneticParam <- list(Modulo=TRUE, BRT=FALSE, BRTprec=0.9, Precompute=FALSE, Approximate=TRUE, Fraction=0.95, Algorithm="Simple") #Uncomment if required
# Run Rosetta with ROC enabled
rules <- rosetta(rule_df,roc = TRUE, discrete=TRUE, clroc = "Human", classifier = classifier, reducer = reducer)
ruletable <- rules$main
viewRules(ruletable)
# Rule Quality
rules$quality
#Performance Visualisation
rules$ROC.stats
plotMeanROC(rules)
##### VISUNET VISUALISATION #####
visunet(ruletable)