SEED-research-lab · tzwilliams · Sep 7, 2021 · Aug 2, 2021 · Sep 7, 2021
diff --git a/.github/.gitignore b/.github/.gitignore
@@ -0,0 +1 @@
+desktop.ini
diff --git a/.github/workflows/.gitignore b/.github/workflows/.gitignore
@@ -0,0 +1 @@
+desktop.ini
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,6 @@
 .RData
 .Ruserdata
 output
-*.lnk
+*.lnk
+*/desktop.ini
+desktop.ini
diff --git a/R/.gitignore b/R/.gitignore
@@ -0,0 +1 @@
+desktop.ini
diff --git a/R/28_generateRandVectors.R b/R/28_generateRandVectors.R
@@ -3,7 +3,7 @@
 # Project:      n-TARP clustering
 #               https://github.com/SEED-research-lab/n-tarp
 # 
-# Copyright 2017-2020 Taylor Williams
+# Copyright 2017-2021 Taylor Williams
 # 
 #     Licensed under the Apache License, Version 2.0 (the "License");
 #     you may not use this file except in compliance with the License.
@@ -22,65 +22,86 @@
 # Authors:      Taylor Williams
 # Affiliation:  Purdue University
 # 
+# Input stack:
+#               CLEAN PROBABILITY MATRIX (feature vector) file
+
+#
 # Description:  
 # 
-# Package dependancies: 
+# Package dependencies: 
 #
 # Changelog:
 #     2019.09.13. forked from other SEED lab projects
 #     2020.02.05. allow user to choose number of random vectors
+#     2021.07.30. Continuing effort to save progress every thousand vectors
+#     2021.07.2x. commented out the file loading code--a bug with grepl is suspected.  Workaround: manually load the 3 RData files
 #                   
-# Feature wishlist:  (*: planned but not complete)
-#     *              
+# Feature wishlist:  (*: planned; /: started; x: complete)
+#     [*] re-enable file loading
 ## ===================================================== ##
 
 
-## Clean the environment ########## 
+
+######### Clean the environment ########## 
+# rm(list=ls())
+# ## Clean the environment except required variables
 varsToRetain <- c("filenameFV", "dataFolderPath")
 rm(list=setdiff(ls(), varsToRetain))
 
-## Required libraries ########## 
-require("readr")
-require("tcltk")
-require("tidyr")
-require("dplyr")
-require("tibble")
-require("beepr")
-
-#Read data from files ####
-## Check for pre-defined starting directory and course prefix ####
-if(!exists("filenamePrefix")) filenamePrefix <- NULL
-if(!exists("dataFolderPath")) dataFolderPath <- NULL
-if(!exists("filenameFV")) filenameFV <- NULL
-
-
-## get data file locations from user ####
-#Locate the CLEAN probability matrix (feature vector) file
-if(!exists("filenameFV")){
-  #read the CLEAN probability matrix (feature vector) file
-  prompt <- "*****Select the CLEAN PROBABILITY MATRIX (feature vector) file*****\n    (The file picker window may have opened in the background.  Check behind this window if you do not see it.)\n"
-  cat("\n", prompt)
-  filenameFV <- tcltk::tk_choose.files(caption = prompt,
-                                       default = file.path(getwd(), "output", ""),
-                                       filter = matrix(c("CSV", ".csv",
-                                                         "RData", ".RData",
-                                                         "All files", ".*"),
-                                                       3, 2, byrow = TRUE),
-                                       multi = FALSE)
-}
-
-#load in the data based on the type of data file provided
-if(grepl(x = filenameFV, pattern = "\\.RData$")){
-  load(file = filenameFV)
-  probMatrix <- stu_LO_FV
-}else if(grepl(x = filenameFV, pattern = "\\.(csv|CSV)$")){
-  probMatrix <- read_csv(file = filenameFV)
-}else {
-  message("Invalid Data Filetype.")
-  break
-}
+######### Internal functions ########## 
+
 
 
+######### Setup ##########
+#load required packages
+require(tidyverse)
+require(readxl)
+require(beepr)
+
+#Load functions 
+
+
+######### Read Data ##########
+
+## WORKAROUND
+probMatrix <- stu_LO_FV #run this and insert the appropriate clean feature vector (aka, probability matrix)
+
+# ## Check for pre-defined starting directory and course prefix ####
+# if(!exists("filenamePrefix")) filenamePrefix <- NULL
+# if(!exists("dataFolderPath")) dataFolderPath <- NULL
+# if(!exists("filenameFV")) filenameFV <- NULL
+# 
+# 
+# ## get data file locations from user ####
+# #Locate the CLEAN probability matrix (feature vector) file
+# if(!exists("filenameFV")){
+#   #read the CLEAN probability matrix (feature vector) file
+#   prompt <- "*****Select the CLEAN PROBABILITY MATRIX (feature vector) file*****\n    (The file picker window may have opened in the background.  Check behind this window if you do not see it.)\n"
+#   cat("\n", prompt)
+#   filenameFV <- tcltk::tk_choose.files(caption = prompt,
+#                                        default = file.path(getwd(), "output", ""),
+#                                        filter = matrix(c("CSV", ".csv",
+#                                                          "RData", ".RData",
+#                                                          "All files", ".*"),
+#                                                        3, 2, byrow = TRUE),
+#                                        multi = FALSE)
+# }
+#   
+# #load in the data based on the type of data file provided
+# if(grepl(x = filenameFV, pattern = "\\.RData$")){
+#   load(file = filenameFV)
+#   probMatrix <- stu_LO_FV
+# }else if(grepl(x = filenameFV, pattern = "\\.(csv|CSV)$")){
+#   probMatrix <- read_csv(file = filenameFV)
+# }else {
+#   message("Invalid Data Filetype.")
+#   break
+# }
+
+
+
+
+######### Main ##########
 
 
 ##Generate random vectors ####

diff --git a/R/40_findBestThreshold.R b/R/40_findBestThreshold.R
@@ -50,6 +50,11 @@ source(file.path(getwd(), "R", "functions", "ExtractRVnumsAndNames.R"))
 source(file.path(getwd(), "R", "functions", "DisplayPercentComplete.R"))
 source(file.path(getwd(), "R", "functions", "file-structure-functions.R"))
 
+
+#start a timer to track how long the script takes to execute
+start40 <-  proc.time() #save the time 
+
+
 # ## Read data from file(s) ####
 # #read the PROJECTIONS data file
 # prompt <- "*****Select the PROJECTIONS data file*****\n    (The file picker window may have opened in the background.  Check behind this window if you do not see it.)\n"
@@ -80,6 +85,9 @@ source(file.path(getwd(), "R", "functions", "file-structure-functions.R"))
 
 
 #Read data from files ####
+cat("Loading data from files\n")
+proc.time() - start40
+
 ## Check for pre-defined starting directory and course prefix ####
 if(!exists("filenamePrefix")) filenamePrefix <- NULL
 if(!exists("dataFolderPath")) dataFolderPath <- NULL
@@ -112,35 +120,81 @@ if(grepl(x = filenameProj, pattern = "\\.RData$")){
   break
 }
 
+cat("Loading data complete\n\n")
+proc.time() - start40
 
 
 
 
-
-#get the string values for the random vectors 
-numsAndNames <- ExtractRVnumsAndNames(RP_names = names(projection))
-
-#split the returned list into two seperate variables.  Convert to matricies for use in the next step
-RV_nums <- as.matrix(numsAndNames$nums)
-RV_names <- as.matrix(numsAndNames$names)
-
+## Extract string values for the random vectors ####
+cat("Extracting string values for the random vectors\n")
+proc.time() - start40
+# check if already complete and saved
+fileExistCheck <- FileExistCheck_workingDir(filename = "40_RVnumsAndNames.RData", subDir = "output")
+if(fileExistCheck != FALSE){
+  load(fileExistCheck)
+}else{
+
+  #get the string values for the random vectors 
+  numsAndNames <- ExtractRVnumsAndNames(RP_names = names(projection))
+
+  #split the returned list into two separate variables.  Convert to matrices for use in the next step
+  RV_nums <- as.matrix(numsAndNames$nums)
+  RV_names <- as.matrix(numsAndNames$names)
+
+  #save String extraction to a RData file
+  save(numsAndNames, file = file.path("output", "40_RVnumsAndNames.RData"),
+       precheck = TRUE, compress = TRUE)
+}
+cat("String extraction complete\n\n")
+proc.time() - start40
 
 
 ## Sort all of the projection columns ####
+cat("Sorting projection columns\n")
+# check if the projections have already been sorted and saved
+fileExistCheck <- FileExistCheck_workingDir(filename = "40_projectionSort.RData", subDir = "output")
+if(fileExistCheck != FALSE){
+  load(fileExistCheck)
+}else{
+  #duplicate the projection data frame to store results of sorted projections
+  projectionSort <- projection
+  #replace the row names with an ascending integer 
+  #   (after sorting, the rows will no longer refer to a single user's projection)
+  rownames(projectionSort) <- c(1:nrow(projectionSort))
+
+  #sort each column 
+  for(i in 1:ncol(projection))
+  {
+    projectionSort[,i] <- projection[order(projection[i]),i]
+
+
+    #| print completion progress to console   ####
+    #during first iteration, create progress status variables for main processing loop
+    if(i==1){
+      iCount <- 0 #loop counter for completion updates
+      pct <- 0  #percentage complete tracker
+    }
+    #print function
+    updateVars <- DisplayPercentComplete(projectionSort, iCount, pct, displayText = "Sorting projections: ")
+    #update status variables
+    iCount <- updateVars$iCount
+    pct <- updateVars$pct
+    #print update
+    cat(updateVars$toPrint)
+  }
+
+  #save sorted projections to a RData file
+  save(projectionSort, file = file.path("output", "40_projectionSort.RData"),
+       precheck = TRUE, compress = TRUE)
+}
+cat("Sorting projection columns complete\n\n")
+proc.time() - start40
 
-#duplicate the projection data frame to store results of sorted projections
-projectionSort <- projection
-#replace the row names with an assending integer 
-#   (after sorting, the rows will no longer refer to a single user's projection)
-rownames(projectionSort) <- c(1:nrow(projectionSort))
 
-#sort each column 
-for(i in 1:ncol(projection))
-{
-  projectionSort[,i] <- projection[order(projection[i]),i]
-}
 
 ## Test each projection value as the threshold to find best threshold value (minimizing withinSS, W) ####
+cat("Finding best threshold value for each projection (minimizing withinSS, W)")
 #create empty data frame to store results of matrix multiplication
 minW_RandVec <- data.frame(matrix(nrow = 2, ncol = length(projection)))
 #set names
@@ -162,6 +216,8 @@ colnames(minW_RandVec) <- minW_RandVecColNames
 # set row names
 rownames(minW_RandVec) <- c("Min WithinSS (W)", "Group Threshold")
 
+
+## loop through all the projections
 for(i in 1:ncol(projectionSort))
 {
   #build current column name
@@ -170,6 +226,23 @@ for(i in 1:ncol(projectionSort))
   #reset the minimimum withinSS value to an abserdly large value
   minW <- 1e10
 
+
+  #| print completion progress to console   ####
+  #durring first iteration, create progress status variables for main processing loop
+  if(i==1){
+    iCount <- 0 #loop counter for completion updates
+    pct <- 0  #percentage complete tracker
+  }
+  #print function
+  updateVars <- DisplayPercentComplete(projectionSort, iCount, pct, displayText = "Locating best threshold values: ")
+  #update status variables
+  iCount <- updateVars$iCount
+  pct <- updateVars$pct
+  #print update
+  cat(updateVars$toPrint)
+
+
+  ## brute-force search for the global minimum
   for(j in 1:nrow(projectionSort))
   {
     #test each of the projection values (x_j) as the threshold 
@@ -207,25 +280,16 @@ for(i in 1:ncol(projectionSort))
   minW_RandVec["Group 1 Count", curColName] <- sum(projectionSort[,curColName] < as.numeric(bestThresh))
   minW_RandVec["Group 2 Count", curColName] <- sum(projectionSort[,curColName] >= as.numeric(bestThresh))
 
-
-
-  #| print completion progress to console   ####
-  #durring first iteration, create progress status variables for main processing loop
-  if(i==1)
-  {
-    iCount <- 0 #loop counter for completion updates
-    pct <- 0  #percentage complete tracker
+  if (i %% 250==0) {
+    #write to a RData file
+    save(minW_RandVec, file = file.path("output", paste0("40_minW_and_threshold-",i,".RData")),
+         precheck = TRUE, compress = TRUE)
+    cat("\nSaving 250 file")
+    proc.time() - start40
+
   }
 
-  #print function
-  updateVars <- DisplayPercentComplete(projectionSort, iCount, pct, displayText = "Locating best threshold values: ")
-
-  #update status variables
-  iCount <- updateVars$iCount
-  pct <- updateVars$pct
-
-  #print update
-  cat(updateVars$toPrint)
+
 }
 
 #transpose and sort the min. withinSS and group threshold data
@@ -236,10 +300,12 @@ minW_RandVec_sort <- minW_RandVec_sort[order(minW_RandVec_sort$`Min WithinSS (W)
 ## |Save min. withinSS and group threshold data to file ####
 #write a CSV file
 cat("\nSaving CSV file.")
-write.csv(file = file.path("output", "40_minW_and_threshold.csv"), 
+proc.time() - start40
+
+write.csv(file = file.path("output", "40_minW_and_threshold_sorted.csv"), 
           x = minW_RandVec_sort)  
 #write to a RData file
-save(minW_RandVec_sort, file = file.path("output", "40_minW_and_threshold.RData"),
+save(minW_RandVec_sort, file = file.path("output", "40_minW_and_threshold_sorted.RData"),
      precheck = TRUE, compress = TRUE)
 
 
@@ -251,18 +317,23 @@ clusterWThreshold <- 0.36
 clusterCandidates <- minW_RandVec[1,minW_RandVec["Min WithinSS (W)",] < clusterWThreshold]
 sortedCandidates <- clusterCandidates[1,order(clusterCandidates)]
 
+
 #report percentage of promising projections 
-length(clusterCandidates)/length(minW_RandVec)
+cat("percentage of promising projections:", length(clusterCandidates)/length(minW_RandVec))
+
 
 #save the names of the cluster candidates
 sortedCandidateNames <- names(sortedCandidates)
 
 ##|Save data to file ####
 #write to a CSV file
 cat("\nSaving files.")
-write.csv(file = file.path("output", "40_best_RP_names.csv"), 
+write.csv(file = file.path("output", 
+                           paste0("40_best_RP_names (W-lt-", clusterWThreshold, ").csv")), 
           x = sortedCandidateNames)  
 #write to a RData file
-save(sortedCandidateNames, file = file.path("output", "40_best_RP_names.RData"),
+save(sortedCandidateNames, 
+     file = file.path("output", 
+                      paste0("40_best_RP_names (W-lt-", clusterWThreshold, ").RData")),
      precheck = TRUE, compress = TRUE)