From f128e6c3c7597c23e4b8267e4273cf9a57e3d2e9 Mon Sep 17 00:00:00 2001 From: berjakian Date: Sun, 29 Nov 2020 23:55:11 -0500 Subject: [PATCH] commit --- assignment5.Rmd | 321 +++++++++++++++++++++++++++++++++++++++++++++--- desktop.ini | Bin 0 -> 244 bytes ggbiplot.r | 220 +++++++++++++++++++++++++++++++++ ggscreeplot.r | 48 ++++++++ 4 files changed, 572 insertions(+), 17 deletions(-) create mode 100644 desktop.ini create mode 100644 ggbiplot.r create mode 100644 ggscreeplot.r diff --git a/assignment5.Rmd b/assignment5.Rmd index 288bcb3..b425cc2 100644 --- a/assignment5.Rmd +++ b/assignment5.Rmd @@ -1,5 +1,5 @@ --- -title: "Principle Component Aanalysis" +title: "Principle Component Analysis" output: html_document --- ## Data @@ -16,10 +16,28 @@ The data you will be using comes from the Assistments online intelligent tutorin ## Start by uploading the data ```{r} -D1 <- +#---- With A Little Help From My Friends ♫ +library(tidyverse) + +#---- Filler'up +D1 <- read.csv("Assistments-confidence.csv") + +#---- let's take a look... +str(D1) +head(D1) +summary(D1) + +#---- na's please rise... https://sebastiansauer.github.io/sum-isna/ +D1.is.na <- D1 %>% + summarise_all(funs(sum(is.na(.)))) +D1.is.na +#---- drop rows that contain NA in any column... not needed based on results of previous command +#---- D1.1 <- na.omit(D1) + ``` + ## Create a correlation matrix of the relationships between the variables, including correlation coefficients for each pair of variables/features. ```{r} @@ -27,88 +45,357 @@ D1 <- library(ggplot2) library(GGally) +library(corrplot) + + + + +ggpairs(D1, 2:8, progress = FALSE) + +#---- thank you https://www.blopig.com/blog/2019/06/a-brief-introduction-to-ggpairs/ +D1 %>% select(,2:8) %>% ggpairs(., + progress = FALSE, + lower = list(continuous = wrap("points", alpha = 0.3, size=0.1))) + +D1 %>% select(,2:8) %>% ggpairs(., + progress = FALSE, + lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.1))) + + +#ggpairs() draws a correlation plot between all the columns you identify by number (second option, you don't need the first column as it is the student ID) and progress = FALSE stops a progress bar appearing as it renders your plot + + -ggpairs(D1, 2:8, progress = FALSE) #ggpairs() draws a correlation plot between all the columns you identify by number (second option, you don't need the first column as it is the student ID) and progress = FALSE stops a progress bar appearing as it renders your plot +ggcorr(D1[,-1], method = c("everything", "pearson")) +#ggcorr() doesn't have an explicit option to choose variables so we need to use matrix notation to drop the id variable. We then need to choose a "method" which determines how to treat missing values (here we choose to keep everything, and then which kind of correlation calculation to use, here we are using Pearson correlation, the other options are "kendall" or "spearman") + +ggcorr(D1[,-1], method = c("everything", "kendall")) +#other methods are "kendall" or "spearman" + + +ggcorr(D1[,-1], method = c("everything", "spearman")) +#other methods are "kendall" or "spearman" -ggcorr(D1[,-1], method = c("everything", "pearson")) #ggcorr() doesn't have an explicit option to choose variables so we need to use matrix notation to drop the id variable. We then need to choose a "method" which determines how to treat missing values (here we choose to keep everything, and then which kind of correlation calculation to use, here we are using Pearson correlation, the other options are "kendall" or "spearman") #Study your correlogram images and save them, you will need them later. Take note of what is strongly related to the outcome variable of interest, mean_correct. + +#---- experimentation with scaling the data before running a pairs plot... surprise...turns out there's no difference... +D1.1 <- D1[,-c(1)] +rownames(D1.1) <- D1[,c(1)] +D1.1.scale <- scale(D1.1) +pairs(D1.1.scale) +pairs(D1[,-1]) + +#---- experimentation with corrplot... didn't work +#---- corrplot(D1[,-1], cl.lim = NULL) + + ``` -## Create a new data frame with the mean_correct variable removed, we want to keep that variable intact. The other variables will be included in our PCA. +#---- looking at these plots, it seems... +#---- The greatest correlation exists between mean_hint and mean_correct, and it's a negative correlation, i.e. learners who use hints have a higher average of submitting wrong answers. A plausable explanation -- learners who take advantage of hints from the system are more likely to be those who answer a question wrong at first. +#---- the least correlation exists between mean_hint and mean_confidence. A plausable explaination -- learner confidence has nothing to do with whether or not they seek hints from the system. + + +#--- Clustering, for fun ```{r} -D2 <- +#--- what class is the data +class(D1.1) + +#--- convert to matrix +D1.1.matrix <- as.matrix(D1.1) + +#--- how many unique rows... this is the upper limit to k values +dim(unique(D1.1.matrix)) + +#--- calculate Total within-cluster sum of squares +D1.1.tot_withinss <- map_dbl(1:10, function(k){ + model <- kmeans(D1.1.matrix, centers = k) + model$tot.withinss +}) + +#--- Generate a data frame containing both k and tot_withinss +D1.1.elbow_df <- data.frame( + k = 1:10, + tot_withinss = D1.1.tot_withinss +) + +#--- Plot the elbow plot of K values and tot_withinss +ggplot(D1.1.elbow_df, aes(x = k, y = tot_withinss)) + + geom_line() + + scale_x_continuous(breaks = 1:42) + +#--- the elbow plot looks like 3 groups + +#--- kmeans the datatable into 3 groups +D1.1.groups <- kmeans(D1.1.matrix, centers = 3) +D1.1.clustered <- data.frame(D1.1, cluster = as.factor(D1.1.groups$cluster)) + + +``` + + +## Create a new data frame with the mean_correct variable removed, we want to keep that variable intact. The other variables will be included in our PCA. + +```{r} +#new dataframe with the mean_correct variable removed +D2 <- D1[,-c(1,5)] +rownames(D2) <- D1[,1] + ``` ## Now run the PCA on the new data frame ```{r} -pca <- prcomp(D2, scale. = TRUE) + +D2.pca <- prcomp(D2, scale. = TRUE, center = TRUE) + ``` ## Although princomp does not generate the eigenvalues directly for us, we can print a list of the standard deviation of the variance accounted for by each component. ```{r} -pca$sdev +D2.pca$sdev + #To convert this into variance accounted for we can square it, these numbers are proportional to the eigenvalue -pca$sdev^2 +D2.pca$sdev^2 #A summary of our pca will give us the proportion of variance accounted for by each component -summary(pca) +summary(D2.pca) #We can look at this to get an idea of which components we should keep and which we should drop -plot(pca, type = "lines") +plot(D2.pca, type = "lines") + ``` ## Decide which components you would drop and remove them from your data set. +#--- So i did this analysis over and over... and i belive i understand the outcome... and so i'm writing this already having that insight... pretending not to know that outcome... here's what i would say. +#--- Looking at the plots as well as the data table, i would guess that dropping PC6 is almost a no brainer. It contributes less than 9% to the overall variance of the 6 variables. Dropping PC5 as well maintains 78% of the variability in the data, which still feels respectable. Dropping PC4 brings the total variability to 64%, which is still the majority, but feels too aggressive. So i'll opt for dropping PC5 and PC6. + + ## Part II ```{r} #Now, create a data frame of the transformed data from your pca. -D3 <- +#--- i did not remove PC5 or PC6 from the data + +D3 <- D2.pca$x #Attach the variable "mean_correct" from your original data frame to D3. +D3 <- data.frame(D3, mean_correct = D1$mean_correct, cluster = as.factor(D1.1.clustered$cluster)) +#Now re-run your correlation plots between the transformed data and mean_correct. If you had dropped some components would you have lost important information about mean_correct? -#Now re-run your correlation plots between the transformed data and mean_correct. If you had dropped some components would you have lost important infomation about mean_correct? +ggpairs(D3[,1:7], progress = FALSE) + +#---- thank you https://www.blopig.com/blog/2019/06/a-brief-introduction-to-ggpairs/ +D3[,1:7] %>% ggpairs(., + progress = FALSE, + lower = list(continuous = wrap("points", alpha = 0.3, size=0.2))) + +D3[,1:7] %>% ggpairs(., + progress = FALSE, + lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.2))) ``` +#--- Well how do you like that... Looking at the plots, it seems that PC1, PC2, and PC6 have the highest correlations to mean_correct, which is arguablly the most important variable... and among those three PC lines, it's actually PC6 with the greatest correlation among them, albiet negative... hmmm, and i considered booting PC6... tsk, tsk. + + ## Now print out the loadings for the components you generated: ```{r} -pca$rotation +D2.pca$rotation #Examine the eigenvectors, notice that they are a little difficult to interpret. It is much easier to make sense of them if we make them proportional within each component -loadings <- abs(pca$rotation) #abs() will make all eigenvectors positive +D2.loadings <- abs(D2.pca$rotation) #abs() will make all eigenvectors positive #Now examine your components and try to come up with substantive descriptions of what some might represent? #You can generate a biplot to help you, though these can be a bit confusing. They plot the transformed data by the first two components. Therefore, the axes represent the direction of maximum variance accounted for. Then mapped onto this point cloud are the original directions of the variables, depicted as red arrows. It is supposed to provide a visualization of which variables "go together". Variables that possibly represent the same underlying construct point in the same direction. -biplot(pca) +biplot(D2.pca) + +print(D2.loadings) + + +#--- found this here https://stackoverflow.com/questions/6578355/plotting-pca-biplot-with-ggplot2 +ggbiplot(D2.pca, labels = rownames(D1)) + + + +#--- found this here https://stackoverflow.com/questions/47482879/how-to-make-a-pretty-biplot-in-r-without-using-external-packages +#--- i thought it would be good to cluster these observations to see if something on the PC graphs shake out with regards to groups... not sure what i should be looking for +library(car) +plot(D3[1:2], pch=20, col = alpha(D3$cluster,0.5)) +dataEllipse(D3[,1], D3[,2], as.factor(D3$cluster), lwd=1, + group.labels = NULL, plot.points=FALSE, add=TRUE, + fill=TRUE, fill.alpha=0.02) + ``` + + + # Part III -Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to andother TC program. Students were shown three program names at a time and were asked which two of the three were most similar. Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs. +Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to and other TC program. Students were shown three program names at a time and were asked which two of the three were most similar. Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs. ```{r} +#---- read in the data +TC1 <- read.csv("tc-program-combos.csv") + +#---- let's take a look... +str(TC1) +head(TC1) +#summary(TC1) + +#---- na's please rise... https://sebastiansauer.github.io/sum-isna/ +TC1.is.na <- TC1 %>% + summarise_all(funs(sum(is.na(.)))) +TC1.is.na +sum(TC1.is.na) +#---- drop rows that contain NA in any column... not needed based on results of previous command +#---- TC1.1 <- na.omit(D1) + + +``` +```{r} +#--- ggpairs is unusable as the matrix has too many variables, wah-wah... if only i could 'reduce the dimensions'... hey wait... ;) +ggpairs(TC1[,2:20], progress = FALSE) + +TC1[,2:20] %>% ggpairs(., + progress = FALSE, + lower = list(continuous = wrap("points", alpha = 0.3, size=0.1))) + +TC1[,2:20] %>% ggpairs(., + progress = FALSE, + lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.1))) + +``` + +```{r} +#--- commence dimension reduction +TC1.pca <- prcomp(TC1[2:68], scale. = TRUE, center = TRUE) + +TC1.pca$sdev + +TC1.pca$sdev^2 + +summary(TC1.pca) + +plot(TC1.pca, type = "lines") + +## Decide which components you would drop and remove them from your data set. +#--- i dont understand why this plot stopped at 10 +#--- Looking primarily at the data table (given that the plot stopped at 10 values), 90% of the variability in the data is within the first half ( the first 33PC values) of the PC vectors, so if that's a reasonable threshold, i can drop PCs 34 and thereafter. If i lower the variability threshold to 80%, i can drop off 10 more PC vectors, leaving me with a more manageable group of 24 PC lines. Then again, if i want to get it down to the point i can run a pairs visual... I would have to be satisfied with having a representation of only 64% of the data variability. i'm going to choose this for now, only out of convenience. I'd like to have a conversation about how to determine an acceptable threshold. + + ``` +```{r} +#Now, create a data frame of the transformed data from your pca. + +#--- + +TC2.pca <- as.data.frame(TC1.pca$x[,1:15]) +ggpairs(TC2.pca, progress = FALSE) + +#---- thank you https://www.blopig.com/blog/2019/06/a-brief-introduction-to-ggpairs/ +TC2.pca %>% ggpairs(., + progress = FALSE, + lower = list(continuous = wrap("points", alpha = 0.3, size=0.2))) + +TC2.pca %>% ggpairs(., + progress = FALSE, + lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.2))) + + + + +## Now print out the loadings for the components you generated: + +TC1.pca$rotation[,1:15] + +TC1.loadings <- abs(TC1.pca$rotation[,1:15]) + +#You can generate a biplot to help you, though these can be a bit confusing. They plot the transformed data by the first two components. Therefore, the axes represent the direction of maximum variance accounted for. Then mapped onto this point cloud are the original directions of the variables, depicted as red arrows. It is supposed to provide a visualization of which variables "go together". Variables that possibly represent the same underlying construct point in the same direction. + +biplot(TC1.pca) + +ggbiplot(TC1.pca) + + + + + +``` + + +#Explain why you think there are relationships between these programs. +#--- The task was to find relationships between programs, and i would assume this can be done by locating groups of TC progams with low data variablity amongst them. I wasn't able to complete this task... +#--- I wasn't able to discern a relationship from the pairs graphs, too many charts, and i remember Dr. Lang saying this would be a problem as datasets got bigger. +#--- I wasn't able discern a relationship from the data datatable, although i think i should be able to. +#--- I wasn't able discern a relationship from the biplot, and i dont think anyone else could. +#--- In looking at the ggbiplot, it seems some of the PC lines are near oneanother, but i dont think that should lead to an interpretation because it would be difficult to infer relationships from 60+ dimensions attempted to be drawn in two dimensions. +#--- I ran a kmeans clustering as seen below... there too, i was not able to discern relationships. +#--- looks like i'll need a little more help to figure this one out. + + + +```{r} +#--- Clustering, for fun + +#--- what class is the data +class(TC1) + +#--- convert to matrix +TC1.matrix <- as.matrix(TC1[,2:68]) +rownames(TC1.matrix) <- TC1[,1] + +#--- how many unique rows... this is the upper limit to k values +dim(unique(TC1.matrix)) + +#--- calculate Total within-cluster sum of squares +TC1.tot_withinss <- map_dbl(1:67, function(k){ + model <- kmeans(TC1.matrix, centers = k) + model$tot.withinss +}) + +#--- Generate a data frame containing both k and tot_withinss +TC1.elbow_df <- data.frame( + k = 1:67, + tot_withinss = TC1.tot_withinss +) + +#--- Plot the elbow plot of K values and tot_withinss +ggplot(TC1.elbow_df, aes(x = k, y = tot_withinss)) + + geom_line() + + scale_x_continuous(breaks = 1:67) + +#--- the elbow plot looks like no natural groupings... skip the rest + +#--- kmeans the datatable into n groups ... +#TC1.groups <- kmeans(TC1.matrix, centers = n) + +#TC1.clustered <- data.frame(TC1.matrix[,0], cluster = as.factor(TC1.groups$cluster)) + + + +``` diff --git a/desktop.ini b/desktop.ini new file mode 100644 index 0000000000000000000000000000000000000000..7e2afc92dc9897151144be5543ed5b34819f64d3 GIT binary patch literal 244 zcmY+9I|{;35JgWdxQ8r2LQo44tdvM9MN4B5Lezj7F)A+KlUFD*%$v{sFmFf9mJ=0c zV{R6-M#hx`4SV|Z+&5vaxpN_At8z1hgpw1aH^6tc(g!=FCb4b<2s@~>+q0#;319^feUH||9 literal 0 HcmV?d00001 diff --git a/ggbiplot.r b/ggbiplot.r new file mode 100644 index 0000000..e0e06e6 --- /dev/null +++ b/ggbiplot.r @@ -0,0 +1,220 @@ +# +# ggbiplot.r +# +# Copyright 2011 Vincent Q. Vu. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# + +#' Biplot for Principal Components using ggplot2 +#' +#' @param pcobj an object returned by prcomp() or princomp() +#' @param choices which PCs to plot +#' @param scale covariance biplot (scale = 1), form biplot (scale = 0). When scale = 1, the inner product between the variables approximates the covariance and the distance between the points approximates the Mahalanobis distance. +#' @param obs.scale scale factor to apply to observations +#' @param var.scale scale factor to apply to variables +#' @param pc.biplot for compatibility with biplot.princomp() +#' @param groups optional factor variable indicating the groups that the observations belong to. If provided the points will be colored according to groups +#' @param ellipse draw a normal data ellipse for each group? +#' @param ellipse.prob size of the ellipse in Normal probability +#' @param labels optional vector of labels for the observations +#' @param labels.size size of the text used for the labels +#' @param alpha alpha transparency value for the points (0 = transparent, 1 = opaque) +#' @param circle draw a correlation circle? (only applies when prcomp was called with scale = TRUE and when var.scale = 1) +#' @param var.axes draw arrows for the variables? +#' @param varname.size size of the text for variable names +#' @param varname.adjust adjustment factor the placement of the variable names, >= 1 means farther from the arrow +#' @param varname.abbrev whether or not to abbreviate the variable names +#' +#' @return a ggplot2 plot +#' @export +#' @examples +#' data(wine) +#' wine.pca <- prcomp(wine, scale. = TRUE) +#' print(ggbiplot(wine.pca, obs.scale = 1, var.scale = 1, groups = wine.class, ellipse = TRUE, circle = TRUE)) +#' +ggbiplot <- function(pcobj, choices = 1:2, scale = 1, pc.biplot = TRUE, + obs.scale = 1 - scale, var.scale = scale, + groups = NULL, ellipse = FALSE, ellipse.prob = 0.68, + labels = NULL, labels.size = 3, alpha = 1, + var.axes = TRUE, + circle = FALSE, circle.prob = 0.69, + varname.size = 3, varname.adjust = 1.5, + varname.abbrev = FALSE, ...) +{ + library(ggplot2) + library(plyr) + library(scales) + library(grid) + + stopifnot(length(choices) == 2) + + # Recover the SVD + if(inherits(pcobj, 'prcomp')){ + nobs.factor <- sqrt(nrow(pcobj$x) - 1) + d <- pcobj$sdev + u <- sweep(pcobj$x, 2, 1 / (d * nobs.factor), FUN = '*') + v <- pcobj$rotation + } else if(inherits(pcobj, 'princomp')) { + nobs.factor <- sqrt(pcobj$n.obs) + d <- pcobj$sdev + u <- sweep(pcobj$scores, 2, 1 / (d * nobs.factor), FUN = '*') + v <- pcobj$loadings + } else if(inherits(pcobj, 'PCA')) { + nobs.factor <- sqrt(nrow(pcobj$call$X)) + d <- unlist(sqrt(pcobj$eig)[1]) + u <- sweep(pcobj$ind$coord, 2, 1 / (d * nobs.factor), FUN = '*') + v <- sweep(pcobj$var$coord,2,sqrt(pcobj$eig[1:ncol(pcobj$var$coord),1]),FUN="/") + } else if(inherits(pcobj, "lda")) { + nobs.factor <- sqrt(pcobj$N) + d <- pcobj$svd + u <- predict(pcobj)$x/nobs.factor + v <- pcobj$scaling + d.total <- sum(d^2) + } else { + stop('Expected a object of class prcomp, princomp, PCA, or lda') + } + + # Scores + choices <- pmin(choices, ncol(u)) + df.u <- as.data.frame(sweep(u[,choices], 2, d[choices]^obs.scale, FUN='*')) + + # Directions + v <- sweep(v, 2, d^var.scale, FUN='*') + df.v <- as.data.frame(v[, choices]) + + names(df.u) <- c('xvar', 'yvar') + names(df.v) <- names(df.u) + + if(pc.biplot) { + df.u <- df.u * nobs.factor + } + + # Scale the radius of the correlation circle so that it corresponds to + # a data ellipse for the standardized PC scores + r <- sqrt(qchisq(circle.prob, df = 2)) * prod(colMeans(df.u^2))^(1/4) + + # Scale directions + v.scale <- rowSums(v^2) + df.v <- r * df.v / sqrt(max(v.scale)) + + # Change the labels for the axes + if(obs.scale == 0) { + u.axis.labs <- paste('standardized PC', choices, sep='') + } else { + u.axis.labs <- paste('PC', choices, sep='') + } + + # Append the proportion of explained variance to the axis labels + u.axis.labs <- paste(u.axis.labs, + sprintf('(%0.1f%% explained var.)', + 100 * pcobj$sdev[choices]^2/sum(pcobj$sdev^2))) + + # Score Labels + if(!is.null(labels)) { + df.u$labels <- labels + } + + # Grouping variable + if(!is.null(groups)) { + df.u$groups <- groups + } + + # Variable Names + if(varname.abbrev) { + df.v$varname <- abbreviate(rownames(v)) + } else { + df.v$varname <- rownames(v) + } + + # Variables for text label placement + df.v$angle <- with(df.v, (180/pi) * atan(yvar / xvar)) + df.v$hjust = with(df.v, (1 - varname.adjust * sign(xvar)) / 2) + + # Base plot + g <- ggplot(data = df.u, aes(x = xvar, y = yvar)) + + xlab(u.axis.labs[1]) + ylab(u.axis.labs[2]) + coord_equal() + + if(var.axes) { + # Draw circle + if(circle) + { + theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50)) + circle <- data.frame(xvar = r * cos(theta), yvar = r * sin(theta)) + g <- g + geom_path(data = circle, color = muted('white'), + size = 1/2, alpha = 1/3) + } + + # Draw directions + g <- g + + geom_segment(data = df.v, + aes(x = 0, y = 0, xend = xvar, yend = yvar), + arrow = arrow(length = unit(1/2, 'picas')), + color = muted('red')) + } + + # Draw either labels or points + if(!is.null(df.u$labels)) { + if(!is.null(df.u$groups)) { + g <- g + geom_text(aes(label = labels, color = groups), + size = labels.size) + } else { + g <- g + geom_text(aes(label = labels), size = labels.size) + } + } else { + if(!is.null(df.u$groups)) { + g <- g + geom_point(aes(color = groups), alpha = alpha) + } else { + g <- g + geom_point(alpha = alpha) + } + } + + # Overlay a concentration ellipse if there are groups + if(!is.null(df.u$groups) && ellipse) { + theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50)) + circle <- cbind(cos(theta), sin(theta)) + + ell <- ddply(df.u, 'groups', function(x) { + if(nrow(x) <= 2) { + return(NULL) + } + sigma <- var(cbind(x$xvar, x$yvar)) + mu <- c(mean(x$xvar), mean(x$yvar)) + ed <- sqrt(qchisq(ellipse.prob, df = 2)) + data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = '+'), + groups = x$groups[1]) + }) + names(ell)[1:2] <- c('xvar', 'yvar') + g <- g + geom_path(data = ell, aes(color = groups, group = groups)) + } + + # Label the variable axes + if(var.axes) { + g <- g + + geom_text(data = df.v, + aes(label = varname, x = xvar, y = yvar, + angle = angle, hjust = hjust), + color = 'darkred', size = varname.size) + } + # Change the name of the legend for groups + # if(!is.null(groups)) { + # g <- g + scale_color_brewer(name = deparse(substitute(groups)), + # palette = 'Dark2') + # } + + # TODO: Add a second set of axes + + return(g) +} diff --git a/ggscreeplot.r b/ggscreeplot.r new file mode 100644 index 0000000..d6347e0 --- /dev/null +++ b/ggscreeplot.r @@ -0,0 +1,48 @@ +# +# ggscreeplot.r +# +# Copyright 2011 Vincent Q. Vu. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# + +#' Screeplot for Principal Components +#' +#' @param pcobj an object returned by prcomp() or princomp() +#' @param type the type of scree plot. 'pev' corresponds proportion of explained variance, i.e. the eigenvalues divided by the trace. 'cev' corresponds to the cumulative proportion of explained variance, i.e. the partial sum of the first k eigenvalues divided by the trace. +#' @export +#' @examples +#' data(wine) +#' wine.pca <- prcomp(wine, scale. = TRUE) +#' print(ggscreeplot(wine.pca)) +#' +ggscreeplot <- function(pcobj, type = c('pev', 'cev')) +{ + type <- match.arg(type) + d <- pcobj$sdev^2 + yvar <- switch(type, + pev = d / sum(d), + cev = cumsum(d) / sum(d)) + + yvar.lab <- switch(type, + pev = 'proportion of explained variance', + cev = 'cumulative proportion of explained variance') + + df <- data.frame(PC = 1:length(d), yvar = yvar) + + ggplot(data = df, aes(x = PC, y = yvar)) + + xlab('principal component number') + ylab(yvar.lab) + + geom_point() + geom_path() +}