From 2f7eed7819f4b85c53d18b9179ab67edfed307af Mon Sep 17 00:00:00 2001 From: zhihao guo Date: Thu, 17 Dec 2020 11:57:30 +0800 Subject: [PATCH] Aaaugnment 5 --- assignment5.Rmd | 228 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 163 insertions(+), 65 deletions(-) diff --git a/assignment5.Rmd b/assignment5.Rmd index 288bcb3..6eb1e8b 100644 --- a/assignment5.Rmd +++ b/assignment5.Rmd @@ -1,114 +1,212 @@ --- -title: "Principle Component Aanalysis" -output: html_document ---- -## Data -The data you will be using comes from the Assistments online intelligent tutoring system (https://www.assistments.org/). It describes students working through online math problems. Each student has the following data associated with them: + title: "Principle Component Aanalysis" + output: html_document + output: + html_document: + toc: yes + toc_float: yes + --- + + ```{r setup, include=FALSE} + knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) + ``` + + ## Data + + The data you will be using comes from the Assistments online intelligent tutoring system (https://www.assistments.org/). It describes students working through online math problems. Each student has the following data associated with them: + + - id + @@ -14,13 +23,22 @@ The data you will be using comes from the Assistments online intelligent tutorin + - mean_attempt: The average number of attempts a student took to answer a problem in the current session + - mean_confidence: The average confidence each student has in their ability to answer the problems in the current session + + ## Start by uploading the data + ```{r} + D1 <- + library(tidyverse) + library(corrplot) + library(formattable) + ``` + + + **Start by uploading the data** + + ```{r} + D1 <- read.csv("Assistments-confidence.csv") + ``` + + ## Create a correlation matrix of the relationships between the variables, including correlation coefficients for each pair of variables/features. + ## Part I + + **Create a correlation matrix of the relationships between the variables, including correlation coefficients for each pair of variables/features.** + + ```{r} + #You can install the corrplot package to plot some pretty correlation matrices (sometimes called correlograms) + @@ -35,27 +53,27 @@ ggcorr(D1[,-1], method = c("everything", "pearson")) #ggcorr() doesn't have an e + #Study your correlogram images and save them, you will need them later. Take note of what is strongly related to the outcome variable of interest, mean_correct. + ``` + + ## Create a new data frame with the mean_correct variable removed, we want to keep that variable intact. The other variables will be included in our PCA. + **Create a new data frame with the mean_correct variable removed, we want to keep that variable intact. The other variables will be included in our PCA.** + + ```{r} + D2 <- + + D2 <- D1[,c(-1,-5)] + # remove id and mean_correct. ID, as an arbitrarily assigned number, should not be used in a predictive model. + ``` + + ## Now run the PCA on the new data frame + **Now run the PCA on the new data frame** + + ```{r} + pca <- prcomp(D2, scale. = TRUE) + ``` + + ## Although princomp does not generate the eigenvalues directly for us, we can print a list of the standard deviation of the variance accounted for by each component. + **Although princomp does not generate the eigenvalues directly for us, we can print a list of the standard deviation of the variance accounted for by each component.** -- id -- prior_prob_count: How many problems a student has answered in the system prior to this session -- prior_percent_correct: The percentage of problems a student has answered correctly prior to this session -- problems_attempted: The number of problems the student has attempted in the current session -- mean_correct: The average number of correct answers a student made on their first attempt at problems in the current session -- mean_hint: The average number of hints a student asked for in the current session -- mean_attempt: The average number of attempts a student took to answer a problem in the current session -- mean_confidence: The average confidence each student has in their ability to answer the problems in the current session + ```{r} + pca$sdev + # pca$sdev -## Start by uploading the data -```{r} -D1 <- + #To convert this into variance accounted for we can square it, these numbers are proportional to the eigenvalue -``` + pca$sdev^2 + # pca$sdev^2 -## Create a correlation matrix of the relationships between the variables, including correlation coefficients for each pair of variables/features. + #A summary of our pca will give us the proportion of variance accounted for by each component -```{r} -#You can install the corrplot package to plot some pretty correlation matrices (sometimes called correlograms) + @@ -66,49 +84,122 @@ summary(pca) + plot(pca, type = "lines") + ``` -library(ggplot2) -library(GGally) + ## Decide which components you would drop and remove them from your data set. + **Decide which components you would drop and remove them from your data set.** -ggpairs(D1, 2:8, progress = FALSE) #ggpairs() draws a correlation plot between all the columns you identify by number (second option, you don't need the first column as it is the student ID) and progress = FALSE stops a progress bar appearing as it renders your plot + #### I would drop PC6 from the dataset, given that it accounts for less than 10% of the variance in the data. -ggcorr(D1[,-1], method = c("everything", "pearson")) #ggcorr() doesn't have an explicit option to choose variables so we need to use matrix notation to drop the id variable. We then need to choose a "method" which determines how to treat missing values (here we choose to keep everything, and then which kind of correlation calculation to use, here we are using Pearson correlation, the other options are "kendall" or "spearman") -#Study your correlogram images and save them, you will need them later. Take note of what is strongly related to the outcome variable of interest, mean_correct. -``` + ## Part II -## Create a new data frame with the mean_correct variable removed, we want to keep that variable intact. The other variables will be included in our PCA. + ```{r} + #Now, create a data frame of the transformed data from your pca. -```{r} -D2 <- + D3 <- -``` + D3 <- pca$x + #Attach the variable "mean_correct" from your original data frame to D3. + D4 <- data.frame(D3,D1[,5]) -## Now run the PCA on the new data frame -```{r} -pca <- prcomp(D2, scale. = TRUE) -``` + #Now re-run your correlation plots between the transformed data and mean_correct. If you had dropped some components would you have lost important information about mean_correct? -## Although princomp does not generate the eigenvalues directly for us, we can print a list of the standard deviation of the variance accounted for by each component. + #Now re-run your correlation plots between the transformed data and mean_correct. If you had dropped some components would you have lost important infomation about mean_correct? -```{r} -pca$sdev + ggpairs(D4, 1:7, progress = FALSE) -#To convert this into variance accounted for we can square it, these numbers are proportional to the eigenvalue + ggcorr(D4, method = c("everything", "pearson")) -pca$sdev^2 -#A summary of our pca will give us the proportion of variance accounted for by each component + ``` + ## Now print out the loadings for the components you generated: -summary(pca) + #### Yes, PC4 and PC6 were highly significant in varying with mean_correct, so if either of those 2 principal components were dropped, then we would have lost some information about mean_correct -#We can look at this to get an idea of which components we should keep and which we should drop -plot(pca, type = "lines") -``` -## Decide which components you would drop and remove them from your data set. + **Now print out the loadings for the components you generated:** -## Part II + ```{r} + pca$rotation + # pca$rotation -```{r} -#Now, create a data frame of the transformed data from your pca. + #Examine the eigenvectors, notice that they are a little difficult to interpret. It is much easier to make sense of them if we make them proportional within each component -D3 <- + loadings <- abs(pca$rotation) #abs() will make all eigenvectors positive -#Attach the variable "mean_correct" from your original data frame to D3. + loadings %>% formattable + #Now examine your components and try to come up with substantive descriptions of what some might represent? + #You can generate a biplot to help you, though these can be a bit confusing. They plot the transformed data by the first two components. Therefore, the axes represent the direction of maximum variance accounted for. Then mapped onto this point cloud are the original directions of the variables, depicted as red arrows. It is supposed to provide a visualization of which variables "go together". Variables that possibly represent the same underlying construct point in the same direction. + biplot(pca) + # biplot(pca) -#Now re-run your correlation plots between the transformed data and mean_correct. If you had dropped some components would you have lost important infomation about mean_correct? + ``` + # Part III + Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to andother TC program. Students were shown three program names at a time and were asked which two of the three were most similar. Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs. + #### PC1 represents a mix of hints used and problems attempted. PC1 correlates strongly with mean_correct because the use of hints is a strong negative predictor of the number of answers correct, given the correlation analysis done above. -``` -## Now print out the loadings for the components you generated: + #### PC2 represents a mix of prior_percent_correct and prior_prob_count attempted. PC2 correlates strongly with mean_correct because prior_percent_correct is a strong positive predictor of the number of answers correct. -```{r} -pca$rotation + #### PC3 represents a mix of mean_confidence and prior_prob_count attempted. PC3 does not correlate strongly with mean_correct because neither mean confidence nor prior_prob_count is a strong predictor of the number of answers correct. -#Examine the eigenvectors, notice that they are a little difficult to interpret. It is much easier to make sense of them if we make them proportional within each component + #### PC5 represents a mix of mean_attempt and problems_attempted attempted. PC5 does not correlate strongly with mean_correct because neither mean_attempt nor problems_attempted is a strong predictor of the number of answers correct, given the correlation analysis done above. -loadings <- abs(pca$rotation) #abs() will make all eigenvectors positive -#Now examine your components and try to come up with substantive descriptions of what some might represent? -#You can generate a biplot to help you, though these can be a bit confusing. They plot the transformed data by the first two components. Therefore, the axes represent the direction of maximum variance accounted for. Then mapped onto this point cloud are the original directions of the variables, depicted as red arrows. It is supposed to provide a visualization of which variables "go together". Variables that possibly represent the same underlying construct point in the same direction. + ## Part III -biplot(pca) + Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to another TC program. Students were shown three program names at a time and were asked which two of the three were most similar. Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs. + ```{r} + programs <- read.csv("tc-program-combos.csv") %>% .[,-1] %>% mutate_all(as.numeric) -``` -# Part III -Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to andother TC program. Students were shown three program names at a time and were asked which two of the three were most similar. Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs. + pca <- prcomp(programs, scale. = TRUE) + pca_associations <- abs(pca$rotation) %>% as.data.frame + pca_associations[,ncol(pca_associations)+1] <- rownames(abs(pca$rotation)) + rownames(pca_associations) <- c() + colnames(pca_associations)[68] <- "Program" -```{r} + PC1_loadings <- pca_associations %>% arrange(-PC1) %>% select(Program,load = PC1) %>% .[1:3,] + PC2_loadings <- pca_associations %>% arrange(-PC2) %>% select(Program,load = PC2) %>% .[1:3,] + PC3_loadings <- pca_associations %>% arrange(-PC3) %>% select(Program,load = PC3) %>% .[1:3,] + PC4_loadings <- pca_associations %>% arrange(-PC4) %>% select(Program,load = PC4) %>% .[1:3,] + PC5_loadings <- pca_associations %>% arrange(-PC5) %>% select(Program,load = PC5) %>% .[1:3,] -``` + rbind(PC1_loadings,PC2_loadings,PC3_loadings,PC4_loadings,PC5_loadings) %>% mutate(PC = rep(1:5, each=3)) %>% formattable + ``` + #### The top 3 loadings from the top 10 principal components can be found in the table above. The greatest contributors to principal component 1 were Change Leadership, Economics and Education, and Education Policy which all intuitively go together. Interestingly, Clincial Psychology, Neuroscience, and Kinseiology were grouped together for component 2. These might be paired becuase they are not necessarily thought of as "education" fields but rather health/biology fields. For component 3, Game Design/Development, Cognitive Science, and Math were paired together which also cannot be immediately explained, and this is the power of PCA in finding variables that contribute to an underlying pattern that can't be explained by simple correlation or common sense. Linguisitics, English Education, and Teaching English make intuitive sense for component 4, all relating to the english language. Lastly, History, Music and Psychological Counseling are also fields not necessarily associated with education. + ```{r} + df <- programs + corr_simple <- function(data=df,sig=0.3){ + #convert data to numeric in order to run correlations + #convert to factor first to keep the integrity of the data - each value will become a number rather than turn into NA + df_cor <- data %>% mutate_if(is.character, as.factor) + df_cor <- df_cor %>% mutate_if(is.factor, as.numeric) + #run a correlation and drop the insignificant ones + corr <- cor(df_cor) + #prepare to drop duplicates and correlations of 1 + corr[lower.tri(corr,diag=TRUE)] <- NA + #drop perfect correlations + corr[corr == 1] <- NA + #turn into a 3-column table + corr <- as.data.frame(as.table(corr)) + #remove the NA values from above + corr <- na.omit(corr) + #select significant values + corr <- subset(corr, abs(Freq) > sig) + #sort by highest correlation + corr <- corr[order(-abs(corr$Freq)),] + #print table + corr %>% head(10) %>% select(Program1 = Var1, Program2 = Var2, Correlation = Freq) %>% formattable + } + corr_simple() + ``` + #### When running a simple correlation on all combinations of programs, the top 10 most highly correlated programs emerge, and they make intuitive sense. This might be a simple and more effective way to look at correlations amongst variables as opposed to looking at loadings of components in PCA analysis. However, simple correlations will miss complicated patterns amongst many variables, which may also be present. + ```{r} + # Experiment with alternative way of transforming data + # mu = colMeans(D2) + # D3 <- pca$x %*% t(pca$rotation) %>% scale(center = -mu,scale = FALSE) + # #Attach the variable "mean_correct" from your original data frame to D3. + # + # D4 <- data.frame(D3,D1[,5]) + ``` \ No newline at end of file