From d8501fe594236ebce50b8f1b9d42fa793506f4ea Mon Sep 17 00:00:00 2001 From: Xie <394637464@qq.com> Date: Tue, 15 Dec 2020 21:00:00 +0800 Subject: [PATCH 1/2] Xingyi Xie Assignment 5 --- assignment5.Rmd | 110 +++++++- assignment5.html | 679 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 780 insertions(+), 9 deletions(-) create mode 100644 assignment5.html diff --git a/assignment5.Rmd b/assignment5.Rmd index 288bcb3..84c8d38 100644 --- a/assignment5.Rmd +++ b/assignment5.Rmd @@ -1,6 +1,7 @@ --- title: "Principle Component Aanalysis" output: html_document +author: Xingyi Xie --- ## Data The data you will be using comes from the Assistments online intelligent tutoring system (https://www.assistments.org/). It describes students working through online math problems. Each student has the following data associated with them: @@ -16,7 +17,7 @@ The data you will be using comes from the Assistments online intelligent tutorin ## Start by uploading the data ```{r} -D1 <- +D1 <- read.csv("Assistments-confidence.csv") ``` @@ -34,11 +35,12 @@ ggcorr(D1[,-1], method = c("everything", "pearson")) #ggcorr() doesn't have an e #Study your correlogram images and save them, you will need them later. Take note of what is strongly related to the outcome variable of interest, mean_correct. ``` +Note:mean_correct correlates with mean_hint. ## Create a new data frame with the mean_correct variable removed, we want to keep that variable intact. The other variables will be included in our PCA. ```{r} -D2 <- +D2 <- D1[,-5] ``` @@ -67,24 +69,28 @@ plot(pca, type = "lines") ``` ## Decide which components you would drop and remove them from your data set. +Note: remove PC7 ## Part II ```{r} #Now, create a data frame of the transformed data from your pca. -D3 <- +D3 <- predict(pca) #Attach the variable "mean_correct" from your original data frame to D3. - - +D4 <- as.data.frame(cbind(D3,D1$mean_correct)) #Now re-run your correlation plots between the transformed data and mean_correct. If you had dropped some components would you have lost important infomation about mean_correct? - +ggpairs(D4, 1:8, progress = FALSE) +ggcorr(D4, method = c("everything", "pearson")) ``` -## Now print out the loadings for the components you generated: + +## Answer: PCA6 negtively correlates with mean_correct and PCA3 positively correlates with mean_correct. + +##Now print out the loadings for the components you generated: ```{r} pca$rotation @@ -92,23 +98,109 @@ pca$rotation #Examine the eigenvectors, notice that they are a little difficult to interpret. It is much easier to make sense of them if we make them proportional within each component loadings <- abs(pca$rotation) #abs() will make all eigenvectors positive +``` #Now examine your components and try to come up with substantive descriptions of what some might represent? -#You can generate a biplot to help you, though these can be a bit confusing. They plot the transformed data by the first two components. Therefore, the axes represent the direction of maximum variance accounted for. Then mapped onto this point cloud are the original directions of the variables, depicted as red arrows. It is supposed to provide a visualization of which variables "go together". Variables that possibly represent the same underlying construct point in the same direction. +## Answer: From this we found mean_hint contributes 78.06% variances of PC6, and prior_percent_correct contributes 78.12% variances of PC3. Therefore, we draw a conclusion that pc6 represented mean_hint and it is the most important factor to predict mean_correct. +```{r} biplot(pca) +library(ggbiplot) +library(factoextra) +library(FactoMineR) + +get_pca_var(pca)$contrib#contrib=(var.cos2 * 100) / (total cos2 of the PC component) +fviz_eig(pca,addlabels = TRUE)+labs(title = "Principal component analysis") +pca1 <- fviz_pca_biplot(pca, palette = "jco", + addEllipses = TRUE, label = "var", + col.var = "black", repel = TRUE) +pca1 ``` + # Part III -Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to andother TC program. Students were shown three program names at a time and were asked which two of the three were most similar. Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs. +Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to another TC program. Students were shown three program names at a time and were asked which two of the three were most similar. Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs. + +```{r} +D1 <- read.csv("tc-program-combos.csv") +``` + +```{r} +pca <- prcomp(D1[,-1],rank=10) +pca <- prcomp(D1[,-1],rank=2) +pca <- prcomp(D1[,-1],rank=1) +pca <- prcomp(D1[,-1],rank=5) + +``` + + +```{r} +pca$sdev + +pca$sdev^2 + +summary(pca) + +plot(pca, type = "lines") +``` + ```{r} +D3 <- as.data.frame(predict(pca)) + +ggcorr(D1[,-1], method = c("everything", "pearson")) + + +``` + + +```{r message=FALSE, warning=FALSE} + +loadings <- as.data.frame(abs(pca$rotation)) #abs() will make all eigenvectors positive +ggcorr(loadings, method = c("everything", "pearson")) #pc5~pc2 +ggpairs(loadings, 1:5, progress = FALSE)#PC1~PC3,PC2~PC3,PC2~PC5 + +library("corrplot") +corrplot(get_pca_var(pca)$cos2, is.corr=FALSE)#visualization,contribution, BUT NO FINDING + +which(loadings$PC1>0.25) +rownames(loadings[which(loadings$PC1>0.25),]) +rownames(loadings[which(loadings$PC1>0.2),]) + +rownames(loadings[which(loadings$PC2>0.25),]) +rownames(loadings[which(loadings$PC2>0.2),]) + +rownames(loadings[which(loadings$PC3>0.25),]) +rownames(loadings[which(loadings$PC3>0.2),]) + +rownames(loadings[which(loadings$PC5>0.25),]) +rownames(loadings[which(loadings$PC5>0.2),]) ``` +## Conclusion: + +- Firstly, we found PC1 is correlated with PC3, and from the contribution table "loading", we found "Change.Leadership" and "Economics.and.Education" are clustered into PC1 and both of them contribute more than 25 percent variances to component 1. Besides, "Arts.Administration", "History", "Politics" and "School.Principals" also contribute more than 20 percent variances to PC1. In my opinion, these programs share something with leadership and regulations, and they seem to be more serious. +- For PC3, we found "Clinical.Psychology", "Neuroscience","Psychology" all contribute more than 25 percent variances to PC3, and each of"Physiology","Behavior.Analysis",Cognitive.Science", "Creative.Technologies" also contribute more tahn 20 percent to PC3. From my point of view, these programs share similarity of cognitive behaviour science. +- For PC5, these programs are more likely to be math related, which requires statistics and compution. +- However, we found correlation between PC2 and PC3, and we also found correlation between PC2 and PC3. Therefore, considering we didn't find the similarity among programs of "Linguistics", "Creative.Technologies", "Design.and.Development.of.Digital.Games", I think these programs were a mixed version of PC3 and PC5, which means PC2 represented both programs of cognitive behavior science and mathmatics. + + +```{r message=FALSE, warning=FALSE} + +biplot(pca) +library(ggbiplot) +library(factoextra) +library(FactoMineR) + + +fviz_eig(pca,addlabels = TRUE)+labs(title = "Principal component analysis") +pca2 <- fviz_pca_biplot(pca) +pca2 +``` diff --git a/assignment5.html b/assignment5.html new file mode 100644 index 0000000..b678e9f --- /dev/null +++ b/assignment5.html @@ -0,0 +1,679 @@ + + + + +
+ + + + + + + + + +The data you will be using comes from the Assistments online intelligent tutoring system (https://www.assistments.org/). It describes students working through online math problems. Each student has the following data associated with them:
+D1 <- read.csv("Assistments-confidence.csv")
+#You can install the corrplot package to plot some pretty correlation matrices (sometimes called correlograms)
+
+library(ggplot2)
+## Warning: package 'ggplot2' was built under R version 3.6.3
+library(GGally)
+## Warning: package 'GGally' was built under R version 3.6.3
+## Registered S3 method overwritten by 'GGally':
+## method from
+## +.gg ggplot2
+ggpairs(D1, 2:8, progress = FALSE) #ggpairs() draws a correlation plot between all the columns you identify by number (second option, you don't need the first column as it is the student ID) and progress = FALSE stops a progress bar appearing as it renders your plot
+
+ggcorr(D1[,-1], method = c("everything", "pearson")) #ggcorr() doesn't have an explicit option to choose variables so we need to use matrix notation to drop the id variable. We then need to choose a "method" which determines how to treat missing values (here we choose to keep everything, and then which kind of correlation calculation to use, here we are using Pearson correlation, the other options are "kendall" or "spearman")
+
+#Study your correlogram images and save them, you will need them later. Take note of what is strongly related to the outcome variable of interest, mean_correct.
+Note:mean_correct correlates with mean_hint.
+D2 <- D1[,-5]
+pca <- prcomp(D2, scale. = TRUE)
+pca$sdev
+## [1] 1.4502917 1.1428589 1.0462819 0.9976299 0.8557032 0.7385185 0.4721430
+#To convert this into variance accounted for we can square it, these numbers are proportional to the eigenvalue
+
+pca$sdev^2
+## [1] 2.1033459 1.3061264 1.0947058 0.9952654 0.7322279 0.5454096 0.2229190
+#A summary of our pca will give us the proportion of variance accounted for by each component
+
+summary(pca)
+## Importance of components:
+## PC1 PC2 PC3 PC4 PC5 PC6 PC7
+## Standard deviation 1.4503 1.1429 1.0463 0.9976 0.8557 0.73852 0.47214
+## Proportion of Variance 0.3005 0.1866 0.1564 0.1422 0.1046 0.07792 0.03185
+## Cumulative Proportion 0.3005 0.4871 0.6434 0.7856 0.8902 0.96815 1.00000
+#We can look at this to get an idea of which components we should keep and which we should drop
+
+plot(pca, type = "lines")
+
+Note: remove PC7
+#Now, create a data frame of the transformed data from your pca.
+
+D3 <- predict(pca)
+
+#Attach the variable "mean_correct" from your original data frame to D3.
+D4 <- as.data.frame(cbind(D3,D1$mean_correct))
+
+#Now re-run your correlation plots between the transformed data and mean_correct. If you had dropped some components would you have lost important infomation about mean_correct?
+ggpairs(D4, 1:8, progress = FALSE)
+
+ggcorr(D4, method = c("everything", "pearson"))
+
+##Now print out the loadings for the components you generated:
+pca$rotation
+## PC1 PC2 PC3 PC4
+## id 0.58205514 -0.3385544 0.062469916 -0.10516164
+## prior_prob_count -0.50027988 0.5120349 0.097545822 0.10595670
+## prior_percent_correct 0.10259004 0.2656193 0.781280744 -0.28670059
+## problems_attempted -0.24807198 -0.4255122 0.527707102 -0.08843549
+## mean_hint -0.47081307 -0.3669473 -0.115313460 -0.02244680
+## mean_attempt -0.34179563 -0.4434016 -0.009533538 -0.27383776
+## mean_confidence -0.01944837 0.2008274 -0.290378811 -0.90122426
+## PC5 PC6 PC7
+## id -0.00563084 0.10995181 0.720770551
+## prior_prob_count -0.02074912 -0.12780116 0.670846181
+## prior_percent_correct 0.29982995 0.34978664 -0.118642884
+## problems_attempted -0.58767798 -0.35448987 -0.008692926
+## mean_hint -0.10797157 0.78055600 0.094647492
+## mean_attempt 0.70023474 -0.33740750 0.085558668
+## mean_confidence -0.24957526 -0.02126521 0.005007913
+#Examine the eigenvectors, notice that they are a little difficult to interpret. It is much easier to make sense of them if we make them proportional within each component
+
+loadings <- abs(pca$rotation) #abs() will make all eigenvectors positive
+#Now examine your components and try to come up with substantive descriptions of what some might represent?
+biplot(pca)
+
+library(ggbiplot)
+## Loading required package: plyr
+## Loading required package: scales
+## Loading required package: grid
+library(factoextra)
+## Warning: package 'factoextra' was built under R version 3.6.3
+## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
+library(FactoMineR)
+## Warning: package 'FactoMineR' was built under R version 3.6.3
+get_pca_var(pca)$contrib#contrib=(var.cos2 * 100) / (total cos2 of the PC component)
+## Dim.1 Dim.2 Dim.3 Dim.4
+## id 33.8788182 11.461906 0.390249044 1.10589716
+## prior_prob_count 25.0279953 26.217977 0.951518740 1.12268226
+## prior_percent_correct 1.0524715 7.055363 61.039960072 8.21972261
+## problems_attempted 6.1539706 18.106063 27.847478517 0.78208362
+## mean_hint 22.1664951 13.465031 1.329719415 0.05038588
+## mean_attempt 11.6824253 19.660495 0.009088835 7.49871171
+## mean_confidence 0.0378239 4.033164 8.431985377 81.22051675
+## Dim.5 Dim.6 Dim.7
+## id 0.003170636 1.20893999 51.951018720
+## prior_prob_count 0.043052608 1.63331366 45.003459921
+## prior_percent_correct 8.989799603 12.23506959 1.407613399
+## problems_attempted 34.536540949 12.56630660 0.007556697
+## mean_hint 1.165786099 60.92676732 0.895814778
+## mean_attempt 49.032868819 11.38438192 0.732028565
+## mean_confidence 6.228781286 0.04522092 0.002507919
+fviz_eig(pca,addlabels = TRUE)+labs(title = "Principal component analysis")
+
+pca1 <- fviz_pca_biplot(pca, palette = "jco",
+ addEllipses = TRUE, label = "var",
+ col.var = "black", repel = TRUE)
+pca1
+
+Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to another TC program. Students were shown three program names at a time and were asked which two of the three were most similar. Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs.
+D1 <- read.csv("tc-program-combos.csv")
+pca <- prcomp(D1[,-1],rank=10)
+pca <- prcomp(D1[,-1],rank=2)
+pca <- prcomp(D1[,-1],rank=1)
+pca <- prcomp(D1[,-1],rank=5)
+pca$sdev
+## [1] 3.07682469 2.87855642 2.61437939 2.30710089 1.99111866 1.92834628
+## [7] 1.86056358 1.81852417 1.74054872 1.61068941 1.57425512 1.51914481
+## [13] 1.49727287 1.47237670 1.44408492 1.39563098 1.36261767 1.34904801
+## [19] 1.28172320 1.24871362 1.23780857 1.18657276 1.15976694 1.15031480
+## [25] 1.12759021 1.11011092 1.04749207 1.02647640 0.99059927 0.97081963
+## [31] 0.93071762 0.87576350 0.86744722 0.83476049 0.81275726 0.76442159
+## [37] 0.74202490 0.71360773 0.70113722 0.66562919 0.63857894 0.63566392
+## [43] 0.57790580 0.55741263 0.53113958 0.50300149 0.47244057 0.46404532
+## [49] 0.42711145 0.41070217 0.39738676 0.36671103 0.35708619 0.33735423
+## [55] 0.31969614 0.30284356 0.27060294 0.23962435 0.19774909 0.18384271
+## [61] 0.16678985 0.16169267 0.12442062 0.08086149 0.05059250 0.04000635
+## [67] 0.01361255
+pca$sdev^2
+## [1] 9.4668501556 8.2860870753 6.8349796126 5.3227144984 3.9645535103
+## [6] 3.7185193944 3.4616968371 3.3070301611 3.0295098464 2.5943203762
+## [11] 2.4782791899 2.3078009433 2.2418260551 2.1678931325 2.0853812501
+## [16] 1.9477858328 1.8567269267 1.8199305316 1.6428143650 1.5592857118
+## [21] 1.5321700622 1.4079549059 1.3450593491 1.3232241294 1.2714596895
+## [26] 1.2323462597 1.0972396269 1.0536537927 0.9812869220 0.9424907458
+## [31] 0.8662352832 0.7669617104 0.7524646833 0.6968250723 0.6605743679
+## [36] 0.5843403673 0.5506009503 0.5092359897 0.4915933950 0.4430622249
+## [41] 0.4077830689 0.4040686212 0.3339751170 0.3107088394 0.2821092537
+## [46] 0.2530104995 0.2232000958 0.2153380576 0.1824241892 0.1686762737
+## [51] 0.1579162400 0.1344769819 0.1275105470 0.1138078760 0.1022056244
+## [56] 0.0917142215 0.0732259524 0.0574198308 0.0391047010 0.0337981414
+## [61] 0.0278188527 0.0261445210 0.0154804917 0.0065385809 0.0025596007
+## [66] 0.0016005079 0.0001853015
+summary(pca)
+## Importance of first k=5 (out of 67) components:
+## PC1 PC2 PC3 PC4 PC5
+## Standard deviation 3.0768 2.87856 2.61438 2.30710 1.99112
+## Proportion of Variance 0.1025 0.08968 0.07398 0.05761 0.04291
+## Cumulative Proportion 0.1025 0.19214 0.26612 0.32372 0.36663
+plot(pca, type = "lines")
+
+D3 <- as.data.frame(predict(pca))
+
+ggcorr(D1[,-1], method = c("everything", "pearson"))
+
+loadings <- as.data.frame(abs(pca$rotation)) #abs() will make all eigenvectors positive
+ggcorr(loadings, method = c("everything", "pearson")) #pc5~pc2
+
+ggpairs(loadings, 1:5, progress = FALSE)#PC1~PC3,PC2~PC3,PC2~PC5
+
+library("corrplot")
+corrplot(get_pca_var(pca)$cos2, is.corr=FALSE)#visualization,contribution, BUT NO FINDING
+
+which(loadings$PC1>0.25)
+## [1] 25 31
+rownames(loadings[which(loadings$PC1>0.25),])
+## [1] "Economics.and.Education" "Change.Leadership"
+rownames(loadings[which(loadings$PC1>0.2),])
+## [1] "Arts.Administration" "Economics.and.Education"
+## [3] "Change.Leadership" "History"
+## [5] "Politics" "School.Principals"
+rownames(loadings[which(loadings$PC2>0.25),])
+## [1] "Linguistics"
+## [2] "Creative.Technologies"
+## [3] "Design.and.Development.of.Digital.Games"
+rownames(loadings[which(loadings$PC2>0.2),])
+## [1] "Linguistics"
+## [2] "Creative.Technologies"
+## [3] "Design.and.Development.of.Digital.Games"
+rownames(loadings[which(loadings$PC3>0.25),])
+## [1] "Clinical.Psychology" "Neuroscience" "Psychology"
+rownames(loadings[which(loadings$PC3>0.2),])
+## [1] "Physiology" "Behavior.Analysis" "Clinical.Psychology"
+## [4] "Cognitive.Science" "Creative.Technologies" "Nursing"
+## [7] "Neuroscience" "Psychology"
+rownames(loadings[which(loadings$PC5>0.25),])
+## [1] "Creative.Technologies"
+## [2] "Learning.Analytics"
+## [3] "Literacy"
+## [4] "Measurement..Evaluation.and.Statistics"
+## [5] "Science.Education"
+rownames(loadings[which(loadings$PC5>0.2),])
+## [1] "Creative.Technologies"
+## [2] "Learning.Analytics"
+## [3] "Literacy"
+## [4] "Mathematics"
+## [5] "Measurement..Evaluation.and.Statistics"
+## [6] "Science.Education"
+biplot(pca)
+
+library(ggbiplot)
+library(factoextra)
+library(FactoMineR)
+
+
+fviz_eig(pca,addlabels = TRUE)+labs(title = "Principal component analysis")
+
+pca2 <- fviz_pca_biplot(pca)
+pca2
+
+