-
Notifications
You must be signed in to change notification settings - Fork 0
/
Yelp_Scraping_Final.Rmd
147 lines (88 loc) · 2.94 KB
/
Yelp_Scraping_Final.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
```{r}
library(rvest)
library(tm)
library(wordcloud)
library(ggplot2)
```
Define a function to create a list of links for a given url to scrape reviews.
```{r}
get_yelp_links <- function(max_pages, url) {
links <- c()
starts <- seq(0, 20*as.numeric(max_pages),20)
for (i in seq(1,length(starts),1)) {
index <- starts[i]
full <- paste(url, as.character(index), sep="")
links[i] <- full
}
return(links)
}
```
Call function for NYU and Columbia links.
```{r}
nyu.links <- get_yelp_links(4, "https://www.yelp.com/biz/new-york-university-new-york-18?start=")
columbia.links <- get_yelp_links(4, "https://www.yelp.com/biz/columbia-university-new-york-43?start=")
```
Define function to scrape reviews.
```{r}
scrape_yelp_reviews <- function(links_list) {
reviews_list <- c()
for (i in seq(1,length(links_list),1)) {
link <- links_list[i]
reviews <- read_html(link)
reviews <- html_nodes(reviews, ".review-content p")
reviews <- html_text(reviews)
reviews_list <- append(reviews_list, reviews)
}
return(reviews_list)
}
```
```{r}
nyu.reviews <- scrape_yelp_reviews(nyu.links)
columbia.reviews <- scrape_yelp_reviews(columbia.links)
```
Function to make into a corpus and clean up.
```{r}
to_corpus <- function(list) {
corp <- VCorpus(VectorSource(list))
corp.clean <- tm_map(corp, removePunctuation)
corp.clean <- tm_map(corp.clean, content_transformer(tolower))
corp.clean <- tm_map(corp.clean, removeWords, stopwords("english"))
corp.clean <- tm_map(corp.clean, removeWords, c("nyu", "columbia"))
corp.clean <- tm_map(corp.clean, stripWhitespace)
return(corp.clean)
}
```
```{r}
nyu.corpus <- to_corpus(nyu.reviews)
columbia.corpus <- to_corpus(columbia.reviews)
```
Wordclouds.
NYU.
```{r}
wordcloud(nyu.corpus, scale=c(3.5, .35), max.words=50, random.order = F, colors = brewer.pal(10, "Purples"))
```
Columbia.
```{r}
wordcloud(columbia.corpus, scale=c(3.5, .35), max.words=50, random.order = F, colors = brewer.pal(6, "Blues"))
```
Sentiment Analysis.
```{r}
library(syuzhet)
```
```{r}
nyu.sentiment <- get_nrc_sentiment(nyu.reviews)
columbia.sentiment <- get_nrc_sentiment(columbia.reviews)
```
```{r}
nyu.review.sentiment.sums <- data.frame(sentiment = colnames(nyu.sentiment), freq = colSums(nyu.sentiment))
columbia.review.sentiment.sums <- data.frame(sentiment = colnames(columbia.sentiment), freq = colSums(columbia.sentiment))
#Combine them in a dataframe for comparison
review.comparison.df <- rbind(nyu.review.sentiment.sums, columbia.review.sentiment.sums)
nyu <- rep("nyu",10)
columbia <- rep("columbia",10)
schools <- c(nyu, columbia)
review.comparison.df$school <- schools
```
```{r}
plot <- ggplot(review.comparison.df, aes(x=sentiment, y=freq, fill=school)) + geom_bar(stat="identity", position=position_dodge()) + scale_fill_manual(values = c("#4F628E", "#9775AA")) + ggtitle("Review Sentiment Totals") + labs(x="Sentiment",y="Count")
```