-
Notifications
You must be signed in to change notification settings - Fork 0
/
chris_stuff.Rmd
76 lines (53 loc) · 1.59 KB
/
chris_stuff.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
---
title: "Chris Stuff"
author: "Altamash Rafiq"
date: "4/2/2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, comment = NA)
library(tidyverse)
library(text2vec)
library(tidytext)
library(widyr)
library(irlba)
```
```{r}
data = read_csv('data/ted_expanded.csv')
transcripts = data %>% select(url, transcript)
transcripts = transcripts[1:100,]
```
```{r}
tidy_skipgrams = transcripts %>%
unnest_tokens(ngram, transcript, token = "ngrams", n = 8) %>%
mutate(ngramID = row_number()) %>%
tidyr::unite(skipgramID, url, ngramID) %>%
unnest_tokens(word, ngram)
unigram_probs = transcripts %>%
unnest_tokens(word, transcript) %>%
count(word, sort = TRUE) %>%
mutate(p = n / sum(n))
skipgram_probs = tidy_skipgrams %>%
pairwise_count(word, skipgramID, diag = TRUE, sort = TRUE) %>%
mutate(p = n / sum(n))
normalized_prob = skipgram_probs %>%
filter(n > 20) %>%
rename(word1 = item1, word2 = item2) %>%
left_join(unigram_probs %>%
select(word1 = word, p1 = p),
by = "word1") %>%
left_join(unigram_probs %>%
select(word2 = word, p2 = p),
by = "word2") %>%
mutate(p_together = p / p1 / p2)
pmi_matrix = normalized_prob %>%
mutate(pmi = log10(p_together)) %>%
cast_sparse(word1, word2, pmi)
#remove missing data
pmi_matrix@x[is.na(pmi_matrix@x)] = 0
#run SVD
pmi_svd = irlba(pmi_matrix, 256, maxit = 500)
#next we output the word vectors:
word_vectors = pmi_svd$u
rownames(word_vectors) = rownames(pmi_matrix)
```