forked from epkanol/bayesian_travails
-
Notifications
You must be signed in to change notification settings - Fork 0
/
simulated_zipoisson_multilevel_author.Rmd
418 lines (309 loc) · 13.8 KB
/
simulated_zipoisson_multilevel_author.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
---
title: "Zero-Inflated Poisson - Multilevel model with authors"
author: "Anders Sundelin"
date: "2022-10-25"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(rethinking)
library(dplyr)
library(tidyr)
library(ggplot2)
```
## Refresher from chapter 13.3 (more than one type of cluster)
Not active by default, evaluate manually or change eval value.
```{r, echo=FALSE, eval=FALSE}
data("chimpanzees")
d <- chimpanzees
d$treatment <- 1 + d$prosoc_left + 2 * d$condition
dat_list <- list(
pulled_left = d$pulled_left,
actor = d$actor,
block_id = d$block,
treatment = as.integer(d$treatment)
)
set.seed(13)
m13.4 <- ulam(
alist(
pulled_left ~ dbinom(1, p),
logit(p) <- a[actor] + g[block_id] + b[treatment],
b[treatment] ~ dnorm(0, 0.5),
# adaptive priors
a[actor] ~ dnorm(a_bar, sigma_a),
g[block_id] ~ dnorm(0, sigma_g),
# hyper-priors
a_bar ~ dnorm(0, 1.5),
sigma_a ~ dexp(1),
sigma_g ~ dexp(1)
), data = dat_list, chains=4, cores=4, log_lik = TRUE
)
precis(m13.4, depth=2)
plot(precis(m13.4, depth=2))
```
## Multilevel Bayesian Model With Authors (one author per file)
In this model, we assume author impact to be constant over time. The same goes for all parameters in the Generalized Linear Model.
```{r init_sim}
N <- 50 # weeks/commits of sampling
set.seed(700716)
a_bar <- 3.14
sigma_a <- 0.15
b_bar <- .04
sigma_b <- 0.01
nfiles <- 5
nauthors <- 7
bauthor_bar <- 0.033
sigma_bauthor <- 0.01
# probability of a event not happening
# A is likely to be touched, but B is not, C, D and E also vary a bit...
prob_untouched <- c(0.2, 0.6, 0.1, 0.4, 0.23)
# simulate with more files, just pick uniformly random numbers as probabilities
#nfiles <- 40
#prob_untouched <- runif(nfiles, min=0.1, max=0.5)
a_file <- rnorm(nfiles, mean=a_bar, sd=sigma_a) # value for each file, drawn from the general mean/sd
b_file <- rnorm(nfiles, mean=b_bar, sd=sigma_b) # ditto for each slope, unique values for each slope
b_author <- rnorm(nauthors, mean=bauthor_bar, sd=sigma_bauthor)
dsim <- data.frame(file=1:nfiles, p_untouched=prob_untouched, true_a=a_file, true_b=b_file)
# each file is measured each observation
observations <- N*nfiles
# on average, we add 10 rows each observation, though there are some variation
change_size <- rnorm(observations, 10, 5) # assume all files change with similar frequency
SZ <- scale(change_size)
# assume one author authors each file, at random
# So the AUTHORS array becomes just a random sample of observations between 1 and nauthors
# assume each author has a constant impact on each file, regardless of file - this is just to make the simulation easier - the author_impact vector stays the same for all observations.
base_author_impact <- rnorm(nauthors, 0.5, 0.2)
# Randomize which author that impacted which observation
AUTHORS <- sample(1:nauthors, observations, replace=TRUE)
# and for each observation, the impact of the author is simulated (in real life, measured) like this
author_impact <- sapply(1:observations, function(i) base_author_impact[AUTHORS[i]])
# standard scaler, (x-mean(x))/sd(x), is OK to use, because 0 has no real meaning for the authors - each file has one and only one author
AI <- scale(author_impact)
# The author part of the lograte coefficient for each observation
author_coefficient <- sapply(1:observations, function(i) b_author[AUTHORS[i]] * AI[i])
# in this simple example, assume we observe one file in each observation. Randomize which one by simple sampling.
file_observation <- sample(1:nfiles, observations, replace = TRUE)
# The log of the rate is a linear model - rates are always positive
# Thus, the rate is the exponentiation of the linear model
# Calculate the log of the rate by iterating over each observation - make sure to keep track of observation metrics and file metrics
log_rate_introd <- sapply(1:length(file_observation), function(i) dsim$true_a[file_observation[i]] + dsim$true_b[file_observation[i]] * SZ[i] + author_coefficient[i])
# now we have the observations in one long vector
# roll a dice, figuring out which observations that are untouched or not, depending on the probability that the file is untouched (the 'file_observation' array contains which file that was affected).
untouched <- sapply(file_observation, function(i) rbinom(1, 1, prob_untouched[i]))
# untouched is also 'observations` items long
# simulate issues introduced - here is the actual zero-inflated Poisson process
introd <- sapply(1:length(file_observation), function (i) (1-untouched[i]) * rpois(1, exp(log_rate_introd[i])))
# this gives another vector of `observations` items
# assemble the data frame
df <- data.frame(introd=introd, file=file_observation, size=SZ, author=AUTHORS, author_impact=AI)
```
Check that the poisson process is visible in the data, after the zeros have been removed (check for sanity of the betas and alpha).
In particular, if a Poisson model should be useful, the mean and variance ($s^2$) of the outcome should be roughly equal (at least same order of magnitude). Otherwise we have to use a zero-inflated negative binomial (see below brms example by R. Torkar)
```{r}
df %>% filter(introd > 0) %>% summarise(mean(introd), var=sd(introd)^2)
```
```{r}
simplehist(df %>% filter(file == 1) %>% select(introd), xlab="issues introduced in file 1", lwd=4)
```
```{r}
df %>% filter(file==4)
```
```{r}
simplehist(df %>% filter(file == 2) %>% select(introd), xlab="issues introduced in file 2", lwd=4)
```
## Prior selection
Based on reasoning and the plots above, we know will mostly find 20-30 issues per file, with some extreme values also possible (hundreds, but not thousands).
The log link puts half of the real numbers (negative numbers) between 0 and 1 on the outcome scale.
This means that a conventional flat prior will yield extreme results (0 or very many issues).
```{r}
curve(dlnorm(x, 2.5, 0.7), from=0, to=100, n=200)
```
For the alpha, a norm(2.5, 0.7) prior seems to make sense.
The size parameter is normalized, so we know that it has 0 as a mean, and 1 as a stddev. This means that we should plot priors in a reasonable range, say [-4,4]. We will also consider the alpha prior previously chosen when plotting the relationships.
```{r}
N <- 100
a <- rnorm(N, 2.5, 0.7)
b <- rnorm(N, 0, 0.2)
plot(NULL, xlim=c(-4,4), ylim=c(0,100))
for (i in 1:N)
curve(exp(a[i] + b[i] * x), add=TRUE, col=grau())
```
The author impact is also centered and normalized by dividing with stddev. The same scale applies.
We plot the same prior as for b_size:
```{r}
N <- 100
a <- rnorm(N, 2.5, 0.7)
b <- rnorm(N, 0, 0.2)
b_auth <- rnorm(N, 0, 0.2)
plot(NULL, xlim=c(-4,4), ylim=c(0,100))
for (i in 1:N)
curve(exp(a[i] + b[i] * x + b_auth[i] * x), add=TRUE, col=grau())
```
While some lines show steep relationships, at least most of the priors are flat - we could even consider narrowing down the stddev of the betas to 0.1, to force even flatter priors.
```{r}
N <- 100
a <- rnorm(N, 2.5, 0.7)
b <- rnorm(N, 0, 0.1)
b_auth <- rnorm(N, 0, 0.1)
plot(NULL, xlim=c(-4,4), ylim=c(0,100))
for (i in 1:N)
curve(exp(a[i] + b[i] * x + b_auth[i] * x), add=TRUE, col=grau())
```
This would mean that our model would be even more skeptical of any relationship existing in the betas. We go for the norm(0, 0.2) priors for now.
### Recovering the model parameters
```{r}
d <- list(y=df$introd, file=df$file, size=df$size, author=df$author, author_impact=df$author_impact)
issue_model <- ulam(
alist(
y ~ dzipois(p, lambda),
logit(p) <- ap[file], # assume simplest possible model
log(lambda) <- al[file] + bsize[file] * size + bauth[author] * author_impact,
ap[file] ~ dnorm(0, 1.5), # could of course also pull this from the model, if we generate it from a common probability above
al[file] ~ dnorm(a_bar, sigma_a),
bsize[file] ~ dnorm(b_bar, sigma_b),
bauth[author] ~ dnorm(bauth_bar, sigma_bauth),
a_bar ~ dnorm(2.5, 0.7),
sigma_a ~ dexp(1),
b_bar ~ dnorm(0, 0.2),
sigma_b ~ dexp(1),
bauth_bar ~ dnorm(0, 0.2),
sigma_bauth ~ dexp(1)
), data=d, chains=4, iter = 5e3, cores = 4)
```
```{r}
precis(issue_model, depth=2)
```
```{r}
plot(issue_model, depth=2)
```
It is extremely important to know how you slice the matrices. Forgetting a comma here and there will completely screw your model...
```{r}
post <- extract.samples(issue_model)
max(sapply(1:nfiles, function(i) abs(mean(inv_logit(post$ap[,i])) - dsim$p_untouched[i])))
```
Derived probabilities of changing (to be compared with the real values)
```{r}
prob_untouched
sapply(1:nfiles, function(i) mean(inv_logit(post$ap[,i])))
```
Reasonably well-behaved values for the binomial zero-inflation process.
Derived intercepts (to be compared with the real values)
```{r}
a_file
sapply(1:nfiles, function(i) mean(post$al[,i]))
```
The intercepts are slightly over-estimated, by up to 0.10 units (stddevs).
```{r}
max(sapply(1:nfiles, function(i) abs(mean(post$al[,i]) - dsim$true_a[i])))
```
With a prior of norm(2.5, 0.7), our model is somewhat successful in retrieving the parameters.
But it seems to consistently overestimate the intercept, relative to the correct value.
Derived file slopes (to be compared with the real values)
```{r}
b_file
sapply(1:nfiles, function(i) mean(post$bsize[,i]))
```
Also quite good predictions relative to the true values.
```{r}
max(sapply(1:nfiles, function(i) abs(mean(post$bsize[,i]) - dsim$true_b[i])))
```
Derived author slopes (to be compared with the real values)
```{r}
b_author
sapply(1:nauthors, function(i) mean(post$bauth[,i]))
```
Authors are harder to estimate - they deviate more
```{r}
max(sapply(1:nauthors, function(i) abs(mean(post$bauth[,i]) - b_author[i])))
```
Summary findings:
a_bar (intercept) and sigma are well estimated, both are close to their true values, and well within the 89% CI.
However, the a_bar is slightly overestimated.
```{r}
c (a_bar, mean(post$a_bar), PI(post$a_bar))
c (sigma_a, mean(post$sigma_a), PI(post$sigma_a))
```
b_bar (file slope) is underestimated, and quite near the range of the 89% CI upper bound.
Standard deviation for the file slope is overestimated, and falls outside of the 89% - though in this simulation we only had 5 different files to sample from.
```{r}
c(b_bar, mean(post$b_bar), PI(post$b_bar))
c(sigma_b, mean(post$sigma_b), PI(post$sigma_b))
```
b_author_bar is much underestimated and falls outside the 89% upper bound. The model is not certain that there is an author effect (0 is within the 89% CI).
sigma_bauthor is very closely fitted, right in the middle of the estimated interval.
```{r}
c(bauthor_bar, mean(post$bauth_bar), PI(post$bauth_bar))
c(sigma_bauthor, mean(post$sigma_bauth), PI(post$sigma_bauth))
```
# Using brms and neg-binomial
Contribution from Richard Torkar - using brms and ggplot instead, with the zero-inflated negative binomial distribution.
An amazing _conversion_ of Statistical Rethinking into the `brms` world is available here: https://bookdown.org/content/4857/ (with source code https://github.com/ASKurz/Statistical_Rethinking_with_brms_ggplot2_and_the_tidyverse_2_ed).
## Comparing using general intercept models
A quick'n dirty comparison of two general intercept models.
```{r}
library(brms)
library(bayesplot)
library(ggplot2)
m_pois <- brm(bf(y ~ 1, zi ~ 1),
family = zero_inflated_poisson(), save_pars = save_pars(all=T),
data = d, chains = 4, cores = 4)
m_nb <- brm(bf(y ~ 1, zi ~ 1),
family = zero_inflated_negbinomial(),
data = d, chains = 4, cores = 4)
```
```{r}
loo_pois <- loo(m_pois, moment_match = TRUE) # warning about pareto_k > 0.7 that we need to check
```
```{r}
loo_nb <- loo(m_nb) # no warning
ll <- loo_compare(loo_pois, loo_nb)
ll
# In short, the NB model is very much prefered
# assuming a z-score of 1.96, i.e. 95%
ll[2, 1] + c(-1, 1) * 1.96 * ll[2, 2]
```
```{r}
d$file_f <- as.factor(d$file)
d$author_f <- as.factor(d$author)
p <- get_prior(bf(y ~ 1 + author_impact + size + (1 | file_f) + (1 | author_f),
shape ~ 1 + author_impact + size + (1 | file_f) + (1 | author_f),
zi ~ 1 + author_impact + size + (1 | file_f) + (1 | author_f)),
family = zero_inflated_negbinomial(), data = d)
p[c(1, 10, 19), 1] <- "normal(0,1)"
p[c(4, 13, 22), 1] <- "normal(0,2.5)"
p[c(5, 14, 23), 1] <- "weibull(2,1)"
# Only estimating overall intercept for the zi part
m <- brm(bf(y ~ 1 + author_impact + size + (1 | file_f) + (1 | author_f),
#it seems shape needs to have predictors!
shape ~ 1 + author_impact + size + (1 | file_f) + (1 | author_f),
zi ~ 1 + author_impact + size + (1 | file_f) + (1 | author_f)),
prior = p,
# sample_prior = "only", # if you want to check that priors are sane
family = zero_inflated_negbinomial(),
data = d, control = list(adapt_delta = 0.95))
# draw from posterior
yrep_zinb <- posterior_predict(m)
# proportions of zeros
ppc_stat(y = d$y, yrep_zinb, stat = function(y) mean(y == 0))
# max count PPC for zero-inflated negative-binomial model
# slightly off, but OKish
ppc_stat(y = d$y, yrep_zinb, stat = "max")
pp_check(m, type = "bars") + ylim(0, 20) + xlim(1, 20)
plot(conditional_effects(m), ask = FALSE)
# beta params?
mcmc_areas(m, regex_pars = c("^b_auth", "^b_si"))
# sd
mcmc_areas(m, pars = c("sd_author_f__Intercept",
"sd_file_f__Intercept"))
# remember, y is log() and zi is logit()!
summary(m)
# Estimations per file/author
ranef(m)
```
```{r}
```
```{r}
```
Questions:
* Is the fact that the model is overestimating the intercept and underestimating the slopes due to some artifact in the data, or something that I forgot in the simulation?
* In this simulation, I used the same scaling method (standard scaler) for all parameters. Is this important? For the reviewers, I am thinking to just divide by the stddev (keeping the 0, as it is important in the matrix multiplication) - but this seems to make the model behave worse.