-
Notifications
You must be signed in to change notification settings - Fork 1
/
conditional_effects.R
225 lines (209 loc) · 13.5 KB
/
conditional_effects.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# Note that `fitted` will return a sample of the _expected_ value ($\mu$), not incorporating any information about the variance
# The variance will be included via the `predict` function, so that is what we use now. We use
#
condeffect_logCOMPLEX_by_logADD <- function(model, someData, aTeam, aRepo, duplicates=q50(data$DUP), removed=q50(data$DEL), robust=F) {
items <- 2000
scalars <- data.frame(R=scale_removed(removed),
D=scale_duplicates(duplicates),
team=aTeam,
repo=aRepo)
added_points <- c(P25=q25(data$ADD), P50=q50(data$ADD), P75=q75(data$ADD), P99=q99(data$ADD), SMAX=max(data$ADD))
complex_points <- seq(from=scale_complexity(min(data$COMPLEX)), to=scale_complexity(max(data$COMPLEX)), length.out=items)
added <- data.frame(added=added_points, A=scale_added(added_points), added_category=names(added_points))
complexity <- data.frame(C=complex_points, complex=unscale_complexity(complex_points))
grid <- expand_grid(scalars, added, complexity)
f <- predict(model, newdata=grid, probs=c(.055, .945), allow_new_levels=T, robust=robust) |> data.frame() |> bind_cols(grid)
return(f)
}
plot_logCOMPLEX_by_logADD <- function(model, someData, ftot) {
scalars <- ftot |> select(R, D, team, repo) |> distinct()
stopifnot(length(scalars[1]) == 1)
removed <- unscale_removed(scalars$R)
duplicates <- unscale_duplicates(scalars$D)
aTeam <- scalars$team
aRepo <- scalars$repo
added_points <- c(P25=q25(data$ADD), P50=q50(data$ADD), P75=q75(data$ADD), P99=q99(data$ADD), SMAX=max(data$ADD))
cutpoints <- append(scale_added(added_points), -Inf, after=0)
d <- someData |> mutate(added_category=cut(d$A, cutpoints, labels=c("P25", "P50", "P75", "P99", "SMAX")))
observed <- bind_cols(d, model$criteria$loo$diagnostics) |> filter(team == aTeam, repo == aRepo)
return(ftot |> ggplot(aes(x=complex)) +
geom_smooth(aes(y=Estimate, ymin=Q5.5, ymax=Q94.5, group=added_category, color=added_category), stat="identity", alpha=.25, linewidth=.5) +
geom_point(data=observed, aes(y=y, x=unscale_complexity(C), size = pareto_k, color=added_category), alpha=0.2) +
ggtitle(paste0("Conditional effects of team ", aTeam, " in repo ", aRepo),
paste0("removed lines: ", removed, ", existing duplicates: ", duplicates))
)
}
condeffect_logADD_by_logCOMPLEX <- function(model, someData, aTeam, aRepo, duplicates=q50(data$DUP), removed=q50(data$DEL), robust=F) {
items <- 2000
scalars <- data.frame(R=scale_removed(removed),
D=scale_duplicates(duplicates),
team=aTeam,
repo=aRepo)
added_points <- seq(from=scale_added(min(data$ADD)), to=scale_added(roundUpNice(max(data$ADD))), length.out=items)
complex_points <- c(P25=q25(data$COMPLEX), P50=q50(data$COMPLEX), P75=q75(data$COMPLEX), P99=q99(data$COMPLEX), SMAX=max(data$COMPLEX))
added <- data.frame(A=added_points, added=unscale_added(added_points))
complexity <- data.frame(C=scale_complexity(complex_points), complex=complex_points, complex_category=names(complex_points))
grid <- expand_grid(scalars, added, complexity)
f <- predict(model, newdata=grid, probs=c(.055, .945), allow_new_levels=T, robust=robust) |> data.frame() |> bind_cols(grid)
return(f)
}
plot_logADD_by_logCOMPLEX <- function(model, someData, ftot) {
scalars <- ftot |> select(R, D, team, repo) |> distinct()
stopifnot(length(scalars[1]) == 1)
removed <- unscale_removed(scalars$R)
duplicates <- unscale_duplicates(scalars$D)
aTeam <- scalars$team
aRepo <- scalars$repo
complex_points <- c(P25=q25(data$COMPLEX), P50=q50(data$COMPLEX), P75=q75(data$COMPLEX), P99=q99(data$COMPLEX), SMAX=max(data$COMPLEX))
cutpoints <- append(scale_complexity(complex_points), -Inf, after=0)
d <- someData |> mutate(complex_category=cut(d$C, cutpoints, labels=c("P25", "P50", "P75", "P99", "SMAX")))
observed <- bind_cols(d, model$criteria$loo$diagnostics) |> filter(team == aTeam, repo == aRepo)
return(ftot |> ggplot(aes(x=added)) +
geom_smooth(aes(y=Estimate, ymin=Q5.5, ymax=Q94.5, group=complex_category, color=complex_category), stat="identity", alpha=.25, linewidth=.5) +
geom_point(data=observed, aes(y=y, x=unscale_added(A), size = pareto_k, color=complex_category), alpha=0.2) +
ggtitle(paste0("Conditional effects of team ", aTeam, " in repo ", aRepo),
paste0("removed lines: ", removed, ", existing duplicates: ", duplicates)))
}
condeffect_logADD_by_logDUP <- function(model, someData, aTeam, aRepo, complexity=q50(data$COMPLEX), removed=q50(data$DEL), robust=F) {
items <- 2000
scalars <- data.frame(R=scale_removed(removed),
C=scale_complexity(complexity),
team=aTeam,
repo=aRepo)
added_points <- seq(from=scale_added(min(data$ADD)), to=scale_added(roundUpNice(max(data$ADD))), length.out=items)
dup_points <- c(P50=q50(data$DUP), P75=q75(data$DUP), P99=q99(data$DUP), SMAX=max(data$DUP))
added <- data.frame(A=added_points, added=unscale_added(added_points))
duplicates <- data.frame(D=scale_duplicates(dup_points), duplicates=dup_points, duplicates_category=names(dup_points))
grid <- expand_grid(scalars, added, duplicates)
f <- predict(model, newdata=grid, probs=c(.055, .945), allow_new_levels=T, robust=robust) |> data.frame() |> bind_cols(grid)
return(f)
}
plot_logADD_by_logDUP <- function(model, someData, ftot) {
scalars <- ftot |> select(R, C, team, repo) |> distinct()
stopifnot(length(scalars[1]) == 1)
removed <- unscale_removed(scalars$R)
complex <- unscale_complexity(scalars$C)
aTeam <- scalars$team
aRepo <- scalars$repo
dup_points <- c(P50=q50(data$DUP), P75=q75(data$DUP), P99=q99(data$DUP), SMAX=max(data$DUP))
cutpoints <- append(scale_duplicates(dup_points), -Inf, after=0)
d <- someData |> mutate(duplicates_category=cut(d$D, cutpoints, labels=c("P50", "P75", "P99", "SMAX")))
observed <- bind_cols(d, model$criteria$loo$diagnostics) |> filter(team == aTeam, repo == aRepo)
return(ftot |> ggplot(aes(x=added)) +
geom_smooth(aes(y=Estimate, ymin=Q5.5, ymax=Q94.5, group=duplicates_category, color=duplicates_category), stat="identity", alpha=.25, linewidth=.5) +
geom_point(data=observed, aes(y=y, x=unscale_added(A), size = pareto_k, color=duplicates_category), alpha=0.2) +
ggtitle(paste0("Conditional effects of team ", aTeam, " in repo ", aRepo),
paste0("removed lines: ", removed, ", complexity: ", complex)))
}
condeffect_logADD_by_logREMOVED <- function(model, someData, aTeam, aRepo, duplicates=q50(data$DUP), complexity=q50(data$COMPLEX), robust=F) {
items <- 2000
scalars <- data.frame(D=scale_duplicates(duplicates),
C=scale_complexity(complexity),
team=aTeam,
repo=aRepo)
added_points <- seq(from=scale_added(min(data$ADD)), to=scale_added(roundUpNice(max(data$ADD))), length.out=items)
removed_points <- c(P25=q25(data$DEL), P50=q50(data$DEL), P75=q75(data$DEL), P99=q99(data$DEL), SMAX=max(data$DEL))
added <- data.frame(A=added_points, added=unscale_added(added_points))
removed <- data.frame(R=scale_removed(removed_points), removed=removed_points, removed_category=names(removed_points))
grid <- expand_grid(scalars, added, removed)
f <- predict(model, newdata=grid, probs=c(.055, .945), allow_new_levels=T, robust=robust) |> data.frame() |> bind_cols(grid)
}
plot_logADD_by_logREMOVED <- function(model, someData, ftot, aTeam, aRepo) {
scalars <- ftot |> select(D, C, team, repo) |> distinct()
stopifnot(length(scalars[1]) == 1)
duplicates <- unscale_duplicates(scalars$D)
complex <- unscale_complexity(scalars$C)
aTeam <- scalars$team
aRepo <- scalars$repo
removed_points <- c(P25=q25(data$DEL), P50=q50(data$DEL), P75=q75(data$DEL), P99=q99(data$DEL), SMAX=max(data$DEL))
cutpoints <- append(scale_removed(removed_points), -Inf, after=0)
d <- someData |> mutate(removed_category=cut(d$R, cutpoints, labels=names(removed_points)))
observed <- bind_cols(d, model$criteria$loo$diagnostics) |> filter(team == aTeam, repo == aRepo)
return(ftot |> ggplot(aes(x=added)) +
geom_smooth(aes(y=Estimate, ymin=Q5.5, ymax=Q94.5, group=removed_category, color=removed_category), stat="identity", alpha=.25, linewidth=.5) +
geom_point(data=observed, aes(y=y, x=unscale_added(A), size = pareto_k, color=removed_category), alpha=0.2) +
ggtitle(paste0("Conditional effects of team ", aTeam, " in repo ", aRepo),
paste0("existing duplicates: ", duplicates, ", complexity: ", complex)))
}
## Not converted below...
condeffect_logCOMPLEX_by_logREMOVED <- function(model, someData, aTeam, aRepo) {
items <- 10000
nd <- data.frame(R=0, A=0, D=0, team=aTeam, repo=aRepo, C=seq(from=-2, to=4, length.out=items))
nd1 <- data.frame(R=1, A=0, D=0, team=aTeam, repo=aRepo, C=seq(from=-2, to=4, length.out=items))
nd2 <- data.frame(R=2, A=0, D=0, team=aTeam, repo=aRepo, C=seq(from=-2, to=4, length.out=items))
nd3 <- data.frame(R=3, A=0, D=0, team=aTeam, repo=aRepo, C=seq(from=-2, to=4, length.out=items))
f <- predict(model, newdata=nd, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd)
f1 <- predict(model, newdata=nd1, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd1)
f2 <- predict(model, newdata=nd2, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd2)
f3 <- predict(model, newdata=nd3, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd3)
f$R <- "0"
f1$R <- "1"
f2$R <- "2"
f3$R <- "3"
ftot <- rbind(f, f1, f2, f3)
return(ftot)
}
plot_logCOMPLEX_by_logREMOVED <- function(model, someData, ftot, aTeam, aRepo) {
observed <- bind_cols(someData, model$criteria$loo$diagnostics) |> filter(team == aTeam, repo == aRepo) |> mutate(truncR=as.factor(round(R)))
return(ftot |> ggplot(aes(x=C)) +
geom_smooth(aes(y=Estimate, ymin=Q5.5, ymax=Q94.5, group=R, color=R), stat="identity", alpha=.25, linewidth=.5) +
geom_point(data=observed, aes(y=y, size = pareto_k, color=truncR), alpha=0.2) +
ggtitle(paste0("Conditional effects of team ", aTeam, " in repo ", aRepo))
)
}
condeffect_logREMOVED_by_logCOMPLEX <- function(model, someData, aTeam, aRepo) {
items <- 10000
ndmin1 <- data.frame(C=-1, A=0, D=0, team=aTeam, repo=aRepo, R=seq(from=-2, to=4, length.out=items))
nd <- data.frame(C=0, A=0, D=0, team=aTeam, repo=aRepo, R=seq(from=-2, to=4, length.out=items))
nd1 <- data.frame(C=1, A=0, D=0, team=aTeam, repo=aRepo, R=seq(from=-2, to=4, length.out=items))
nd2 <- data.frame(C=2, A=0, D=0, team=aTeam, repo=aRepo, R=seq(from=-2, to=4, length.out=items))
nd3 <- data.frame(C=3, A=0, D=0, team=aTeam, repo=aRepo, R=seq(from=-2, to=4, length.out=items))
fmin1 <- predict(model, newdata=ndmin1, probs=c(.055, .945)) |> data.frame() |> bind_cols(ndmin1)
f <- predict(model, newdata=nd, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd)
f1 <- predict(model, newdata=nd1, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd1)
f2 <- predict(model, newdata=nd2, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd2)
f3 <- predict(model, newdata=nd3, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd3)
fmin1$C <- "-1"
f$C <- "0"
f1$C <- "1"
f2$C <- "2"
f3$C <- "3"
ftot <- rbind(fmin1, f, f1, f2, f3)
return(ftot)
}
plot_logREMOVED_by_logCOMPLEX <- function(model, someData, ftot, aTeam, aRepo) {
observed <- bind_cols(someData, model$criteria$loo$diagnostics) |> filter(team == aTeam, repo == aRepo)|> mutate(truncD=as.factor(round(D)))
return(ftot |> ggplot(aes(x=R)) +
geom_smooth(aes(y=Estimate, ymin=Q5.5, ymax=Q94.5, group=C, color=C), stat="identity", alpha=.25, linewidth=.5) +
geom_point(data=observed, aes(y=y, size = pareto_k, color=truncD), alpha=0.2) +
ggtitle(paste0("Conditional effects of team ", aTeam, " in repo ", aRepo))
)
}
condeffect_logREMOVED_by_logDUP <- function(model, someData, aTeam, aRepo) {
items <- 10000
ndmin1 <- data.frame(D=-1, A=0, C=0, team=aTeam, repo=aRepo, R=seq(from=-2, to=4, length.out=items))
nd <- data.frame(D=0, A=0, C=0, team=aTeam, repo=aRepo, R=seq(from=-2, to=4, length.out=items))
nd1 <- data.frame(D=1, A=0, C=0, team=aTeam, repo=aRepo, R=seq(from=-2, to=4, length.out=items))
nd2 <- data.frame(D=2, A=0, C=0, team=aTeam, repo=aRepo, R=seq(from=-2, to=4, length.out=items))
nd3 <- data.frame(D=3, A=0, C=0, team=aTeam, repo=aRepo, R=seq(from=-2, to=4, length.out=items))
fmin1 <- predict(model, newdata=ndmin1, probs=c(.055, .945)) |> data.frame() |> bind_cols(ndmin1)
f <- predict(model, newdata=nd, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd)
f1 <- predict(model, newdata=nd1, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd1)
f2 <- predict(model, newdata=nd2, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd2)
f3 <- predict(model, newdata=nd3, probs=c(.055, .945)) |> data.frame() |> bind_cols(nd3)
fmin1$D <- "-1"
f$D <- "0"
f1$D <- "1"
f2$D <- "2"
f3$D <- "3"
ftot <- rbind(fmin1, f, f1, f2, f3)
return(ftot)
}
plot_logREMOVED_by_logDUP <- function(model, someData, ftot, aTeam, aRepo) {
observed <- bind_cols(someData, model$criteria$loo$diagnostics) |> filter(team == aTeam, repo == aRepo) |> mutate(truncD=as.factor(round(D)))
return(ftot |> ggplot(aes(x=R)) +
geom_smooth(aes(y=Estimate, ymin=Q5.5, ymax=Q94.5, group=D, color=D), stat="identity", alpha=.25, linewidth=.5) +
geom_point(data=observed, aes(y=y, size = pareto_k, color=truncD), alpha=0.2) +
ggtitle(paste0("Conditional effects of team ", aTeam, " in repo ", aRepo))
)
}