-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis.qmd
322 lines (268 loc) · 11.2 KB
/
analysis.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
---
title: "Apple Silicon power and frequency analysis"
format:
html:
toc: true
embed-resources: true
df-print: paged
---
## Data overview
```{r}
#| echo: false
#| include: false
library(tidyverse)
library(colorspace)
# load the data
data <- imap(list.files("results", "\\.csv$", full.names = TRUE), ~ {
tibble(
.source = ..2,
read_csv(..1, show_col_types = FALSE)
)
}) %>%
bind_rows() %>%
# single- vs. multi-core results (by number of threads)
mutate(
type = ifelse(n_distinct(thread_id) == 1, "single", "multi"),
.by = .source,
.after = .source
) %>%
# P-core counters
mutate(
# P-core frequency and power (account for zero denominator)
p_freq = ifelse(p_time > 0.01, p_cycles / p_time / 1e9, 0),
p_power = ifelse(p_time > 0.01, p_energy / p_time, 0),
# relative thread time spend on P-core
p_usage = p_time/(p_time + e_time),
.before = p_cycles
) %>%
# E-core counters
mutate(
# E-core frequency and power (account for zero denominator)
e_freq = ifelse(e_time > 0.01, e_cycles / e_time / 1e9, 0),
e_power = ifelse(e_time > 0.01, e_energy / e_time, 0),
# relative thread time spend on E-core
e_usage = e_time/(p_time + e_time),
.before = e_cycles
) %>%
# additional aggregated stats per thread
mutate(
# total power
power = (p_energy + e_energy)/(p_time + e_time),
# throughtput (items/sec — both P- and E-cores)
items_tp = items / (p_time + e_time),
# which cores did the thread run at
cores = {
P <- p_usage > 0.05
E <- e_usage > 0.05
case_when(P & E ~ "P,E", P ~ "P", E ~ "E")
},
.after = powermode
) %>%
# device detection
mutate(
device = case_when(
# A13
device == "iPhone12,1" ~ "A13",
device == "iPhone12,8" ~ "A13",
# A14
device == "iPhone13,3" ~ "A14",
# A15
device == "iPad14,1" ~ "A15", # iPad Mini 6
device == "iPhone14,2" ~ "A15",
# A16
device == "iPhone15,2" ~ "A16",
device == "iPhone15,3" ~ "A16",
# A17 Pro
device == "iPhone16,1" ~ "A17 Pro",
# M1 family
device == "MacBookPro18,2" ~ "M1", # Max
device == "MacBookPro18,3" ~ "M1", # Pro
device == "Mac13,2" ~ "M1", # Ultra
# M2 family
device == "Mac14,5" ~ "M2", # Max
device == "Mac14,13" ~ "M2", # Max
device == "Mac14,6" ~ "M2", # Max
device == "Mac14,14" ~ "M2", # Ultra
TRUE ~ "unknown"
)
)
# guard agains unknown device
stopifnot(!any(data$device == "unknown"))
```
The test data is shown in the table below. We have the following columns
- **.source** the test run ID
- **type** singe- or multi-core run
- **sample** test iteration
- **thread_id** thread (unique across samples)
- **device** the device type
- **powermode** high vs. low power
- **power** total power usage of the thread per sample
- **items_tp** work throughput per thread per the sample (items/sec)
- **cores** types of cores the thread was running on
- **p_freq** average frequency of the P-core while running the thread (GHz)
- **p_power** average power draw of the P-core while running the thread (watt)
- **p_usage** relative amount of time the thread spend running on a P-core
- **p_cycles** number of cycles spent on a P-core
- **p_time** amount of time spend on a P-core
- **p_energy** energy used by P-core running the thread (J)
- **e_...** same as above, but for E-cores
- **items** number of work items processed by the thread for this sample
```{r}
data
```
## Power and frequency by device
Power draw distribution across devices in high power mode
```{r}
ggplot(
data %>%
filter(powermode == "high") %>%
# remove multi-core for Macs, since it makes the gaph hard to read
filter(startsWith(device, "A") | type == "single") %>%
# sum the total power per sample (to get proper multi-core power usage)
summarize(power = sum(power), .by = c(.source, sample, type, device))
) +
geom_violin(aes(x = device, y = power, fill = type), alpha=0.5, scale = "width") +
ggtitle("Power consumption distribution by device (high power mode)") +
xlab("Average power draw, watts")
```
Power draw distribution across devices in low power mode
```{r}
ggplot(
data %>%
filter(powermode == "low") %>%
# remove multi-core for Macs, since it makes the gaph hard to read
filter(startsWith(device, "A") | type == "single") %>%
# sum the total power per sample (to get proper multi-core power usage)
summarize(power = sum(power), .by = c(.source, sample, type, device))
) +
geom_violin(aes(x = device, y = power, fill = type), alpha=0.5, scale = "width") +
ggtitle("Power consumption distribution by device (low power mode)") +
xlab("Average power draw, watts")
```
P-thread power/frequency relationship by device (power curve). We only consider thread samples where at least 10% of time was spend on a P-core to avoid obviously unreliable numbers. The data for A-series shows considerable amount of variation since we combine together single- and multi-core mode, as well as high- and low-power mode (and the phones do throttle their frequency over time, giving us a larger frequency range within these scenarios). The M-series appear more grouped because they are less prone to frequency adjustments over time due to their higher thermal dissipation capability.
```{r}
ggplot(filter(data, p_power > 0.1)) +
geom_point(aes(x = p_freq, y = p_power, color = device), size = 0.75, alpha = 0.5) +
scale_x_continuous(n.breaks = 10) +
scale_y_continuous(n.breaks = 10) +
ggtitle("P-core power curve by device") +
ylab("Thread power draw, watts") +
xlab("CPU frequency, GHz")
```
Same, but for E-cores
```{r}
ggplot(filter(data, e_power > 0.1)) +
geom_point(aes(x = e_freq, y = e_power, color = device), size = 0.75, alpha = 0.5) +
scale_x_continuous(n.breaks = 10) +
scale_y_continuous(n.breaks = 10) +
ggtitle("E-core power curve by device") +
ylab("Thread power draw, watts") +
xlab("CPU frequency, GHz")
```
## Work throughput and energy efficiency
Please take all of this with a grain of salt since the test is very simplistic and won't account for IPC differences between the micro-architectures. This is just about rough behavior. Work efficiency shoudl be measured on concrete workloads.
Work throughput (items/sec) for each device (high power, single-core results). Note that there is no good way to know how much work was done
on P-cores and E-cores, so we just put this together. The different proportions of core involvement across threads produces the "cloud" effect.
```{r}
ggplot(filter(data, powermode == "high", type == "single")) +
geom_point(
aes(x = items/(p_time + e_time), y = p_power + e_power, color = device),
size = 0.75,
alpha = 0.75
) +
scale_color_discrete_qualitative("Set2") +
scale_x_continuous(n.breaks = 10) +
scale_y_continuous(n.breaks = 10) +
ggtitle("Work throughtput by power usage (high power, single-core)") +
ylab("Thread power (P+E combined), watts") +
xlab("Work throughtput, items/s")
```
Energy (in J) used to perform the work
```{r}
ggplot(filter(data, powermode == "high", type == "single")) +
geom_point(
aes(x = items, y = p_energy + e_energy, color = device),
size = 0.75,
alpha = 0.75
) +
scale_color_discrete_qualitative("Set2") +
scale_x_continuous(n.breaks = 10) +
scale_y_continuous(n.breaks = 10) +
ggtitle("Energy usage and work per thread (high power, single-core)") +
xlab("Total work performed by thread (items) ") +
ylab("Energy used (J)")
```
## Power curve forecasting
Attempt to estimate how the A17 power curve will continue beyond the observed range. We use the
constrained regression algorithms from package `colf` to force the coefficients to be non-negative (as
increasing frequency cannot lower power consumption). Please take this with a big grain of salt, as this is likely overfitting the data.
```{r}
# P-core data for A17 Pro
a17_data <- data %>%
filter(device == "A17 Pro", p_usage > 0.2) %>%
mutate(p_freq2 = p_freq^2, p_freq3 = p_freq^3, p_freq4 = p_freq^4, p_freq5 = p_freq^5, p_freq6 = p_freq^6)
# fit the polynomial using constrained linear regression
m1 <- colf::colf_nlxb(p_power ~ 1 + p_freq, a17_data, lower = c(-Inf, 0))
m2 <- colf::colf_nlxb(p_power ~ 1 + p_freq + p_freq2, a17_data, lower = c(-Inf, 0, 0))
m3 <- colf::colf_nlxb(p_power ~ 1 + p_freq + p_freq2 + p_freq3, a17_data, lower = c(-Inf, 0, 0, 0))
m4 <- colf::colf_nlxb(p_power ~ 1 + p_freq + p_freq2 + p_freq3 + p_freq4, a17_data, lower = c(-Inf, 0, 0, 0, 0))
m5 <- colf::colf_nlxb(p_power ~ 1 + p_freq + p_freq2 + p_freq3 + p_freq4 + p_freq5, a17_data, lower = c(-Inf, 0, 0, 0, 0, 0))
m6 <- colf::colf_nlxb(p_power ~ 1 + p_freq + p_freq2 + p_freq3 + p_freq4 + p_freq5 + p_freq6, a17_data, lower = c(-Inf, 0, 0, 0, 0, 0, 0))
# there is not much difference beyond fourth-degree polynomial
a17_curve <- m4
# setup predicted data
a17_prediction <- tibble(
p_freq = seq(1, 5, by = 0.05),
p_freq2 = p_freq^2,
p_freq3 = p_freq^3,
p_freq4 = p_freq^4,
p_freq5 = p_freq^5,
p_freq6 = p_freq^6
)
a17_prediction$p_power <- predict(m4, a17_prediction)
ggplot(a17_prediction) +
geom_path(aes(x = p_freq, y = p_power), color = "cyan", linetype = "dashed") +
# add the actual data
geom_point(aes(x = p_freq, y = p_power), size = 0.75, alpha = 0.75, data = a17_data) +
ggtitle("Predicted frequency curve for A17 Pro") +
ylab("Thread power draw, watts") +
xlab("CPU frequency, GHz")
```
Compare this to A15/M2
```{r}
# P-core data for A15
a15_data <- data %>%
filter(device == "A15", p_usage > 0.2) %>%
mutate(p_freq2 = p_freq^2, p_freq3 = p_freq^3, p_freq4 = p_freq^4,p_freq5 = p_freq^5, p_freq6 = p_freq^6)
# fit the polynomial using constrained linear regression
m1 <- colf::colf_nlxb(p_power ~ 1 + p_freq, a15_data, lower = c(-Inf, 0))
m2 <- colf::colf_nlxb(p_power ~ 1 + p_freq + p_freq2, a15_data, lower = c(-Inf, 0, 0))
m3 <- colf::colf_nlxb(p_power ~ 1 + p_freq + p_freq2 + p_freq3, a15_data, lower = c(-Inf, 0, 0, 0))
m4 <- colf::colf_nlxb(p_power ~ 1 + p_freq + p_freq2 + p_freq3 + p_freq4, a15_data, lower = c(-Inf, 0, 0, 0, 0))
m5 <- colf::colf_nlxb(p_power ~ 1 + p_freq + p_freq2 + p_freq3 + p_freq4 + p_freq5, a15_data, lower = c(-Inf, 0, 0, 0, 0, 0))
m6 <- colf::colf_nlxb(p_power ~ 1 + p_freq + p_freq2 + p_freq3 + p_freq4 + p_freq5 + p_freq6, a15_data, lower = c(-Inf, 0, 0, 0, 0, 0, 0))
# the sixth degree polynomial offers the best fit and predicts M2 data well!
a15_curve <- m6
# setup predicted data
a15_prediction <- tibble(
p_freq = seq(1, 5, by = 0.05),
p_freq2 = p_freq^2,
p_freq3 = p_freq^3,
p_freq4 = p_freq^4,
p_freq5 = p_freq^5,
p_freq6 = p_freq^6
)
a15_prediction$p_power <- predict(a15_curve, a15_prediction)
ggplot(a15_prediction) +
geom_path(aes(x = p_freq, y = p_power), color = "cyan", linetype = "dashed") +
# add the actual data
geom_point(aes(x = p_freq, y = p_power), size = 0.75, alpha = 0.75, data = {
filter(data, device %in% c("A15", "M2"), p_usage > 0.2)
}) +
# add the A17 curve for comparison
geom_path(aes(x = p_freq, y = p_power), color = "grey", linetype = "dashed", data = a17_prediction) +
geom_label(aes(x = p_freq, y = p_power), label = "a17", data = filter(a17_prediction, p_freq == 4.5)) +
ggtitle("Predicted frequency curve for A17 Pro") +
ylab("Thread power draw, watts") +
xlab("CPU frequency, GHz")
```