-
Notifications
You must be signed in to change notification settings - Fork 127
/
cgroup.c
338 lines (285 loc) · 9.14 KB
/
cgroup.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
#define _GNU_SOURCE
#include "cgroup.h"
#include "globals.h"
#include "utils.h"
#include "cli.h"
#include "config.h"
#include <fcntl.h>
#include <glib.h>
#include <stdio.h>
#ifdef __linux__
#include <linux/limits.h>
#include <sys/eventfd.h>
#include <sys/inotify.h>
#include <sys/socket.h>
#include <sys/statfs.h>
#endif
#ifndef CGROUP2_SUPER_MAGIC
#define CGROUP2_SUPER_MAGIC 0x63677270
#endif
#define CGROUP_ROOT "/sys/fs/cgroup"
int oom_event_fd = -1;
int oom_cgroup_fd = -1;
#ifdef __linux__
static char *process_cgroup_subsystem_path(int pid, bool cgroup2, const char *subsystem);
static void setup_oom_handling_cgroup_v2(int pid);
static void setup_oom_handling_cgroup_v1(int pid);
static gboolean oom_cb_cgroup_v2(int fd, GIOCondition condition, G_GNUC_UNUSED gpointer user_data);
static gboolean oom_cb_cgroup_v1(int fd, GIOCondition condition, G_GNUC_UNUSED gpointer user_data);
static int write_oom_files();
void setup_oom_handling(int pid)
{
struct statfs sfs;
if (statfs("/sys/fs/cgroup", &sfs) == 0 && sfs.f_type == CGROUP2_SUPER_MAGIC) {
is_cgroup_v2 = TRUE;
setup_oom_handling_cgroup_v2(pid);
return;
}
setup_oom_handling_cgroup_v1(pid);
}
/*
* Returns the path for specified controller name for a pid.
* Returns NULL on error.
*/
static char *process_cgroup_subsystem_path(int pid, bool cgroup2, const char *subsystem)
{
_cleanup_free_ char *cgroups_file_path = g_strdup_printf("/proc/%d/cgroup", pid);
_cleanup_fclose_ FILE *fp = fopen(cgroups_file_path, "re");
if (fp == NULL) {
nwarnf("Failed to open cgroups file: %s", cgroups_file_path);
return NULL;
}
_cleanup_free_ char *line = NULL;
ssize_t read;
size_t len = 0;
char *ptr, *path;
while ((read = getline(&line, &len, fp)) != -1) {
_cleanup_strv_ char **subsystems = NULL;
ptr = strchr(line, ':');
if (ptr == NULL) {
nwarnf("Error parsing cgroup, ':' not found: %s", line);
return NULL;
}
ptr++;
path = strchr(ptr, ':');
if (path == NULL) {
nwarnf("Error parsing cgroup, second ':' not found: %s", line);
return NULL;
}
*path = 0;
path++;
if (cgroup2) {
char *subsystem_path = g_strdup_printf("%s%s", CGROUP_ROOT, path);
subsystem_path[strlen(subsystem_path) - 1] = '\0';
return subsystem_path;
}
subsystems = g_strsplit(ptr, ",", -1);
for (int i = 0; subsystems[i] != NULL; i++) {
if (strcmp(subsystems[i], subsystem) == 0) {
char *subpath = strchr(subsystems[i], '=');
if (subpath == NULL) {
subpath = ptr;
} else {
*subpath = 0;
}
char *subsystem_path = g_strdup_printf("%s/%s%s", CGROUP_ROOT, subpath, path);
subsystem_path[strlen(subsystem_path) - 1] = '\0';
return subsystem_path;
}
}
}
return NULL;
}
static void setup_oom_handling_cgroup_v2(int pid)
{
cgroup2_path = process_cgroup_subsystem_path(pid, true, "");
if (!cgroup2_path) {
nwarn("Failed to get cgroup path. Container may have exited");
return;
}
_cleanup_free_ char *memory_events_file_path = g_build_filename(cgroup2_path, "memory.events", NULL);
_cleanup_close_ int ifd = -1;
if ((ifd = inotify_init()) < 0) {
nwarnf("Failed to create inotify fd");
return;
}
if (inotify_add_watch(ifd, memory_events_file_path, IN_MODIFY) < 0) {
nwarnf("Failed to add inotify watch for %s", memory_events_file_path);
return;
}
/* Move ownership to inotify_fd. */
inotify_fd = ifd;
ifd = -1;
g_unix_fd_add(inotify_fd, G_IO_IN, oom_cb_cgroup_v2, NULL);
}
static void setup_oom_handling_cgroup_v1(int pid)
{
/* Setup OOM notification for container process */
_cleanup_free_ char *memory_cgroup_path = process_cgroup_subsystem_path(pid, false, "memory");
if (!memory_cgroup_path) {
nwarn("Failed to get memory cgroup path. Container may have exited");
return;
}
/* this will be cleaned up in oom_cb_cgroup_v1 */
char *memory_cgroup_file_path = g_build_filename(memory_cgroup_path, "cgroup.event_control", NULL);
_cleanup_close_ int cfd = open(memory_cgroup_file_path, O_WRONLY | O_CLOEXEC);
if (cfd == -1) {
nwarnf("Failed to open %s", memory_cgroup_file_path);
g_free(memory_cgroup_file_path);
return;
}
_cleanup_free_ char *memory_cgroup_file_oom_path = g_build_filename(memory_cgroup_path, "memory.oom_control", NULL);
oom_cgroup_fd = open(memory_cgroup_file_oom_path, O_RDONLY | O_CLOEXEC); /* Not closed */
if (oom_cgroup_fd == -1)
pexitf("Failed to open %s", memory_cgroup_file_oom_path);
if ((oom_event_fd = eventfd(0, EFD_CLOEXEC)) == -1)
pexit("Failed to create eventfd");
_cleanup_free_ char *data = g_strdup_printf("%d %d", oom_event_fd, oom_cgroup_fd);
if (write_all(cfd, data, strlen(data)) < 0) {
/* This used to be fatal, but we make it advisory and stumble on because
* https://github.com/torvalds/linux/commit/2343e88d238f5de973d609d861c505890f94f22e
* disables this interface in PREEMPT_RT kernel configs.
*/
nwarnf("Failed to write to cgroup.event_control");
g_free(memory_cgroup_file_path);
return;
}
g_unix_fd_add(oom_event_fd, G_IO_IN, oom_cb_cgroup_v1, memory_cgroup_file_path);
}
static gboolean oom_cb_cgroup_v2(int fd, GIOCondition condition, G_GNUC_UNUSED gpointer user_data)
{
const size_t events_size = sizeof(struct inotify_event) + NAME_MAX + 1;
char events[events_size];
/* Drop the inotify events. */
ssize_t num_read = read(fd, &events, events_size);
if (num_read < 0) {
nwarn("Failed to read oom event from eventfd in v2");
return G_SOURCE_CONTINUE;
}
gboolean ret = G_SOURCE_REMOVE;
if ((condition & G_IO_IN) != 0) {
ret = check_cgroup2_oom();
}
if (ret == G_SOURCE_REMOVE) {
/* End of input */
close(fd);
inotify_fd = -1;
}
return ret;
}
/* user_data is expected to be the container's cgroup.event_control file,
* used to verify the cgroup hasn't been cleaned up */
static gboolean oom_cb_cgroup_v1(int fd, GIOCondition condition, gpointer user_data)
{
char *cgroup_event_control_path = (char *)user_data;
if ((condition & G_IO_IN) == 0) {
/* End of input */
close(fd);
oom_event_fd = -1;
g_free(cgroup_event_control_path);
return G_SOURCE_REMOVE;
}
/* Attempt to read the container's cgroup path.
* if the cgroup.memory_control file does not exist,
* we know one of the events on this fd was a cgroup removal
*/
gboolean cgroup_removed = FALSE;
if (access(cgroup_event_control_path, F_OK) < 0) {
ndebugf("Memory cgroup removal event received");
cgroup_removed = TRUE;
}
/* there are three cases we need to worry about:
* oom kill happened (1 event)
* cgroup was removed (1 event)
* oom kill happened and cgroup was removed (2 events)
*/
uint64_t event_count;
ssize_t num_read = read(fd, &event_count, sizeof(uint64_t));
if (num_read < 0) {
nwarn("Failed to read oom event from eventfd");
return G_SOURCE_CONTINUE;
}
if (num_read == 0) {
close(fd);
oom_event_fd = -1;
g_free(cgroup_event_control_path);
return G_SOURCE_REMOVE;
}
if (num_read != sizeof(uint64_t)) {
nwarn("Failed to read full oom event from eventfd");
return G_SOURCE_CONTINUE;
}
ndebugf("Memory cgroup event count: %ld", (long)event_count);
if (event_count == 0) {
nwarn("Unexpected event count (zero) when reading for oom event");
return G_SOURCE_CONTINUE;
}
/* if there's only one event, and the cgroup was removed
* we know the event was for a cgroup removal, not an OOM kill
*/
if (event_count == 1 && cgroup_removed)
return G_SOURCE_CONTINUE;
/* we catch the two other cases here, both of which are OOM kill events */
ninfo("OOM event received");
write_oom_files();
return G_SOURCE_CONTINUE;
}
gboolean check_cgroup2_oom()
{
static long int last_counter = 0;
if (!is_cgroup_v2)
return G_SOURCE_REMOVE;
_cleanup_free_ char *memory_events_file_path = g_build_filename(cgroup2_path, "memory.events", NULL);
_cleanup_fclose_ FILE *fp = fopen(memory_events_file_path, "re");
if (fp == NULL) {
nwarnf("Failed to open cgroups file: %s", memory_events_file_path);
return G_SOURCE_CONTINUE;
}
_cleanup_free_ char *line = NULL;
size_t len = 0;
ssize_t read;
while ((read = getline(&line, &len, fp)) != -1) {
long int counter;
const int oom_len = 4, oom_kill_len = 9;
if (read >= oom_kill_len + 2 && memcmp(line, "oom_kill ", oom_kill_len) == 0)
len = oom_kill_len;
else if (read >= oom_len + 2 && memcmp(line, "oom ", oom_len) == 0)
len = oom_len;
else
continue;
counter = strtol(&line[len], NULL, 10);
if (counter == LONG_MAX) {
nwarnf("Failed to parse: %s", &line[len]);
continue;
}
if (counter == 0)
continue;
if (counter != last_counter) {
if (write_oom_files() == 0)
last_counter = counter;
}
return G_SOURCE_CONTINUE;
}
return G_SOURCE_REMOVE;
}
/* write the appropriate files to tell the caller there was an oom event
* this can be used for v1 and v2 OOMS
* returns 0 on success, negative value on failure
*/
static int write_oom_files()
{
ninfo("OOM received");
if (opt_persist_path) {
_cleanup_free_ char *ctr_oom_file_path = g_build_filename(opt_persist_path, "oom", NULL);
_cleanup_close_ int ctr_oom_fd = open(ctr_oom_file_path, O_CREAT | O_CLOEXEC, 0666);
if (ctr_oom_fd < 0) {
nwarn("Failed to write oom file");
}
}
_cleanup_close_ int oom_fd = open("oom", O_CREAT | O_CLOEXEC, 0666);
if (oom_fd < 0) {
nwarn("Failed to write oom file");
}
return oom_fd >= 0 ? 0 : -1;
}
#endif