Skip to content

Commit 59331b8

Browse files
committed
oom: implement avoid/omit xattr support
There may be situations where a cgroup should be protected from killing or deprioritized as a candidate. In FB oomd xattrs are used to bias oomd away from supervisor cgroups and towards worker cgroups in container tasks. On desktops this can be used to protect important units with unpredictable resource consumption. The patch allows systemd-oomd to understand 2 xattrs: "user.oomd_avoid" and "user.oomd_omit". If systemd-oomd sees these xattrs set to 1 on a candidate cgroup (i.e. while attempting to kill something) AND the cgroup is owned by root, it will either deprioritize the cgroup as a candidate (avoid) or remove it completely as a candidate (omit). Usage is restricted to root owned cgroups to prevent situations where an unprivileged user can set their own cgroups lower in the kill priority than another user's (and prevent them from omitting their units from systemd-oomd killing).
1 parent 242d75b commit 59331b8

File tree

8 files changed

+169
-13
lines changed

8 files changed

+169
-13
lines changed

src/basic/cgroup-util.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,20 @@ int cg_get_xattr_malloc(const char *controller, const char *path, const char *na
635635
return r;
636636
}
637637

638+
int cg_get_xattr_bool(const char *controller, const char *path, const char *name) {
639+
_cleanup_free_ char *val = NULL;
640+
int r;
641+
642+
assert(path);
643+
assert(name);
644+
645+
r = cg_get_xattr_malloc(controller, path, name, &val);
646+
if (r < 0)
647+
return r;
648+
649+
return parse_boolean(val);
650+
}
651+
638652
int cg_remove_xattr(const char *controller, const char *path, const char *name) {
639653
_cleanup_free_ char *fs = NULL;
640654
int r;
@@ -1703,6 +1717,25 @@ int cg_get_attribute_as_bool(const char *controller, const char *path, const cha
17031717
return 0;
17041718
}
17051719

1720+
int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid) {
1721+
_cleanup_free_ char *f = NULL;
1722+
struct stat stats;
1723+
int r;
1724+
1725+
assert(ret_uid);
1726+
1727+
r = cg_get_path(controller, path, NULL, &f);
1728+
if (r < 0)
1729+
return r;
1730+
1731+
r = stat(f, &stats);
1732+
if (r < 0)
1733+
return -errno;
1734+
1735+
*ret_uid = stats.st_uid;
1736+
return 0;
1737+
}
1738+
17061739
int cg_get_keyed_attribute_full(
17071740
const char *controller,
17081741
const char *path,

src/basic/cgroup-util.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,10 +212,13 @@ int cg_get_attribute_as_uint64(const char *controller, const char *path, const c
212212
int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret);
213213

214214
int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid);
215+
int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid);
215216

216217
int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags);
217218
int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size);
218219
int cg_get_xattr_malloc(const char *controller, const char *path, const char *name, char **ret);
220+
/* Returns negative on error, and 0 or 1 on success for the bool value */
221+
int cg_get_xattr_bool(const char *controller, const char *path, const char *name);
219222
int cg_remove_xattr(const char *controller, const char *path, const char *name);
220223

221224
int cg_install_release_agent(const char *controller, const char *agent);

src/oom/oomd-util.c

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include <sys/xattr.h>
44
#include <unistd.h>
55

6-
#include "cgroup-util.h"
76
#include "fd-util.h"
87
#include "format-util.h"
98
#include "oomd-util.h"
@@ -159,7 +158,8 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha
159158
return -ENOMEM;
160159

161160
HASHMAP_FOREACH(item, h) {
162-
if (item->path && prefix && !path_startswith(item->path, prefix))
161+
/* Skip over cgroups that are not valid candidates or are explicitly marked for omission */
162+
if ((item->path && prefix && !path_startswith(item->path, prefix)) || item->preference == MANAGED_OOM_PREFERENCE_OMIT)
163163
continue;
164164

165165
sorted[k++] = item;
@@ -219,9 +219,10 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) {
219219
return r;
220220

221221
for (int i = 0; i < r; i++) {
222-
/* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure */
222+
/* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. */
223+
/* Don't break since there might be "avoid" cgroups at the end. */
223224
if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0)
224-
break;
225+
continue;
225226

226227
r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
227228
if (r > 0 || r == -ENOMEM)
@@ -244,8 +245,10 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) {
244245
/* Try to kill cgroups with non-zero swap usage until we either succeed in
245246
* killing or we get to a cgroup with no swap usage. */
246247
for (int i = 0; i < r; i++) {
248+
/* Skip over cgroups with no resource usage. Don't break since there might be "avoid"
249+
* cgroups at the end. */
247250
if (sorted[i]->swap_usage == 0)
248-
break;
251+
continue;
249252

250253
r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
251254
if (r > 0 || r == -ENOMEM)
@@ -259,6 +262,7 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
259262
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
260263
_cleanup_free_ char *p = NULL, *val = NULL;
261264
bool is_root;
265+
uid_t uid;
262266
int r;
263267

264268
assert(path);
@@ -269,6 +273,7 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
269273
return -ENOMEM;
270274

271275
is_root = empty_or_root(path);
276+
ctx->preference = MANAGED_OOM_PREFERENCE_NONE;
272277

273278
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p);
274279
if (r < 0)
@@ -278,6 +283,23 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
278283
if (r < 0)
279284
return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p);
280285

286+
r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, path, &uid);
287+
if (r < 0)
288+
log_debug_errno(r, "Failed to get owner/group from %s: %m", path);
289+
else if (uid == 0) {
290+
/* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used
291+
* as an optional feature of systemd-oomd (and the system might not even support them). */
292+
r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_avoid");
293+
if (r == -ENOMEM)
294+
return r;
295+
ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_AVOID : ctx->preference;
296+
297+
r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_omit");
298+
if (r == -ENOMEM)
299+
return r;
300+
ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_OMIT : ctx->preference;
301+
}
302+
281303
if (is_root) {
282304
r = procfs_memory_get_used(&ctx->current_memory_usage);
283305
if (r < 0)

src/oom/oomd-util.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include <stdbool.h>
55

6+
#include "cgroup-util.h"
67
#include "hashmap.h"
78
#include "psi-util.h"
89

@@ -29,6 +30,8 @@ struct OomdCGroupContext {
2930
uint64_t last_pgscan;
3031
uint64_t pgscan;
3132

33+
ManagedOOMPreference preference;
34+
3235
/* These are only used by oomd_pressure_above for acting on high memory pressure. */
3336
loadavg_t mem_pressure_limit;
3437
usec_t mem_pressure_duration_usec;
@@ -61,12 +64,18 @@ bool oomd_memory_reclaim(Hashmap *h);
6164
/* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */
6265
bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent);
6366

67+
/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
68+
* (after the smallest values). */
6469
static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
6570
int r;
6671

6772
assert(c1);
6873
assert(c2);
6974

75+
r = CMP((*c1)->preference, (*c2)->preference);
76+
if (r != 0)
77+
return r;
78+
7079
r = CMP((*c2)->pgscan, (*c1)->pgscan);
7180
if (r != 0)
7281
return r;
@@ -75,9 +84,15 @@ static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1,
7584
}
7685

7786
static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
87+
int r;
88+
7889
assert(c1);
7990
assert(c2);
8091

92+
r = CMP((*c1)->preference, (*c2)->preference);
93+
if (r != 0)
94+
return r;
95+
8196
return CMP((*c2)->swap_usage, (*c1)->swap_usage);
8297
}
8398

src/oom/test-oomd-util.c

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
8989
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
9090
_cleanup_free_ char *cgroup = NULL;
9191
OomdCGroupContext *c1, *c2;
92+
bool test_xattrs;
93+
int r;
9294

9395
if (geteuid() != 0)
9496
return (void) log_tests_skipped("not root");
@@ -101,6 +103,16 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
101103

102104
assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0);
103105

106+
/* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities
107+
* so skip the xattr portions of the test. */
108+
r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "1", 1, 0);
109+
test_xattrs = !ERRNO_IS_PRIVILEGE(r) && !ERRNO_IS_NOT_SUPPORTED(r);
110+
111+
if (test_xattrs) {
112+
assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "1", 1, 0) >= 0);
113+
assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0);
114+
}
115+
104116
assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
105117

106118
assert_se(streq(ctx->path, cgroup));
@@ -110,12 +122,28 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
110122
assert_se(ctx->swap_usage == 0);
111123
assert_se(ctx->last_pgscan == 0);
112124
assert_se(ctx->pgscan == 0);
125+
/* omit takes precedence over avoid when both are set to true */
126+
if (test_xattrs)
127+
assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_OMIT);
128+
else
129+
assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
130+
ctx = oomd_cgroup_context_free(ctx);
131+
132+
/* also check when only avoid is set to true */
133+
if (test_xattrs) {
134+
assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "0", 1, 0) >= 0);
135+
assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0);
136+
}
137+
assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
138+
if (test_xattrs)
139+
assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_AVOID);
113140
ctx = oomd_cgroup_context_free(ctx);
114141

115142
/* Test the root cgroup */
116143
assert_se(oomd_cgroup_context_acquire("", &ctx) == 0);
117144
assert_se(streq(ctx->path, "/"));
118145
assert_se(ctx->current_memory_usage > 0);
146+
assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
119147

120148
/* Test hashmap inserts */
121149
assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
@@ -137,6 +165,14 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
137165
assert_se(c2->last_pgscan == 5555);
138166
assert_se(c2->mem_pressure_limit == 6789);
139167
assert_se(c2->last_hit_mem_pressure_limit == 42);
168+
169+
/* Assert that avoid/omit are not set if the cgroup is not owned by root */
170+
if (test_xattrs) {
171+
ctx = oomd_cgroup_context_free(ctx);
172+
assert_se(cg_set_access(SYSTEMD_CGROUP_CONTROLLER, cgroup, 65534, 0) >= 0);
173+
assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
174+
assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
175+
}
140176
}
141177

142178
static void test_oomd_system_context_acquire(void) {
@@ -287,9 +323,11 @@ static void test_oomd_sort_cgroups(void) {
287323
char **paths = STRV_MAKE("/herp.slice",
288324
"/herp.slice/derp.scope",
289325
"/herp.slice/derp.scope/sheep.service",
290-
"/zupa.slice");
326+
"/zupa.slice",
327+
"/omitted.slice",
328+
"/avoid.slice");
291329

292-
OomdCGroupContext ctx[4] = {
330+
OomdCGroupContext ctx[6] = {
293331
{ .path = paths[0],
294332
.swap_usage = 20,
295333
.pgscan = 60,
@@ -306,6 +344,14 @@ static void test_oomd_sort_cgroups(void) {
306344
.swap_usage = 10,
307345
.pgscan = 80,
308346
.current_memory_usage = 10 },
347+
{ .path = paths[4],
348+
.swap_usage = 90,
349+
.pgscan = 100,
350+
.preference = MANAGED_OOM_PREFERENCE_OMIT },
351+
{ .path = paths[5],
352+
.swap_usage = 99,
353+
.pgscan = 200,
354+
.preference = MANAGED_OOM_PREFERENCE_AVOID },
309355
};
310356

311357
assert_se(h = hashmap_new(&string_hash_ops));
@@ -314,26 +360,32 @@ static void test_oomd_sort_cgroups(void) {
314360
assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0);
315361
assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0);
316362
assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0);
363+
assert_se(hashmap_put(h, "/omitted.slice", &ctx[4]) >= 0);
364+
assert_se(hashmap_put(h, "/avoid.slice", &ctx[5]) >= 0);
317365

318-
assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 4);
366+
assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 5);
319367
assert_se(sorted_cgroups[0] == &ctx[1]);
320368
assert_se(sorted_cgroups[1] == &ctx[2]);
321369
assert_se(sorted_cgroups[2] == &ctx[0]);
322370
assert_se(sorted_cgroups[3] == &ctx[3]);
371+
assert_se(sorted_cgroups[4] == &ctx[5]);
323372
sorted_cgroups = mfree(sorted_cgroups);
324373

325-
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 4);
374+
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 5);
326375
assert_se(sorted_cgroups[0] == &ctx[3]);
327376
assert_se(sorted_cgroups[1] == &ctx[0]);
328377
assert_se(sorted_cgroups[2] == &ctx[2]);
329378
assert_se(sorted_cgroups[3] == &ctx[1]);
379+
assert_se(sorted_cgroups[4] == &ctx[5]);
330380
sorted_cgroups = mfree(sorted_cgroups);
331381

332382
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
333383
assert_se(sorted_cgroups[0] == &ctx[2]);
334384
assert_se(sorted_cgroups[1] == &ctx[1]);
335385
assert_se(sorted_cgroups[2] == 0);
336386
assert_se(sorted_cgroups[3] == 0);
387+
assert_se(sorted_cgroups[4] == 0);
388+
assert_se(sorted_cgroups[5] == 0);
337389
sorted_cgroups = mfree(sorted_cgroups);
338390
}
339391

test/test-functions

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ BASICTOOLS=(
124124
rmdir
125125
sed
126126
seq
127+
setfattr
127128
setfont
128129
setsid
129130
sfdisk
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[Unit]
2+
Description=Create some memory pressure
3+
4+
[Service]
5+
MemoryHigh=2M
6+
Slice=testsuite-56-workload.slice
7+
ExecStart=/usr/lib/systemd/tests/testdata/units/testsuite-56-slowgrowth.sh

test/units/testsuite-56.sh

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,20 +23,43 @@ oomctl | grep "/testsuite-56-workload.slice"
2323
oomctl | grep "1.00%"
2424
oomctl | grep "Default Memory Pressure Duration: 5s"
2525

26-
# systemd-oomd watches for elevated pressure for 30 seconds before acting.
27-
# It can take time to build up pressure so either wait 5 minutes or for the service to fail.
28-
timeout=$(date -ud "5 minutes" +%s)
26+
# systemd-oomd watches for elevated pressure for 5 seconds before acting.
27+
# It can take time to build up pressure so either wait 2 minutes or for the service to fail.
28+
timeout=$(date -ud "2 minutes" +%s)
2929
while [[ $(date -u +%s) -le $timeout ]]; do
3030
if ! systemctl status testsuite-56-testbloat.service; then
3131
break
3232
fi
33-
sleep 15
33+
sleep 5
3434
done
3535

3636
# testbloat should be killed and testchill should be fine
3737
if systemctl status testsuite-56-testbloat.service; then exit 42; fi
3838
if ! systemctl status testsuite-56-testchill.service; then exit 24; fi
3939

40+
# only run this portion of the test if we can set xattrs
41+
if setfattr -n user.xattr_test -v 1 /sys/fs/cgroup/; then
42+
sleep 120 # wait for systemd-oomd kill cool down and elevated memory pressure to come down
43+
44+
systemctl start testsuite-56-testchill.service
45+
systemctl start testsuite-56-testmunch.service
46+
systemctl start testsuite-56-testbloat.service
47+
setfattr -n user.oomd_avoid -v 1 /sys/fs/cgroup/testsuite.slice/testsuite-56.slice/testsuite-56-workload.slice/testsuite-56-testbloat.service
48+
49+
timeout=$(date -ud "2 minutes" +%s)
50+
while [[ $(date -u +%s) -le $timeout ]]; do
51+
if ! systemctl status testsuite-56-testmunch.service; then
52+
break
53+
fi
54+
sleep 5
55+
done
56+
57+
# testmunch should be killed since testbloat had the avoid xattr on it
58+
if ! systemctl status testsuite-56-testbloat.service; then exit 25; fi
59+
if systemctl status testsuite-56-testmunch.service; then exit 43; fi
60+
if ! systemctl status testsuite-56-testchill.service; then exit 24; fi
61+
fi
62+
4063
systemd-analyze log-level info
4164

4265
echo OK > /testok

0 commit comments

Comments
 (0)