return r;
}
+int cg_get_xattr_bool(const char *controller, const char *path, const char *name) {
+ _cleanup_free_ char *val = NULL;
+ int r;
+
+ assert(path);
+ assert(name);
+
+ r = cg_get_xattr_malloc(controller, path, name, &val);
+ if (r < 0)
+ return r;
+
+ return parse_boolean(val);
+}
+
int cg_remove_xattr(const char *controller, const char *path, const char *name) {
_cleanup_free_ char *fs = NULL;
int r;
return 0;
}
+int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid) {
+ _cleanup_free_ char *f = NULL;
+ struct stat stats;
+ int r;
+
+ assert(ret_uid);
+
+ r = cg_get_path(controller, path, NULL, &f);
+ if (r < 0)
+ return r;
+
+ r = stat(f, &stats);
+ if (r < 0)
+ return -errno;
+
+ *ret_uid = stats.st_uid;
+ return 0;
+}
+
int cg_get_keyed_attribute_full(
const char *controller,
const char *path,
int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret);
int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid);
+int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid);
int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags);
int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size);
int cg_get_xattr_malloc(const char *controller, const char *path, const char *name, char **ret);
+/* Returns negative on error, and 0 or 1 on success for the bool value */
+int cg_get_xattr_bool(const char *controller, const char *path, const char *name);
int cg_remove_xattr(const char *controller, const char *path, const char *name);
int cg_install_release_agent(const char *controller, const char *agent);
#include <sys/xattr.h>
#include <unistd.h>
-#include "cgroup-util.h"
#include "fd-util.h"
#include "format-util.h"
#include "oomd-util.h"
return -ENOMEM;
HASHMAP_FOREACH(item, h) {
- if (item->path && prefix && !path_startswith(item->path, prefix))
+ /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */
+ if ((item->path && prefix && !path_startswith(item->path, prefix)) || item->preference == MANAGED_OOM_PREFERENCE_OMIT)
continue;
sorted[k++] = item;
return r;
for (int i = 0; i < r; i++) {
- /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure */
+ /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. */
+ /* Don't break since there might be "avoid" cgroups at the end. */
if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0)
- break;
+ continue;
r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
if (r > 0 || r == -ENOMEM)
/* Try to kill cgroups with non-zero swap usage until we either succeed in
* killing or we get to a cgroup with no swap usage. */
for (int i = 0; i < r; i++) {
+ /* Skip over cgroups with no resource usage. Don't break since there might be "avoid"
+ * cgroups at the end. */
if (sorted[i]->swap_usage == 0)
- break;
+ continue;
r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
if (r > 0 || r == -ENOMEM)
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
_cleanup_free_ char *p = NULL, *val = NULL;
bool is_root;
+ uid_t uid;
int r;
assert(path);
return -ENOMEM;
is_root = empty_or_root(path);
+ ctx->preference = MANAGED_OOM_PREFERENCE_NONE;
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p);
if (r < 0)
if (r < 0)
return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p);
+ r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, path, &uid);
+ if (r < 0)
+ log_debug_errno(r, "Failed to get owner/group from %s: %m", path);
+ else if (uid == 0) {
+ /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used
+ * as an optional feature of systemd-oomd (and the system might not even support them). */
+ r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_avoid");
+ if (r == -ENOMEM)
+ return r;
+ ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_AVOID : ctx->preference;
+
+ r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_omit");
+ if (r == -ENOMEM)
+ return r;
+ ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_OMIT : ctx->preference;
+ }
+
if (is_root) {
r = procfs_memory_get_used(&ctx->current_memory_usage);
if (r < 0)
#include <stdbool.h>
+#include "cgroup-util.h"
#include "hashmap.h"
#include "psi-util.h"
uint64_t last_pgscan;
uint64_t pgscan;
+ ManagedOOMPreference preference;
+
/* These are only used by oomd_pressure_above for acting on high memory pressure. */
loadavg_t mem_pressure_limit;
usec_t mem_pressure_duration_usec;
/* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */
bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent);
+/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
+ * (after the smallest values). */
static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
int r;
assert(c1);
assert(c2);
+ r = CMP((*c1)->preference, (*c2)->preference);
+ if (r != 0)
+ return r;
+
r = CMP((*c2)->pgscan, (*c1)->pgscan);
if (r != 0)
return r;
}
static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
+ int r;
+
assert(c1);
assert(c2);
+ r = CMP((*c1)->preference, (*c2)->preference);
+ if (r != 0)
+ return r;
+
return CMP((*c2)->swap_usage, (*c1)->swap_usage);
}
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
_cleanup_free_ char *cgroup = NULL;
OomdCGroupContext *c1, *c2;
+ bool test_xattrs;
+ int r;
if (geteuid() != 0)
return (void) log_tests_skipped("not root");
assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0);
+ /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities
+ * so skip the xattr portions of the test. */
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "1", 1, 0);
+ test_xattrs = !ERRNO_IS_PRIVILEGE(r) && !ERRNO_IS_NOT_SUPPORTED(r);
+
+ if (test_xattrs) {
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "1", 1, 0) >= 0);
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0);
+ }
+
assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
assert_se(streq(ctx->path, cgroup));
assert_se(ctx->swap_usage == 0);
assert_se(ctx->last_pgscan == 0);
assert_se(ctx->pgscan == 0);
+ /* omit takes precedence over avoid when both are set to true */
+ if (test_xattrs)
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_OMIT);
+ else
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
+ ctx = oomd_cgroup_context_free(ctx);
+
+ /* also check when only avoid is set to true */
+ if (test_xattrs) {
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "0", 1, 0) >= 0);
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0);
+ }
+ assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
+ if (test_xattrs)
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_AVOID);
ctx = oomd_cgroup_context_free(ctx);
/* Test the root cgroup */
assert_se(oomd_cgroup_context_acquire("", &ctx) == 0);
assert_se(streq(ctx->path, "/"));
assert_se(ctx->current_memory_usage > 0);
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
/* Test hashmap inserts */
assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
assert_se(c2->last_pgscan == 5555);
assert_se(c2->mem_pressure_limit == 6789);
assert_se(c2->last_hit_mem_pressure_limit == 42);
+
+ /* Assert that avoid/omit are not set if the cgroup is not owned by root */
+ if (test_xattrs) {
+ ctx = oomd_cgroup_context_free(ctx);
+ assert_se(cg_set_access(SYSTEMD_CGROUP_CONTROLLER, cgroup, 65534, 0) >= 0);
+ assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
+ }
}
static void test_oomd_system_context_acquire(void) {
char **paths = STRV_MAKE("/herp.slice",
"/herp.slice/derp.scope",
"/herp.slice/derp.scope/sheep.service",
- "/zupa.slice");
+ "/zupa.slice",
+ "/omitted.slice",
+ "/avoid.slice");
- OomdCGroupContext ctx[4] = {
+ OomdCGroupContext ctx[6] = {
{ .path = paths[0],
.swap_usage = 20,
.pgscan = 60,
.swap_usage = 10,
.pgscan = 80,
.current_memory_usage = 10 },
+ { .path = paths[4],
+ .swap_usage = 90,
+ .pgscan = 100,
+ .preference = MANAGED_OOM_PREFERENCE_OMIT },
+ { .path = paths[5],
+ .swap_usage = 99,
+ .pgscan = 200,
+ .preference = MANAGED_OOM_PREFERENCE_AVOID },
};
assert_se(h = hashmap_new(&string_hash_ops));
assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0);
assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0);
assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0);
+ assert_se(hashmap_put(h, "/omitted.slice", &ctx[4]) >= 0);
+ assert_se(hashmap_put(h, "/avoid.slice", &ctx[5]) >= 0);
- assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 4);
+ assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 5);
assert_se(sorted_cgroups[0] == &ctx[1]);
assert_se(sorted_cgroups[1] == &ctx[2]);
assert_se(sorted_cgroups[2] == &ctx[0]);
assert_se(sorted_cgroups[3] == &ctx[3]);
+ assert_se(sorted_cgroups[4] == &ctx[5]);
sorted_cgroups = mfree(sorted_cgroups);
- assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 4);
+ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 5);
assert_se(sorted_cgroups[0] == &ctx[3]);
assert_se(sorted_cgroups[1] == &ctx[0]);
assert_se(sorted_cgroups[2] == &ctx[2]);
assert_se(sorted_cgroups[3] == &ctx[1]);
+ assert_se(sorted_cgroups[4] == &ctx[5]);
sorted_cgroups = mfree(sorted_cgroups);
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
assert_se(sorted_cgroups[1] == &ctx[1]);
assert_se(sorted_cgroups[2] == 0);
assert_se(sorted_cgroups[3] == 0);
+ assert_se(sorted_cgroups[4] == 0);
+ assert_se(sorted_cgroups[5] == 0);
sorted_cgroups = mfree(sorted_cgroups);
}
rmdir
sed
seq
+ setfattr
setfont
setsid
sfdisk
--- /dev/null
+[Unit]
+Description=Create some memory pressure
+
+[Service]
+MemoryHigh=2M
+Slice=testsuite-56-workload.slice
+ExecStart=/usr/lib/systemd/tests/testdata/units/testsuite-56-slowgrowth.sh
oomctl | grep "1.00%"
oomctl | grep "Default Memory Pressure Duration: 5s"
-# systemd-oomd watches for elevated pressure for 30 seconds before acting.
-# It can take time to build up pressure so either wait 5 minutes or for the service to fail.
-timeout=$(date -ud "5 minutes" +%s)
+# systemd-oomd watches for elevated pressure for 5 seconds before acting.
+# It can take time to build up pressure so either wait 2 minutes or for the service to fail.
+timeout=$(date -ud "2 minutes" +%s)
while [[ $(date -u +%s) -le $timeout ]]; do
if ! systemctl status testsuite-56-testbloat.service; then
break
fi
- sleep 15
+ sleep 5
done
# testbloat should be killed and testchill should be fine
if systemctl status testsuite-56-testbloat.service; then exit 42; fi
if ! systemctl status testsuite-56-testchill.service; then exit 24; fi
+# only run this portion of the test if we can set xattrs
+if setfattr -n user.xattr_test -v 1 /sys/fs/cgroup/; then
+ sleep 120 # wait for systemd-oomd kill cool down and elevated memory pressure to come down
+
+ systemctl start testsuite-56-testchill.service
+ systemctl start testsuite-56-testmunch.service
+ systemctl start testsuite-56-testbloat.service
+ setfattr -n user.oomd_avoid -v 1 /sys/fs/cgroup/testsuite.slice/testsuite-56.slice/testsuite-56-workload.slice/testsuite-56-testbloat.service
+
+ timeout=$(date -ud "2 minutes" +%s)
+ while [[ $(date -u +%s) -le $timeout ]]; do
+ if ! systemctl status testsuite-56-testmunch.service; then
+ break
+ fi
+ sleep 5
+ done
+
+ # testmunch should be killed since testbloat had the avoid xattr on it
+ if ! systemctl status testsuite-56-testbloat.service; then exit 25; fi
+ if systemctl status testsuite-56-testmunch.service; then exit 43; fi
+ if ! systemctl status testsuite-56-testchill.service; then exit 24; fi
+fi
+
systemd-analyze log-level info
echo OK > /testok