src/oom/oomd-manager.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include "bus-log-control-api.h"
   4 #include "bus-util.h"
   5 #include "bus-polkit.h"
   6 #include "cgroup-util.h"
   7 #include "fd-util.h"
   8 #include "fileio.h"
   9 #include "memory-util.h"
  10 #include "oomd-manager-bus.h"
  11 #include "oomd-manager.h"
  12 #include "path-util.h"
  13 #include "percent-util.h"
  14
  15 typedef struct ManagedOOMReply {
  16         ManagedOOMMode mode;
  17         char *path;
  18         char *property;
  19         uint32_t limit;
  20 } ManagedOOMReply;
  21
  22 static void managed_oom_reply_destroy(ManagedOOMReply *reply) {
  23         assert(reply);
  24         free(reply->path);
  25         free(reply->property);
  26 }
  27
  28 static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
  29         ManagedOOMMode *mode = userdata, m;
  30         const char *s;
  31
  32         assert(mode);
  33         assert_se(s = json_variant_string(v));
  34
  35         m = managed_oom_mode_from_string(s);
  36         if (m < 0)
  37                 return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s);
  38
  39         *mode = m;
  40         return 0;
  41 }
  42
  43 static int process_managed_oom_reply(
  44                 Varlink *link,
  45                 JsonVariant *parameters,
  46                 const char *error_id,
  47                 VarlinkReplyFlags flags,
  48                 void *userdata) {
  49         JsonVariant *c, *cgroups;
  50         Manager *m = userdata;
  51         int r = 0;
  52
  53         assert(m);
  54
  55         static const JsonDispatch dispatch_table[] = {
  56                 { "mode",     JSON_VARIANT_STRING,   managed_oom_mode,     offsetof(ManagedOOMReply, mode),     JSON_MANDATORY },
  57                 { "path",     JSON_VARIANT_STRING,   json_dispatch_string, offsetof(ManagedOOMReply, path),     JSON_MANDATORY },
  58                 { "property", JSON_VARIANT_STRING,   json_dispatch_string, offsetof(ManagedOOMReply, property), JSON_MANDATORY },
  59                 { "limit",    JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMReply, limit),    0 },
  60                 {},
  61         };
  62
  63         if (error_id) {
  64                 r = -EIO;
  65                 log_debug("Error getting ManagedOOM cgroups: %s", error_id);
  66                 goto finish;
  67         }
  68
  69         cgroups = json_variant_by_key(parameters, "cgroups");
  70         if (!cgroups) {
  71                 r = -EINVAL;
  72                 goto finish;
  73         }
  74
  75         /* Skip malformed elements and keep processing in case the others are good */
  76         JSON_VARIANT_ARRAY_FOREACH(c, cgroups) {
  77                 _cleanup_(managed_oom_reply_destroy) ManagedOOMReply reply = {};
  78                 OomdCGroupContext *ctx;
  79                 Hashmap *monitor_hm;
  80                 loadavg_t limit;
  81                 int ret;
  82
  83                 if (!json_variant_is_object(c))
  84                         continue;
  85
  86                 ret = json_dispatch(c, dispatch_table, NULL, 0, &reply);
  87                 if (ret == -ENOMEM) {
  88                         r = ret;
  89                         goto finish;
  90                 }
  91                 if (ret < 0)
  92                         continue;
  93
  94                 monitor_hm = streq(reply.property, "ManagedOOMSwap") ?
  95                                 m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
  96
  97                 if (reply.mode == MANAGED_OOM_AUTO) {
  98                         (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(reply.path)));
  99                         continue;
 100                 }
 101
 102                 limit = m->default_mem_pressure_limit;
 103
 104                 if (streq(reply.property, "ManagedOOMMemoryPressure") && reply.limit > 0) {
 105                         int permyriad = UINT32_SCALE_TO_PERMYRIAD(reply.limit);
 106
 107                         ret = store_loadavg_fixed_point(
 108                                         (unsigned long) permyriad / 100,
 109                                         (unsigned long) permyriad % 100,
 110                                         &limit);
 111                         if (ret < 0)
 112                                 continue;
 113                 }
 114
 115                 ret = oomd_insert_cgroup_context(NULL, monitor_hm, reply.path);
 116                 if (ret == -ENOMEM) {
 117                         r = ret;
 118                         goto finish;
 119                 }
 120
 121                 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
 122                  * ignored so always updating it here is not a problem. */
 123                 ctx = hashmap_get(monitor_hm, empty_to_root(reply.path));
 124                 if (ctx)
 125                         ctx->mem_pressure_limit = limit;
 126         }
 127
 128 finish:
 129         if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
 130                 m->varlink = varlink_close_unref(link);
 131
 132         return r;
 133 }
 134
 135 /* Fill `new_h` with `path`'s descendent OomdCGroupContexts. Only include descendent cgroups that are possible
 136  * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
 137  *
 138  * This function ignores most errors in order to handle cgroups that may have been cleaned up while populating
 139  * the hashmap.
 140  *
 141  * `new_h` is of the form { key: cgroup paths -> value: OomdCGroupContext } */
 142 static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
 143         _cleanup_free_ char *subpath = NULL;
 144         _cleanup_closedir_ DIR *d = NULL;
 145         int r;
 146
 147         assert(new_h);
 148         assert(path);
 149
 150         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
 151         if (r < 0)
 152                 return r;
 153
 154         r = cg_read_subgroup(d, &subpath);
 155         if (r < 0)
 156                 return r;
 157         else if (r == 0) { /* No subgroups? We're a leaf node */
 158                 r = oomd_insert_cgroup_context(NULL, new_h, path);
 159                 return (r == -ENOMEM) ? r : 0;
 160         }
 161
 162         do {
 163                 _cleanup_free_ char *cg_path = NULL;
 164                 bool oom_group;
 165
 166                 cg_path = path_join(empty_to_root(path), subpath);
 167                 if (!cg_path)
 168                         return -ENOMEM;
 169
 170                 subpath = mfree(subpath);
 171
 172                 r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
 173                 /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
 174                 if (r < 0)
 175                         return (r == -ENOMEM) ? r : 0;
 176
 177                 if (oom_group)
 178                         r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
 179                 else
 180                         r = recursively_get_cgroup_context(new_h, cg_path);
 181                 if (r == -ENOMEM)
 182                         return r;
 183         } while ((r = cg_read_subgroup(d, &subpath)) > 0);
 184
 185         return 0;
 186 }
 187
 188 static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
 189         _cleanup_hashmap_free_ Hashmap *new_base = NULL;
 190         OomdCGroupContext *ctx;
 191         int r;
 192
 193         assert(monitored_cgroups);
 194
 195         new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops);
 196         if (!new_base)
 197                 return -ENOMEM;
 198
 199         HASHMAP_FOREACH(ctx, *monitored_cgroups) {
 200                 /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
 201                 r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
 202                 if (r == -ENOMEM)
 203                         return r;
 204         }
 205
 206         hashmap_free(*monitored_cgroups);
 207         *monitored_cgroups = TAKE_PTR(new_base);
 208
 209         return 0;
 210 }
 211
 212 static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) {
 213         _cleanup_hashmap_free_ Hashmap *candidates = NULL;
 214         OomdCGroupContext *ctx;
 215         int r;
 216
 217         assert(monitored_cgroups);
 218         assert(ret_candidates);
 219
 220         candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
 221         if (!candidates)
 222                 return -ENOMEM;
 223
 224         HASHMAP_FOREACH(ctx, monitored_cgroups) {
 225                 r = recursively_get_cgroup_context(candidates, ctx->path);
 226                 if (r == -ENOMEM)
 227                         return r;
 228         }
 229
 230         *ret_candidates = TAKE_PTR(candidates);
 231
 232         return 0;
 233 }
 234
 235 static int acquire_managed_oom_connect(Manager *m) {
 236         _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
 237         int r;
 238
 239         assert(m);
 240         assert(m->event);
 241
 242         r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM);
 243         if (r < 0)
 244                 return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM);
 245
 246         (void) varlink_set_userdata(link, m);
 247         (void) varlink_set_description(link, "oomd");
 248         (void) varlink_set_relative_timeout(link, USEC_INFINITY);
 249
 250         r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
 251         if (r < 0)
 252                 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
 253
 254         r = varlink_bind_reply(link, process_managed_oom_reply);
 255         if (r < 0)
 256                 return log_error_errno(r, "Failed to bind reply callback: %m");
 257
 258         r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL);
 259         if (r < 0)
 260                 return log_error_errno(r, "Failed to observe varlink call: %m");
 261
 262         m->varlink = TAKE_PTR(link);
 263         return 0;
 264 }
 265
 266 static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
 267         _cleanup_set_free_ Set *targets = NULL;
 268         Manager *m = userdata;
 269         usec_t usec_now;
 270         int r;
 271
 272         assert(s);
 273         assert(userdata);
 274
 275         /* Reset timer */
 276         r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
 277         if (r < 0)
 278                 return log_error_errno(r, "Failed to reset event timer");
 279
 280         r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
 281         if (r < 0)
 282                 return log_error_errno(r, "Failed to set relative time for timer");
 283
 284         /* Reconnect if our connection dropped */
 285         if (!m->varlink) {
 286                 r = acquire_managed_oom_connect(m);
 287                 if (r < 0)
 288                         return log_error_errno(r, "Failed to acquire varlink connection");
 289         }
 290
 291         /* Update the cgroups used for detection/action */
 292         r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
 293         if (r == -ENOMEM)
 294                 return log_error_errno(r, "Failed to update monitored swap cgroup contexts");
 295
 296         r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
 297         if (r == -ENOMEM)
 298                 return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts");
 299
 300         r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
 301         /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
 302          * Allow ENOENT in the event that swap is disabled on the system. */
 303         if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
 304                 return log_error_errno(r, "Failed to acquire system context");
 305         else if (r == -ENOENT)
 306                 zero(m->system_context);
 307
 308         if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
 309                 m->last_reclaim_at = usec_now;
 310
 311         /* If we're still recovering from a kill, don't try to kill again yet */
 312         if (m->post_action_delay_start > 0) {
 313                 if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
 314                         return 0;
 315                 else
 316                         m->post_action_delay_start = 0;
 317         }
 318
 319         r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
 320         if (r == -ENOMEM)
 321                 return log_error_errno(r, "Failed to check if memory pressure exceeded limits");
 322         else if (r == 1) {
 323                 /* Check if there was reclaim activity in the given interval. The concern is the following case:
 324                  * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
 325                  * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
 326                  * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
 327                  * to kill something (it won't help anyways). */
 328                 if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
 329                         _cleanup_hashmap_free_ Hashmap *candidates = NULL;
 330                         OomdCGroupContext *t;
 331
 332                         r = get_monitored_cgroup_contexts_candidates(m->monitored_mem_pressure_cgroup_contexts, &candidates);
 333                         if (r == -ENOMEM)
 334                                 return log_error_errno(r, "Failed to get monitored memory pressure cgroup candidates");
 335
 336                         SET_FOREACH(t, targets) {
 337                                 log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64" seconds and there was reclaim activity",
 338                                         t->path, LOAD_INT(t->mem_pressure_limit), m->default_mem_pressure_duration_usec / USEC_PER_SEC);
 339
 340                                 r = oomd_kill_by_pgscan(candidates, t->path, m->dry_run);
 341                                 if (r == -ENOMEM)
 342                                         return log_error_errno(r, "Failed to kill cgroup processes by pgscan");
 343                                 if (r < 0)
 344                                         log_info("Failed to kill any cgroup(s) under %s based on pressure", t->path);
 345                                 else {
 346                                         /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
 347                                         m->post_action_delay_start = usec_now;
 348                                         return 0;
 349                                 }
 350                         }
 351                 }
 352         }
 353
 354         if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
 355                 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
 356
 357                 log_notice("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
 358                            m->system_context.swap_used, m->system_context.swap_total, PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
 359
 360                 r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
 361                 if (r == -ENOMEM)
 362                         return log_error_errno(r, "Failed to get monitored swap cgroup candidates");
 363
 364                 r = oomd_kill_by_swap_usage(candidates, m->dry_run);
 365                 if (r == -ENOMEM)
 366                         return log_error_errno(r, "Failed to kill cgroup processes by swap usage");
 367                 if (r < 0)
 368                         log_info("Failed to kill any cgroup(s) based on swap");
 369                 else {
 370                         m->post_action_delay_start = usec_now;
 371                         return 0;
 372                 }
 373         }
 374
 375         return 0;
 376 }
 377
 378 static int monitor_cgroup_contexts(Manager *m) {
 379         _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
 380         int r;
 381
 382         assert(m);
 383         assert(m->event);
 384
 385         r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m);
 386         if (r < 0)
 387                 return r;
 388
 389         r = sd_event_source_set_exit_on_failure(s, true);
 390         if (r < 0)
 391                 return r;
 392
 393         r = sd_event_source_set_enabled(s, SD_EVENT_ON);
 394         if (r < 0)
 395                 return r;
 396
 397         (void) sd_event_source_set_description(s, "oomd-timer");
 398
 399         m->cgroup_context_event_source = TAKE_PTR(s);
 400         return 0;
 401 }
 402
 403 Manager* manager_free(Manager *m) {
 404         assert(m);
 405
 406         varlink_close_unref(m->varlink);
 407         sd_event_source_unref(m->cgroup_context_event_source);
 408         sd_event_unref(m->event);
 409
 410         bus_verify_polkit_async_registry_free(m->polkit_registry);
 411         sd_bus_flush_close_unref(m->bus);
 412
 413         hashmap_free(m->monitored_swap_cgroup_contexts);
 414         hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
 415
 416         return mfree(m);
 417 }
 418
 419 int manager_new(Manager **ret) {
 420         _cleanup_(manager_freep) Manager *m = NULL;
 421         int r;
 422
 423         assert(ret);
 424
 425         m = new0(Manager, 1);
 426         if (!m)
 427                 return -ENOMEM;
 428
 429         r = sd_event_default(&m->event);
 430         if (r < 0)
 431                 return r;
 432
 433         (void) sd_event_set_watchdog(m->event, true);
 434
 435         r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
 436         if (r < 0)
 437                 return r;
 438
 439         r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
 440         if (r < 0)
 441                 return r;
 442
 443         m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
 444         if (!m->monitored_swap_cgroup_contexts)
 445                 return -ENOMEM;
 446
 447         m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
 448         if (!m->monitored_mem_pressure_cgroup_contexts)
 449                 return -ENOMEM;
 450
 451         *ret = TAKE_PTR(m);
 452         return 0;
 453 }
 454
 455 static int manager_connect_bus(Manager *m) {
 456         int r;
 457
 458         assert(m);
 459         assert(!m->bus);
 460
 461         r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom");
 462         if (r < 0)
 463                 return log_error_errno(r, "Failed to connect to bus: %m");
 464
 465         r = bus_add_implementation(m->bus, &manager_object, m);
 466         if (r < 0)
 467                 return r;
 468
 469         r = bus_log_control_api_register(m->bus);
 470         if (r < 0)
 471                 return r;
 472
 473         r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL);
 474         if (r < 0)
 475                 return log_error_errno(r, "Failed to request name: %m");
 476
 477         r = sd_bus_attach_event(m->bus, m->event, 0);
 478         if (r < 0)
 479                 return log_error_errno(r, "Failed to attach bus to event loop: %m");
 480
 481         return 0;
 482 }
 483
 484 int manager_start(
 485                 Manager *m,
 486                 bool dry_run,
 487                 int swap_used_limit_permyriad,
 488                 int mem_pressure_limit_permyriad,
 489                 usec_t mem_pressure_usec) {
 490
 491         unsigned long l, f;
 492         int r;
 493
 494         assert(m);
 495
 496         m->dry_run = dry_run;
 497
 498         m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100;
 499         assert(m->swap_used_limit_permyriad <= 10000);
 500
 501         if (mem_pressure_limit_permyriad >= 0) {
 502                 assert(mem_pressure_limit_permyriad <= 10000);
 503
 504                 l = mem_pressure_limit_permyriad / 100;
 505                 f = mem_pressure_limit_permyriad % 100;
 506         } else {
 507                 l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT;
 508                 f = 0;
 509         }
 510         r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit);
 511         if (r < 0)
 512                 return r;
 513
 514         m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC;
 515
 516         r = manager_connect_bus(m);
 517         if (r < 0)
 518                 return r;
 519
 520         r = acquire_managed_oom_connect(m);
 521         if (r < 0)
 522                 return r;
 523
 524         r = monitor_cgroup_contexts(m);
 525         if (r < 0)
 526                 return r;
 527
 528         return 0;
 529 }
 530
 531 int manager_get_dump_string(Manager *m, char **ret) {
 532         _cleanup_free_ char *dump = NULL;
 533         _cleanup_fclose_ FILE *f = NULL;
 534         char buf[FORMAT_TIMESPAN_MAX];
 535         OomdCGroupContext *c;
 536         size_t size;
 537         char *key;
 538         int r;
 539
 540         assert(m);
 541         assert(ret);
 542
 543         f = open_memstream_unlocked(&dump, &size);
 544         if (!f)
 545                 return -errno;
 546
 547         fprintf(f,
 548                 "Dry Run: %s\n"
 549                 "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
 550                 "Default Memory Pressure Limit: %lu.%02lu%%\n"
 551                 "Default Memory Pressure Duration: %s\n"
 552                 "System Context:\n",
 553                 yes_no(m->dry_run),
 554                 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad),
 555                 LOAD_INT(m->default_mem_pressure_limit), LOAD_FRAC(m->default_mem_pressure_limit),
 556                 format_timespan(buf, sizeof(buf), m->default_mem_pressure_duration_usec, USEC_PER_SEC));
 557         oomd_dump_system_context(&m->system_context, f, "\t");
 558
 559         fprintf(f, "Swap Monitored CGroups:\n");
 560         HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts)
 561                 oomd_dump_swap_cgroup_context(c, f, "\t");
 562
 563         fprintf(f, "Memory Pressure Monitored CGroups:\n");
 564         HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts)
 565                 oomd_dump_memory_pressure_cgroup_context(c, f, "\t");
 566
 567         r = fflush_and_check(f);
 568         if (r < 0)
 569                 return r;
 570
 571         f = safe_fclose(f);
 572
 573         *ret = TAKE_PTR(dump);
 574         return 0;
 575 }