]> git.ipfire.org Git - thirdparty/libcgroup.git/commitdiff
tools/cgclassify: add -r option for systemd scope
authorKamalesh Babulal <kamalesh.babulal@oracle.com>
Tue, 20 Jun 2023 11:44:25 +0000 (17:14 +0530)
committerTom Hromatka <tom.hromatka@oracle.com>
Wed, 21 Jun 2023 16:11:19 +0000 (10:11 -0600)
Provide an option for users to replace the default idle_thread created
for systemd scope.  With '-r' option specified, the first pid passed as
the argument to cgclassify will replace the default idle_thread, if it
exists.  Also, if it's the only task, killing it will remove the .scope
cgroup it's running in.

$ sudo cgclassify -r -gcpu:libcgroup.slice/db.scope 1234 1235

For non scope cgroups, it launches the task program. The algorithm in
brief is as follows:
Once the first pid of argument list migrated to the expected cgroup(s):
1. The controller, cgroups details of the pid is captured before the
   migration, so incase an error occurs, it easy to roll back to the
   original cgroups the pid belonged to.
2. Once the migrated is done, parse /proc/<pid>/cgroup of the first
   pid and if the cgroup name ends with .scope and has a task with
   /proc/<pid>/cmdline as "libcgroup_systemd_idle_thread", kill
   the task. This gives the illusion of replacement.
3. If an error occurs, migrate the pid back to the original cgroups.

Signed-off-by: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Signed-off-by: Tom Hromatka <tom.hromatka@oracle.com>
src/tools/cgclassify.c

index f9f8aa888a65fa7360d38ac9c00eea067ebe7f67..1c505dc7139a2debd8e677d447c9b2c9c88e8c5a 100644 (file)
@@ -3,6 +3,10 @@
  * Copyright RedHat Inc. 2008
  *
  * Authors:    Vivek Goyal <vgoyal@redhat.com>
+ *
+ * Replace systemd idle_thread enhancements by Kamalesh Babulal
+ * Copyright (c) 2023 Oracle and/or its affiliates.
+ * Author: Kamalesh Babulal <kamalesh.babulal@oracle.com>
  */
 
 #include "tools-common.h"
@@ -15,6 +19,7 @@
 #include <limits.h>
 #include <unistd.h>
 #include <getopt.h>
+#include <signal.h>
 #include <errno.h>
 #include <stdio.h>
 #include <pwd.h>
 
 #define TEMP_BUF       81
 
+#define SYSTEMD_IDLE_THREAD    "libcgroup_systemd_idle_thread"
+
+static pid_t find_scope_pid(pid_t pid, int capture);
+static int write_systemd_unified(const char * const cgrp_name, pid_t pid);
+static int is_scope_parsed(const char * const path);
+static int rollback_pid_cgroups(pid_t pid);
+
+struct cgroup_info {
+       char ctrl_name[CONTROL_NAMELEN_MAX];
+       char cgrp_path[FILENAME_MAX];
+}info[MAX_MNT_ELEMENTS];
+
 static void usage(int status, const char *program_name)
 {
        if (status != 0) {
@@ -43,6 +60,8 @@ static void usage(int status, const char *program_name)
        info("pidlist and children tasks\n");
 #ifdef WITH_SYSTEMD
        info("  -b                              Ignore default systemd delegate hierarchy\n");
+       info("  -r                              Replace the default idle_thread spawned ");
+       info("for the systemd scope\n");
 #endif
 }
 
@@ -118,6 +137,9 @@ int main(int argc, char *argv[])
        struct cgroup_group_spec *cgroup_list[CG_HIER_MAX];
        int ignore_default_systemd_delegate_slice = 0;
        int ret = 0, i, exit_code = 0;
+       int skip_replace_idle = 0;
+       pid_t scope_pid = -1;
+       int replace_idle = 0;
        int cg_specified = 0;
        int flag = 0;
        char *endptr;
@@ -131,11 +153,14 @@ int main(int argc, char *argv[])
 
        memset(cgroup_list, 0, sizeof(cgroup_list));
 #ifdef WITH_SYSTEMD
-       while ((c = getopt_long(argc, argv, "+g:shb", longopts, NULL)) > 0) {
+       while ((c = getopt_long(argc, argv, "+g:shbr", longopts, NULL)) > 0) {
                switch (c) {
                case 'b':
                        ignore_default_systemd_delegate_slice = 1;
                        break;
+               case 'r':
+                       replace_idle = 1;
+                       break;
 #else
        while ((c = getopt_long(argc, argv, "+g:sh", longopts, NULL)) > 0) {
                switch (c) {
@@ -190,6 +215,14 @@ int main(int argc, char *argv[])
                if (ret)
                        exit_code = 1;
 
+               if (replace_idle && !skip_replace_idle) {
+                       ret = find_scope_pid(pid, 1);
+                       if (ret) {
+                               err("Failed to read /proc/%u/cgroups\n", pid);
+                               return 1;
+                       }
+               }
+
                if (cg_specified)
                        ret = change_group_path(pid, cgroup_list);
                else
@@ -198,7 +231,470 @@ int main(int argc, char *argv[])
                /* if any group change fails */
                if (ret)
                        exit_code = 1;
+
+               /* skip replacing of idle_thread in systemd slice */
+               if (!replace_idle)
+                       continue;
+
+               /* systemd idle_thread is already replaced */
+               if (skip_replace_idle)
+                       continue;
+
+               scope_pid = find_scope_pid(pid, 0);
+               if (scope_pid == -1)
+                       goto err;
+
+               skip_replace_idle = 1;
+
+               ret = kill(scope_pid, SIGTERM);
+               if (ret) {
+                       err("Failed to kill pid %u:%s\n", scope_pid, strerror(errno));
+                       goto err;
+               }
        }
 
        return exit_code;
+
+err:
+       exit_code = rollback_pid_cgroups(pid);
+       return exit_code;
+}
+
+static pid_t search_systemd_idle_thread_task(pid_t pids[], size_t size)
+{
+       char task_cmd[FILENAME_MAX];
+       char buffer[FILENAME_MAX];
+       FILE *pid_cmd_fp = NULL;
+       int scope_pid = -1;
+       int i;
+
+       for (i = 0; i < size; i++) {
+               snprintf(buffer, FILENAME_MAX, "/proc/%u/cmdline", pids[i]);
+               pid_cmd_fp = fopen(buffer, "re");
+               /* task might have exited */
+               if (!pid_cmd_fp)
+                       continue;
+
+               /* task might have exited, so consider only successful reads. */
+               if (fgets(task_cmd, FILENAME_MAX, pid_cmd_fp)) {
+                       if (!strcmp(task_cmd, SYSTEMD_IDLE_THREAD)) {
+                               scope_pid = pids[i];
+                               fclose(pid_cmd_fp);
+                               break;
+                       }
+               }
+               fclose(pid_cmd_fp);
+       }
+
+       return scope_pid;
+}
+
+static pid_t find_scope_pid(pid_t pid, int capture)
+{
+       pid_t _scope_pid = -1, scope_pid = -1;
+       char ctrl_name[CONTROL_NAMELEN_MAX];
+       char cgroup_name[FILENAME_MAX];
+       char scope_name[FILENAME_MAX];
+       int found_systemd_cgrp = 0;
+       int found_unified_cgrp = 0;
+       char buffer[FILENAME_MAX];
+       FILE *pid_proc_fp = NULL;
+       char *_ctrl_name = NULL;
+       int idx, ret, size = 0;
+       pid_t *pids;
+       int i=0;
+
+       /*
+        * Let's parse the cgroup of the pid, to check if its in one or
+        * more .scopes.
+        */
+       snprintf(buffer, FILENAME_MAX, "/proc/%u/cgroup", pid);
+       pid_proc_fp = fopen(buffer, "re");
+       if (!pid_proc_fp) {
+               err("Failed to open: %s\n", buffer);
+               return -1;
+       }
+
+       while (fgets(buffer, FILENAME_MAX, pid_proc_fp)) {
+               memset(ctrl_name, '\0', sizeof(ctrl_name));
+
+               /* check for overflow of controllers */
+               if (i >= MAX_MNT_ELEMENTS) {
+                       err("Found more than MAX_MNT_ELEMENTS controllers\n");
+                       scope_pid = -1;
+                       goto out;
+               }
+
+               /* read according to the cgroup mode */
+               if (strstr(buffer, "::")) {
+                       snprintf(ctrl_name, CONTROL_NAMELEN_MAX, "unified");
+                       ret = sscanf(buffer, "%d::%4096s\n", &idx, cgroup_name);
+               } else{
+                       ret = sscanf(buffer, "%d:%[^:]:%4096s\n", &idx, ctrl_name, cgroup_name);
+               }
+
+               if (ret != 2 && ret != 3) {
+                       err("Unrecognized cgroup file format: %s\n", buffer);
+                       scope_pid = -1;
+                       goto out;
+               }
+
+               /* cgroup v1 might have shared mount points cpu,cpuacct */
+               _ctrl_name = strchr(ctrl_name, ',');
+               if (_ctrl_name) {
+                       size = strlen(ctrl_name) - strlen(_ctrl_name);
+                       ctrl_name[size] = '\0';
+               }
+
+               /*
+                * capture is true, while the pid's controller and cgroups
+                * are populated for rollback case.
+                */
+               if (capture) {
+                       snprintf(info[i].ctrl_name, CONTROL_NAMELEN_MAX, "%s", ctrl_name);
+                       snprintf(info[i].cgrp_path, FILENAME_MAX, "%s", cgroup_name);
+               }
+
+               if (!is_cgroup_mode_unified()) {
+                       if (ret == 3 && !strncmp(ctrl_name, "name=", 5)) {
+                               if (!strcmp(ctrl_name, "name=systemd")) {
+                                       i++;
+                                       found_systemd_cgrp = 1;
+                               }
+                               continue;
+                       } else if (ret == 2) {
+                               i++;
+                               found_unified_cgrp = 1;
+                               continue;
+                       }
+               }
+               i++;
+
+               /* we are not interested in other functionality */
+               if (capture)
+                       continue;
+
+               /* skip if the cgroup path doesn't have systemd scope format */
+               if (strstr(cgroup_name, ".scope") == NULL ||
+                   strstr(cgroup_name, ".slice") == NULL)
+                       continue;
+
+               /* skip if we have already searched cgroup for idle_thread */
+               if (is_scope_parsed(cgroup_name))
+                       continue;
+
+
+               if (ret == 2)
+                       ret = cgroup_get_procs(cgroup_name, NULL, &pids, &size);
+               else
+                       ret = cgroup_get_procs(cgroup_name, ctrl_name, &pids, &size);
+               if (ret) {
+                       err("Failed to read cgroup.procs of cgroup: %s\n", cgroup_name + 1);
+                       scope_pid = -1;
+                       goto out;
+               }
+
+               /*
+                * .scope created by the non-libcgroup process, will not
+                * have libcgroup_systemd_idle_thread
+                */
+               _scope_pid = search_systemd_idle_thread_task(pids, size);
+               free(pids);
+
+               if (_scope_pid == -1)
+                       continue;
+
+               if (scope_pid == -1) {
+                       /*
+                        * cgexec pid needs to written into:
+                        * ../systemd/<slice>/<scope>/cgroup.procs (legacy/hybrid)
+                        * ../unified/<slice>/<scope>/cgroup.procs (hybrid)
+                        */
+                       snprintf(scope_name, FILENAME_MAX, "%s", cgroup_name);
+                       scope_pid = _scope_pid;
+                       continue;
+               }
+
+               if (_scope_pid != scope_pid) {
+                       err("Failed to replace scope idle_thread, found two idle_thread\n");
+                       err(" %u %u\n", scope_pid, _scope_pid);
+                       scope_pid = -1;
+                       goto out;
+               }
+       }
+
+       if (capture) {
+               scope_pid = 0;
+               goto out;
+       }
+
+       if (scope_pid == -1) {
+               err("Failed to find idle_thread task\n");
+               goto out;
+       }
+
+       if (is_cgroup_mode_legacy() && (found_systemd_cgrp == 0 || found_unified_cgrp == 1)) {
+               err("cgroup legacy setup incorrect\n");
+               scope_pid = -1;
+               goto out;
+       }
+
+       if (is_cgroup_mode_hybrid() && (found_systemd_cgrp == 0 || found_unified_cgrp == 0)) {
+               err("cgroup hybrid setup incorrect\n");
+               scope_pid = -1;
+               goto out;
+       }
+
+       /* This is true for cgroup v1 (legacy/hybrid) */
+       if (found_systemd_cgrp) {
+               ret = write_systemd_unified(scope_name, pid);
+               if (ret)
+                       scope_pid = -1;
+       }
+
+       info[i].ctrl_name[0] = '\0';
+out:
+       if (pid_proc_fp)
+               fclose(pid_proc_fp);
+
+       return scope_pid;
+}
+
+/*
+ * Parse the /proc/mounts file and look for the controller string
+ * in each line. If found copies the mount point into mnt_point,
+ * else return NULL mnt_point.
+ */
+static void find_mnt_point(const char * const controller, char **mnt_point)
+{
+       char proc_mount[] = "/proc/mounts";
+       char cgroup_path[FILENAME_MAX];
+       char buffer[FILENAME_MAX * 2];
+       FILE *proc_mount_f = NULL;
+       int ret;
+
+       *mnt_point = NULL;
+
+       proc_mount_f = fopen(proc_mount, "re");
+       if (proc_mount_f == NULL) {
+               err("Failed to read %s:%s\n", proc_mount, strerror(errno));
+               goto out;
+       }
+
+       while (fgets(buffer, (FILENAME_MAX * 2), proc_mount_f) != NULL) {
+               /* skip line that doesn't have controller */
+               if (!strstr(buffer, controller))
+                       continue;
+
+               if (strcmp(controller, "name=systemd") == 0) {
+                       if (!strstr(buffer, "name=systemd ") &&
+                           !strstr(buffer, "name=systemd,"))
+                               continue;
+               }
+
+               ret = sscanf(buffer, "%*s %4096s\n", cgroup_path);
+               if (ret != 1) {
+                       err("Failed during read of %s:%s\n", proc_mount, strerror(errno));
+                       goto out;
+               }
+
+               *mnt_point = strdup(cgroup_path);
+               if (!*mnt_point)
+                       err("strdup of %s failed\n", cgroup_path);
+               break;
+       }
+
+out:
+       if (proc_mount_f)
+               fclose(proc_mount_f);
+}
+
+static int write_systemd_unified(const char * const scope_name, pid_t pid)
+{
+       char cgroup_procs_path[FILENAME_MAX + 14];
+       FILE *cgroup_systemd_path_f = NULL;
+       FILE *cgroup_unified_path_f = NULL;
+       char *cgroup_name = NULL;
+
+       /* construct the systemd cgroup path, by parsing /proc/mounts */
+       find_mnt_point("name=systemd", &cgroup_name);
+       if (!cgroup_name) {
+               err("Unable find name=systemd cgroup path\n");
+               return -1;
+       }
+
+       snprintf(cgroup_procs_path, sizeof(cgroup_procs_path), "%s/%s/cgroup.procs",
+                cgroup_name, scope_name);
+       free(cgroup_name);
+
+       cgroup_systemd_path_f = fopen(cgroup_procs_path, "we");
+       if (!cgroup_systemd_path_f) {
+               err("Failed to open %s\n", cgroup_procs_path);
+               return -1;
+       }
+
+       if (is_cgroup_mode_hybrid()) {
+               /*
+                * construct the unified cgroup path, by parsing
+                * /proc/mounts
+                */
+               find_mnt_point("unified", &cgroup_name);
+               if (!cgroup_name) {
+                       err("Unable find unified cgroup path\n");
+                       fclose(cgroup_systemd_path_f);
+                       return -1;
+               }
+
+               snprintf(cgroup_procs_path, sizeof(cgroup_procs_path), "%s/%s/cgroup.procs",
+                        cgroup_name, scope_name);
+               free(cgroup_name);
+
+               cgroup_unified_path_f = fopen(cgroup_procs_path, "we");
+               if (!cgroup_unified_path_f) {
+                       err("Failed to open %s\n", cgroup_procs_path);
+                       fclose(cgroup_systemd_path_f);
+                       return -1;
+               }
+       }
+
+       fprintf(cgroup_systemd_path_f, "%d", pid);
+       fflush(cgroup_systemd_path_f);
+       fclose(cgroup_systemd_path_f);
+
+       if (!is_cgroup_mode_hybrid())
+               return 0;
+
+       fprintf(cgroup_unified_path_f, "%d", pid);
+       fflush(cgroup_unified_path_f);
+       fclose(cgroup_unified_path_f);
+
+       return 0;
+}
+
+static int is_scope_parsed(const char * const path)
+{
+       /*
+        * As per, <kernel sources>/kernel/cgroup/cgroup.c::cgroup_init()
+        * At the max there can be only 16 controllers and we are
+        * not accounting for named hierarchies, which can be more
+        * than 16 themselves.
+        */
+       static char parsed_scope_path[MAX_MNT_ELEMENTS][FILENAME_MAX];
+       int i;
+
+       for (i = 0; i < MAX_MNT_ELEMENTS; i++) {
+               if (!strcmp(parsed_scope_path[i], path))
+                       return 1;
+               if (parsed_scope_path[i][0] == '\0') {
+                       snprintf(parsed_scope_path[i], FILENAME_MAX, "%s", path);
+                       break;
+               }
+       }
+
+       return 0;
+}
+
+/* Borrowed from src/api.c::__attach_task_pid */
+static int attach_task_pid(char *path, pid_t tid)
+{
+       FILE *tasks = NULL;
+       int ret = 0;
+
+       tasks = fopen(path, "we");
+       if (!tasks) {
+               switch (errno) {
+               case EPERM:
+                       ret = ECGROUPNOTOWNER;
+                       break;
+               case ENOENT:
+                       ret = ECGROUPNOTEXIST;
+                       break;
+               default:
+                       ret = ECGROUPNOTALLOWED;
+               }
+               goto err;
+       }
+       ret = fprintf(tasks, "%d", tid);
+       if (ret < 0) {
+               ret = ECGOTHER;
+               goto err;
+       }
+       ret = fflush(tasks);
+       if (ret) {
+               ret = ECGOTHER;
+               goto err;
+       }
+       fclose(tasks);
+       return 0;
+err:
+       err("cannot write tid %d to %s:%s\n", tid, path, strerror(errno));
+       if (tasks)
+               fclose(tasks);
+       return ret;
+}
+
+static int rollback_pid_cgroups(pid_t pid)
+{
+       char cgroup_proc_path[FILENAME_MAX + 14];
+       char cgroup_path[FILENAME_MAX];
+       int err = 0, idx = 0, ret = 0;
+       char *cgrp_proc_path = NULL;
+
+       /*
+        * unified cgroup rollback is simple, we need to write into
+        * single cgroup hierarchy.
+        */
+       if (is_cgroup_mode_unified()) {
+               pthread_rwlock_rdlock(&cg_mount_table_lock);
+               cg_build_path_locked(info[idx].cgrp_path, cgroup_path, NULL);
+               pthread_rwlock_unlock(&cg_mount_table_lock);
+
+               snprintf(cgroup_proc_path, FILENAME_MAX + 14, "%s/cgroup.procs", cgroup_path);
+               ret = attach_task_pid(cgroup_proc_path, pid);
+               return ret;
+       }
+
+       for (idx = 0; info[idx].ctrl_name[0] != '\0'; idx++) {
+               /* find the systemd cgroup path */
+               if (!strcmp(info[idx].ctrl_name, "name=systemd")) {
+                       find_mnt_point("name=systemd", &cgrp_proc_path);
+                       if (!cgrp_proc_path) {
+                               err("Unable find name=systemd cgroup path\n");
+                               return -1;
+                       }
+
+                       snprintf(cgroup_proc_path, FILENAME_MAX + 14, "%s/%s/cgroup.procs",
+                                cgrp_proc_path, info[idx].cgrp_path);
+                       free(cgrp_proc_path);
+
+               /* find the unified cgroup path */
+               } else if (is_cgroup_mode_hybrid() &&
+                          !strcmp(info[idx].ctrl_name, "unified")) {
+                       find_mnt_point("unified cgroup2", &cgrp_proc_path);
+                       if (!cgrp_proc_path) {
+                               err("Unable find unified cgroup path\n");
+                               return -1;
+                       }
+
+                       snprintf(cgroup_proc_path, FILENAME_MAX + 14, "%s/%s/cgroup.procs",
+                               cgrp_proc_path, info[idx].cgrp_path);
+                       free(cgrp_proc_path);
+
+               /* find other controller hierarchy path */
+               } else {
+                       pthread_rwlock_rdlock(&cg_mount_table_lock);
+                       cg_build_path_locked(info[idx].cgrp_path, cgroup_path, info[idx].ctrl_name);
+                       pthread_rwlock_unlock(&cg_mount_table_lock);
+
+                       snprintf(cgroup_proc_path, FILENAME_MAX + 14, "%s/cgroup.procs",
+                               cgroup_path);
+               }
+
+               /* record the error and continue */
+               ret = attach_task_pid(cgroup_proc_path, pid);
+               if (ret)
+                       err = -1;
+       }
+
+       return err;
 }