]> git.ipfire.org Git - thirdparty/libcgroup.git/commitdiff
tools/cgexec: add -r option for systemd scope
authorKamalesh Babulal <kamalesh.babulal@oracle.com>
Thu, 15 Jun 2023 10:36:54 +0000 (16:06 +0530)
committerTom Hromatka <tom.hromatka@oracle.com>
Thu, 15 Jun 2023 19:27:52 +0000 (13:27 -0600)
Provide an option for users to replace the default idle_thread created
for systemd scope.  With '-r' option specified, the program passed as
the argument to cgexec will replace the default idle_thread, if it
exists.  Also, if it's the only task, killing it will remove the .scope
cgroup it's running in.

$ sudo cgexec -r -gcpu:libcgroup.slice/db.scope ./new_default

For non scope cgroups, it launches the task program. The algorithm in
brief is as follows:
Once the current task, instance of cgexec is migrated to the
expected cgroup(s):
1. fork() and in the child path perform the exec to program passed in
   the command line.  This means, we have cgexec instance and program
   executing in the desired cgroup(s).
2. In the parent, parse /proc/<pid>/cgroup of the current process
   (cgexec) and if the cgroup name ends with .scope and has a task
   with /proc/<pid>/cmdline as "libcgroup_systemd_idle_thread", kill
   the task. This gives the illusion of replacement.

Signed-off-by: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Signed-off-by: Tom Hromatka <tom.hromatka@oracle.com>
src/tools/cgexec.c

index 00627cca706c93d3803566fe430dca748e06256d..11246273fccff5239240963f322808db6e38e9af 100644 (file)
@@ -3,6 +3,10 @@
  * Copyright RedHat Inc. 2008
  *
  * Authors:    Vivek Goyal <vgoyal@redhat.com>
+ *
+ * Replace systemd idle_thread enhancements by Kamalesh Babulal
+ * Copyright (c) 2023 Oracle and/or its affiliates.
+ * Author: Kamalesh Babulal <kamalesh.babulal@oracle.com>
  */
 
 #ifndef _GNU_SOURCE
 #include <string.h>
 #include <unistd.h>
 #include <getopt.h>
+#include <signal.h>
 #include <stdio.h>
 #include <errno.h>
 #include <grp.h>
 #include <pwd.h>
 
 #include <sys/mount.h>
-#include <sys/stat.h>
 #include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#define SYSTEMD_IDLE_THREAD    "libcgroup_systemd_idle_thread"
 
+static pid_t find_scope_pid(pid_t pid);
+static int write_systemd_unified(const char * const scope_name);
+static int is_scope_parsed(const char * const path);
 
 static struct option longopts[] = {
        {"sticky",      no_argument, NULL, 's'},
@@ -51,6 +62,8 @@ static void usage(int status, const char *program_name)
        info("change pidlist and children tasks\n");
 #ifdef WITH_SYSTEMD
        info("  -b                              Ignore default systemd delegate hierarchy\n");
+       info("  -r                              Replace the default idle_thread spawned ");
+       info("for the systemd scope\n");
 #endif
 }
 
@@ -58,6 +71,9 @@ int main(int argc, char *argv[])
 {
        struct cgroup_group_spec *cgroup_list[CG_HIER_MAX];
        int ignore_default_systemd_delegate_slice = 0;
+       pid_t scope_pid = -1;
+       int child_status = 0;
+       int replace_idle = 0;
        int cg_specified = 0;
        int flag_child = 0;
        int i, ret = 0;
@@ -68,11 +84,14 @@ int main(int argc, char *argv[])
 
        memset(cgroup_list, 0, sizeof(cgroup_list));
 #ifdef WITH_SYSTEMD
-       while ((c = getopt_long(argc, argv, "+g:shb", longopts, NULL)) > 0) {
+       while ((c = getopt_long(argc, argv, "+g:shbr", longopts, NULL)) > 0) {
                switch (c) {
                case 'b':
                        ignore_default_systemd_delegate_slice = 1;
                        break;
+               case 'r':
+                       replace_idle = 1;
+                       break;
 #else
        while ((c = getopt_long(argc, argv, "+g:sh", longopts, NULL)) > 0) {
                switch (c) {
@@ -171,12 +190,331 @@ int main(int argc, char *argv[])
                }
        }
 
+       if (!replace_idle) {
+               /* Now exec the new process */
+               execvp(argv[optind], &argv[optind]);
+               err("exec failed:%s", strerror(errno));
+               return -1;
+       }
+
+       scope_pid = find_scope_pid(pid);
+       if (scope_pid == -1)
+               return -1;
+
+       pid = fork();
+       if (pid == -1) {
+               err("Fork failed for pid %u:%s\n", pid, strerror(errno));
+               return -1;
+       }
+
+       /* child process kills the spawned idle_thread */
+       if (pid == 0) {
+               ret = kill(scope_pid, SIGTERM);
+               if (ret) {
+                       err("Failed to kill pid %u:%s\n", scope_pid, strerror(errno));
+                       exit(1);
+               }
+
+               exit(0);
+       }
+
+       wait(&child_status);
+       if (WEXITSTATUS(child_status))
+               return -1;
+
        /* Now exec the new process */
-       ret = execvp(argv[optind], &argv[optind]);
-       if (ret == -1) {
-               err("%s", strerror(errno));
+       execvp(argv[optind], &argv[optind]);
+       err("exec failed:%s", strerror(errno));
+
+       return -1;
+}
+
+static pid_t search_systemd_idle_thread_task(pid_t pids[], size_t size)
+{
+       char task_cmd[FILENAME_MAX];
+       char buffer[FILENAME_MAX];
+       FILE *pid_cmd_fp = NULL;
+       int scope_pid = -1;
+       int i;
+
+       for (i = 0; i < size; i++) {
+               snprintf(buffer, FILENAME_MAX, "/proc/%u/cmdline", pids[i]);
+               pid_cmd_fp = fopen(buffer, "re");
+               /* task might have exited */
+               if (!pid_cmd_fp)
+                       continue;
+
+               /* task might have exited, so consider only successful reads. */
+               if (fgets(task_cmd, FILENAME_MAX, pid_cmd_fp)) {
+                       if (!strcmp(task_cmd, SYSTEMD_IDLE_THREAD)) {
+                               scope_pid = pids[i];
+                               fclose(pid_cmd_fp);
+                               break;
+                       }
+               }
+               fclose(pid_cmd_fp);
+       }
+       return scope_pid;
+
+}
+
+static pid_t find_scope_pid(pid_t pid)
+{
+       pid_t _scope_pid = -1, scope_pid = -1;
+       char ctrl_name[CONTROL_NAMELEN_MAX];
+       char cgroup_name[FILENAME_MAX];
+       char scope_name[FILENAME_MAX];
+       int found_systemd_cgrp = 0;
+       int found_unified_cgrp = 0;
+       char buffer[FILENAME_MAX];
+       FILE *pid_proc_fp = NULL;
+       char *_ctrl_name = NULL;
+       int idx, ret, size = 0;
+       pid_t *pids;
+
+
+       /* Let's parse the cgroup of the pid, to check if its in one or
+        * more .scopes.
+        */
+       snprintf(buffer, FILENAME_MAX, "/proc/%u/cgroup", pid);
+       pid_proc_fp = fopen(buffer, "re");
+       if (!pid_proc_fp) {
+               err("Failed to open: %s\n", buffer);
+               return -1;
+       }
+
+       while (fgets(buffer, FILENAME_MAX, pid_proc_fp)) {
+               /* read according to the cgroup mode */
+               if (strstr(buffer, "::"))
+                       ret = sscanf(buffer, "%d::%4096s\n", &idx, cgroup_name);
+               else
+                       ret = sscanf(buffer, "%d:%[^:]:%4096s\n", &idx, ctrl_name, cgroup_name);
+
+               if (ret != 2 && ret != 3) {
+                       err("Unrecognized cgroup file format: %s\n", buffer);
+                       goto out;
+               }
+
+               if (!is_cgroup_mode_unified()) {
+                       if (ret == 3 && !strncmp(ctrl_name, "name=systemd", 12)) {
+                               found_systemd_cgrp = 1;
+                               continue;
+                       } else if (ret == 2) {
+                               found_unified_cgrp = 1;
+                               continue;
+                       }
+               }
+
+               /* skip if the cgroup path doesn't have systemd scope format */
+               if (strstr(cgroup_name, ".scope") == NULL ||
+                   strstr(cgroup_name, ".slice") == NULL)
+                       continue;
+
+               /* skip if we have already searched cgroup for idle_thread */
+               if (is_scope_parsed(cgroup_name))
+                       continue;
+
+               /* cgroup v1 might have shared mount points cpu,cpuacct */
+               _ctrl_name = strchr(ctrl_name, ',');
+               if (_ctrl_name) {
+                       size = strlen(ctrl_name) - strlen(_ctrl_name);
+                       ctrl_name[size] = '\0';
+               }
+
+               if (ret == 2)
+                       ret = cgroup_get_procs(cgroup_name, NULL, &pids, &size);
+               else
+                       ret = cgroup_get_procs(cgroup_name, ctrl_name, &pids, &size);
+               if (ret) {
+                       err("Failed to read cgroup.procs of cgroup: %s\n", cgroup_name + 1);
+                       goto out;
+               }
+
+               /*
+                * .scope created by the non-libcgroup process, will not
+                * have libcgroup_systemd_idle_thread
+                */
+               _scope_pid = search_systemd_idle_thread_task(pids, size);
+               if (_scope_pid == -1)
+                       continue;
+
+               if (scope_pid == -1) {
+                       /*
+                        * cgexec pid needs to written into:
+                        * ../systemd/<slice>/<scope>/cgroup.procs (legacy/hybrid)
+                        * ../unified/<slice>/<scope>/cgroup.procs (hybrid)
+                        */
+                       snprintf(scope_name, FILENAME_MAX, "%s", cgroup_name);
+                       scope_pid = _scope_pid;
+                       continue;
+               }
+
+               if (_scope_pid != scope_pid) {
+                       err("Failed to replace scope idle_thread, found two idle_thread\n");
+                       err(" %u %u\n", scope_pid, _scope_pid);
+                       goto out;
+               }
+       }
+
+       if (scope_pid == -1) {
+               err("Failed to find idle_thread task\n");
+               goto out;
+       }
+
+       if (is_cgroup_mode_legacy() && (found_systemd_cgrp == 0 || found_unified_cgrp == 1)) {
+               err("cgroup legacy setup incorrect\n");
+               scope_pid = -1;
+               goto out;
+       }
+
+       if (is_cgroup_mode_hybrid() && (found_systemd_cgrp == 0 || found_unified_cgrp == 0)) {
+               err("cgroup hybrid setup incorrect\n");
+               scope_pid = -1;
+               goto out;
+       }
+
+       /* This is true for cgroup v1 (legacy/hybrid) */
+       if (found_systemd_cgrp) {
+               ret = write_systemd_unified(scope_name);
+               if (ret)
+                       scope_pid = -1;
+       }
+
+out:
+       if (pid_proc_fp)
+               fclose(pid_proc_fp);
+
+       return scope_pid;
+}
+
+/*
+ * Parse the /proc/mounts file and look for the controller string
+ * in each line. If found copies the mount point into mnt_point,
+ * else return NULL mnt_point.
+ */
+static void find_mnt_point(const char * const controller, char **mnt_point)
+{
+       char proc_mount[] = "/proc/mounts";
+       char cgroup_path[FILENAME_MAX];
+       char buffer[FILENAME_MAX * 2];
+       FILE *proc_mount_f = NULL;
+       int ret;
+
+       *mnt_point = NULL;
+
+       proc_mount_f = fopen(proc_mount, "re");
+       if (proc_mount_f == NULL) {
+               err("Failed to read %s:%s\n", proc_mount, strerror(errno));
+               goto out;
+       }
+
+       while (fgets(buffer, (FILENAME_MAX * 2), proc_mount_f) != NULL) {
+               /* skip line that doesn't have controller */
+               if (!strstr(buffer, controller))
+                       continue;
+
+               ret = sscanf(buffer, "%*s %4096s\n", cgroup_path);
+               if (ret != 1) {
+                       err("Failed during read of %s:%s\n", proc_mount, strerror(errno));
+                       goto out;
+               }
+
+               *mnt_point = strdup(cgroup_path);
+               if (!*mnt_point)
+                       err("strdup of %s failed\n", cgroup_path);
+               break;
+       }
+
+out:
+       if (proc_mount_f)
+               fclose(proc_mount_f);
+}
+
+static int write_systemd_unified(const char * const scope_name)
+{
+       char cgroup_procs_path[FILENAME_MAX * 2 + 25];
+       FILE *cgroup_systemd_path_f = NULL;
+       FILE *cgroup_unified_path_f = NULL;
+       char *cgroup_name = NULL;
+       pid_t pid;
+
+       /* construct the systemd cgroup path, by parsing /proc/mounts */
+       find_mnt_point("name=systemd ", &cgroup_name);
+       if (!cgroup_name) {
+               err("Unable find name=systemd cgroup path\n");
                return -1;
        }
 
+       snprintf(cgroup_procs_path, sizeof(cgroup_procs_path), "%s/%s/cgroup.procs",
+                cgroup_name, scope_name);
+       free(cgroup_name);
+
+       cgroup_systemd_path_f = fopen(cgroup_procs_path, "we");
+       if (!cgroup_systemd_path_f) {
+               err("Failed to open %s\n", cgroup_procs_path);
+               return -1;
+       }
+
+       if (is_cgroup_mode_hybrid()) {
+               /*
+                * construct the unified cgroup path, by parsing
+                * /proc/mounts
+                */
+               find_mnt_point("unified cgroup2", &cgroup_name);
+               if (!cgroup_name) {
+                       err("Unable find unified cgroup path\n");
+                       fclose(cgroup_systemd_path_f);
+                       return -1;
+               }
+
+               snprintf(cgroup_procs_path, sizeof(cgroup_procs_path), "%s/%s/cgroup.procs",
+                               cgroup_name, scope_name);
+               free(cgroup_name);
+
+               cgroup_unified_path_f = fopen(cgroup_procs_path, "we");
+               if (!cgroup_unified_path_f) {
+                       err("Failed to open %s\n", cgroup_procs_path);
+                       fclose(cgroup_systemd_path_f);
+                       return -1;
+               }
+       }
+
+       pid = getpid();
+
+       fprintf(cgroup_systemd_path_f, "%d", pid);
+       fflush(cgroup_systemd_path_f);
+       fclose(cgroup_systemd_path_f);
+
+       if (!is_cgroup_mode_hybrid())
+               return 0;
+
+       fprintf(cgroup_unified_path_f, "%d", pid);
+       fflush(cgroup_unified_path_f);
+       fclose(cgroup_unified_path_f);
+
+       return 0;
+}
+
+static int is_scope_parsed(const char * const path)
+{
+       /*
+        * As per, <kernel sources>/kernel/cgroup/cgroup.c::cgroup_init()
+        * At the max there can be only 16 controllers and we are
+        * not accounting for named hierarchies, which can be more
+        * than 16 themselves.
+        */
+       static char parsed_scope_path[MAX_MNT_ELEMENTS][FILENAME_MAX];
+       int i;
+
+       for (i = 0; i < MAX_MNT_ELEMENTS; i++) {
+               if (!strcmp(parsed_scope_path[i], path))
+                       return 1;
+
+               if (parsed_scope_path[i][0] == '\0') {
+                       snprintf(parsed_scope_path[i], FILENAME_MAX, "%s", path);
+                       break;
+               }
+       }
+
        return 0;
 }