From: Kamalesh Babulal Date: Tue, 20 Jun 2023 11:44:25 +0000 (+0530) Subject: tools/cgclassify: add -r option for systemd scope X-Git-Tag: v3.1.0~48 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=db4e3521647e55224c862dc5dd3a67b9824988c9;p=thirdparty%2Flibcgroup.git tools/cgclassify: add -r option for systemd scope Provide an option for users to replace the default idle_thread created for systemd scope. With '-r' option specified, the first pid passed as the argument to cgclassify will replace the default idle_thread, if it exists. Also, if it's the only task, killing it will remove the .scope cgroup it's running in. $ sudo cgclassify -r -gcpu:libcgroup.slice/db.scope 1234 1235 For non scope cgroups, it launches the task program. The algorithm in brief is as follows: Once the first pid of argument list migrated to the expected cgroup(s): 1. The controller, cgroups details of the pid is captured before the migration, so incase an error occurs, it easy to roll back to the original cgroups the pid belonged to. 2. Once the migrated is done, parse /proc//cgroup of the first pid and if the cgroup name ends with .scope and has a task with /proc//cmdline as "libcgroup_systemd_idle_thread", kill the task. This gives the illusion of replacement. 3. If an error occurs, migrate the pid back to the original cgroups. Signed-off-by: Kamalesh Babulal Signed-off-by: Tom Hromatka --- diff --git a/src/tools/cgclassify.c b/src/tools/cgclassify.c index f9f8aa88..1c505dc7 100644 --- a/src/tools/cgclassify.c +++ b/src/tools/cgclassify.c @@ -3,6 +3,10 @@ * Copyright RedHat Inc. 2008 * * Authors: Vivek Goyal + * + * Replace systemd idle_thread enhancements by Kamalesh Babulal + * Copyright (c) 2023 Oracle and/or its affiliates. + * Author: Kamalesh Babulal */ #include "tools-common.h" @@ -15,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +31,18 @@ #define TEMP_BUF 81 +#define SYSTEMD_IDLE_THREAD "libcgroup_systemd_idle_thread" + +static pid_t find_scope_pid(pid_t pid, int capture); +static int write_systemd_unified(const char * const cgrp_name, pid_t pid); +static int is_scope_parsed(const char * const path); +static int rollback_pid_cgroups(pid_t pid); + +struct cgroup_info { + char ctrl_name[CONTROL_NAMELEN_MAX]; + char cgrp_path[FILENAME_MAX]; +}info[MAX_MNT_ELEMENTS]; + static void usage(int status, const char *program_name) { if (status != 0) { @@ -43,6 +60,8 @@ static void usage(int status, const char *program_name) info("pidlist and children tasks\n"); #ifdef WITH_SYSTEMD info(" -b Ignore default systemd delegate hierarchy\n"); + info(" -r Replace the default idle_thread spawned "); + info("for the systemd scope\n"); #endif } @@ -118,6 +137,9 @@ int main(int argc, char *argv[]) struct cgroup_group_spec *cgroup_list[CG_HIER_MAX]; int ignore_default_systemd_delegate_slice = 0; int ret = 0, i, exit_code = 0; + int skip_replace_idle = 0; + pid_t scope_pid = -1; + int replace_idle = 0; int cg_specified = 0; int flag = 0; char *endptr; @@ -131,11 +153,14 @@ int main(int argc, char *argv[]) memset(cgroup_list, 0, sizeof(cgroup_list)); #ifdef WITH_SYSTEMD - while ((c = getopt_long(argc, argv, "+g:shb", longopts, NULL)) > 0) { + while ((c = getopt_long(argc, argv, "+g:shbr", longopts, NULL)) > 0) { switch (c) { case 'b': ignore_default_systemd_delegate_slice = 1; break; + case 'r': + replace_idle = 1; + break; #else while ((c = getopt_long(argc, argv, "+g:sh", longopts, NULL)) > 0) { switch (c) { @@ -190,6 +215,14 @@ int main(int argc, char *argv[]) if (ret) exit_code = 1; + if (replace_idle && !skip_replace_idle) { + ret = find_scope_pid(pid, 1); + if (ret) { + err("Failed to read /proc/%u/cgroups\n", pid); + return 1; + } + } + if (cg_specified) ret = change_group_path(pid, cgroup_list); else @@ -198,7 +231,470 @@ int main(int argc, char *argv[]) /* if any group change fails */ if (ret) exit_code = 1; + + /* skip replacing of idle_thread in systemd slice */ + if (!replace_idle) + continue; + + /* systemd idle_thread is already replaced */ + if (skip_replace_idle) + continue; + + scope_pid = find_scope_pid(pid, 0); + if (scope_pid == -1) + goto err; + + skip_replace_idle = 1; + + ret = kill(scope_pid, SIGTERM); + if (ret) { + err("Failed to kill pid %u:%s\n", scope_pid, strerror(errno)); + goto err; + } } return exit_code; + +err: + exit_code = rollback_pid_cgroups(pid); + return exit_code; +} + +static pid_t search_systemd_idle_thread_task(pid_t pids[], size_t size) +{ + char task_cmd[FILENAME_MAX]; + char buffer[FILENAME_MAX]; + FILE *pid_cmd_fp = NULL; + int scope_pid = -1; + int i; + + for (i = 0; i < size; i++) { + snprintf(buffer, FILENAME_MAX, "/proc/%u/cmdline", pids[i]); + pid_cmd_fp = fopen(buffer, "re"); + /* task might have exited */ + if (!pid_cmd_fp) + continue; + + /* task might have exited, so consider only successful reads. */ + if (fgets(task_cmd, FILENAME_MAX, pid_cmd_fp)) { + if (!strcmp(task_cmd, SYSTEMD_IDLE_THREAD)) { + scope_pid = pids[i]; + fclose(pid_cmd_fp); + break; + } + } + fclose(pid_cmd_fp); + } + + return scope_pid; +} + +static pid_t find_scope_pid(pid_t pid, int capture) +{ + pid_t _scope_pid = -1, scope_pid = -1; + char ctrl_name[CONTROL_NAMELEN_MAX]; + char cgroup_name[FILENAME_MAX]; + char scope_name[FILENAME_MAX]; + int found_systemd_cgrp = 0; + int found_unified_cgrp = 0; + char buffer[FILENAME_MAX]; + FILE *pid_proc_fp = NULL; + char *_ctrl_name = NULL; + int idx, ret, size = 0; + pid_t *pids; + int i=0; + + /* + * Let's parse the cgroup of the pid, to check if its in one or + * more .scopes. + */ + snprintf(buffer, FILENAME_MAX, "/proc/%u/cgroup", pid); + pid_proc_fp = fopen(buffer, "re"); + if (!pid_proc_fp) { + err("Failed to open: %s\n", buffer); + return -1; + } + + while (fgets(buffer, FILENAME_MAX, pid_proc_fp)) { + memset(ctrl_name, '\0', sizeof(ctrl_name)); + + /* check for overflow of controllers */ + if (i >= MAX_MNT_ELEMENTS) { + err("Found more than MAX_MNT_ELEMENTS controllers\n"); + scope_pid = -1; + goto out; + } + + /* read according to the cgroup mode */ + if (strstr(buffer, "::")) { + snprintf(ctrl_name, CONTROL_NAMELEN_MAX, "unified"); + ret = sscanf(buffer, "%d::%4096s\n", &idx, cgroup_name); + } else{ + ret = sscanf(buffer, "%d:%[^:]:%4096s\n", &idx, ctrl_name, cgroup_name); + } + + if (ret != 2 && ret != 3) { + err("Unrecognized cgroup file format: %s\n", buffer); + scope_pid = -1; + goto out; + } + + /* cgroup v1 might have shared mount points cpu,cpuacct */ + _ctrl_name = strchr(ctrl_name, ','); + if (_ctrl_name) { + size = strlen(ctrl_name) - strlen(_ctrl_name); + ctrl_name[size] = '\0'; + } + + /* + * capture is true, while the pid's controller and cgroups + * are populated for rollback case. + */ + if (capture) { + snprintf(info[i].ctrl_name, CONTROL_NAMELEN_MAX, "%s", ctrl_name); + snprintf(info[i].cgrp_path, FILENAME_MAX, "%s", cgroup_name); + } + + if (!is_cgroup_mode_unified()) { + if (ret == 3 && !strncmp(ctrl_name, "name=", 5)) { + if (!strcmp(ctrl_name, "name=systemd")) { + i++; + found_systemd_cgrp = 1; + } + continue; + } else if (ret == 2) { + i++; + found_unified_cgrp = 1; + continue; + } + } + i++; + + /* we are not interested in other functionality */ + if (capture) + continue; + + /* skip if the cgroup path doesn't have systemd scope format */ + if (strstr(cgroup_name, ".scope") == NULL || + strstr(cgroup_name, ".slice") == NULL) + continue; + + /* skip if we have already searched cgroup for idle_thread */ + if (is_scope_parsed(cgroup_name)) + continue; + + + if (ret == 2) + ret = cgroup_get_procs(cgroup_name, NULL, &pids, &size); + else + ret = cgroup_get_procs(cgroup_name, ctrl_name, &pids, &size); + if (ret) { + err("Failed to read cgroup.procs of cgroup: %s\n", cgroup_name + 1); + scope_pid = -1; + goto out; + } + + /* + * .scope created by the non-libcgroup process, will not + * have libcgroup_systemd_idle_thread + */ + _scope_pid = search_systemd_idle_thread_task(pids, size); + free(pids); + + if (_scope_pid == -1) + continue; + + if (scope_pid == -1) { + /* + * cgexec pid needs to written into: + * ../systemd///cgroup.procs (legacy/hybrid) + * ../unified///cgroup.procs (hybrid) + */ + snprintf(scope_name, FILENAME_MAX, "%s", cgroup_name); + scope_pid = _scope_pid; + continue; + } + + if (_scope_pid != scope_pid) { + err("Failed to replace scope idle_thread, found two idle_thread\n"); + err(" %u %u\n", scope_pid, _scope_pid); + scope_pid = -1; + goto out; + } + } + + if (capture) { + scope_pid = 0; + goto out; + } + + if (scope_pid == -1) { + err("Failed to find idle_thread task\n"); + goto out; + } + + if (is_cgroup_mode_legacy() && (found_systemd_cgrp == 0 || found_unified_cgrp == 1)) { + err("cgroup legacy setup incorrect\n"); + scope_pid = -1; + goto out; + } + + if (is_cgroup_mode_hybrid() && (found_systemd_cgrp == 0 || found_unified_cgrp == 0)) { + err("cgroup hybrid setup incorrect\n"); + scope_pid = -1; + goto out; + } + + /* This is true for cgroup v1 (legacy/hybrid) */ + if (found_systemd_cgrp) { + ret = write_systemd_unified(scope_name, pid); + if (ret) + scope_pid = -1; + } + + info[i].ctrl_name[0] = '\0'; +out: + if (pid_proc_fp) + fclose(pid_proc_fp); + + return scope_pid; +} + +/* + * Parse the /proc/mounts file and look for the controller string + * in each line. If found copies the mount point into mnt_point, + * else return NULL mnt_point. + */ +static void find_mnt_point(const char * const controller, char **mnt_point) +{ + char proc_mount[] = "/proc/mounts"; + char cgroup_path[FILENAME_MAX]; + char buffer[FILENAME_MAX * 2]; + FILE *proc_mount_f = NULL; + int ret; + + *mnt_point = NULL; + + proc_mount_f = fopen(proc_mount, "re"); + if (proc_mount_f == NULL) { + err("Failed to read %s:%s\n", proc_mount, strerror(errno)); + goto out; + } + + while (fgets(buffer, (FILENAME_MAX * 2), proc_mount_f) != NULL) { + /* skip line that doesn't have controller */ + if (!strstr(buffer, controller)) + continue; + + if (strcmp(controller, "name=systemd") == 0) { + if (!strstr(buffer, "name=systemd ") && + !strstr(buffer, "name=systemd,")) + continue; + } + + ret = sscanf(buffer, "%*s %4096s\n", cgroup_path); + if (ret != 1) { + err("Failed during read of %s:%s\n", proc_mount, strerror(errno)); + goto out; + } + + *mnt_point = strdup(cgroup_path); + if (!*mnt_point) + err("strdup of %s failed\n", cgroup_path); + break; + } + +out: + if (proc_mount_f) + fclose(proc_mount_f); +} + +static int write_systemd_unified(const char * const scope_name, pid_t pid) +{ + char cgroup_procs_path[FILENAME_MAX + 14]; + FILE *cgroup_systemd_path_f = NULL; + FILE *cgroup_unified_path_f = NULL; + char *cgroup_name = NULL; + + /* construct the systemd cgroup path, by parsing /proc/mounts */ + find_mnt_point("name=systemd", &cgroup_name); + if (!cgroup_name) { + err("Unable find name=systemd cgroup path\n"); + return -1; + } + + snprintf(cgroup_procs_path, sizeof(cgroup_procs_path), "%s/%s/cgroup.procs", + cgroup_name, scope_name); + free(cgroup_name); + + cgroup_systemd_path_f = fopen(cgroup_procs_path, "we"); + if (!cgroup_systemd_path_f) { + err("Failed to open %s\n", cgroup_procs_path); + return -1; + } + + if (is_cgroup_mode_hybrid()) { + /* + * construct the unified cgroup path, by parsing + * /proc/mounts + */ + find_mnt_point("unified", &cgroup_name); + if (!cgroup_name) { + err("Unable find unified cgroup path\n"); + fclose(cgroup_systemd_path_f); + return -1; + } + + snprintf(cgroup_procs_path, sizeof(cgroup_procs_path), "%s/%s/cgroup.procs", + cgroup_name, scope_name); + free(cgroup_name); + + cgroup_unified_path_f = fopen(cgroup_procs_path, "we"); + if (!cgroup_unified_path_f) { + err("Failed to open %s\n", cgroup_procs_path); + fclose(cgroup_systemd_path_f); + return -1; + } + } + + fprintf(cgroup_systemd_path_f, "%d", pid); + fflush(cgroup_systemd_path_f); + fclose(cgroup_systemd_path_f); + + if (!is_cgroup_mode_hybrid()) + return 0; + + fprintf(cgroup_unified_path_f, "%d", pid); + fflush(cgroup_unified_path_f); + fclose(cgroup_unified_path_f); + + return 0; +} + +static int is_scope_parsed(const char * const path) +{ + /* + * As per, /kernel/cgroup/cgroup.c::cgroup_init() + * At the max there can be only 16 controllers and we are + * not accounting for named hierarchies, which can be more + * than 16 themselves. + */ + static char parsed_scope_path[MAX_MNT_ELEMENTS][FILENAME_MAX]; + int i; + + for (i = 0; i < MAX_MNT_ELEMENTS; i++) { + if (!strcmp(parsed_scope_path[i], path)) + return 1; + if (parsed_scope_path[i][0] == '\0') { + snprintf(parsed_scope_path[i], FILENAME_MAX, "%s", path); + break; + } + } + + return 0; +} + +/* Borrowed from src/api.c::__attach_task_pid */ +static int attach_task_pid(char *path, pid_t tid) +{ + FILE *tasks = NULL; + int ret = 0; + + tasks = fopen(path, "we"); + if (!tasks) { + switch (errno) { + case EPERM: + ret = ECGROUPNOTOWNER; + break; + case ENOENT: + ret = ECGROUPNOTEXIST; + break; + default: + ret = ECGROUPNOTALLOWED; + } + goto err; + } + ret = fprintf(tasks, "%d", tid); + if (ret < 0) { + ret = ECGOTHER; + goto err; + } + ret = fflush(tasks); + if (ret) { + ret = ECGOTHER; + goto err; + } + fclose(tasks); + return 0; +err: + err("cannot write tid %d to %s:%s\n", tid, path, strerror(errno)); + if (tasks) + fclose(tasks); + return ret; +} + +static int rollback_pid_cgroups(pid_t pid) +{ + char cgroup_proc_path[FILENAME_MAX + 14]; + char cgroup_path[FILENAME_MAX]; + int err = 0, idx = 0, ret = 0; + char *cgrp_proc_path = NULL; + + /* + * unified cgroup rollback is simple, we need to write into + * single cgroup hierarchy. + */ + if (is_cgroup_mode_unified()) { + pthread_rwlock_rdlock(&cg_mount_table_lock); + cg_build_path_locked(info[idx].cgrp_path, cgroup_path, NULL); + pthread_rwlock_unlock(&cg_mount_table_lock); + + snprintf(cgroup_proc_path, FILENAME_MAX + 14, "%s/cgroup.procs", cgroup_path); + ret = attach_task_pid(cgroup_proc_path, pid); + return ret; + } + + for (idx = 0; info[idx].ctrl_name[0] != '\0'; idx++) { + /* find the systemd cgroup path */ + if (!strcmp(info[idx].ctrl_name, "name=systemd")) { + find_mnt_point("name=systemd", &cgrp_proc_path); + if (!cgrp_proc_path) { + err("Unable find name=systemd cgroup path\n"); + return -1; + } + + snprintf(cgroup_proc_path, FILENAME_MAX + 14, "%s/%s/cgroup.procs", + cgrp_proc_path, info[idx].cgrp_path); + free(cgrp_proc_path); + + /* find the unified cgroup path */ + } else if (is_cgroup_mode_hybrid() && + !strcmp(info[idx].ctrl_name, "unified")) { + find_mnt_point("unified cgroup2", &cgrp_proc_path); + if (!cgrp_proc_path) { + err("Unable find unified cgroup path\n"); + return -1; + } + + snprintf(cgroup_proc_path, FILENAME_MAX + 14, "%s/%s/cgroup.procs", + cgrp_proc_path, info[idx].cgrp_path); + free(cgrp_proc_path); + + /* find other controller hierarchy path */ + } else { + pthread_rwlock_rdlock(&cg_mount_table_lock); + cg_build_path_locked(info[idx].cgrp_path, cgroup_path, info[idx].ctrl_name); + pthread_rwlock_unlock(&cg_mount_table_lock); + + snprintf(cgroup_proc_path, FILENAME_MAX + 14, "%s/cgroup.procs", + cgroup_path); + } + + /* record the error and continue */ + ret = attach_task_pid(cgroup_proc_path, pid); + if (ret) + err = -1; + } + + return err; }