From 74680bf4a65bb9badcbe784bedd7067ed2fa0dbe Mon Sep 17 00:00:00 2001 From: Tom Hromatka Date: Wed, 26 Oct 2022 10:14:27 -0600 Subject: [PATCH] systemd: Add function to create a systemd scope Add a function, cgroup_create_scope(), to create a systemd scope. This scope can be delegated, in other words the cgroup management of this scope can be delegated away from systemd. This is the official way to create a cgroup that systemd will not interfere with. Signed-off-by: Tom Hromatka Reviewed-by: Kamalesh Babulal --- .github/actions/setup-libcgroup/action.yml | 2 +- configure.ac | 12 + include/Makefile.am | 4 + include/libcgroup.h | 1 + include/libcgroup/systemd.h | 61 +++++ src/.gitignore | 1 + src/Makefile.am | 21 ++ src/libcgroup.map | 2 + src/libcgroup_systemd_idle_thread.c | 11 + src/systemd.c | 283 +++++++++++++++++++++ 10 files changed, 397 insertions(+), 1 deletion(-) create mode 100644 include/libcgroup/systemd.h create mode 100644 src/libcgroup_systemd_idle_thread.c create mode 100644 src/systemd.c diff --git a/.github/actions/setup-libcgroup/action.yml b/.github/actions/setup-libcgroup/action.yml index 6f6c7ab5..29d3f85e 100644 --- a/.github/actions/setup-libcgroup/action.yml +++ b/.github/actions/setup-libcgroup/action.yml @@ -13,7 +13,7 @@ runs: steps: - run: sudo apt-get update shell: bash - - run: sudo apt-get install libpam-dev lcov python3-pip python3-dev cmake bison flex byacc g++ autoconf automake libtool -y + - run: sudo apt-get install libpam-dev lcov python3-pip python3-dev cmake bison flex byacc g++ autoconf automake libtool libsystemd-dev -y shell: bash - run: sudo pip install cython shell: bash diff --git a/configure.ac b/configure.ac index c31e7c85..2856a174 100644 --- a/configure.ac +++ b/configure.ac @@ -84,6 +84,18 @@ AC_DEFINE_UNQUOTED([ENABLE_PYTHON], [$(test "$enable_python" = yes && echo 1 || echo 0)], [Python bindings build flag.]) +AC_ARG_ENABLE([systemd], + [AS_HELP_STRING([--enable-systemd],[enable systemd support [default=yes]])], + [ + if test "x$enableval" = xno; then + with_systemd=false + else + with_systemd=true + fi + ], + [with_systemd=true]) +AM_CONDITIONAL([WITH_SYSTEMD], [test x$with_systemd = xtrue]) + AC_ARG_ENABLE([initscript-install], [AS_HELP_STRING([--enable-initscript-install],[install init scripts [default=no]])], [ diff --git a/include/Makefile.am b/include/Makefile.am index 24e5bac9..23cebaac 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -3,3 +3,7 @@ nobase_include_HEADERS = libcgroup.h libcgroup/error.h libcgroup/init.h \ libcgroup/groups.h libcgroup/tasks.h \ libcgroup/iterators.h libcgroup/config.h \ libcgroup/log.h libcgroup/tools.h + +if WITH_SYSTEMD +nobase_include_HEADERS += libcgroup/systemd.h +endif diff --git a/include/libcgroup.h b/include/libcgroup.h index 3f7c759c..eddb356f 100644 --- a/include/libcgroup.h +++ b/include/libcgroup.h @@ -18,6 +18,7 @@ #include #include #include +#include #undef _LIBCGROUP_H_INSIDE diff --git a/include/libcgroup/systemd.h b/include/libcgroup/systemd.h new file mode 100644 index 00000000..f12772ab --- /dev/null +++ b/include/libcgroup/systemd.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: LGPL-2.1-only */ +#ifndef _LIBCGROUP_SYSTEMD_H +#define _LIBCGROUP_SYSTEMD_H + +#ifndef _LIBCGROUP_H_INSIDE +#error "Only should be included directly." +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +enum cgroup_systemd_mode_t { + CGROUP_SYSTEMD_MODE_FAIL = 0, + CGROUP_SYSTEMD_MODE_REPLACE, + CGROUP_SYSTEMD_MODE_ISOLATE, + CGROUP_SYSTEMD_MODE_IGNORE_DEPS, + CGROUP_SYSTEMD_MODE_IGNORE_REQS, + + CGROUP_SYSTEMD_MODE_CNT, + CGROUP_SYSTEMD_MODE_DFLT = CGROUP_SYSTEMD_MODE_REPLACE +}; + +/** + * Options associated with creating a systemd scope + */ +struct cgroup_systemd_scope_opts { + /** should systemd delegate this cgroup or not. 1 == yes, 0 == no */ + int delegated; + /** systemd behavior when the scope already exists */ + enum cgroup_systemd_mode_t mode; + /** pid to be placed in the cgroup. if 0, libcgroup will create a dummy process */ + pid_t pid; +}; + +/** + * Populate the scope options structure with default values + * + * @param opts Scope creation options structure instance. Must already be allocated + * + * @return 0 on success and > 0 on error + */ +int cgroup_set_default_scope_opts(struct cgroup_systemd_scope_opts * const opts); + +/** + * Create a systemd scope under the specified slice + * + * @param scope_name Name of the scope, must end in .scope + * @param slice_name Name of the slice, must end in .slice + * @param opts Scope creation options structure instance + * + * @return 0 on success and > 0 on error + */ +int cgroup_create_scope(const char * const scope_name, const char * const slice_name, + const struct cgroup_systemd_scope_opts * const opts); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* _LIBCGROUP_SYSTEMD_H */ diff --git a/src/.gitignore b/src/.gitignore index 14925462..ed9b4ecf 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -1,3 +1,4 @@ +libcgroup_systemd_idle_thread lex.c parse.c parse.h diff --git a/src/Makefile.am b/src/Makefile.am index 6a838f1e..da02b013 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -18,22 +18,43 @@ AM_CPPFLAGS = -I$(top_srcdir)/include VERSION_NUMBER = $(LIBRARY_VERSION_MAJOR):$(LIBRARY_VERSION_MINOR):$(LIBRARY_VERSION_RELEASE) TESTING_MAP_FILE = $(top_srcdir)/tests/gunit/libcgroup_unittest.map +if WITH_SYSTEMD +libcgroup_systemd_idle_thread_SOURCES = libcgroup_systemd_idle_thread.c +bin_PROGRAMS = libcgroup_systemd_idle_thread +endif + lib_LTLIBRARIES = libcgroup.la libcgroup_la_SOURCES = parse.h parse.y lex.l api.c config.c libcgroup-internal.h libcgroup.map \ wrapper.c log.c abstraction-common.c abstraction-common.h \ abstraction-map.c abstraction-map.h abstraction-cpu.c abstraction-cpuset.c \ tools/cgxget.c tools/cgxset.c +if WITH_SYSTEMD +libcgroup_la_SOURCES += systemd.c +endif + libcgroup_la_LIBADD = -lpthread $(CODE_COVERAGE_LIBS) libcgroup_la_CFLAGS = $(CODE_COVERAGE_CFLAGS) -DSTATIC=static -DLIBCG_LIB -fPIC + libcgroup_la_LDFLAGS = -Wl,--version-script,$(srcdir)/libcgroup.map \ -version-number $(VERSION_NUMBER) +if WITH_SYSTEMD +libcgroup_la_LDFLAGS += -lsystemd +endif noinst_LTLIBRARIES = libcgroupfortesting.la libcgroupfortesting_la_SOURCES = parse.h parse.y lex.l api.c config.c libcgroup-internal.h \ libcgroup.map wrapper.c log.c abstraction-common.c \ abstraction-common.h abstraction-map.c abstraction-map.h \ abstraction-cpu.c abstraction-cpuset.c +if WITH_SYSTEMD +libcgroupfortesting_la_SOURCES += systemd.c +endif + libcgroupfortesting_la_LIBADD = -lpthread $(CODE_COVERAGE_LIBS) libcgroupfortesting_la_CFLAGS = $(CODE_COVERAGE_CFLAGS) -DSTATIC= -DUNIT_TEST + libcgroupfortesting_la_LDFLAGS = -Wl,--version-script,$(TESTING_MAP_FILE) \ -version-number $(VERSION_NUMBER) +if WITH_SYSTEMD +libcgroupfortesting_la_LDFLAGS += -lsystemd +endif diff --git a/src/libcgroup.map b/src/libcgroup.map index 0cecba2f..0d582982 100644 --- a/src/libcgroup.map +++ b/src/libcgroup.map @@ -149,4 +149,6 @@ CGROUP_3.0 { /* libcgroup 3.0.1 */ cgroup_setup_mode; + cgroup_create_scope; + cgroup_set_default_scope_opts; } CGROUP_2.0; diff --git a/src/libcgroup_systemd_idle_thread.c b/src/libcgroup_systemd_idle_thread.c new file mode 100644 index 00000000..1b3b06e4 --- /dev/null +++ b/src/libcgroup_systemd_idle_thread.c @@ -0,0 +1,11 @@ +#include + +#define SECS_PER_DAY (60 * 60 *24) + +int main(void) +{ + while(1) + sleep(1 * SECS_PER_DAY); + + return 0; +} diff --git a/src/systemd.c b/src/systemd.c new file mode 100644 index 00000000..cbd36c25 --- /dev/null +++ b/src/systemd.c @@ -0,0 +1,283 @@ +/* SPDX-License-Identifier: LGPL-2.1-only */ +/** + * Copyright (c) 2022 Oracle and/or its affiliates. + * Author: Tom Hromatka + * Author: Silvia Chapa + */ + +#include +#include +#include +#include +#include +#include + +#define USEC_PER_SEC 1000000 + +static const char * const modes[] = { + "fail", /* CGROUP_SYSTEMD_MODE_FAIL */ + "replace", /* CGROUP_SYSTEMD_MODE_REPLACE */ + "isolate", /* CGROUP_SYSTEMD_MODE_ISOLATE */ + "ignore-dependencies", /* CGROUP_SYSTEMD_MODE_IGNORE_DEPS */ + "ignore-requirements", /* CGROUP_SYSTEMD_MODE_IGNORE_REQS */ +}; +static_assert((sizeof(modes) / sizeof(modes[0])) == CGROUP_SYSTEMD_MODE_CNT, + "modes[] array must be same length as CGROUP_SYSTEMD_MODE_CNT"); + +static const char * const sender = "org.freedesktop.systemd1"; +static const char * const path = "/org/freedesktop/systemd1"; +static const char * const interface = "org.freedesktop.systemd1.Manager"; + +int cgroup_set_default_scope_opts(struct cgroup_systemd_scope_opts * const opts) +{ + if (!opts) + return ECGINVAL; + + opts->delegated = 1; + opts->mode = CGROUP_SYSTEMD_MODE_FAIL; + opts->pid = -1; + + return 0; +} + +/* + * Returns time elapsed in usec + * + * Inspired-by: https://github.com/cockpit-project/cockpit/blob/main/src/tls/socket-io.c#L39 + */ +static int64_t elapsed_time(const struct timespec * const start, const struct timespec * const end) +{ + int64_t elapsed = (end->tv_sec - start->tv_sec) * 1000000 + + (end->tv_nsec - start->tv_nsec) / 1000; + + assert(elapsed >= 0); + + return elapsed; +} + +static int job_removed_callback(sd_bus_message *message, void *user_data, sd_bus_error *error) +{ + const char *result, *msg_path, *scope_name; + const char **job_path = user_data; + int ret; + + ret = sd_bus_message_read(message, "uoss", NULL, &msg_path, &scope_name, &result); + if (ret < 0) { + cgroup_err("callback message read failed: %d\n", errno); + return 0; + } + + if (*job_path == NULL || strcmp(msg_path, *job_path) != 0) { + cgroup_dbg("Received a systemd signal, but it was not our message\n"); + return 0; + } + + cgroup_dbg("Received JobRemoved signal for scope %s. Result: %s\n", scope_name, result); + + /* + * Use the job_path pointer as a way to inform the original thread that the job has + * completed. + */ + *job_path = NULL; + return 0; +} + +int cgroup_create_scope(const char * const scope_name, const char * const slice_name, + const struct cgroup_systemd_scope_opts * const opts) +{ + sd_bus_message *msg = NULL, *reply = NULL; + sd_bus_error error = SD_BUS_ERROR_NULL; + const char *job_path = NULL; + struct timespec start, now; + sd_bus *bus = NULL; + pid_t child_pid; + int ret = 0; + + if (!scope_name || !slice_name || !opts) + return ECGINVAL; + + if (strcmp(&scope_name[strlen(scope_name) - strlen(".scope")], ".scope") != 0) + cgroup_warn("scope doesn't have expected suffix\n"); + if (strcmp(&slice_name[strlen(slice_name) - strlen(".slice")], ".slice") != 0) + cgroup_warn("slice doesn't have expected suffix\n"); + + if (opts->mode >= CGROUP_SYSTEMD_MODE_CNT) { + cgroup_err("invalid systemd mode: %d\n", opts->mode); + return ECGINVAL; + } + + if (opts->mode == CGROUP_SYSTEMD_MODE_ISOLATE || + opts->mode == CGROUP_SYSTEMD_MODE_IGNORE_DEPS || + opts->mode == CGROUP_SYSTEMD_MODE_IGNORE_REQS) { + cgroup_err("unsupported systemd mode: %d\n", opts->mode); + return ECGINVAL; + } + + if (opts->pid < 0) { + child_pid = fork(); + if (child_pid < 0) { + cgroup_err("fork failed: %d\n", errno); + return ECGOTHER; + } + + if (child_pid == 0) { + char *args[] = {"libcgroup_systemd_idle_thread", NULL}; + + /* + * Have the child sleep forever. Systemd will delete the scope if + * there isn't a running process in it. + */ + execvp("libcgroup_systemd_idle_thread", args); + /* The child process should never get here */ + cgroup_err("failed to create system idle thread.\n"); + return ECGOTHER; + } + + cgroup_dbg("created libcgroup_system_idle thread pid %d\n", child_pid); + } else { + child_pid = opts->pid; + } + cgroup_dbg("pid %d will be placed in scope %s\n", child_pid, scope_name); + + ret = sd_bus_default_system(&bus); + if (ret < 0) { + cgroup_err("failed to open the system bus: %d\n", errno); + goto out; + } + + ret = sd_bus_match_signal(bus, NULL, sender, path, interface, + "JobRemoved", job_removed_callback, &job_path); + if (ret < 0) { + cgroup_err("failed to install match callback: %d\n", errno); + goto out; + } + + ret = sd_bus_message_new_method_call(bus, &msg, sender, path, interface, + "StartTransientUnit"); + if (ret < 0) { + cgroup_err("failed to create the systemd msg: %d\n", errno); + goto out; + } + + ret = sd_bus_message_append(msg, "ss", scope_name, modes[opts->mode]); + if (ret < 0) { + cgroup_err("failed to append the scope name: %d\n", errno); + goto out; + } + + ret = sd_bus_message_open_container(msg, 'a', "(sv)"); + if (ret < 0) { + cgroup_err("failed to open container: %d\n", errno); + goto out; + } + + ret = sd_bus_message_append(msg, "(sv)", "Description", "s", + "scope created by libcgroup"); + if (ret < 0) { + cgroup_err("failed to append the description: %d\n", errno); + goto out; + } + + ret = sd_bus_message_append(msg, "(sv)", "PIDs", "au", 1, child_pid); + if (ret < 0) { + cgroup_err("failed to append the PID: %d\n", errno); + goto out; + } + + ret = sd_bus_message_append(msg, "(sv)", "Slice", "s", slice_name); + if (ret < 0) { + cgroup_err("failed to append the slice: %d\n", errno); + goto out; + } + + if (opts->delegated == 1) { + ret = sd_bus_message_append(msg, "(sv)", "Delegate", "b", 1); + if (ret < 0) { + cgroup_err("failed to append delegate: %d\n", errno); + goto out; + } + } + + ret = sd_bus_message_close_container(msg); + if (ret < 0) { + cgroup_err("failed to close the container: %d\n", errno); + goto out; + } + + ret = sd_bus_message_append(msg, "a(sa(sv))", 0); + if (ret < 0) { + cgroup_err("failed to append aux structure: %d\n", errno); + goto out; + } + + ret = sd_bus_call(bus, msg, 0, &error, &reply); + if (ret < 0) { + cgroup_err("sd_bus_call() failed: %d\n", + sd_bus_message_get_errno(msg)); + cgroup_err("error message: %s\n", error.message); + goto out; + } + + /* Receive the job_path from systemd */ + ret = sd_bus_message_read(reply, "o", &job_path); + if (ret < 0) { + cgroup_err("failed to read reply: %d\n", errno); + goto out; + } + + cgroup_dbg("job_path = %s\n", job_path); + + ret = clock_gettime(CLOCK_MONOTONIC, &start); + if (ret < 0) { + cgroup_err("Failed to get time: %d\n", errno); + ret = ECGFAIL; + goto out; + } + + /* The callback will null out the job_path pointer on completion */ + while(job_path) { + ret = sd_bus_process(bus, NULL); + if (ret < 0) { + cgroup_err("failed to process the sd bus: %d\n", errno); + goto out; + } + + if (ret == 0) { + /* + * Per the sd_bus_wait() man page, call this function after sd_bus_process + * returns zero. The wait time (usec) was somewhat arbitrarily chosen + */ + ret = sd_bus_wait(bus, 10); + if (ret < 0) { + cgroup_err("failed to wait for sd bus: %d\n", errno); + goto out; + } + } + + ret = clock_gettime(CLOCK_MONOTONIC, &now); + if (ret < 0) { + cgroup_err("Failed to get time: %d\n", errno); + ret = ECGFAIL; + goto out; + } + + if (elapsed_time(&start, &now) > USEC_PER_SEC) { + cgroup_err("The create scope command timed out\n"); + ret = ECGFAIL; + goto out; + } + } + + ret = 0; + +out: + if (ret && opts->pid < 0) + kill(child_pid, SIGTERM); + + sd_bus_error_free(&error); + sd_bus_message_unref(msg); + sd_bus_message_unref(reply); + sd_bus_unref(bus); + + return ret; +} -- 2.47.2