]> git.ipfire.org Git - thirdparty/lxc.git/commitdiff
cgroups: add cgroup2 device controller support 3194/head
authorChristian Brauner <christian.brauner@ubuntu.com>
Thu, 28 Nov 2019 15:22:36 +0000 (16:22 +0100)
committerChristian Brauner <christian.brauner@ubuntu.com>
Fri, 29 Nov 2019 16:10:13 +0000 (17:10 +0100)
Add a bpf-based device controller implementation.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
configure.ac
src/lxc/Makefile.am
src/lxc/cgroups/cgfsng.c
src/lxc/cgroups/cgroup.h
src/lxc/cgroups/cgroup2_devices.c [new file with mode: 0644]
src/lxc/cgroups/cgroup2_devices.h [new file with mode: 0644]
src/lxc/conf.c
src/lxc/conf.h
src/lxc/log.h
src/lxc/macro.h
src/lxc/start.c

index 4c1a10b3d7a2473d788765d3713f0a164b5a361e..19cb1e72e56bc552c177cdbe55e66056a5b8659b 100644 (file)
@@ -368,6 +368,10 @@ AC_CHECK_TYPES([struct seccomp_notif_sizes], [], [], [[#include <seccomp.h>]])
 AC_CHECK_DECLS([seccomp_syscall_resolve_name_arch], [], [], [[#include <seccomp.h>]])
 CFLAGS="$OLD_CFLAGS"
 
+AC_CHECK_HEADERS([linux/bpf.h], [
+       AC_CHECK_TYPES([struct bpf_cgroup_dev_ctx], [], [], [[#include <linux/bpf.h>]])
+], [], [])
+
 # Configuration examples
 AC_ARG_ENABLE([examples],
        [AS_HELP_STRING([--enable-examples], [install examples [default=yes]])],
index 4b18ac5d82831b6dfe7925c85e214d9113e1180a..56c64f596a8771f729e261d83abdc1c05c9231c6 100644 (file)
@@ -7,6 +7,7 @@ noinst_HEADERS = api_extensions.h \
                 caps.h \
                 cgroups/cgroup.h \
                 cgroups/cgroup_utils.h \
+                cgroups/cgroup2_devices.h \
                 compiler.h \
                 conf.h \
                 confile.h \
@@ -95,6 +96,7 @@ liblxc_la_SOURCES = af_unix.c af_unix.h \
                    caps.c caps.h \
                    cgroups/cgfsng.c \
                    cgroups/cgroup.c cgroups/cgroup.h \
+                   cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \
                    cgroups/cgroup_utils.c cgroups/cgroup_utils.h \
                    compiler.h \
                    commands.c commands.h \
index 1e6a45cff2071ed073c4e44d4afc460f55b39e25..20e28d66d7dd361d3443c9f2110d321cfa6e549b 100644 (file)
@@ -54,6 +54,7 @@
 
 #include "caps.h"
 #include "cgroup.h"
+#include "cgroup2_devices.h"
 #include "cgroup_utils.h"
 #include "commands.h"
 #include "conf.h"
@@ -1105,6 +1106,12 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
        wrap.hierarchies = ops->hierarchies;
        wrap.conf = handler->conf;
 
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+       ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices);
+       if (ret < 0)
+               WARN("Failed to detach bpf program from cgroup");
+#endif
+
        if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
                ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
                                    "cgroup_rmdir_wrapper");
@@ -2474,8 +2481,146 @@ out:
        return ret;
 }
 
+/*
+ * Some of the parsing logic comes from the original cgroup device v1
+ * implementation in the kernel.
+ */
+static int bpf_device_cgroup_prepare(struct lxc_conf *conf, const char *key,
+                                    const char *val)
+{
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+       struct device_item {
+               char type;
+               int major;
+               int minor;
+               char access[100];
+               int allow;
+       } device_item = {0};
+       int count, ret;
+       char temp[50];
+       struct bpf_program *device;
+
+       if (conf->cgroup2_devices) {
+               device = conf->cgroup2_devices;
+       } else {
+               device = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
+               if (device && bpf_program_init(device)) {
+                       ERROR("Failed to initialize bpf program");
+                       return -1;
+               }
+       }
+       if (!device) {
+               ERROR("Failed to create new ebpf device program");
+               return -1;
+       }
+
+       conf->cgroup2_devices = device;
+
+       if (strcmp("devices.allow", key) == 0)
+               device_item.allow = 1;
+
+       if (strcmp(val, "a") == 0) {
+               device->blacklist = (device_item.allow == 1);
+               return 0;
+       }
+
+       switch (*val) {
+       case 'a':
+               __fallthrough;
+       case 'b':
+               __fallthrough;
+       case 'c':
+               device_item.type = *val;
+               break;
+       default:
+               return -1;
+       }
+
+       val++;
+       if (!isspace(*val))
+               return -1;
+       val++;
+       if (*val == '*') {
+               device_item.major = ~0;
+               val++;
+       } else if (isdigit(*val)) {
+               memset(temp, 0, sizeof(temp));
+               for (count = 0; count < sizeof(temp) - 1; count++) {
+                       temp[count] = *val;
+                       val++;
+                       if (!isdigit(*val))
+                               break;
+               }
+               ret = lxc_safe_uint(temp, &device_item.major);
+               if (ret)
+                       return -1;
+       } else {
+               return -1;
+       }
+       if (*val != ':')
+               return -1;
+       val++;
+
+       /* read minor */
+       if (*val == '*') {
+               device_item.minor = ~0;
+               val++;
+       } else if (isdigit(*val)) {
+               memset(temp, 0, sizeof(temp));
+               for (count = 0; count < sizeof(temp) - 1; count++) {
+                       temp[count] = *val;
+                       val++;
+                       if (!isdigit(*val))
+                               break;
+               }
+               ret = lxc_safe_uint(temp, &device_item.minor);
+               if (ret)
+                       return -1;
+       } else {
+               return -1;
+       }
+       if (!isspace(*val))
+               return -1;
+       for (val++, count = 0; count < 3; count++, val++) {
+               switch (*val) {
+               case 'r':
+                       device_item.access[count] = *val;
+                       break;
+               case 'w':
+                       device_item.access[count] = *val;
+                       break;
+               case 'm':
+                       device_item.access[count] = *val;
+                       break;
+               case '\n':
+               case '\0':
+                       count = 3;
+                       break;
+               default:
+                       return -1;
+               }
+       }
+
+       ret = bpf_program_append_device(device, device_item.type, device_item.major,
+                                       device_item.minor, device_item.access,
+                                       device_item.allow);
+       if (ret) {
+               ERROR("Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d",
+                     device_item.type, device_item.major, device_item.minor,
+                     device_item.access, device_item.allow);
+               return -1;
+       } else {
+               TRACE("Added new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d",
+                     device_item.type, device_item.major, device_item.minor,
+                     device_item.access, device_item.allow);
+       }
+#endif
+       return 0;
+}
+
 static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
-                                     struct lxc_list *cgroup_settings)
+                                     struct lxc_list *cgroup_settings,
+                                     struct lxc_conf *conf)
 {
        struct lxc_list *iterator;
        struct hierarchy *h = ops->unified;
@@ -2486,17 +2631,24 @@ static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
        if (!h)
                return false;
 
-       lxc_list_for_each(iterator, cgroup_settings) {
+       lxc_list_for_each (iterator, cgroup_settings) {
                __do_free char *fullpath = NULL;
                int ret;
                struct lxc_cgroup *cg = iterator->elem;
 
-               fullpath = must_make_path(h->container_full_path, cg->subsystem, NULL);
-               ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666);
-               if (ret < 0) {
-                       SYSERROR("Failed to set \"%s\" to \"%s\"",
-                                cg->subsystem, cg->value);
-                       return false;
+               if (strncmp("devices", cg->subsystem, 7) == 0) {
+                       ret = bpf_device_cgroup_prepare(conf, cg->subsystem,
+                                                       cg->value);
+               } else {
+                       fullpath = must_make_path(h->container_full_path,
+                                                 cg->subsystem, NULL);
+                       ret = lxc_write_to_file(fullpath, cg->value,
+                                               strlen(cg->value), false, 0666);
+                       if (ret < 0) {
+                               SYSERROR("Failed to set \"%s\" to \"%s\"",
+                                        cg->subsystem, cg->value);
+                               return false;
+                       }
                }
                TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
        }
@@ -2505,6 +2657,32 @@ static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
        return true;
 }
 
+__cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
+                                         struct lxc_handler *handler)
+{
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+       int ret;
+       struct hierarchy *h = ops->unified;
+       struct bpf_program *device = handler->conf->cgroup2_devices;
+
+       if (!h)
+               return false;
+
+       if (!device)
+               return true;
+
+       ret = bpf_program_finalize(device);
+       if (ret)
+               return false;
+
+       return bpf_program_cgroup_attach(device, BPF_CGROUP_DEVICE,
+                                        h->container_full_path,
+                                        BPF_F_ALLOW_MULTI) == 0;
+#else
+       return true;
+#endif
+}
+
 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
                                             struct lxc_conf *conf,
                                             bool do_devices)
@@ -2512,7 +2690,11 @@ __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
        if (!__cg_legacy_setup_limits(ops, &conf->cgroup, do_devices))
                return false;
 
-       return __cg_unified_setup_limits(ops, &conf->cgroup2);
+       /* for v2 we will have already set up devices */
+       if (do_devices)
+               return true;
+
+       return __cg_unified_setup_limits(ops, &conf->cgroup2, conf);
 }
 
 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
@@ -2893,6 +3075,7 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
        cgfsng_ops->chown = cgfsng_chown;
        cgfsng_ops->mount = cgfsng_mount;
        cgfsng_ops->nrtasks = cgfsng_nrtasks;
+       cgfsng_ops->devices_activate = cgfsng_devices_activate;
 
        return move_ptr(cgfsng_ops);
 }
index 6ab5187c255916522a77b5a4760e7d0414379200..bb6c91cce8387d44c7c31117285b774b9444906f 100644 (file)
@@ -164,6 +164,8 @@ struct cgroup_ops {
        bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler,
                      const char *root, int type);
        int (*nrtasks)(struct cgroup_ops *ops);
+       bool (*devices_activate)(struct cgroup_ops *ops,
+                                struct lxc_handler *handler);
 };
 
 extern struct cgroup_ops *cgroup_init(struct lxc_conf *conf);
diff --git a/src/lxc/cgroups/cgroup2_devices.c b/src/lxc/cgroups/cgroup2_devices.c
new file mode 100644 (file)
index 0000000..6616a18
--- /dev/null
@@ -0,0 +1,414 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+/* Parts of this taken from systemd's implementation. */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/filter.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "cgroup2_devices.h"
+#include "config.h"
+#include "log.h"
+#include "macro.h"
+#include "memory_utils.h"
+
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+#include <linux/bpf.h>
+
+lxc_log_define(cgroup2_devices, cgroup);
+
+static int bpf_program_add_instructions(struct bpf_program *prog,
+                                       const struct bpf_insn *instructions,
+                                       size_t count)
+{
+
+       struct bpf_insn *new_insn;
+
+       if (prog->kernel_fd >= 0)
+               return error_log_errno(EBUSY, "Refusing to update bpf cgroup program that's already loaded");
+
+       new_insn = realloc(prog->instructions, sizeof(struct bpf_insn) * (count + prog->n_instructions));
+       if (!new_insn)
+               return error_log_errno(ENOMEM, "Failed to reallocate bpf cgroup program");
+
+       prog->instructions = new_insn;
+       memcpy(prog->instructions + prog->n_instructions, instructions,
+              sizeof(struct bpf_insn) * count);
+       prog->n_instructions += count;
+
+       return 0;
+}
+
+void bpf_program_free(struct bpf_program *prog)
+{
+       (void)bpf_program_cgroup_detach(prog);
+
+       if (prog->kernel_fd >= 0)
+               close(prog->kernel_fd);
+       free(prog->instructions);
+       free(prog->attached_path);
+       free(prog);
+}
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)                               \
+       ((struct bpf_insn){.code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
+                          .dst_reg = DST,                             \
+                          .src_reg = SRC,                             \
+                          .off = OFF,                                 \
+                          .imm = 0})
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+#define BPF_ALU32_IMM(OP, DST, IMM)                              \
+       ((struct bpf_insn){.code = BPF_ALU | BPF_OP(OP) | BPF_K, \
+                          .dst_reg = DST,                       \
+                          .src_reg = 0,                         \
+                          .off = 0,                             \
+                          .imm = IMM})
+
+/* Short form of mov, dst_reg = src_reg */
+#define BPF_MOV64_IMM(DST, IMM)                                 \
+       ((struct bpf_insn){.code = BPF_ALU64 | BPF_MOV | BPF_K, \
+                          .dst_reg = DST,                      \
+                          .src_reg = 0,                        \
+                          .off = 0,                            \
+                          .imm = IMM})
+
+#define BPF_MOV32_REG(DST, SRC)                               \
+       ((struct bpf_insn){.code = BPF_ALU | BPF_MOV | BPF_X, \
+                          .dst_reg = DST,                    \
+                          .src_reg = SRC,                    \
+                          .off = 0,                          \
+                          .imm = 0})
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
+#define BPF_JMP_REG(OP, DST, SRC, OFF)                           \
+       ((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_X, \
+                          .dst_reg = DST,                       \
+                          .src_reg = SRC,                       \
+                          .off = OFF,                           \
+                          .imm = 0})
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
+#define BPF_JMP_IMM(OP, DST, IMM, OFF)                           \
+       ((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_K, \
+                          .dst_reg = DST,                       \
+                          .src_reg = 0,                         \
+                          .off = OFF,                           \
+                          .imm = IMM})
+
+/* Program exit */
+#define BPF_EXIT_INSN()                                \
+       ((struct bpf_insn){.code = BPF_JMP | BPF_EXIT, \
+                          .dst_reg = 0,               \
+                          .src_reg = 0,               \
+                          .off = 0,                   \
+                          .imm = 0})
+
+static int bpf_access_mask(const char *acc)
+{
+       int mask = 0;
+
+       if (!acc)
+               return mask;
+
+       for (; *acc; acc++)
+               switch (*acc) {
+               case 'r':
+                       mask |= BPF_DEVCG_ACC_READ;
+                       break;
+               case 'w':
+                       mask |= BPF_DEVCG_ACC_WRITE;
+                       break;
+               case 'm':
+                       mask |= BPF_DEVCG_ACC_MKNOD;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+
+       return mask;
+}
+
+static int bpf_device_type(char type)
+{
+       switch (type) {
+       case 'a':
+               return 0;
+       case 'b':
+               return BPF_DEVCG_DEV_BLOCK;
+       case 'c':
+               return BPF_DEVCG_DEV_CHAR;
+       }
+
+       return -1;
+}
+
+static inline bool bpf_device_all_access(int access_mask)
+{
+       return (access_mask == (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE |
+                               BPF_DEVCG_ACC_MKNOD));
+}
+
+struct bpf_program *bpf_program_new(uint32_t prog_type)
+{
+       __do_free struct bpf_program *prog = NULL;
+
+       prog = calloc(1, sizeof(struct bpf_program));
+       if (!prog)
+               return NULL;
+
+       prog->prog_type = prog_type;
+       prog->kernel_fd = -EBADF;
+
+       return move_ptr(prog);
+}
+
+int bpf_program_init(struct bpf_program *prog)
+{
+       const struct bpf_insn pre_insn[] = {
+           /* load device type to r2 */
+           BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+           BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+
+           /* load access type to r3 */
+           BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+           BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+
+           /* load major number to r4 */
+           BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, major)),
+
+           /* load minor number to r5 */
+           BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, minor)),
+       };
+
+       return bpf_program_add_instructions(prog, pre_insn, ARRAY_SIZE(pre_insn));
+}
+
+int bpf_program_append_device(struct bpf_program *prog, char type, int major,
+                             int minor, const char *access, int allow)
+{
+       int ret;
+       int jump_nr = 1;
+       struct bpf_insn bpf_access_decision[] = {
+           BPF_MOV64_IMM(BPF_REG_0, allow),
+           BPF_EXIT_INSN(),
+       };
+       int access_mask;
+       int device_type;
+
+       device_type = bpf_device_type(type);
+       if (device_type < 0)
+               return error_log_errno(EINVAL, "Invalid bpf cgroup device type %c", type);
+
+       if (device_type > 0)
+               jump_nr++;
+
+       access_mask = bpf_access_mask(access);
+       if (!bpf_device_all_access(access_mask))
+               jump_nr += 3;
+
+       if (major >= 0)
+               jump_nr++;
+
+       if (minor >= 0)
+               jump_nr++;
+
+       if (device_type > 0) {
+               struct bpf_insn ins[] = {
+                   BPF_JMP_IMM(BPF_JNE, BPF_REG_2, device_type, jump_nr--),
+               };
+
+               ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+               if (ret)
+                       return error_log_errno(errno, "Failed to add instructions to bpf cgroup program");
+       }
+
+       if (!bpf_device_all_access(access_mask)) {
+               struct bpf_insn ins[] = {
+                   BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+                   BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access_mask),
+                   BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, jump_nr),
+               };
+
+               jump_nr -= 3;
+               ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+               if (ret)
+                       return error_log_errno(errno, "Failed to add instructions to bpf cgroup program");
+       }
+
+       if (major >= 0) {
+               struct bpf_insn ins[] = {
+                   BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, jump_nr--),
+               };
+
+               ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+               if (ret)
+                       return error_log_errno(errno, "Failed to add instructions to bpf cgroup program");
+       }
+
+       if (minor >= 0) {
+               struct bpf_insn ins[] = {
+                   BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, jump_nr--),
+               };
+
+               ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+               if (ret)
+                       return error_log_errno(errno, "Failed to add instructions to bpf cgroup program");
+       }
+
+       ret = bpf_program_add_instructions(prog, bpf_access_decision,
+                                           ARRAY_SIZE(bpf_access_decision));
+       if (ret)
+               return error_log_errno(errno, "Failed to add instructions to bpf cgroup program");
+
+       return 0;
+}
+
+int bpf_program_finalize(struct bpf_program *prog)
+{
+       struct bpf_insn ins[] = {
+           BPF_MOV64_IMM(BPF_REG_0, prog->blacklist ? 1 : 0),
+           BPF_EXIT_INSN(),
+       };
+
+       TRACE("Implementing %s bpf device cgroup program",
+             prog->blacklist ? "blacklist" : "whitelist");
+       return bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+}
+
+static int bpf_program_load_kernel(struct bpf_program *prog, char *log_buf,
+                                  size_t log_size)
+{
+       union bpf_attr attr;
+
+       if (prog->kernel_fd >= 0) {
+               memset(log_buf, 0, log_size);
+               return 0;
+       }
+
+       attr = (union bpf_attr){
+           .prog_type  = prog->prog_type,
+           .insns      = PTR_TO_UINT64(prog->instructions),
+           .insn_cnt   = prog->n_instructions,
+           .license    = PTR_TO_UINT64("GPL"),
+           .log_buf    = PTR_TO_UINT64(log_buf),
+           .log_level  = !!log_buf,
+           .log_size   = log_size,
+       };
+
+       prog->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
+       if (prog->kernel_fd < 0)
+               return error_log_errno(errno, "Failed to load bpf program");
+
+       return 0;
+}
+
+int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
+                             const char *path, uint32_t flags)
+{
+       __do_free char *copy = NULL;
+       __do_close_prot_errno int fd = -EBADF;
+       union bpf_attr attr;
+       int ret;
+
+       if (flags & ~(BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
+               return error_log_errno(EINVAL, "Invalid flags for bpf program");
+
+       if (prog->attached_path) {
+               if (prog->attached_type != type)
+                       return error_log_errno(EBUSY, "Wrong type for bpf program");
+
+               if (prog->attached_flags != flags)
+                       return error_log_errno(EBUSY, "Wrong flags for bpf program");
+
+               if (flags != BPF_F_ALLOW_OVERRIDE)
+                       return true;
+       }
+
+       ret = bpf_program_load_kernel(prog, NULL, 0);
+       if (ret < 0)
+               return error_log_errno(ret, "Failed to load bpf program");
+
+       copy = strdup(path);
+       if (!copy)
+               return error_log_errno(ENOMEM, "Failed to duplicate cgroup path %s", path);
+
+       fd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+       if (fd < 0)
+               return error_log_errno(errno, "Failed to open cgroup path %s", path);
+
+       attr = (union bpf_attr){
+           .attach_type        = type,
+           .target_fd          = fd,
+           .attach_bpf_fd      = prog->kernel_fd,
+           .attach_flags       = flags,
+       };
+
+       ret = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
+       if (ret < 0)
+               return error_log_errno(errno, "Failed to attach bpf program");
+
+       free_and_replace(prog->attached_path, copy);
+       prog->attached_type = type;
+       prog->attached_flags = flags;
+
+       TRACE("Loaded and attached bpf program to cgroup %s", prog->attached_path);
+       return 0;
+}
+
+int bpf_program_cgroup_detach(struct bpf_program *prog)
+{
+       int ret;
+       __do_close_prot_errno int fd = -EBADF;
+
+       if (!prog)
+               return 0;
+
+       if (!prog->attached_path)
+               return 0;
+
+       fd = open(prog->attached_path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+       if (fd < 0) {
+               if (errno != ENOENT)
+                       return error_log_errno(errno, "Failed to open attach cgroup %s",
+                                              prog->attached_path);
+       } else {
+               union bpf_attr attr;
+
+               attr = (union bpf_attr){
+                   .attach_type        = prog->attached_type,
+                   .target_fd          = fd,
+                   .attach_bpf_fd      = prog->kernel_fd,
+               };
+
+               ret = bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
+               if (ret < 0)
+                       return error_log_errno(errno, "Failed to detach bpf program from cgroup %s",
+                                              prog->attached_path);
+       }
+
+       free(prog->attached_path);
+       prog->attached_path = NULL;
+
+       return 0;
+}
+
+void lxc_clear_cgroup2_devices(struct lxc_conf *conf)
+{
+       if (conf->cgroup2_devices) {
+               (void)bpf_program_cgroup_detach(conf->cgroup2_devices);
+               (void)bpf_program_free(conf->cgroup2_devices);
+       }
+}
+#endif
diff --git a/src/lxc/cgroups/cgroup2_devices.h b/src/lxc/cgroups/cgroup2_devices.h
new file mode 100644 (file)
index 0000000..afcc6b9
--- /dev/null
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+/* Parts of this taken from systemd's implementation. */
+
+#ifndef __LXC_CGROUP2_DEVICES_H
+#define __LXC_CGROUP2_DEVICES_H
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "conf.h"
+#include "config.h"
+
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#endif
+
+#if !HAVE_BPF
+#if !(defined __NR_bpf && __NR_bpf > 0)
+#if defined __NR_bpf
+#undef __NR_bpf
+#endif
+#if defined __i386__
+#define __NR_bpf 357
+#elif defined __x86_64__
+#define __NR_bpf 321
+#elif defined __aarch64__
+#define __NR_bpf 280
+#elif defined __arm__
+#define __NR_bpf 386
+#elif defined __sparc__
+#define __NR_bpf 349
+#elif defined __s390__
+#define __NR_bpf 351
+#elif defined __tilegx__
+#define __NR_bpf 280
+#else
+#warning "__NR_bpf not defined for your architecture"
+#endif
+#endif
+
+union bpf_attr;
+
+static inline int missing_bpf(int cmd, union bpf_attr *attr, size_t size)
+{
+#ifdef __NR_bpf
+       return (int)syscall(__NR_bpf, cmd, attr, size);
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+#define bpf missing_bpf
+#endif
+
+struct bpf_program {
+       bool blacklist;
+       int kernel_fd;
+       uint32_t prog_type;
+
+       size_t n_instructions;
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+       struct bpf_insn *instructions;
+#endif
+
+       char *attached_path;
+       int attached_type;
+       uint32_t attached_flags;
+};
+
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+struct bpf_program *bpf_program_new(uint32_t prog_type);
+int bpf_program_init(struct bpf_program *prog);
+int bpf_program_append_device(struct bpf_program *prog, char type, int major,
+                             int minor, const char *access, int allow);
+int bpf_program_finalize(struct bpf_program *prog);
+int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
+                             const char *path, uint32_t flags);
+int bpf_program_cgroup_detach(struct bpf_program *prog);
+void bpf_program_free(struct bpf_program *prog);
+void lxc_clear_cgroup2_devices(struct lxc_conf *conf);
+static inline void __do_bpf_program_free(struct bpf_program **prog)
+{
+       if (*prog) {
+               bpf_program_free(*prog);
+               *prog = NULL;
+       }
+}
+#else
+static inline struct bpf_program *bpf_program_new(uint32_t prog_type)
+{
+       return NULL;
+}
+
+static inline int bpf_program_init(struct bpf_program *prog)
+{
+       return -ENOSYS;
+}
+
+static inline int bpf_program_append_device(struct bpf_program *prog, char type,
+                                           int major, int minor,
+                                           const char *access, int allow)
+{
+       return -ENOSYS;
+}
+
+static inline int bpf_program_finalize(struct bpf_program *prog)
+{
+       return -ENOSYS;
+}
+
+static inline int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
+                                           const char *path, uint32_t flags)
+{
+       return -ENOSYS;
+}
+
+static inline int bpf_program_cgroup_detach(struct bpf_program *prog)
+{
+       return -ENOSYS;
+}
+
+static inline void bpf_program_free(struct bpf_program *prog)
+{
+}
+
+static inline void lxc_clear_cgroup2_devices(struct lxc_conf *conf)
+{
+}
+
+static inline void __do_bpf_program_free(struct bpf_program **prog)
+{
+}
+#endif
+
+#endif /* __LXC_CGROUP2_DEVICES_H */
index 06e4adcc387ec0117e2cfc56d00702470d31d7b0..c03b663835f0a3154ba4f2bdb93a5cfbddb549dd 100644 (file)
@@ -57,6 +57,7 @@
 #include "af_unix.h"
 #include "caps.h"
 #include "cgroup.h"
+#include "cgroup2_devices.h"
 #include "conf.h"
 #include "config.h"
 #include "confile.h"
@@ -4118,6 +4119,7 @@ void lxc_conf_free(struct lxc_conf *conf)
        lxc_clear_config_keepcaps(conf);
        lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
        lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
+       lxc_clear_cgroup2_devices(conf);
        lxc_clear_hooks(conf, "lxc.hook");
        lxc_clear_mount_entries(conf);
        lxc_clear_idmaps(conf);
index 9f4a93d0b2156127bebffd266b22b7ef41762411..741ac4f09617552181a818cc0ab60002d5ad5a63 100644 (file)
@@ -241,6 +241,7 @@ struct lxc_conf {
        struct {
                struct lxc_list cgroup;
                struct lxc_list cgroup2;
+               struct bpf_program *cgroup2_devices;
        };
 
        struct {
index 8b093de90b1d0cc656c5e3be77f6725252a7fa25..3c5be95c3797fdbd2bee7a885e1e6ef4341da0cb 100644 (file)
@@ -505,11 +505,11 @@ ATTR_UNUSED static inline void LXC_##LEVEL(struct lxc_log_locinfo* locinfo,       \
        } while (0)
 #endif
 
-#define error_log_errno(__errno__, format, ...)        \
-       ({                                              \
-               errno = __errno__;                      \
-               SYSERROR(format, ##__VA_ARGS__);        \
-               -1;                                     \
+#define error_log_errno(__errno__, format, ...)  \
+       ({                                       \
+               errno = __errno__;               \
+               SYSERROR(format, ##__VA_ARGS__); \
+               -1;                              \
        })
 
 extern int lxc_log_fd;
index f96a90019eb0164f065dfc79c65b1170623a42cb..6f3379b3c4cfda7fc3f639b7f960c7880d63e319 100644 (file)
@@ -429,6 +429,8 @@ enum {
 #define PTR_TO_INTMAX(p) ((intmax_t)((intptr_t)(p)))
 #define INTMAX_TO_PTR(u) ((void *)((intptr_t)(u)))
 
+#define PTR_TO_UINT64(p) ((uint64_t)((intptr_t)(p)))
+
 #define LXC_INVALID_UID ((uid_t)-1)
 #define LXC_INVALID_GID ((gid_t)-1)
 
@@ -465,4 +467,80 @@ enum {
 #define LXC_TIMESTAMP_FNAME   "ts"
 #define LXC_COMMENT_FNAME     "comment"
 
+/* Taken from systemd. */
+#define free_and_replace(a, b) \
+       ({                     \
+               free(a);       \
+               (a) = (b);     \
+               (b) = NULL;    \
+               0;             \
+       })
+
+#define XCONCATENATE(x, y) x##y
+#define CONCATENATE(x, y) XCONCATENATE(x, y)
+#define UNIQ_T(x, uniq) CONCATENATE(__unique_prefix_, CONCATENATE(x, uniq))
+#define UNIQ __COUNTER__
+#undef MIN
+#define MIN(a, b) __MIN(UNIQ, (a), UNIQ, (b))
+#define __MIN(aq, a, bq, b)                                                    \
+       ({                                                                     \
+               const typeof(a) UNIQ_T(A, aq) = (a);                           \
+               const typeof(b) UNIQ_T(B, bq) = (b);                           \
+               UNIQ_T(A, aq) < UNIQ_T(B, bq) ? UNIQ_T(A, aq) : UNIQ_T(B, bq); \
+       })
+
+/* Taken from the kernel. */
+
+/*
+ * min()/max()/clamp() macros must accomplish three things:
+ *
+ * - avoid multiple evaluations of the arguments (so side-effects like
+ *   "x++" happen only once) when non-constant.
+ * - perform strict type-checking (to generate warnings instead of
+ *   nasty runtime surprises). See the "unnecessary" pointer comparison
+ *   in __typecheck().
+ * - retain result as a constant expressions when called with only
+ *   constant expressions (to avoid tripping VLA warnings in stack
+ *   allocation usage).
+ */
+#define __typecheck(x, y) (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
+
+/*
+ * This returns a constant expression while determining if an argument is
+ * a constant expression, most importantly without evaluating the argument.
+ * Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de>
+ */
+#define __is_constexpr(x) \
+       (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x)*0l)) : (int *)8)))
+
+#define __no_side_effects(x, y) (__is_constexpr(x) && __is_constexpr(y))
+
+#define __safe_cmp(x, y) (__typecheck(x, y) && __no_side_effects(x, y))
+
+#define __cmp(x, y, op) ((x)op(y) ? (x) : (y))
+
+#define __cmp_once(x, y, unique_x, unique_y, op) \
+       ({                                       \
+               typeof(x) unique_x = (x);        \
+               typeof(y) unique_y = (y);        \
+               __cmp(unique_x, unique_y, op);   \
+       })
+
+#define __careful_cmp(x, y, op)                                  \
+       __builtin_choose_expr(__safe_cmp(x, y), __cmp(x, y, op), \
+                             __cmp_once(x, y, __UNIQUE_ID(__x), \
+                                        __UNIQUE_ID(__y), op))
+
+/**
+ * min - return minimum of two values of the same or compatible types
+ * @x: first value
+ * @y: second value
+ */
+#define min(x, y) __careful_cmp(x, y, <)
+
+#define ARRAY_SIZE(x)                                                        \
+       (__builtin_choose_expr(!__builtin_types_compatible_p(typeof(x),      \
+                                                            typeof(&*(x))), \
+                              sizeof(x) / sizeof((x)[0]), ((void)0)))
+
 #endif /* __LXC_MACRO_H */
index 3cfc8b2f57a2a25e85142fcea90a9f9688afaf7a..ec1557fdecd87d732976f554adc2a3bc0096b1aa 100644 (file)
@@ -1912,6 +1912,12 @@ static int lxc_spawn(struct lxc_handler *handler)
        }
        TRACE("Set up legacy device cgroup controller limits");
 
+       if (!cgroup_ops->devices_activate(cgroup_ops, handler)) {
+               ERROR("Failed to setup cgroup2 device controller limits");
+               goto out_delete_net;
+       }
+       TRACE("Set up cgroup2 device controller limits");
+
        if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
                /* Now we're ready to preserve the cgroup namespace */
                ret = lxc_try_preserve_ns(handler->pid, "cgroup");