From: Christian Brauner Date: Thu, 28 Nov 2019 15:22:36 +0000 (+0100) Subject: cgroups: add cgroup2 device controller support X-Git-Tag: lxc-4.0.0~91^2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bf6519892e130cb65bae4ad43f981a3dda2a12ec;p=thirdparty%2Flxc.git cgroups: add cgroup2 device controller support Add a bpf-based device controller implementation. Signed-off-by: Christian Brauner --- diff --git a/configure.ac b/configure.ac index 4c1a10b3d..19cb1e72e 100644 --- a/configure.ac +++ b/configure.ac @@ -368,6 +368,10 @@ AC_CHECK_TYPES([struct seccomp_notif_sizes], [], [], [[#include ]]) AC_CHECK_DECLS([seccomp_syscall_resolve_name_arch], [], [], [[#include ]]) CFLAGS="$OLD_CFLAGS" +AC_CHECK_HEADERS([linux/bpf.h], [ + AC_CHECK_TYPES([struct bpf_cgroup_dev_ctx], [], [], [[#include ]]) +], [], []) + # Configuration examples AC_ARG_ENABLE([examples], [AS_HELP_STRING([--enable-examples], [install examples [default=yes]])], diff --git a/src/lxc/Makefile.am b/src/lxc/Makefile.am index 4b18ac5d8..56c64f596 100644 --- a/src/lxc/Makefile.am +++ b/src/lxc/Makefile.am @@ -7,6 +7,7 @@ noinst_HEADERS = api_extensions.h \ caps.h \ cgroups/cgroup.h \ cgroups/cgroup_utils.h \ + cgroups/cgroup2_devices.h \ compiler.h \ conf.h \ confile.h \ @@ -95,6 +96,7 @@ liblxc_la_SOURCES = af_unix.c af_unix.h \ caps.c caps.h \ cgroups/cgfsng.c \ cgroups/cgroup.c cgroups/cgroup.h \ + cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \ cgroups/cgroup_utils.c cgroups/cgroup_utils.h \ compiler.h \ commands.c commands.h \ diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index 1e6a45cff..20e28d66d 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -54,6 +54,7 @@ #include "caps.h" #include "cgroup.h" +#include "cgroup2_devices.h" #include "cgroup_utils.h" #include "commands.h" #include "conf.h" @@ -1105,6 +1106,12 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops, wrap.hierarchies = ops->hierarchies; wrap.conf = handler->conf; +#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX + ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices); + if (ret < 0) + WARN("Failed to detach bpf program from cgroup"); +#endif + if (handler->conf && !lxc_list_empty(&handler->conf->id_map)) ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap, "cgroup_rmdir_wrapper"); @@ -2474,8 +2481,146 @@ out: return ret; } +/* + * Some of the parsing logic comes from the original cgroup device v1 + * implementation in the kernel. + */ +static int bpf_device_cgroup_prepare(struct lxc_conf *conf, const char *key, + const char *val) +{ +#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX + struct device_item { + char type; + int major; + int minor; + char access[100]; + int allow; + } device_item = {0}; + int count, ret; + char temp[50]; + struct bpf_program *device; + + if (conf->cgroup2_devices) { + device = conf->cgroup2_devices; + } else { + device = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE); + if (device && bpf_program_init(device)) { + ERROR("Failed to initialize bpf program"); + return -1; + } + } + if (!device) { + ERROR("Failed to create new ebpf device program"); + return -1; + } + + conf->cgroup2_devices = device; + + if (strcmp("devices.allow", key) == 0) + device_item.allow = 1; + + if (strcmp(val, "a") == 0) { + device->blacklist = (device_item.allow == 1); + return 0; + } + + switch (*val) { + case 'a': + __fallthrough; + case 'b': + __fallthrough; + case 'c': + device_item.type = *val; + break; + default: + return -1; + } + + val++; + if (!isspace(*val)) + return -1; + val++; + if (*val == '*') { + device_item.major = ~0; + val++; + } else if (isdigit(*val)) { + memset(temp, 0, sizeof(temp)); + for (count = 0; count < sizeof(temp) - 1; count++) { + temp[count] = *val; + val++; + if (!isdigit(*val)) + break; + } + ret = lxc_safe_uint(temp, &device_item.major); + if (ret) + return -1; + } else { + return -1; + } + if (*val != ':') + return -1; + val++; + + /* read minor */ + if (*val == '*') { + device_item.minor = ~0; + val++; + } else if (isdigit(*val)) { + memset(temp, 0, sizeof(temp)); + for (count = 0; count < sizeof(temp) - 1; count++) { + temp[count] = *val; + val++; + if (!isdigit(*val)) + break; + } + ret = lxc_safe_uint(temp, &device_item.minor); + if (ret) + return -1; + } else { + return -1; + } + if (!isspace(*val)) + return -1; + for (val++, count = 0; count < 3; count++, val++) { + switch (*val) { + case 'r': + device_item.access[count] = *val; + break; + case 'w': + device_item.access[count] = *val; + break; + case 'm': + device_item.access[count] = *val; + break; + case '\n': + case '\0': + count = 3; + break; + default: + return -1; + } + } + + ret = bpf_program_append_device(device, device_item.type, device_item.major, + device_item.minor, device_item.access, + device_item.allow); + if (ret) { + ERROR("Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d", + device_item.type, device_item.major, device_item.minor, + device_item.access, device_item.allow); + return -1; + } else { + TRACE("Added new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d", + device_item.type, device_item.major, device_item.minor, + device_item.access, device_item.allow); + } +#endif + return 0; +} + static bool __cg_unified_setup_limits(struct cgroup_ops *ops, - struct lxc_list *cgroup_settings) + struct lxc_list *cgroup_settings, + struct lxc_conf *conf) { struct lxc_list *iterator; struct hierarchy *h = ops->unified; @@ -2486,17 +2631,24 @@ static bool __cg_unified_setup_limits(struct cgroup_ops *ops, if (!h) return false; - lxc_list_for_each(iterator, cgroup_settings) { + lxc_list_for_each (iterator, cgroup_settings) { __do_free char *fullpath = NULL; int ret; struct lxc_cgroup *cg = iterator->elem; - fullpath = must_make_path(h->container_full_path, cg->subsystem, NULL); - ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666); - if (ret < 0) { - SYSERROR("Failed to set \"%s\" to \"%s\"", - cg->subsystem, cg->value); - return false; + if (strncmp("devices", cg->subsystem, 7) == 0) { + ret = bpf_device_cgroup_prepare(conf, cg->subsystem, + cg->value); + } else { + fullpath = must_make_path(h->container_full_path, + cg->subsystem, NULL); + ret = lxc_write_to_file(fullpath, cg->value, + strlen(cg->value), false, 0666); + if (ret < 0) { + SYSERROR("Failed to set \"%s\" to \"%s\"", + cg->subsystem, cg->value); + return false; + } } TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value); } @@ -2505,6 +2657,32 @@ static bool __cg_unified_setup_limits(struct cgroup_ops *ops, return true; } +__cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops, + struct lxc_handler *handler) +{ +#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX + int ret; + struct hierarchy *h = ops->unified; + struct bpf_program *device = handler->conf->cgroup2_devices; + + if (!h) + return false; + + if (!device) + return true; + + ret = bpf_program_finalize(device); + if (ret) + return false; + + return bpf_program_cgroup_attach(device, BPF_CGROUP_DEVICE, + h->container_full_path, + BPF_F_ALLOW_MULTI) == 0; +#else + return true; +#endif +} + __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops, struct lxc_conf *conf, bool do_devices) @@ -2512,7 +2690,11 @@ __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops, if (!__cg_legacy_setup_limits(ops, &conf->cgroup, do_devices)) return false; - return __cg_unified_setup_limits(ops, &conf->cgroup2); + /* for v2 we will have already set up devices */ + if (do_devices) + return true; + + return __cg_unified_setup_limits(ops, &conf->cgroup2, conf); } static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops, @@ -2893,6 +3075,7 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf) cgfsng_ops->chown = cgfsng_chown; cgfsng_ops->mount = cgfsng_mount; cgfsng_ops->nrtasks = cgfsng_nrtasks; + cgfsng_ops->devices_activate = cgfsng_devices_activate; return move_ptr(cgfsng_ops); } diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h index 6ab5187c2..bb6c91cce 100644 --- a/src/lxc/cgroups/cgroup.h +++ b/src/lxc/cgroups/cgroup.h @@ -164,6 +164,8 @@ struct cgroup_ops { bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler, const char *root, int type); int (*nrtasks)(struct cgroup_ops *ops); + bool (*devices_activate)(struct cgroup_ops *ops, + struct lxc_handler *handler); }; extern struct cgroup_ops *cgroup_init(struct lxc_conf *conf); diff --git a/src/lxc/cgroups/cgroup2_devices.c b/src/lxc/cgroups/cgroup2_devices.c new file mode 100644 index 000000000..6616a180e --- /dev/null +++ b/src/lxc/cgroups/cgroup2_devices.c @@ -0,0 +1,414 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +/* Parts of this taken from systemd's implementation. */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cgroup2_devices.h" +#include "config.h" +#include "log.h" +#include "macro.h" +#include "memory_utils.h" + +#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX +#include + +lxc_log_define(cgroup2_devices, cgroup); + +static int bpf_program_add_instructions(struct bpf_program *prog, + const struct bpf_insn *instructions, + size_t count) +{ + + struct bpf_insn *new_insn; + + if (prog->kernel_fd >= 0) + return error_log_errno(EBUSY, "Refusing to update bpf cgroup program that's already loaded"); + + new_insn = realloc(prog->instructions, sizeof(struct bpf_insn) * (count + prog->n_instructions)); + if (!new_insn) + return error_log_errno(ENOMEM, "Failed to reallocate bpf cgroup program"); + + prog->instructions = new_insn; + memcpy(prog->instructions + prog->n_instructions, instructions, + sizeof(struct bpf_insn) * count); + prog->n_instructions += count; + + return 0; +} + +void bpf_program_free(struct bpf_program *prog) +{ + (void)bpf_program_cgroup_detach(prog); + + if (prog->kernel_fd >= 0) + close(prog->kernel_fd); + free(prog->instructions); + free(prog->attached_path); + free(prog); +} + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn){.code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0}) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn){.code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM}) + +/* Short form of mov, dst_reg = src_reg */ +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn){.code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM}) + +#define BPF_MOV32_REG(DST, SRC) \ + ((struct bpf_insn){.code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0}) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0}) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM}) + +/* Program exit */ +#define BPF_EXIT_INSN() \ + ((struct bpf_insn){.code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0}) + +static int bpf_access_mask(const char *acc) +{ + int mask = 0; + + if (!acc) + return mask; + + for (; *acc; acc++) + switch (*acc) { + case 'r': + mask |= BPF_DEVCG_ACC_READ; + break; + case 'w': + mask |= BPF_DEVCG_ACC_WRITE; + break; + case 'm': + mask |= BPF_DEVCG_ACC_MKNOD; + break; + default: + return -EINVAL; + } + + return mask; +} + +static int bpf_device_type(char type) +{ + switch (type) { + case 'a': + return 0; + case 'b': + return BPF_DEVCG_DEV_BLOCK; + case 'c': + return BPF_DEVCG_DEV_CHAR; + } + + return -1; +} + +static inline bool bpf_device_all_access(int access_mask) +{ + return (access_mask == (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | + BPF_DEVCG_ACC_MKNOD)); +} + +struct bpf_program *bpf_program_new(uint32_t prog_type) +{ + __do_free struct bpf_program *prog = NULL; + + prog = calloc(1, sizeof(struct bpf_program)); + if (!prog) + return NULL; + + prog->prog_type = prog_type; + prog->kernel_fd = -EBADF; + + return move_ptr(prog); +} + +int bpf_program_init(struct bpf_program *prog) +{ + const struct bpf_insn pre_insn[] = { + /* load device type to r2 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF), + + /* load access type to r3 */ + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16), + + /* load major number to r4 */ + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, major)), + + /* load minor number to r5 */ + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, minor)), + }; + + return bpf_program_add_instructions(prog, pre_insn, ARRAY_SIZE(pre_insn)); +} + +int bpf_program_append_device(struct bpf_program *prog, char type, int major, + int minor, const char *access, int allow) +{ + int ret; + int jump_nr = 1; + struct bpf_insn bpf_access_decision[] = { + BPF_MOV64_IMM(BPF_REG_0, allow), + BPF_EXIT_INSN(), + }; + int access_mask; + int device_type; + + device_type = bpf_device_type(type); + if (device_type < 0) + return error_log_errno(EINVAL, "Invalid bpf cgroup device type %c", type); + + if (device_type > 0) + jump_nr++; + + access_mask = bpf_access_mask(access); + if (!bpf_device_all_access(access_mask)) + jump_nr += 3; + + if (major >= 0) + jump_nr++; + + if (minor >= 0) + jump_nr++; + + if (device_type > 0) { + struct bpf_insn ins[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, device_type, jump_nr--), + }; + + ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); + if (ret) + return error_log_errno(errno, "Failed to add instructions to bpf cgroup program"); + } + + if (!bpf_device_all_access(access_mask)) { + struct bpf_insn ins[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access_mask), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, jump_nr), + }; + + jump_nr -= 3; + ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); + if (ret) + return error_log_errno(errno, "Failed to add instructions to bpf cgroup program"); + } + + if (major >= 0) { + struct bpf_insn ins[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, jump_nr--), + }; + + ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); + if (ret) + return error_log_errno(errno, "Failed to add instructions to bpf cgroup program"); + } + + if (minor >= 0) { + struct bpf_insn ins[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, jump_nr--), + }; + + ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); + if (ret) + return error_log_errno(errno, "Failed to add instructions to bpf cgroup program"); + } + + ret = bpf_program_add_instructions(prog, bpf_access_decision, + ARRAY_SIZE(bpf_access_decision)); + if (ret) + return error_log_errno(errno, "Failed to add instructions to bpf cgroup program"); + + return 0; +} + +int bpf_program_finalize(struct bpf_program *prog) +{ + struct bpf_insn ins[] = { + BPF_MOV64_IMM(BPF_REG_0, prog->blacklist ? 1 : 0), + BPF_EXIT_INSN(), + }; + + TRACE("Implementing %s bpf device cgroup program", + prog->blacklist ? "blacklist" : "whitelist"); + return bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); +} + +static int bpf_program_load_kernel(struct bpf_program *prog, char *log_buf, + size_t log_size) +{ + union bpf_attr attr; + + if (prog->kernel_fd >= 0) { + memset(log_buf, 0, log_size); + return 0; + } + + attr = (union bpf_attr){ + .prog_type = prog->prog_type, + .insns = PTR_TO_UINT64(prog->instructions), + .insn_cnt = prog->n_instructions, + .license = PTR_TO_UINT64("GPL"), + .log_buf = PTR_TO_UINT64(log_buf), + .log_level = !!log_buf, + .log_size = log_size, + }; + + prog->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); + if (prog->kernel_fd < 0) + return error_log_errno(errno, "Failed to load bpf program"); + + return 0; +} + +int bpf_program_cgroup_attach(struct bpf_program *prog, int type, + const char *path, uint32_t flags) +{ + __do_free char *copy = NULL; + __do_close_prot_errno int fd = -EBADF; + union bpf_attr attr; + int ret; + + if (flags & ~(BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI)) + return error_log_errno(EINVAL, "Invalid flags for bpf program"); + + if (prog->attached_path) { + if (prog->attached_type != type) + return error_log_errno(EBUSY, "Wrong type for bpf program"); + + if (prog->attached_flags != flags) + return error_log_errno(EBUSY, "Wrong flags for bpf program"); + + if (flags != BPF_F_ALLOW_OVERRIDE) + return true; + } + + ret = bpf_program_load_kernel(prog, NULL, 0); + if (ret < 0) + return error_log_errno(ret, "Failed to load bpf program"); + + copy = strdup(path); + if (!copy) + return error_log_errno(ENOMEM, "Failed to duplicate cgroup path %s", path); + + fd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (fd < 0) + return error_log_errno(errno, "Failed to open cgroup path %s", path); + + attr = (union bpf_attr){ + .attach_type = type, + .target_fd = fd, + .attach_bpf_fd = prog->kernel_fd, + .attach_flags = flags, + }; + + ret = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); + if (ret < 0) + return error_log_errno(errno, "Failed to attach bpf program"); + + free_and_replace(prog->attached_path, copy); + prog->attached_type = type; + prog->attached_flags = flags; + + TRACE("Loaded and attached bpf program to cgroup %s", prog->attached_path); + return 0; +} + +int bpf_program_cgroup_detach(struct bpf_program *prog) +{ + int ret; + __do_close_prot_errno int fd = -EBADF; + + if (!prog) + return 0; + + if (!prog->attached_path) + return 0; + + fd = open(prog->attached_path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (fd < 0) { + if (errno != ENOENT) + return error_log_errno(errno, "Failed to open attach cgroup %s", + prog->attached_path); + } else { + union bpf_attr attr; + + attr = (union bpf_attr){ + .attach_type = prog->attached_type, + .target_fd = fd, + .attach_bpf_fd = prog->kernel_fd, + }; + + ret = bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); + if (ret < 0) + return error_log_errno(errno, "Failed to detach bpf program from cgroup %s", + prog->attached_path); + } + + free(prog->attached_path); + prog->attached_path = NULL; + + return 0; +} + +void lxc_clear_cgroup2_devices(struct lxc_conf *conf) +{ + if (conf->cgroup2_devices) { + (void)bpf_program_cgroup_detach(conf->cgroup2_devices); + (void)bpf_program_free(conf->cgroup2_devices); + } +} +#endif diff --git a/src/lxc/cgroups/cgroup2_devices.h b/src/lxc/cgroups/cgroup2_devices.h new file mode 100644 index 000000000..afcc6b937 --- /dev/null +++ b/src/lxc/cgroups/cgroup2_devices.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +/* Parts of this taken from systemd's implementation. */ + +#ifndef __LXC_CGROUP2_DEVICES_H +#define __LXC_CGROUP2_DEVICES_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "conf.h" +#include "config.h" + +#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX +#include +#include +#endif + +#if !HAVE_BPF +#if !(defined __NR_bpf && __NR_bpf > 0) +#if defined __NR_bpf +#undef __NR_bpf +#endif +#if defined __i386__ +#define __NR_bpf 357 +#elif defined __x86_64__ +#define __NR_bpf 321 +#elif defined __aarch64__ +#define __NR_bpf 280 +#elif defined __arm__ +#define __NR_bpf 386 +#elif defined __sparc__ +#define __NR_bpf 349 +#elif defined __s390__ +#define __NR_bpf 351 +#elif defined __tilegx__ +#define __NR_bpf 280 +#else +#warning "__NR_bpf not defined for your architecture" +#endif +#endif + +union bpf_attr; + +static inline int missing_bpf(int cmd, union bpf_attr *attr, size_t size) +{ +#ifdef __NR_bpf + return (int)syscall(__NR_bpf, cmd, attr, size); +#else + errno = ENOSYS; + return -1; +#endif +} + +#define bpf missing_bpf +#endif + +struct bpf_program { + bool blacklist; + int kernel_fd; + uint32_t prog_type; + + size_t n_instructions; +#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX + struct bpf_insn *instructions; +#endif + + char *attached_path; + int attached_type; + uint32_t attached_flags; +}; + +#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX +struct bpf_program *bpf_program_new(uint32_t prog_type); +int bpf_program_init(struct bpf_program *prog); +int bpf_program_append_device(struct bpf_program *prog, char type, int major, + int minor, const char *access, int allow); +int bpf_program_finalize(struct bpf_program *prog); +int bpf_program_cgroup_attach(struct bpf_program *prog, int type, + const char *path, uint32_t flags); +int bpf_program_cgroup_detach(struct bpf_program *prog); +void bpf_program_free(struct bpf_program *prog); +void lxc_clear_cgroup2_devices(struct lxc_conf *conf); +static inline void __do_bpf_program_free(struct bpf_program **prog) +{ + if (*prog) { + bpf_program_free(*prog); + *prog = NULL; + } +} +#else +static inline struct bpf_program *bpf_program_new(uint32_t prog_type) +{ + return NULL; +} + +static inline int bpf_program_init(struct bpf_program *prog) +{ + return -ENOSYS; +} + +static inline int bpf_program_append_device(struct bpf_program *prog, char type, + int major, int minor, + const char *access, int allow) +{ + return -ENOSYS; +} + +static inline int bpf_program_finalize(struct bpf_program *prog) +{ + return -ENOSYS; +} + +static inline int bpf_program_cgroup_attach(struct bpf_program *prog, int type, + const char *path, uint32_t flags) +{ + return -ENOSYS; +} + +static inline int bpf_program_cgroup_detach(struct bpf_program *prog) +{ + return -ENOSYS; +} + +static inline void bpf_program_free(struct bpf_program *prog) +{ +} + +static inline void lxc_clear_cgroup2_devices(struct lxc_conf *conf) +{ +} + +static inline void __do_bpf_program_free(struct bpf_program **prog) +{ +} +#endif + +#endif /* __LXC_CGROUP2_DEVICES_H */ diff --git a/src/lxc/conf.c b/src/lxc/conf.c index 06e4adcc3..c03b66383 100644 --- a/src/lxc/conf.c +++ b/src/lxc/conf.c @@ -57,6 +57,7 @@ #include "af_unix.h" #include "caps.h" #include "cgroup.h" +#include "cgroup2_devices.h" #include "conf.h" #include "config.h" #include "confile.h" @@ -4118,6 +4119,7 @@ void lxc_conf_free(struct lxc_conf *conf) lxc_clear_config_keepcaps(conf); lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC); lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC); + lxc_clear_cgroup2_devices(conf); lxc_clear_hooks(conf, "lxc.hook"); lxc_clear_mount_entries(conf); lxc_clear_idmaps(conf); diff --git a/src/lxc/conf.h b/src/lxc/conf.h index 9f4a93d0b..741ac4f09 100644 --- a/src/lxc/conf.h +++ b/src/lxc/conf.h @@ -241,6 +241,7 @@ struct lxc_conf { struct { struct lxc_list cgroup; struct lxc_list cgroup2; + struct bpf_program *cgroup2_devices; }; struct { diff --git a/src/lxc/log.h b/src/lxc/log.h index 8b093de90..3c5be95c3 100644 --- a/src/lxc/log.h +++ b/src/lxc/log.h @@ -505,11 +505,11 @@ ATTR_UNUSED static inline void LXC_##LEVEL(struct lxc_log_locinfo* locinfo, \ } while (0) #endif -#define error_log_errno(__errno__, format, ...) \ - ({ \ - errno = __errno__; \ - SYSERROR(format, ##__VA_ARGS__); \ - -1; \ +#define error_log_errno(__errno__, format, ...) \ + ({ \ + errno = __errno__; \ + SYSERROR(format, ##__VA_ARGS__); \ + -1; \ }) extern int lxc_log_fd; diff --git a/src/lxc/macro.h b/src/lxc/macro.h index f96a90019..6f3379b3c 100644 --- a/src/lxc/macro.h +++ b/src/lxc/macro.h @@ -429,6 +429,8 @@ enum { #define PTR_TO_INTMAX(p) ((intmax_t)((intptr_t)(p))) #define INTMAX_TO_PTR(u) ((void *)((intptr_t)(u))) +#define PTR_TO_UINT64(p) ((uint64_t)((intptr_t)(p))) + #define LXC_INVALID_UID ((uid_t)-1) #define LXC_INVALID_GID ((gid_t)-1) @@ -465,4 +467,80 @@ enum { #define LXC_TIMESTAMP_FNAME "ts" #define LXC_COMMENT_FNAME "comment" +/* Taken from systemd. */ +#define free_and_replace(a, b) \ + ({ \ + free(a); \ + (a) = (b); \ + (b) = NULL; \ + 0; \ + }) + +#define XCONCATENATE(x, y) x##y +#define CONCATENATE(x, y) XCONCATENATE(x, y) +#define UNIQ_T(x, uniq) CONCATENATE(__unique_prefix_, CONCATENATE(x, uniq)) +#define UNIQ __COUNTER__ +#undef MIN +#define MIN(a, b) __MIN(UNIQ, (a), UNIQ, (b)) +#define __MIN(aq, a, bq, b) \ + ({ \ + const typeof(a) UNIQ_T(A, aq) = (a); \ + const typeof(b) UNIQ_T(B, bq) = (b); \ + UNIQ_T(A, aq) < UNIQ_T(B, bq) ? UNIQ_T(A, aq) : UNIQ_T(B, bq); \ + }) + +/* Taken from the kernel. */ + +/* + * min()/max()/clamp() macros must accomplish three things: + * + * - avoid multiple evaluations of the arguments (so side-effects like + * "x++" happen only once) when non-constant. + * - perform strict type-checking (to generate warnings instead of + * nasty runtime surprises). See the "unnecessary" pointer comparison + * in __typecheck(). + * - retain result as a constant expressions when called with only + * constant expressions (to avoid tripping VLA warnings in stack + * allocation usage). + */ +#define __typecheck(x, y) (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) + +/* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. + * Glory to Martin Uecker + */ +#define __is_constexpr(x) \ + (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x)*0l)) : (int *)8))) + +#define __no_side_effects(x, y) (__is_constexpr(x) && __is_constexpr(y)) + +#define __safe_cmp(x, y) (__typecheck(x, y) && __no_side_effects(x, y)) + +#define __cmp(x, y, op) ((x)op(y) ? (x) : (y)) + +#define __cmp_once(x, y, unique_x, unique_y, op) \ + ({ \ + typeof(x) unique_x = (x); \ + typeof(y) unique_y = (y); \ + __cmp(unique_x, unique_y, op); \ + }) + +#define __careful_cmp(x, y, op) \ + __builtin_choose_expr(__safe_cmp(x, y), __cmp(x, y, op), \ + __cmp_once(x, y, __UNIQUE_ID(__x), \ + __UNIQUE_ID(__y), op)) + +/** + * min - return minimum of two values of the same or compatible types + * @x: first value + * @y: second value + */ +#define min(x, y) __careful_cmp(x, y, <) + +#define ARRAY_SIZE(x) \ + (__builtin_choose_expr(!__builtin_types_compatible_p(typeof(x), \ + typeof(&*(x))), \ + sizeof(x) / sizeof((x)[0]), ((void)0))) + #endif /* __LXC_MACRO_H */ diff --git a/src/lxc/start.c b/src/lxc/start.c index 3cfc8b2f5..ec1557fde 100644 --- a/src/lxc/start.c +++ b/src/lxc/start.c @@ -1912,6 +1912,12 @@ static int lxc_spawn(struct lxc_handler *handler) } TRACE("Set up legacy device cgroup controller limits"); + if (!cgroup_ops->devices_activate(cgroup_ops, handler)) { + ERROR("Failed to setup cgroup2 device controller limits"); + goto out_delete_net; + } + TRACE("Set up cgroup2 device controller limits"); + if (handler->ns_clone_flags & CLONE_NEWCGROUP) { /* Now we're ready to preserve the cgroup namespace */ ret = lxc_try_preserve_ns(handler->pid, "cgroup");