From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sat, 20 Feb 2021 01:01:18 +0000 (+0100)
Subject: cgroups: rework cgroup initialization
X-Git-Tag: lxc-5.0.0~274^2~20
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=35ec1a385a2b0521b522d2c7cc82bfc7ef633e88;p=thirdparty%2Flxc.git

cgroups: rework cgroup initialization

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---

diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index 8431007f4..a1e2ff95e 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -60,22 +60,29 @@
 
 lxc_log_define(cgfsng, cgroup);
 
-/* Given a pointer to a null-terminated array of pointers, realloc to add one
+/*
+ * Given a pointer to a null-terminated array of pointers, realloc to add one
  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  * second-to-last entry - that is, the one which is now available for use
  * (keeping the list null-terminated).
  */
-static int append_null_to_list(void ***list)
+static int list_add(void ***list)
 {
-	int newentry = 0;
+	int idx = 0;
+	void **p;
 
 	if (*list)
-		for (; (*list)[newentry]; newentry++)
+		for (; (*list)[idx]; idx++)
 			;
 
-	*list = must_realloc(*list, (newentry + 2) * sizeof(void **));
-	(*list)[newentry + 1] = NULL;
-	return newentry;
+	p = realloc(*list, (idx + 2) * sizeof(void **));
+	if (!p)
+		return ret_errno(ENOMEM);
+
+	p[idx + 1] = NULL;
+	*list = p;
+
+	return idx;
 }
 
 /* Given a null-terminated array of strings, check whether @entry is one of the
@@ -93,59 +100,6 @@ static bool string_in_list(char **list, const char *entry)
 	return false;
 }
 
-/* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
- * "name=systemd". Do not fail.
- */
-static char *cg_legacy_must_prefix_named(char *entry)
-{
-	size_t len;
-	char *prefixed;
-
-	len = strlen(entry);
-	prefixed = must_realloc(NULL, len + 6);
-
-	memcpy(prefixed, "name=", STRLITERALLEN("name="));
-	memcpy(prefixed + STRLITERALLEN("name="), entry, len);
-	prefixed[len + 5] = '\0';
-
-	return prefixed;
-}
-
-/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
- * we are called.
- *
- * We also handle named subsystems here. Any controller which is not a kernel
- * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
- * we refuse to use because we're not sure which we have here.
- * (TODO: We could work around this in some cases by just remounting to be
- * unambiguous, or by comparing mountpoint contents with current cgroup.)
- *
- * The last entry will always be NULL.
- */
-static void must_append_controller(char **klist, char **nlist, char ***clist,
-				   char *entry)
-{
-	int newentry;
-	char *copy;
-
-	if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
-		ERROR("Refusing to use ambiguous controller \"%s\"", entry);
-		ERROR("It is both a named and kernel subsystem");
-		return;
-	}
-
-	newentry = append_null_to_list((void ***)clist);
-
-	if (strnequal(entry, "name=", 5))
-		copy = must_copy_string(entry);
-	else if (string_in_list(klist, entry))
-		copy = must_copy_string(entry);
-	else
-		copy = cg_legacy_must_prefix_named(entry);
-
-	(*clist)[newentry] = copy;
-}
-
 /* Given a handler's cgroup data, return the struct hierarchy for the controller
  * @c, or NULL if there is none.
  */
@@ -318,37 +272,6 @@ static inline bool is_unified_hierarchy(const struct hierarchy *h)
 	return h->version == CGROUP2_SUPER_MAGIC;
 }
 
-/* Given two null-terminated lists of strings, return true if any string is in
- * both.
- */
-static bool controller_lists_intersect(char **l1, char **l2)
-{
-	if (!l1 || !l2)
-		return false;
-
-	for (int i = 0; l1[i]; i++)
-		if (string_in_list(l2, l1[i]))
-			return true;
-
-	return false;
-}
-
-/* For a null-terminated list of controllers @clist, return true if any of those
- * controllers is already listed the null-terminated list of hierarchies @hlist.
- * Realistically, if one is present, all must be present.
- */
-static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
-{
-	if (!hlist)
-		return false;
-
-	for (int i = 0; hlist[i]; i++)
-		if (controller_lists_intersect(hlist[i]->controllers, clist))
-			return true;
-
-	return false;
-}
-
 /* Return true if the controller @entry is found in the null-terminated list of
  * hierarchies @hlist.
  */
@@ -367,7 +290,7 @@ static bool controller_found(struct hierarchy **hlist, char *entry)
 /* Return true if all of the controllers which we require have been found.  The
  * required list is  freezer and anything in lxc.cgroup.use.
  */
-static bool all_controllers_found(struct cgroup_ops *ops)
+__lxc_unused static bool all_controllers_found(struct cgroup_ops *ops)
 {
 	struct hierarchy **hlist;
 
@@ -382,98 +305,51 @@ static bool all_controllers_found(struct cgroup_ops *ops)
 	return true;
 }
 
-/* Get the controllers from a mountinfo line There are other ways we could get
- * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
- * could parse the mount options. But we simply assume that the mountpoint must
- * be /sys/fs/cgroup/controller-list
- */
-static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
-					int type)
+static char **__controller_list_empty(void)
 {
-	/* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
-	 * for legacy hierarchies.
-	 */
 	__do_free_string_list char **aret = NULL;
-	int i;
-	char *p2, *tok;
-	char *p = line, *sep = ",";
-
-	for (i = 0; i < 4; i++) {
-		p = strchr(p, ' ');
-		if (!p)
-			return NULL;
-		p++;
-	}
+	int newentry;
 
-	/* Note, if we change how mountinfo works, then our caller will need to
-	 * verify /sys/fs/cgroup/ in this field.
-	 */
-	if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
-		return log_warn(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
+	newentry = list_add((void ***)&aret);
+	aret[newentry] = NULL;
+	return move_ptr(aret);
+}
 
-	p += 15;
-	p2 = strchr(p, ' ');
-	if (!p2)
-		return log_error(NULL, "Corrupt mountinfo");
-	*p2 = '\0';
+static char **__controller_list(char *controllers)
+{
+	__do_free_string_list char **controller_list = NULL;
+	char *it;
 
-	if (type == CGROUP_SUPER_MAGIC) {
-		__do_free char *dup = NULL;
+	lxc_iterate_parts(it, controllers, " \t\n") {
+		int idx;
 
-		/* strdup() here for v1 hierarchies. Otherwise
-		 * lxc_iterate_parts() will destroy mountpoints such as
-		 * "/sys/fs/cgroup/cpu,cpuacct".
-		 */
-		dup = must_copy_string(p);
-		if (!dup)
+		idx = list_add((void ***)&controller_list);
+		controller_list[idx] = strdup(it);
+		if (!controller_list[idx])
 			return NULL;
-
-		lxc_iterate_parts(tok, dup, sep)
-			must_append_controller(klist, nlist, &aret, tok);
 	}
-	*p2 = ' ';
 
-	return move_ptr(aret);
-}
-
-static char **cg_unified_make_empty_controller(void)
-{
-	__do_free_string_list char **aret = NULL;
-	int newentry;
+	if (!controller_list)
+		return NULL;
 
-	newentry = append_null_to_list((void ***)&aret);
-	aret[newentry] = NULL;
-	return move_ptr(aret);
+	return move_ptr(controller_list);
 }
 
-static char **cg_unified_get_controllers(int dfd, const char *file)
+static char **unified_controllers(int dfd, const char *file)
 {
 	__do_free char *buf = NULL;
-	__do_free_string_list char **aret = NULL;
-	char *sep = " \t\n";
-	char *tok;
 
 	buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
 	if (!buf)
 		return NULL;
 
-	lxc_iterate_parts(tok, buf, sep) {
-		int newentry;
-		char *copy;
-
-		newentry = append_null_to_list((void ***)&aret);
-		copy = must_copy_string(tok);
-		aret[newentry] = copy;
-	}
-
-	return move_ptr(aret);
+	return __controller_list(buf);
 }
 
-static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
-				       char **controllers)
+static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
 {
 	if (!ops->cgroup_use)
-		return true;
+		return false;
 
 	for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
 		bool found = false;
@@ -489,299 +365,60 @@ static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
 		if (found)
 			continue;
 
-		return false;
+		return true;
 	}
 
-	return true;
+	return false;
 }
 
-static int add_hierarchy(struct cgroup_ops *ops, char **clist, char *mountpoint,
-			 char *container_base_path, int type)
+static int add_hierarchy(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
+			 int dfd_base, char *base_cgroup, char **controllers,
+			 int type)
 {
-	__do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
 	__do_free struct hierarchy *new = NULL;
-	__do_free_string_list char **controllers = clist;
 	int idx;
 
-	if (abspath(container_base_path))
+	if (abspath(base_cgroup))
 		return syserrno_set(-EINVAL, "Container base path must be relative to controller mount");
 
-	if (!controllers && type != CGROUP2_SUPER_MAGIC)
-		return syserrno_set(-EINVAL, "Empty controller list for non-unified cgroup hierarchy passed");
-
-	dfd_mnt = open_at(-EBADF, mountpoint, PROTECT_OPATH_DIRECTORY,
-			  PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
-	if (dfd_mnt < 0)
-		return syserrno(-errno, "Failed to open %s", mountpoint);
-
-	if (!is_empty_string(container_base_path)) {
-		dfd_base = open_at(dfd_mnt, container_base_path,
-				   PROTECT_OPATH_DIRECTORY,
-				   PROTECT_LOOKUP_BENEATH_XDEV, 0);
-		if (dfd_base < 0)
-			return syserrno(-errno, "Failed to open %d(%s)", dfd_base, container_base_path);
-	}
-
-	if (!controllers) {
-		/*
-		* We assume that the cgroup we're currently in has been delegated to
-		* us and we are free to further delege all of the controllers listed
-		* in cgroup.controllers further down the hierarchy.
-		 */
-		if (dfd_base < 0)
-			controllers = cg_unified_get_controllers(dfd_mnt, "cgroup.controllers");
-		else
-			controllers = cg_unified_get_controllers(dfd_base, "cgroup.controllers");
-		if (!controllers)
-			controllers = cg_unified_make_empty_controller();
-		if (!controllers[0])
-			TRACE("No controllers are enabled for delegation");
-	}
-
-	/* Exclude all controllers that cgroup use does not want. */
-	if (!cgroup_use_wants_controllers(ops, controllers))
-		return log_trace(0, "Skipping cgroup hiearchy with non-requested controllers");
-
 	new = zalloc(sizeof(*new));
 	if (!new)
 		return ret_errno(ENOMEM);
 
-	new->version			= type;
-	new->controllers		= move_ptr(controllers);
-	new->mountpoint			= mountpoint;
-	new->container_base_path	= container_base_path;
 	new->cgfd_con			= -EBADF;
 	new->cgfd_limit			= -EBADF;
 	new->cgfd_mon			= -EBADF;
 
-	TRACE("Adding cgroup hierarchy with mountpoint %s and base cgroup %s",
-	      mountpoint, container_base_path);
+	new->version			= type;
+	new->controllers		= controllers;
+	new->mountpoint			= mnt;
+	new->container_base_path	= base_cgroup;
+
+	new->dfd_mnt			= dfd_mnt;
+	new->dfd_base			= dfd_base;
+
+	TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
+	      mnt, maybe_empty(base_cgroup));
 	for (char *const *it = new->controllers; it && *it; it++)
-		TRACE("The detected hierarchy contains the %s controller", *it);
+		TRACE("The hierarchy contains the %s controller", *it);
 
-	idx = append_null_to_list((void ***)&ops->hierarchies);
-	if (dfd_base < 0)
-		new->dfd_base = dfd_mnt;
-	else
-		new->dfd_base = move_fd(dfd_base);
-	new->dfd_mnt = move_fd(dfd_mnt);
+	idx = list_add((void ***)&ops->hierarchies);
 	if (type == CGROUP2_SUPER_MAGIC)
 		ops->unified = new;
 	(ops->hierarchies)[idx] = move_ptr(new);
 	return 0;
 }
 
-/* Get a copy of the mountpoint from @line, which is a line from
- * /proc/self/mountinfo.
- */
-static char *cg_hybrid_get_mountpoint(char *line)
-{
-	char *p = line, *sret = NULL;
-	size_t len;
-	char *p2;
-
-	for (int i = 0; i < 4; i++) {
-		p = strchr(p, ' ');
-		if (!p)
-			return NULL;
-		p++;
-	}
-
-	if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
-		return NULL;
-
-	p2 = strchr(p + 15, ' ');
-	if (!p2)
-		return NULL;
-	*p2 = '\0';
-
-	len = strlen(p);
-	sret = must_realloc(NULL, len + 1);
-	memcpy(sret, p, len);
-	sret[len] = '\0';
-
-	return sret;
-}
-
-/* Given a multi-line string, return a null-terminated copy of the current line. */
-static char *copy_to_eol(char *p)
-{
-	char *p2, *sret;
-	size_t len;
-
-	p2 = strchr(p, '\n');
-	if (!p2)
-		return NULL;
-
-	len = p2 - p;
-	sret = must_realloc(NULL, len + 1);
-	memcpy(sret, p, len);
-	sret[len] = '\0';
-
-	return sret;
-}
-
-/* cgline: pointer to character after the first ':' in a line in a \n-terminated
- * /proc/self/cgroup file. Check whether controller c is present.
- */
-static bool controller_in_clist(char *cgline, char *c)
-{
-	__do_free char *tmp = NULL;
-	char *tok, *eol;
-	size_t len;
-
-	eol = strchr(cgline, ':');
-	if (!eol)
-		return false;
-
-	len = eol - cgline;
-	tmp = must_realloc(NULL, len + 1);
-	memcpy(tmp, cgline, len);
-	tmp[len] = '\0';
-
-	lxc_iterate_parts(tok, tmp, ",")
-		if (strequal(tok, c))
-			return true;
-
-	return false;
-}
-
-static inline char *trim(char *s)
-{
-	size_t len;
-
-	len = strlen(s);
-	while ((len > 1) && (s[len - 1] == '\n'))
-		s[--len] = '\0';
-
-	return s;
-}
-
-/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
- * @controller.
- */
-static char *cg_hybrid_get_current_cgroup(bool relative, char *basecginfo,
-					  char *controller, int type)
-{
-	char *base_cgroup = basecginfo;
-
-	for (;;) {
-		bool is_cgv2_base_cgroup = false;
-
-		/* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
-		if ((type == CGROUP2_SUPER_MAGIC) && (*base_cgroup == '0'))
-			is_cgv2_base_cgroup = true;
-
-		base_cgroup = strchr(base_cgroup, ':');
-		if (!base_cgroup)
-			return NULL;
-		base_cgroup++;
-
-		if (is_cgv2_base_cgroup || (controller && controller_in_clist(base_cgroup, controller))) {
-			__do_free char *copy = NULL;
-
-			base_cgroup = strchr(base_cgroup, ':');
-			if (!base_cgroup)
-				return NULL;
-			base_cgroup++;
-
-			copy = copy_to_eol(base_cgroup);
-			if (!copy)
-				return NULL;
-			trim(copy);
-
-			if (!relative) {
-				base_cgroup = prune_init_scope(copy);
-				if (!base_cgroup)
-					return NULL;
-			} else {
-				base_cgroup = copy;
-			}
-
-			if (abspath(base_cgroup))
-				base_cgroup = deabs(base_cgroup);
-
-			/* We're allowing base_cgroup to be "". */
-			return strdup(base_cgroup);
-		}
-
-		base_cgroup = strchr(base_cgroup, '\n');
-		if (!base_cgroup)
-			return NULL;
-		base_cgroup++;
-	}
-}
-
 static void must_append_string(char ***list, char *entry)
 {
 	int newentry;
 	char *copy;
 
-	newentry = append_null_to_list((void ***)list);
+	newentry = list_add((void ***)list);
 	copy = must_copy_string(entry);
 	(*list)[newentry] = copy;
 }
 
-static int get_existing_subsystems(char ***klist, char ***nlist)
-{
-	__do_free char *line = NULL;
-	__do_fclose FILE *f = NULL;
-	size_t len = 0;
-
-	f = fopen("/proc/self/cgroup", "re");
-	if (!f)
-		return -1;
-
-	while (getline(&line, &len, f) != -1) {
-		char *p, *p2, *tok;
-		p = strchr(line, ':');
-		if (!p)
-			continue;
-		p++;
-		p2 = strchr(p, ':');
-		if (!p2)
-			continue;
-		*p2 = '\0';
-
-		/* If the kernel has cgroup v2 support, then /proc/self/cgroup
-		 * contains an entry of the form:
-		 *
-		 *	0::/some/path
-		 *
-		 * In this case we use "cgroup2" as controller name.
-		 */
-		if ((p2 - p) == 0) {
-			must_append_string(klist, "cgroup2");
-			continue;
-		}
-
-		lxc_iterate_parts(tok, p, ",") {
-			if (strnequal(tok, "name=", 5))
-				must_append_string(nlist, tok);
-			else
-				must_append_string(klist, tok);
-		}
-	}
-
-	return 0;
-}
-
-static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
-					      char **nlist)
-{
-	int k;
-	char **it;
-
-	TRACE("basecginfo is:");
-	TRACE("%s", basecginfo);
-
-	for (k = 0, it = klist; it && *it; it++, k++)
-		TRACE("kernel subsystem %d: %s", k, *it);
-
-	for (k = 0, it = nlist; it && *it; it++, k++)
-		TRACE("named subsystem %d: %s", k, *it);
-}
-
 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
 {
 	if (!path_prune || !hierarchies)
@@ -1130,7 +767,7 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
 		    !ops->setup_limits_legacy(ops, conf, true))
 			return log_error(false, "Failed to setup legacy device limits");
 
-		limit_path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
+		limit_path = make_cgroup_path(h, h->container_base_path, cgroup_limit_dir, NULL);
 		path = must_make_path(limit_path, cgroup_leaf, NULL);
 
 		/*
@@ -1146,7 +783,7 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
 				TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
 		}
 	} else {
-		path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
+		path = make_cgroup_path(h, h->container_base_path, cgroup_limit_dir, NULL);
 
 		fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
 	}
@@ -1805,8 +1442,7 @@ static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarc
 		INFO("Remounted %s read-only", controllerpath);
 	}
 
-	sourcepath = must_make_path(h->mountpoint, h->container_base_path,
-				    container_cgroup, NULL);
+	sourcepath = make_cgroup_path(h, h->container_base_path, container_cgroup, NULL);
 	if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
 		flags |= MS_RDONLY;
 
@@ -2128,7 +1764,7 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
 	for (int i = 0; ops->hierarchies[i]; i++) {
 		__do_free char *controllerpath = NULL, *path2 = NULL;
 		struct hierarchy *h = ops->hierarchies[i];
-		char *controller = strrchr(h->mountpoint, '/');
+		char *controller = h->mountpoint;
 
 		if (!controller)
 			continue;
@@ -2196,10 +1832,9 @@ __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
 		__do_free char *fullpath = NULL;
 		int ret;
 
-		fullpath =
-		    must_make_path(ops->hierarchies[i]->mountpoint,
-				   ops->hierarchies[i]->container_base_path,
-				   "cgroup.procs", NULL);
+		fullpath = make_cgroup_path(ops->hierarchies[i],
+					    ops->hierarchies[i]->container_base_path,
+					    "cgroup.procs", NULL);
 		ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
 		if (ret != 0)
 			return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
@@ -2385,20 +2020,28 @@ static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
 					const char *controller, bool limiting)
 {
 	struct hierarchy *h;
+	size_t len;
+	const char *path;
 
 	h = get_hierarchy(ops, controller);
 	if (!h)
-		return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
-				      controller ? controller : "(null)");
+		return log_warn_errno(NULL, ENOENT,
+				      "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
 
 	if (limiting)
-		return h->container_limit_path
-			   ? h->container_limit_path + strlen(h->mountpoint)
-			   : NULL;
+		path = h->container_limit_path;
+	else
+		path = h->container_full_path;
+	if (!path)
+		return NULL;
 
-	return h->container_full_path
-		   ? h->container_full_path + strlen(h->mountpoint)
-		   : NULL;
+	len = strlen(h->mountpoint);
+	if (!strnequal(h->mountpoint, DEFAULT_CGROUP_MOUNTPOINT,
+		       STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
+		path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
+		path += strspn(path, "/");
+	}
+	return path += len;
 }
 
 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
@@ -2420,7 +2063,7 @@ static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
 						       const char *inpath,
 						       const char *filename)
 {
-	return must_make_path(h->mountpoint, inpath, filename, NULL);
+	return make_cgroup_path(h, inpath, filename, NULL);
 }
 
 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
@@ -2619,7 +2262,7 @@ static int __cg_unified_attach(const struct hierarchy *h,
 	if (!cgroup)
 		return 0;
 
-	path = must_make_path(h->mountpoint, cgroup, NULL);
+	path = make_cgroup_path(h, cgroup, NULL);
 
 	unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
 	if (unified_fd < 0)
@@ -3278,21 +2921,27 @@ __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *
 	return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
 }
 
-static void cg_unified_delegate(char ***delegate)
+static int cg_unified_delegate(char ***delegate)
 {
 	__do_free char *buf = NULL;
-	char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
+	char *standard[] = {
+		"cgroup.procs",
+		"cgroup.threads",
+		"cgroup.subtree_control",
+		"memory.oom.group",
+		NULL,
+	};
 	char *token;
 	int idx;
 
 	buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
 	if (!buf) {
 		for (char **p = standard; p && *p; p++) {
-			idx = append_null_to_list((void ***)delegate);
+			idx = list_add((void ***)delegate);
 			(*delegate)[idx] = must_copy_string(*p);
 		}
-		SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
-		return;
+
+		return syswarn(0, "Failed to read /sys/kernel/cgroup/delegate");
 	}
 
 	lxc_iterate_parts(token, buf, " \t\n") {
@@ -3303,127 +2952,19 @@ static void cg_unified_delegate(char ***delegate)
 		if (strequal(token, "cgroup.procs"))
 			continue;
 
-		idx = append_null_to_list((void ***)delegate);
+		idx = list_add((void ***)delegate);
 		(*delegate)[idx] = must_copy_string(token);
 	}
-}
-
-/* At startup, parse_hierarchies finds all the info we need about cgroup
- * mountpoints and current cgroups, and stores it in @d.
- */
-static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
-{
-	__do_free char *cgroup_info = NULL, *line = NULL;
-	__do_free_string_list char **klist = NULL, **nlist = NULL;
-	__do_fclose FILE *f = NULL;
-	int ret;
-	size_t len = 0;
-
-	/* Root spawned containers escape the current cgroup, so use init's
-	 * cgroups as our base in that case.
-	 */
-	if (!relative && (geteuid() == 0))
-		cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
-	else
-		cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
-	if (!cgroup_info)
-		return ret_errno(ENOMEM);
-
-	ret = get_existing_subsystems(&klist, &nlist);
-	if (ret < 0)
-		return syserrno(-errno, "Failed to retrieve available legacy cgroup controllers");
-
-	f = fopen("/proc/self/mountinfo", "re");
-	if (!f)
-		return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
-
-	lxc_cgfsng_print_basecg_debuginfo(cgroup_info, klist, nlist);
-
-	while (getline(&line, &len, f) != -1) {
-		__do_free char *base_cgroup = NULL, *mountpoint = NULL;
-		__do_free_string_list char **controller_list = NULL;
-		int type;
-		bool writeable;
-
-		type = get_cgroup_version(line);
-		if (type == 0)
-			continue;
-
-		if (type == CGROUP2_SUPER_MAGIC && ops->unified)
-			continue;
-
-		if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
-			if (type == CGROUP2_SUPER_MAGIC)
-				ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
-			else if (type == CGROUP_SUPER_MAGIC)
-				ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
-		} else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
-			if (type == CGROUP_SUPER_MAGIC)
-				ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
-		} else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
-			if (type == CGROUP2_SUPER_MAGIC)
-				ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
-		}
-
-		controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
-		if (!controller_list && type == CGROUP_SUPER_MAGIC)
-			continue;
-
-		if (type == CGROUP_SUPER_MAGIC)
-			if (controller_list_is_dup(ops->hierarchies, controller_list)) {
-				TRACE("Skipping duplicating controller");
-				continue;
-			}
-
-		mountpoint = cg_hybrid_get_mountpoint(line);
-		if (!mountpoint) {
-			WARN("Failed parsing mountpoint from \"%s\"", line);
-			continue;
-		}
-
-		if (type == CGROUP_SUPER_MAGIC)
-			base_cgroup = cg_hybrid_get_current_cgroup(relative, cgroup_info, controller_list[0], CGROUP_SUPER_MAGIC);
-		else
-			base_cgroup = cg_hybrid_get_current_cgroup(relative, cgroup_info, NULL, CGROUP2_SUPER_MAGIC);
-		if (!base_cgroup) {
-			WARN("Failed to find current cgroup");
-			continue;
-		}
-
-		if (type == CGROUP2_SUPER_MAGIC)
-			writeable = test_writeable_v2(mountpoint, base_cgroup);
-		else
-			writeable = test_writeable_v1(mountpoint, base_cgroup);
-		if (!writeable) {
-			TRACE("The %s group is not writeable", base_cgroup);
-			continue;
-		}
-
-		if (type == CGROUP2_SUPER_MAGIC)
-			ret = add_hierarchy(ops, NULL, move_ptr(mountpoint), move_ptr(base_cgroup), type);
-		else
-			ret = add_hierarchy(ops, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
-		if (ret)
-			return syserrno(ret, "Failed to add cgroup hierarchy");
-		if (ops->unified && unprivileged)
-			cg_unified_delegate(&(ops->unified)->cgroup2_chown);
-	}
-
-	/* verify that all controllers in cgroup.use and all crucial
-	 * controllers are accounted for
-	 */
-	if (!all_controllers_found(ops))
-		return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
 
 	return 0;
 }
 
-static inline bool __current_cgroup_unified_line(const char *line)
+static inline bool unified_cgroup(const char *line)
 {
 	return *line == '0';
 }
 
-static inline char *__current_cgroup_unified(bool relative, char *line)
+static inline char *current_unified_cgroup(bool relative, char *line)
 {
 	char *current_cgroup;
 
@@ -3446,70 +2987,209 @@ static inline char *__current_cgroup_unified(bool relative, char *line)
 	return current_cgroup;
 }
 
-/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
-static char *current_cgroup_unified(bool relative)
+static inline const char *unprefix(const char *controllers)
+{
+	if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
+		return controllers + STRLITERALLEN("name=");
+	return controllers;
+}
+
+static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
+				bool unprivileged)
 {
 	__do_free char *cgroup_info = NULL;
 	char *it;
 
+	/*
+	 * Root spawned containers escape the current cgroup, so use init's
+	 * cgroups as our base in that case.
+	 */
 	if (!relative && (geteuid() == 0))
 		cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
 	else
 		cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
 	if (!cgroup_info)
-		return NULL;
+		return ret_errno(ENOMEM);
 
 	lxc_iterate_parts(it, cgroup_info, "\n") {
-		char *current_cgroup;
+		__do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
+		__do_free char *controllers = NULL, *current_cgroup = NULL;
+		__do_free_string_list char **controller_list = NULL,
+					   **delegate = NULL;
+		char *line;
+		int dfd, ret, type;
+
+		/* Handle the unified cgroup hierarchy. */
+		line = it;
+		if (unified_cgroup(line)) {
+			char *unified_mnt;
+
+			current_cgroup = current_unified_cgroup(relative, line);
+			if (IS_ERR(current_cgroup))
+				return PTR_ERR(current_cgroup);
+
+			if (unified_cgroup_fd(ops->dfd_mnt_cgroupfs_host)) {
+				dfd_mnt = dup_cloexec(ops->dfd_mnt_cgroupfs_host);
+				unified_mnt = "";
+			} else {
+				dfd_mnt = open_at(ops->dfd_mnt_cgroupfs_host,
+						  "unified",
+						  PROTECT_OPATH_DIRECTORY,
+						  PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
+				unified_mnt = "unified";
+			}
+			if (dfd_mnt < 0) {
+				if (errno != ENOENT)
+					return syserrno(-errno, "Failed to open %d/unified", ops->dfd_mnt_cgroupfs_host);
 
-		if (!__current_cgroup_unified_line(it))
-			continue;
+				SYSTRACE("Unified cgroup not mounted");
+				continue;
+			}
+			dfd = dfd_mnt;
+
+			if (!is_empty_string(current_cgroup)) {
+				dfd_base = open_at(dfd_mnt, current_cgroup,
+						   PROTECT_OPATH_DIRECTORY,
+						   PROTECT_LOOKUP_BENEATH_XDEV, 0);
+				if (dfd_base < 0)
+					return syserrno(-errno, "Failed to open %d/%s", dfd_mnt, current_cgroup);
+				dfd = dfd_base;
+			}
 
-		current_cgroup = __current_cgroup_unified(relative, it);
-		if (IS_ERR(current_cgroup))
-			return NULL;
+			controller_list = unified_controllers(dfd, "cgroup.controllers");
+			if (!controller_list) {
+				TRACE("No controllers are enabled for delegation in the unified hierarchy");
+				controller_list = __controller_list_empty();
+			}
 
-		return current_cgroup;
-	}
+			if (unprivileged) {
+				ret = cg_unified_delegate(&delegate);
+				if (ret < 0)
+					return syserrno(ret, "Failed to determine delegation requirements");
 
-	return log_error(NULL, "Failed to retrieve current cgroup for %s process",
-			 relative ? "current" : "init");
-}
+				for (char *const *d = delegate; d && *d; d++) {
+					if (faccessat(dfd, *d, W_OK, 0)) {
+						if (errno == ENOENT)
+							continue;
 
-static int cg_unified_init(struct cgroup_ops *ops, bool relative,
-			   bool unprivileged)
-{
-	__do_free char *base_cgroup = NULL;
-	int ret;
+						SYSINFO("Lacking write access to %s, skipping unified cgroup", *d);
+						break;
+					}
+				}
+			}
 
-	base_cgroup = current_cgroup_unified(relative);
-	if (!base_cgroup)
-		return ret_errno(EINVAL);
+			type = CGROUP2_SUPER_MAGIC;
+			controllers = strdup(unified_mnt);
+			if (!controllers)
+				return ret_errno(ENOMEM);
+		} else {
+			char *__controllers, *__current_cgroup;
 
-	/* TODO: If the user requested specific controllers via lxc.cgroup.use
-	 * we should verify here. The reason I'm not doing it right is that I'm
-	 * not convinced that lxc.cgroup.use will be the future since it is a
-	 * global property. I much rather have an option that lets you request
-	 * controllers per container.
-	 */
+			__controllers = strchr(line, ':');
+			if (!__controllers)
+				return ret_errno(EINVAL);
+			__controllers++;
+
+			__current_cgroup = strchr(__controllers, ':');
+			if (!__current_cgroup)
+				return ret_errno(EINVAL);
+			*__current_cgroup = '\0';
+			__current_cgroup++;
+
+			controllers = strdup(unprefix(__controllers));
+			if (!controllers)
+				return ret_errno(ENOMEM);
+
+			dfd_mnt = open_at(ops->dfd_mnt_cgroupfs_host,
+					  controllers, PROTECT_OPATH_DIRECTORY,
+					  PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
+			if (dfd_mnt < 0) {
+				if (errno != ENOENT)
+					return syserrno(-errno, "Failed to open %d/%s",
+							ops->dfd_mnt_cgroupfs_host, controllers);
 
-	ret = add_hierarchy(ops, NULL,
-			    must_copy_string(DEFAULT_CGROUP_MOUNTPOINT),
-			    move_ptr(base_cgroup), CGROUP2_SUPER_MAGIC);
-	if (ret)
-		return syserrno(ret, "Failed to add unified cgroup hierarchy");
+				SYSTRACE("%s not mounted", controllers);
+				continue;
+			}
+			dfd = dfd_mnt;
+
+			if (!abspath(__current_cgroup))
+				return ret_errno(EINVAL);
+
+			/* remove init.scope */
+			if (!relative)
+				__current_cgroup = prune_init_scope(__current_cgroup);
+
+			/* create a relative path */
+			__current_cgroup = deabs(__current_cgroup);
 
-	if (unprivileged)
-		cg_unified_delegate(&(ops->unified)->cgroup2_chown);
+			current_cgroup = strdup(__current_cgroup);
+			if (!current_cgroup)
+				return ret_errno(ENOMEM);
 
-	if (bpf_devices_cgroup_supported())
-		ops->unified->bpf_device_controller = 1;
+			if (!is_empty_string(current_cgroup)) {
+				dfd_base = open_at(dfd_mnt, current_cgroup,
+						   PROTECT_OPATH_DIRECTORY,
+						   PROTECT_LOOKUP_BENEATH_XDEV, 0);
+				if (dfd_base < 0)
+					return syserrno(-errno, "Failed to open %d/%s",
+							dfd_mnt, current_cgroup);
+				dfd = dfd_base;
+			}
 
-	ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
-	return CGROUP2_SUPER_MAGIC;
+			if (faccessat(dfd, "cgroup.procs", W_OK, 0)) {
+				if (errno == ENOENT)
+					continue;
+
+				SYSINFO("Lacking write access to %s", controllers);
+				break;
+			}
+
+			/*
+			 * We intentionally pass __current_cgroup here and not
+			 * controllers because we would otherwise chop the
+			 * mountpoint.
+			 */
+			controller_list = __controller_list(__controllers);
+			if (IS_ERR(controller_list))
+				return PTR_ERR(controller_list);
+
+			if (skip_hierarchy(ops, controller_list))
+				continue;
+
+			type = CGROUP_SUPER_MAGIC;
+			ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
+		}
+
+		ret = add_hierarchy(ops, dfd_mnt, controllers, dfd, current_cgroup, controller_list, type);
+		if (ret < 0)
+			return syserrno(ret, "Failed to add %s hierarchy", controllers);
+
+		/* Transfer ownership. */
+		move_fd(dfd_mnt);
+		move_fd(dfd_base);
+		move_ptr(current_cgroup);
+		move_ptr(controllers);
+		move_ptr(controller_list);
+		if (type == CGROUP2_SUPER_MAGIC)
+			ops->unified->cgroup2_chown = move_ptr(delegate);
+	}
+
+	/* determine cgroup layout */
+	if (ops->unified) {
+		if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
+			ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
+		} else {
+			if (bpf_devices_cgroup_supported())
+				ops->unified->bpf_device_controller = 1;
+			ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
+		}
+	}
+
+	return 0;
 }
 
-static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
+static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
 {
 	__do_close int dfd = -EBADF;
 	int ret;
@@ -3548,12 +3228,7 @@ static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
 	 */
 	ops->dfd_mnt_cgroupfs_host = dfd;
 
-	if (unified_cgroup_fd(dfd))
-		ret = cg_unified_init(ops, conf->cgroup_meta.relative,
-				      !lxc_list_empty(&conf->id_map));
-	else
-		ret = cg_hybrid_init(ops, conf->cgroup_meta.relative,
-				     !lxc_list_empty(&conf->id_map));
+	ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !lxc_list_empty(&conf->id_map));
 	if (ret < 0)
 		return syserrno(ret, "Failed to initialize cgroups");
 
@@ -3577,7 +3252,7 @@ __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
 	return 0;
 }
 
-struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
+struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
 {
 	__do_free struct cgroup_ops *cgfsng_ops = NULL;
 
@@ -3588,7 +3263,7 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
 	cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
 	cgfsng_ops->dfd_mnt_cgroupfs_host = -EBADF;
 
-	if (__cgroup_init(cgfsng_ops, conf))
+	if (initialize_cgroups(cgfsng_ops, conf))
 		return NULL;
 
 	cgfsng_ops->data_init				= cgfsng_data_init;
diff --git a/src/lxc/cgroups/cgroup.c b/src/lxc/cgroups/cgroup.c
index 68e3e3e25..469660655 100644
--- a/src/lxc/cgroups/cgroup.c
+++ b/src/lxc/cgroups/cgroup.c
@@ -21,7 +21,7 @@
 
 lxc_log_define(cgroup, lxc);
 
-__hidden extern struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf);
+__hidden extern struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf);
 
 struct cgroup_ops *cgroup_init(struct lxc_conf *conf)
 {
@@ -30,7 +30,7 @@ struct cgroup_ops *cgroup_init(struct lxc_conf *conf)
 	if (!conf)
 		return log_error_errno(NULL, EINVAL, "No valid conf given");
 
-	cgroup_ops = cgfsng_ops_init(conf);
+	cgroup_ops = cgroup_ops_init(conf);
 	if (!cgroup_ops)
 		return log_error_errno(NULL, errno, "Failed to initialize cgroup driver");
 
@@ -47,13 +47,13 @@ struct cgroup_ops *cgroup_init(struct lxc_conf *conf)
 	TRACE("Initialized cgroup driver %s", cgroup_ops->driver);
 
 	if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_LEGACY)
-		TRACE("Running with legacy cgroup layout");
+		TRACE("Legacy cgroup layout");
 	else if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_HYBRID)
-		TRACE("Running with hybrid cgroup layout");
+		TRACE("Hybrid cgroup layout");
 	else if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
-		TRACE("Running with unified cgroup layout");
+		TRACE("Unified cgroup layout");
 	else
-		WARN("Running with unknown cgroup layout");
+		WARN("Unsupported cgroup layout");
 
 	return cgroup_ops;
 }
diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
index 3a8e3f3ef..1d08bdb55 100644
--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -230,4 +230,11 @@ static inline int cgroup_unified_fd(const struct cgroup_ops *ops)
 	return ops->unified->cgfd_con;
 }
 
+#define make_cgroup_path(__hierarchy, __first, ...)                        \
+	({                                                                 \
+		const struct hierarchy *__h = __hierarchy;                 \
+		must_make_path(DEFAULT_CGROUP_MOUNTPOINT, __h->mountpoint, \
+			       __first, __VA_ARGS__);                      \
+	})
+
 #endif /* __LXC_CGROUP_H */
diff --git a/src/lxc/cgroups/cgroup_utils.c b/src/lxc/cgroups/cgroup_utils.c
index 448fc9fa5..6beb57067 100644
--- a/src/lxc/cgroups/cgroup_utils.c
+++ b/src/lxc/cgroups/cgroup_utils.c
@@ -86,7 +86,7 @@ bool test_writeable_v2(char *mountpoint, char *path)
 	return (access(cgroup_threads_file, W_OK) == 0);
 }
 
-int unified_cgroup_fd(int fd)
+bool unified_cgroup_fd(int fd)
 {
 
 	int ret;
diff --git a/src/lxc/cgroups/cgroup_utils.h b/src/lxc/cgroups/cgroup_utils.h
index 77bf40c15..41a8a2199 100644
--- a/src/lxc/cgroups/cgroup_utils.h
+++ b/src/lxc/cgroups/cgroup_utils.h
@@ -29,7 +29,7 @@ __hidden extern bool test_writeable_v1(char *mountpoint, char *path);
  */
 __hidden extern bool test_writeable_v2(char *mountpoint, char *path);
 
-__hidden extern int unified_cgroup_fd(int fd);
+__hidden extern bool unified_cgroup_fd(int fd);
 
 static inline bool cgns_supported(void)
 {