From 68020412c2000f568a17a43575a3568086ba0e98 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 24 Nov 2022 12:44:15 +0100
Subject: [PATCH] cgroups: only allocate user namespace if we have to

If the monitor runs as root we can assume it's able to remove the cgroups it
created when the container started.

Fixes: https://github.com/lxc/lxd/issues/11108
Signed-off-by: Christian Brauner (Microsoft) <christian.brauner@ubuntu.com>
---
 src/lxc/cgroups/cgfsng.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index 979deab19..b868be93d 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -560,7 +560,12 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
 	if (ret < 0)
 		WARN("Failed to detach bpf program from cgroup");
 
-	if (!list_empty(&handler->conf->id_map)) {
+	/*
+	 * Only do the user namespace dance if we have too. If the container's
+	 * monitor is root we can assume that it is privileged enough to remove
+	 * the cgroups it created when the container started.
+	 */
+	if (!list_empty(&handler->conf->id_map) && !handler->am_root) {
 		struct generic_userns_exec_data wrap = {
 			.conf			= handler->conf,
 			.path_prune		= ops->container_limit_cgroup,
-- 
2.47.2