From 36bed3e00a28308c1910f514b6464ae5ab5da77a Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 4 May 2021 13:49:35 -0400 Subject: [PATCH] Fixes for 5.10 Signed-off-by: Sasha Levin --- ...ies-require-cap_setfcap-to-map-uid-0.patch | 227 ++++++++++++++++++ ...ror-return-code-in-perf_data__create.patch | 53 ++++ ...access-to-pid-in-array-when-setting-.patch | 64 +++++ queue-5.10/series | 4 + ...binfo.py-updated-to-work-on-current-.patch | 71 ++++++ 5 files changed, 419 insertions(+) create mode 100644 queue-5.10/capabilities-require-cap_setfcap-to-map-uid-0.patch create mode 100644 queue-5.10/perf-data-fix-error-return-code-in-perf_data__create.patch create mode 100644 queue-5.10/perf-ftrace-fix-access-to-pid-in-array-when-setting-.patch create mode 100644 queue-5.10/tools-cgroup-slabinfo.py-updated-to-work-on-current-.patch diff --git a/queue-5.10/capabilities-require-cap_setfcap-to-map-uid-0.patch b/queue-5.10/capabilities-require-cap_setfcap-to-map-uid-0.patch new file mode 100644 index 00000000000..49b3644c964 --- /dev/null +++ b/queue-5.10/capabilities-require-cap_setfcap-to-map-uid-0.patch @@ -0,0 +1,227 @@ +From 934d6b992595da2e3d013f845831b5b35cb7c024 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 20 Apr 2021 08:43:34 -0500 +Subject: capabilities: require CAP_SETFCAP to map uid 0 + +From: Serge E. Hallyn + +[ Upstream commit db2e718a47984b9d71ed890eb2ea36ecf150de18 ] + +cap_setfcap is required to create file capabilities. + +Since commit 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"), +a process running as uid 0 but without cap_setfcap is able to work +around this as follows: unshare a new user namespace which maps parent +uid 0 into the child namespace. + +While this task will not have new capabilities against the parent +namespace, there is a loophole due to the way namespaced file +capabilities are represented as xattrs. File capabilities valid in +userns 1 are distinguished from file capabilities valid in userns 2 by +the kuid which underlies uid 0. Therefore the restricted root process +can unshare a new self-mapping namespace, add a namespaced file +capability onto a file, then use that file capability in the parent +namespace. + +To prevent that, do not allow mapping parent uid 0 if the process which +opened the uid_map file does not have CAP_SETFCAP, which is the +capability for setting file capabilities. + +As a further wrinkle: a task can unshare its user namespace, then open +its uid_map file itself, and map (only) its own uid. In this case we do +not have the credential from before unshare, which was potentially more +restricted. So, when creating a user namespace, we record whether the +creator had CAP_SETFCAP. Then we can use that during map_write(). + +With this patch: + +1. Unprivileged user can still unshare -Ur + + ubuntu@caps:~$ unshare -Ur + root@caps:~# logout + +2. Root user can still unshare -Ur + + ubuntu@caps:~$ sudo bash + root@caps:/home/ubuntu# unshare -Ur + root@caps:/home/ubuntu# logout + +3. Root user without CAP_SETFCAP cannot unshare -Ur: + + root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap -- + root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap + unable to set CAP_SETFCAP effective capability: Operation not permitted + root@caps:/home/ubuntu# unshare -Ur + unshare: write failed /proc/self/uid_map: Operation not permitted + +Note: an alternative solution would be to allow uid 0 mappings by +processes without CAP_SETFCAP, but to prevent such a namespace from +writing any file capabilities. This approach can be seen at [1]. + +Background history: commit 95ebabde382 ("capabilities: Don't allow +writing ambiguous v3 file capabilities") tried to fix the issue by +preventing v3 fscaps to be written to disk when the root uid would map +to the same uid in nested user namespaces. This led to regressions for +various workloads. For example, see [2]. Ultimately this is a valid +use-case we have to support meaning we had to revert this change in +3b0c2d3eaa83 ("Revert 95ebabde382c ("capabilities: Don't allow writing +ambiguous v3 file capabilities")"). + +Link: https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4 [1] +Link: https://github.com/containers/buildah/issues/3071 [2] +Signed-off-by: Serge Hallyn +Reviewed-by: Andrew G. Morgan +Tested-by: Christian Brauner +Reviewed-by: Christian Brauner +Tested-by: Giuseppe Scrivano +Cc: Eric Biederman +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + include/linux/user_namespace.h | 3 ++ + include/uapi/linux/capability.h | 3 +- + kernel/user_namespace.c | 65 +++++++++++++++++++++++++++++++-- + 3 files changed, 67 insertions(+), 4 deletions(-) + +diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h +index 6ef1c7109fc4..7616c7bf4b24 100644 +--- a/include/linux/user_namespace.h ++++ b/include/linux/user_namespace.h +@@ -64,6 +64,9 @@ struct user_namespace { + kgid_t group; + struct ns_common ns; + unsigned long flags; ++ /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP ++ * in its effective capability set at the child ns creation time. */ ++ bool parent_could_setfcap; + + #ifdef CONFIG_KEYS + /* List of joinable keyrings in this namespace. Modification access of +diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h +index c6ca33034147..2ddb4226cd23 100644 +--- a/include/uapi/linux/capability.h ++++ b/include/uapi/linux/capability.h +@@ -335,7 +335,8 @@ struct vfs_ns_cap_data { + + #define CAP_AUDIT_CONTROL 30 + +-/* Set or remove capabilities on files */ ++/* Set or remove capabilities on files. ++ Map uid=0 into a child user namespace. */ + + #define CAP_SETFCAP 31 + +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index e703d5d9cbe8..ce396ea4de60 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new) + if (!ns) + goto fail_dec; + ++ ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); + ret = ns_alloc_inum(&ns->ns); + if (ret) + goto fail_free; +@@ -841,6 +842,60 @@ static int sort_idmaps(struct uid_gid_map *map) + return 0; + } + ++/** ++ * verify_root_map() - check the uid 0 mapping ++ * @file: idmapping file ++ * @map_ns: user namespace of the target process ++ * @new_map: requested idmap ++ * ++ * If a process requests mapping parent uid 0 into the new ns, verify that the ++ * process writing the map had the CAP_SETFCAP capability as the target process ++ * will be able to write fscaps that are valid in ancestor user namespaces. ++ * ++ * Return: true if the mapping is allowed, false if not. ++ */ ++static bool verify_root_map(const struct file *file, ++ struct user_namespace *map_ns, ++ struct uid_gid_map *new_map) ++{ ++ int idx; ++ const struct user_namespace *file_ns = file->f_cred->user_ns; ++ struct uid_gid_extent *extent0 = NULL; ++ ++ for (idx = 0; idx < new_map->nr_extents; idx++) { ++ if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) ++ extent0 = &new_map->extent[idx]; ++ else ++ extent0 = &new_map->forward[idx]; ++ if (extent0->lower_first == 0) ++ break; ++ ++ extent0 = NULL; ++ } ++ ++ if (!extent0) ++ return true; ++ ++ if (map_ns == file_ns) { ++ /* The process unshared its ns and is writing to its own ++ * /proc/self/uid_map. User already has full capabilites in ++ * the new namespace. Verify that the parent had CAP_SETFCAP ++ * when it unshared. ++ * */ ++ if (!file_ns->parent_could_setfcap) ++ return false; ++ } else { ++ /* Process p1 is writing to uid_map of p2, who is in a child ++ * user namespace to p1's. Verify that the opener of the map ++ * file has CAP_SETFCAP against the parent of the new map ++ * namespace */ ++ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) ++ return false; ++ } ++ ++ return true; ++} ++ + static ssize_t map_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, + int cap_setid, +@@ -848,7 +903,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, + struct uid_gid_map *parent_map) + { + struct seq_file *seq = file->private_data; +- struct user_namespace *ns = seq->private; ++ struct user_namespace *map_ns = seq->private; + struct uid_gid_map new_map; + unsigned idx; + struct uid_gid_extent extent; +@@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, + /* + * Adjusting namespace settings requires capabilities on the target. + */ +- if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) ++ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) + goto out; + + /* Parse the user data */ +@@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, + + ret = -EPERM; + /* Validate the user is allowed to use user id's mapped to. */ +- if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) ++ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) + goto out; + + ret = -EPERM; +@@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file, + struct uid_gid_map *new_map) + { + const struct cred *cred = file->f_cred; ++ ++ if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) ++ return false; ++ + /* Don't allow mappings that would allow anything that wouldn't + * be allowed without the establishment of unprivileged mappings. + */ +-- +2.30.2 + diff --git a/queue-5.10/perf-data-fix-error-return-code-in-perf_data__create.patch b/queue-5.10/perf-data-fix-error-return-code-in-perf_data__create.patch new file mode 100644 index 00000000000..962699b4aad --- /dev/null +++ b/queue-5.10/perf-data-fix-error-return-code-in-perf_data__create.patch @@ -0,0 +1,53 @@ +From 93c380135ad54909d2c9aa18d471c75d554049b3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 15 Apr 2021 16:34:16 +0800 +Subject: perf data: Fix error return code in perf_data__create_dir() + +From: Zhen Lei + +[ Upstream commit f2211881e737cade55e0ee07cf6a26d91a35a6fe ] + +Although 'ret' has been initialized to -1, but it will be reassigned by +the "ret = open(...)" statement in the for loop. So that, the value of +'ret' is unknown when asprintf() failed. + +Reported-by: Hulk Robot +Signed-off-by: Zhen Lei +Cc: Alexander Shishkin +Cc: Jiri Olsa +Cc: Mark Rutland +Cc: Namhyung Kim +Cc: Peter Zijlstra +Link: http://lore.kernel.org/lkml/20210415083417.3740-1-thunder.leizhen@huawei.com +Signed-off-by: Arnaldo Carvalho de Melo +Signed-off-by: Sasha Levin +--- + tools/perf/util/data.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c +index c47aa34fdc0a..5d97b3e45fbb 100644 +--- a/tools/perf/util/data.c ++++ b/tools/perf/util/data.c +@@ -35,7 +35,7 @@ void perf_data__close_dir(struct perf_data *data) + int perf_data__create_dir(struct perf_data *data, int nr) + { + struct perf_data_file *files = NULL; +- int i, ret = -1; ++ int i, ret; + + if (WARN_ON(!data->is_dir)) + return -EINVAL; +@@ -51,7 +51,8 @@ int perf_data__create_dir(struct perf_data *data, int nr) + for (i = 0; i < nr; i++) { + struct perf_data_file *file = &files[i]; + +- if (asprintf(&file->path, "%s/data.%d", data->path, i) < 0) ++ ret = asprintf(&file->path, "%s/data.%d", data->path, i); ++ if (ret < 0) + goto out_err; + + ret = open(file->path, O_RDWR|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR); +-- +2.30.2 + diff --git a/queue-5.10/perf-ftrace-fix-access-to-pid-in-array-when-setting-.patch b/queue-5.10/perf-ftrace-fix-access-to-pid-in-array-when-setting-.patch new file mode 100644 index 00000000000..fce08f4d244 --- /dev/null +++ b/queue-5.10/perf-ftrace-fix-access-to-pid-in-array-when-setting-.patch @@ -0,0 +1,64 @@ +From aa0ca6d6e9aa76f748c9c44a758e6e9181b934c1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 21 Apr 2021 14:04:00 +0200 +Subject: perf ftrace: Fix access to pid in array when setting a pid filter + +From: Thomas Richter + +[ Upstream commit 671b60cb6a897a5b3832fe57657152f2c3995e25 ] + +Command 'perf ftrace -v -- ls' fails in s390 (at least 5.12.0rc6). + +The root cause is a missing pointer dereference which causes an +array element address to be used as PID. + +Fix this by extracting the PID. + +Output before: + # ./perf ftrace -v -- ls + function_graph tracer is used + write '-263732416' to tracing/set_ftrace_pid failed: Invalid argument + failed to set ftrace pid + # + +Output after: + ./perf ftrace -v -- ls + function_graph tracer is used + # tracer: function_graph + # + # CPU DURATION FUNCTION CALLS + # | | | | | | | + 4) | rcu_read_lock_sched_held() { + 4) 0.552 us | rcu_lockdep_current_cpu_online(); + 4) 6.124 us | } + +Reported-by: Alexander Schmidt +Signed-off-by: Thomas Richter +Acked-by: Namhyung Kim +Cc: Heiko Carstens +Cc: Sumanth Korikkar +Cc: Sven Schnelle +Cc: Vasily Gorbik +Link: http://lore.kernel.org/lkml/20210421120400.2126433-1-tmricht@linux.ibm.com +Signed-off-by: Arnaldo Carvalho de Melo +Signed-off-by: Sasha Levin +--- + tools/perf/builtin-ftrace.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c +index 9366fad591dc..eecc70fc3b19 100644 +--- a/tools/perf/builtin-ftrace.c ++++ b/tools/perf/builtin-ftrace.c +@@ -289,7 +289,7 @@ static int set_tracing_pid(struct perf_ftrace *ftrace) + + for (i = 0; i < perf_thread_map__nr(ftrace->evlist->core.threads); i++) { + scnprintf(buf, sizeof(buf), "%d", +- ftrace->evlist->core.threads->map[i]); ++ perf_thread_map__pid(ftrace->evlist->core.threads, i)); + if (append_tracing_file("set_ftrace_pid", buf) < 0) + return -1; + } +-- +2.30.2 + diff --git a/queue-5.10/series b/queue-5.10/series index 87ba48e73db..7b5263e6990 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -5,3 +5,7 @@ igb-enable-rss-for-intel-i211-ethernet-controller.patch bpf-fix-masking-negation-logic-upon-negative-dst-register.patch bpf-fix-leakage-of-uninitialized-bpf-stack-under-speculation.patch net-qrtr-avoid-potential-use-after-free-in-mhi-send.patch +perf-data-fix-error-return-code-in-perf_data__create.patch +capabilities-require-cap_setfcap-to-map-uid-0.patch +perf-ftrace-fix-access-to-pid-in-array-when-setting-.patch +tools-cgroup-slabinfo.py-updated-to-work-on-current-.patch diff --git a/queue-5.10/tools-cgroup-slabinfo.py-updated-to-work-on-current-.patch b/queue-5.10/tools-cgroup-slabinfo.py-updated-to-work-on-current-.patch new file mode 100644 index 00000000000..bddd02fac12 --- /dev/null +++ b/queue-5.10/tools-cgroup-slabinfo.py-updated-to-work-on-current-.patch @@ -0,0 +1,71 @@ +From 24bc8eaa11cdec7783e1bda83ac37cb8a21b2d53 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 23 Apr 2021 14:29:03 -0700 +Subject: tools/cgroup/slabinfo.py: updated to work on current kernel + +From: Vasily Averin + +[ Upstream commit 1974c45dd7745e999b9387be3d8fdcb27a5b1721 ] + +slabinfo.py script does not work with actual kernel version. + +First, it was unable to recognise SLUB susbsytem, and when I specified +it manually it failed again with + + AttributeError: 'struct page' has no member 'obj_cgroups' + +.. and then again with + + File "tools/cgroup/memcg_slabinfo.py", line 221, in main + memcg.kmem_caches.address_of_(), + AttributeError: 'struct mem_cgroup' has no member 'kmem_caches' + +Link: https://lkml.kernel.org/r/cec1a75e-43b4-3d64-2084-d9f98fda037f@virtuozzo.com +Signed-off-by: Vasily Averin +Tested-by: Roman Gushchin +Acked-by: Roman Gushchin +Cc: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + tools/cgroup/memcg_slabinfo.py | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py +index c4225ed63565..1600b17dbb8a 100644 +--- a/tools/cgroup/memcg_slabinfo.py ++++ b/tools/cgroup/memcg_slabinfo.py +@@ -128,9 +128,9 @@ def detect_kernel_config(): + + cfg['nr_nodes'] = prog['nr_online_nodes'].value_() + +- if prog.type('struct kmem_cache').members[1][1] == 'flags': ++ if prog.type('struct kmem_cache').members[1].name == 'flags': + cfg['allocator'] = 'SLUB' +- elif prog.type('struct kmem_cache').members[1][1] == 'batchcount': ++ elif prog.type('struct kmem_cache').members[1].name == 'batchcount': + cfg['allocator'] = 'SLAB' + else: + err('Can\'t determine the slab allocator') +@@ -193,7 +193,7 @@ def main(): + # look over all slab pages, belonging to non-root memcgs + # and look for objects belonging to the given memory cgroup + for page in for_each_slab_page(prog): +- objcg_vec_raw = page.obj_cgroups.value_() ++ objcg_vec_raw = page.memcg_data.value_() + if objcg_vec_raw == 0: + continue + cache = page.slab_cache +@@ -202,7 +202,7 @@ def main(): + addr = cache.value_() + caches[addr] = cache + # clear the lowest bit to get the true obj_cgroups +- objcg_vec = Object(prog, page.obj_cgroups.type_, ++ objcg_vec = Object(prog, 'struct obj_cgroup **', + value=objcg_vec_raw & ~1) + + if addr not in stats: +-- +2.30.2 + -- 2.47.3