Fixes for 5.11

author Sasha Levin <sashal@kernel.org>

Tue, 4 May 2021 17:49:35 +0000 (13:49 -0400)

committer Sasha Levin <sashal@kernel.org>

Tue, 4 May 2021 17:50:00 +0000 (13:50 -0400)
author Sasha Levin <sashal@kernel.org>
Tue, 4 May 2021 17:49:35 +0000 (13:49 -0400)
committer Sasha Levin <sashal@kernel.org>
Tue, 4 May 2021 17:50:00 +0000 (13:50 -0400)
diff --git a/queue-5.11/capabilities-require-cap_setfcap-to-map-uid-0.patch b/queue-5.11/capabilities-require-cap_setfcap-to-map-uid-0.patch

new file mode 100644 (file)

index 0000000..95191ac
--- /dev/null
+++ b/queue-5.11/capabilities-require-cap_setfcap-to-map-uid-0.patch
@@ -0,0 +1,227 @@
+From eec4cb40e13217080a325dae738a3eaa37c4a2e7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 20 Apr 2021 08:43:34 -0500
+Subject: capabilities: require CAP_SETFCAP to map uid 0
+
+From: Serge E. Hallyn <serge@hallyn.com>
+
+[ Upstream commit db2e718a47984b9d71ed890eb2ea36ecf150de18 ]
+
+cap_setfcap is required to create file capabilities.
+
+Since commit 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"),
+a process running as uid 0 but without cap_setfcap is able to work
+around this as follows: unshare a new user namespace which maps parent
+uid 0 into the child namespace.
+
+While this task will not have new capabilities against the parent
+namespace, there is a loophole due to the way namespaced file
+capabilities are represented as xattrs.  File capabilities valid in
+userns 1 are distinguished from file capabilities valid in userns 2 by
+the kuid which underlies uid 0.  Therefore the restricted root process
+can unshare a new self-mapping namespace, add a namespaced file
+capability onto a file, then use that file capability in the parent
+namespace.
+
+To prevent that, do not allow mapping parent uid 0 if the process which
+opened the uid_map file does not have CAP_SETFCAP, which is the
+capability for setting file capabilities.
+
+As a further wrinkle: a task can unshare its user namespace, then open
+its uid_map file itself, and map (only) its own uid.  In this case we do
+not have the credential from before unshare, which was potentially more
+restricted.  So, when creating a user namespace, we record whether the
+creator had CAP_SETFCAP.  Then we can use that during map_write().
+
+With this patch:
+
+1. Unprivileged user can still unshare -Ur
+
+   ubuntu@caps:~$ unshare -Ur
+   root@caps:~# logout
+
+2. Root user can still unshare -Ur
+
+   ubuntu@caps:~$ sudo bash
+   root@caps:/home/ubuntu# unshare -Ur
+   root@caps:/home/ubuntu# logout
+
+3. Root user without CAP_SETFCAP cannot unshare -Ur:
+
+   root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
+   root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
+   unable to set CAP_SETFCAP effective capability: Operation not permitted
+   root@caps:/home/ubuntu# unshare -Ur
+   unshare: write failed /proc/self/uid_map: Operation not permitted
+
+Note: an alternative solution would be to allow uid 0 mappings by
+processes without CAP_SETFCAP, but to prevent such a namespace from
+writing any file capabilities.  This approach can be seen at [1].
+
+Background history: commit 95ebabde382 ("capabilities: Don't allow
+writing ambiguous v3 file capabilities") tried to fix the issue by
+preventing v3 fscaps to be written to disk when the root uid would map
+to the same uid in nested user namespaces.  This led to regressions for
+various workloads.  For example, see [2].  Ultimately this is a valid
+use-case we have to support meaning we had to revert this change in
+3b0c2d3eaa83 ("Revert 95ebabde382c ("capabilities: Don't allow writing
+ambiguous v3 file capabilities")").
+
+Link: https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4 [1]
+Link: https://github.com/containers/buildah/issues/3071 [2]
+Signed-off-by: Serge Hallyn <serge@hallyn.com>
+Reviewed-by: Andrew G. Morgan <morgan@kernel.org>
+Tested-by: Christian Brauner <christian.brauner@ubuntu.com>
+Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
+Tested-by: Giuseppe Scrivano <gscrivan@redhat.com>
+Cc: Eric Biederman <ebiederm@xmission.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/user_namespace.h  |  3 ++
+ include/uapi/linux/capability.h |  3 +-
+ kernel/user_namespace.c         | 65 +++++++++++++++++++++++++++++++--
+ 3 files changed, 67 insertions(+), 4 deletions(-)
+
+diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
+index 64cf8ebdc4ec..f6c5f784be5a 100644
+--- a/include/linux/user_namespace.h
++++ b/include/linux/user_namespace.h
+@@ -63,6 +63,9 @@ struct user_namespace {
+       kgid_t                  group;
+       struct ns_common        ns;
+       unsigned long           flags;
++      /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
++       * in its effective capability set at the child ns creation time. */
++      bool                    parent_could_setfcap;
+ 
+ #ifdef CONFIG_KEYS
+       /* List of joinable keyrings in this namespace.  Modification access of
+diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
+index c6ca33034147..2ddb4226cd23 100644
+--- a/include/uapi/linux/capability.h
++++ b/include/uapi/linux/capability.h
+@@ -335,7 +335,8 @@ struct vfs_ns_cap_data {
+ 
+ #define CAP_AUDIT_CONTROL    30
+ 
+-/* Set or remove capabilities on files */
++/* Set or remove capabilities on files.
++   Map uid=0 into a child user namespace. */
+ 
+ #define CAP_SETFCAP        31
+ 
+diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
+index af612945a4d0..9a4b980d695b 100644
+--- a/kernel/user_namespace.c
++++ b/kernel/user_namespace.c
+@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new)
+       if (!ns)
+               goto fail_dec;
+ 
++      ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
+       ret = ns_alloc_inum(&ns->ns);
+       if (ret)
+               goto fail_free;
+@@ -841,6 +842,60 @@ static int sort_idmaps(struct uid_gid_map *map)
+       return 0;
+ }
+ 
++/**
++ * verify_root_map() - check the uid 0 mapping
++ * @file: idmapping file
++ * @map_ns: user namespace of the target process
++ * @new_map: requested idmap
++ *
++ * If a process requests mapping parent uid 0 into the new ns, verify that the
++ * process writing the map had the CAP_SETFCAP capability as the target process
++ * will be able to write fscaps that are valid in ancestor user namespaces.
++ *
++ * Return: true if the mapping is allowed, false if not.
++ */
++static bool verify_root_map(const struct file *file,
++                          struct user_namespace *map_ns,
++                          struct uid_gid_map *new_map)
++{
++      int idx;
++      const struct user_namespace *file_ns = file->f_cred->user_ns;
++      struct uid_gid_extent *extent0 = NULL;
++
++      for (idx = 0; idx < new_map->nr_extents; idx++) {
++              if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
++                      extent0 = &new_map->extent[idx];
++              else
++                      extent0 = &new_map->forward[idx];
++              if (extent0->lower_first == 0)
++                      break;
++
++              extent0 = NULL;
++      }
++
++      if (!extent0)
++              return true;
++
++      if (map_ns == file_ns) {
++              /* The process unshared its ns and is writing to its own
++               * /proc/self/uid_map.  User already has full capabilites in
++               * the new namespace.  Verify that the parent had CAP_SETFCAP
++               * when it unshared.
++               * */
++              if (!file_ns->parent_could_setfcap)
++                      return false;
++      } else {
++              /* Process p1 is writing to uid_map of p2, who is in a child
++               * user namespace to p1's.  Verify that the opener of the map
++               * file has CAP_SETFCAP against the parent of the new map
++               * namespace */
++              if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
++                      return false;
++      }
++
++      return true;
++}
++
+ static ssize_t map_write(struct file *file, const char __user *buf,
+                        size_t count, loff_t *ppos,
+                        int cap_setid,
+@@ -848,7 +903,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
+                        struct uid_gid_map *parent_map)
+ {
+       struct seq_file *seq = file->private_data;
+-      struct user_namespace *ns = seq->private;
++      struct user_namespace *map_ns = seq->private;
+       struct uid_gid_map new_map;
+       unsigned idx;
+       struct uid_gid_extent extent;
+@@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
+       /*
+        * Adjusting namespace settings requires capabilities on the target.
+        */
+-      if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
++      if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
+               goto out;
+ 
+       /* Parse the user data */
+@@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
+ 
+       ret = -EPERM;
+       /* Validate the user is allowed to use user id's mapped to. */
+-      if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
++      if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
+               goto out;
+ 
+       ret = -EPERM;
+@@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file,
+                               struct uid_gid_map *new_map)
+ {
+       const struct cred *cred = file->f_cred;
++
++      if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
++              return false;
++
+       /* Don't allow mappings that would allow anything that wouldn't
+        * be allowed without the establishment of unprivileged mappings.
+        */
+-- 
+2.30.2
+
diff --git a/queue-5.11/perf-data-fix-error-return-code-in-perf_data__create.patch b/queue-5.11/perf-data-fix-error-return-code-in-perf_data__create.patch

new file mode 100644 (file)

index 0000000..9ca4d10
--- /dev/null
+++ b/queue-5.11/perf-data-fix-error-return-code-in-perf_data__create.patch
@@ -0,0 +1,53 @@
+From bdb06c0750411a9034bca27436b6c781c912d8ab Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 15 Apr 2021 16:34:16 +0800
+Subject: perf data: Fix error return code in perf_data__create_dir()
+
+From: Zhen Lei <thunder.leizhen@huawei.com>
+
+[ Upstream commit f2211881e737cade55e0ee07cf6a26d91a35a6fe ]
+
+Although 'ret' has been initialized to -1, but it will be reassigned by
+the "ret = open(...)" statement in the for loop. So that, the value of
+'ret' is unknown when asprintf() failed.
+
+Reported-by: Hulk Robot <hulkci@huawei.com>
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Cc: Jiri Olsa <jolsa@redhat.com>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Namhyung Kim <namhyung@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Link: http://lore.kernel.org/lkml/20210415083417.3740-1-thunder.leizhen@huawei.com
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/perf/util/data.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
+index f29af4fc3d09..8fca4779ae6a 100644
+--- a/tools/perf/util/data.c
++++ b/tools/perf/util/data.c
+@@ -35,7 +35,7 @@ void perf_data__close_dir(struct perf_data *data)
+ int perf_data__create_dir(struct perf_data *data, int nr)
+ {
+       struct perf_data_file *files = NULL;
+-      int i, ret = -1;
++      int i, ret;
+ 
+       if (WARN_ON(!data->is_dir))
+               return -EINVAL;
+@@ -51,7 +51,8 @@ int perf_data__create_dir(struct perf_data *data, int nr)
+       for (i = 0; i < nr; i++) {
+               struct perf_data_file *file = &files[i];
+ 
+-              if (asprintf(&file->path, "%s/data.%d", data->path, i) < 0)
++              ret = asprintf(&file->path, "%s/data.%d", data->path, i);
++              if (ret < 0)
+                       goto out_err;
+ 
+               ret = open(file->path, O_RDWR|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR);
+-- 
+2.30.2
+
diff --git a/queue-5.11/perf-ftrace-fix-access-to-pid-in-array-when-setting-.patch b/queue-5.11/perf-ftrace-fix-access-to-pid-in-array-when-setting-.patch

new file mode 100644 (file)

index 0000000..79dca7f
--- /dev/null
+++ b/queue-5.11/perf-ftrace-fix-access-to-pid-in-array-when-setting-.patch
@@ -0,0 +1,64 @@
+From fc9b3a5610018a28be46d28d5934618a6e1bffdf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 21 Apr 2021 14:04:00 +0200
+Subject: perf ftrace: Fix access to pid in array when setting a pid filter
+
+From: Thomas Richter <tmricht@linux.ibm.com>
+
+[ Upstream commit 671b60cb6a897a5b3832fe57657152f2c3995e25 ]
+
+Command 'perf ftrace -v -- ls' fails in s390 (at least 5.12.0rc6).
+
+The root cause is a missing pointer dereference which causes an
+array element address to be used as PID.
+
+Fix this by extracting the PID.
+
+Output before:
+  # ./perf ftrace -v -- ls
+  function_graph tracer is used
+  write '-263732416' to tracing/set_ftrace_pid failed: Invalid argument
+  failed to set ftrace pid
+  #
+
+Output after:
+   ./perf ftrace -v -- ls
+   function_graph tracer is used
+   # tracer: function_graph
+   #
+   # CPU  DURATION                  FUNCTION CALLS
+   # |     |   |                     |   |   |   |
+   4)               |  rcu_read_lock_sched_held() {
+   4)   0.552 us    |    rcu_lockdep_current_cpu_online();
+   4)   6.124 us    |  }
+
+Reported-by: Alexander Schmidt <alexschm@de.ibm.com>
+Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
+Acked-by: Namhyung Kim <namhyung@kernel.org>
+Cc: Heiko Carstens <hca@linux.ibm.com>
+Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
+Cc: Sven Schnelle <svens@linux.ibm.com>
+Cc: Vasily Gorbik <gor@linux.ibm.com>
+Link: http://lore.kernel.org/lkml/20210421120400.2126433-1-tmricht@linux.ibm.com
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/perf/builtin-ftrace.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
+index d49448a1060c..87cb11a7a3ee 100644
+--- a/tools/perf/builtin-ftrace.c
++++ b/tools/perf/builtin-ftrace.c
+@@ -289,7 +289,7 @@ static int set_tracing_pid(struct perf_ftrace *ftrace)
+ 
+       for (i = 0; i < perf_thread_map__nr(ftrace->evlist->core.threads); i++) {
+               scnprintf(buf, sizeof(buf), "%d",
+-                        ftrace->evlist->core.threads->map[i]);
++                        perf_thread_map__pid(ftrace->evlist->core.threads, i));
+               if (append_tracing_file("set_ftrace_pid", buf) < 0)
+                       return -1;
+       }
+-- 
+2.30.2
+
diff --git a/queue-5.11/series b/queue-5.11/series

index 4fb4b4b89baa9ac17717b8ad1d0cf1e729addaa7..3262abdcd91087d2cb37ed4f8f8a31cd3257beff 100644 (file)
--- a/queue-5.11/series
+++ b/queue-5.11/series
@@ -6,3 +6,7 @@ igb-enable-rss-for-intel-i211-ethernet-controller.patch
  bpf-fix-masking-negation-logic-upon-negative-dst-register.patch
  bpf-fix-leakage-of-uninitialized-bpf-stack-under-speculation.patch
  net-qrtr-avoid-potential-use-after-free-in-mhi-send.patch
+perf-data-fix-error-return-code-in-perf_data__create.patch
+capabilities-require-cap_setfcap-to-map-uid-0.patch
+perf-ftrace-fix-access-to-pid-in-array-when-setting-.patch
+tools-cgroup-slabinfo.py-updated-to-work-on-current-.patch
diff --git a/queue-5.11/tools-cgroup-slabinfo.py-updated-to-work-on-current-.patch b/queue-5.11/tools-cgroup-slabinfo.py-updated-to-work-on-current-.patch

new file mode 100644 (file)

index 0000000..a260132
--- /dev/null
+++ b/queue-5.11/tools-cgroup-slabinfo.py-updated-to-work-on-current-.patch
@@ -0,0 +1,71 @@
+From ebf73fd21bdac2a7578ab92b04b615c6ed7e2867 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 23 Apr 2021 14:29:03 -0700
+Subject: tools/cgroup/slabinfo.py: updated to work on current kernel
+
+From: Vasily Averin <vvs@virtuozzo.com>
+
+[ Upstream commit 1974c45dd7745e999b9387be3d8fdcb27a5b1721 ]
+
+slabinfo.py script does not work with actual kernel version.
+
+First, it was unable to recognise SLUB susbsytem, and when I specified
+it manually it failed again with
+
+  AttributeError: 'struct page' has no member 'obj_cgroups'
+
+.. and then again with
+
+  File "tools/cgroup/memcg_slabinfo.py", line 221, in main
+    memcg.kmem_caches.address_of_(),
+  AttributeError: 'struct mem_cgroup' has no member 'kmem_caches'
+
+Link: https://lkml.kernel.org/r/cec1a75e-43b4-3d64-2084-d9f98fda037f@virtuozzo.com
+Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
+Tested-by: Roman Gushchin <guro@fb.com>
+Acked-by: Roman Gushchin <guro@fb.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/cgroup/memcg_slabinfo.py | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py
+index c4225ed63565..1600b17dbb8a 100644
+--- a/tools/cgroup/memcg_slabinfo.py
++++ b/tools/cgroup/memcg_slabinfo.py
+@@ -128,9 +128,9 @@ def detect_kernel_config():
+ 
+     cfg['nr_nodes'] = prog['nr_online_nodes'].value_()
+ 
+-    if prog.type('struct kmem_cache').members[1][1] == 'flags':
++    if prog.type('struct kmem_cache').members[1].name == 'flags':
+         cfg['allocator'] = 'SLUB'
+-    elif prog.type('struct kmem_cache').members[1][1] == 'batchcount':
++    elif prog.type('struct kmem_cache').members[1].name == 'batchcount':
+         cfg['allocator'] = 'SLAB'
+     else:
+         err('Can\'t determine the slab allocator')
+@@ -193,7 +193,7 @@ def main():
+         # look over all slab pages, belonging to non-root memcgs
+         # and look for objects belonging to the given memory cgroup
+         for page in for_each_slab_page(prog):
+-            objcg_vec_raw = page.obj_cgroups.value_()
++            objcg_vec_raw = page.memcg_data.value_()
+             if objcg_vec_raw == 0:
+                 continue
+             cache = page.slab_cache
+@@ -202,7 +202,7 @@ def main():
+             addr = cache.value_()
+             caches[addr] = cache
+             # clear the lowest bit to get the true obj_cgroups
+-            objcg_vec = Object(prog, page.obj_cgroups.type_,
++            objcg_vec = Object(prog, 'struct obj_cgroup **',
+                                value=objcg_vec_raw & ~1)
+ 
+             if addr not in stats:
+-- 
+2.30.2
+
author	Sasha Levin <sashal@kernel.org>
	Tue, 4 May 2021 17:49:35 +0000 (13:49 -0400)
committer	Sasha Levin <sashal@kernel.org>
	Tue, 4 May 2021 17:50:00 +0000 (13:50 -0400)
queue-5.11/capabilities-require-cap_setfcap-to-map-uid-0.patch	[new file with mode: 0644]	patch \| blob
queue-5.11/perf-data-fix-error-return-code-in-perf_data__create.patch	[new file with mode: 0644]	patch \| blob
queue-5.11/perf-ftrace-fix-access-to-pid-in-array-when-setting-.patch	[new file with mode: 0644]	patch \| blob
queue-5.11/series		patch \| blob \| blame \| history
queue-5.11/tools-cgroup-slabinfo.py-updated-to-work-on-current-.patch	[new file with mode: 0644]	patch \| blob