--- /dev/null
+From eec4cb40e13217080a325dae738a3eaa37c4a2e7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 20 Apr 2021 08:43:34 -0500
+Subject: capabilities: require CAP_SETFCAP to map uid 0
+
+From: Serge E. Hallyn <serge@hallyn.com>
+
+[ Upstream commit db2e718a47984b9d71ed890eb2ea36ecf150de18 ]
+
+cap_setfcap is required to create file capabilities.
+
+Since commit 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"),
+a process running as uid 0 but without cap_setfcap is able to work
+around this as follows: unshare a new user namespace which maps parent
+uid 0 into the child namespace.
+
+While this task will not have new capabilities against the parent
+namespace, there is a loophole due to the way namespaced file
+capabilities are represented as xattrs. File capabilities valid in
+userns 1 are distinguished from file capabilities valid in userns 2 by
+the kuid which underlies uid 0. Therefore the restricted root process
+can unshare a new self-mapping namespace, add a namespaced file
+capability onto a file, then use that file capability in the parent
+namespace.
+
+To prevent that, do not allow mapping parent uid 0 if the process which
+opened the uid_map file does not have CAP_SETFCAP, which is the
+capability for setting file capabilities.
+
+As a further wrinkle: a task can unshare its user namespace, then open
+its uid_map file itself, and map (only) its own uid. In this case we do
+not have the credential from before unshare, which was potentially more
+restricted. So, when creating a user namespace, we record whether the
+creator had CAP_SETFCAP. Then we can use that during map_write().
+
+With this patch:
+
+1. Unprivileged user can still unshare -Ur
+
+ ubuntu@caps:~$ unshare -Ur
+ root@caps:~# logout
+
+2. Root user can still unshare -Ur
+
+ ubuntu@caps:~$ sudo bash
+ root@caps:/home/ubuntu# unshare -Ur
+ root@caps:/home/ubuntu# logout
+
+3. Root user without CAP_SETFCAP cannot unshare -Ur:
+
+ root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
+ root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
+ unable to set CAP_SETFCAP effective capability: Operation not permitted
+ root@caps:/home/ubuntu# unshare -Ur
+ unshare: write failed /proc/self/uid_map: Operation not permitted
+
+Note: an alternative solution would be to allow uid 0 mappings by
+processes without CAP_SETFCAP, but to prevent such a namespace from
+writing any file capabilities. This approach can be seen at [1].
+
+Background history: commit 95ebabde382 ("capabilities: Don't allow
+writing ambiguous v3 file capabilities") tried to fix the issue by
+preventing v3 fscaps to be written to disk when the root uid would map
+to the same uid in nested user namespaces. This led to regressions for
+various workloads. For example, see [2]. Ultimately this is a valid
+use-case we have to support meaning we had to revert this change in
+3b0c2d3eaa83 ("Revert 95ebabde382c ("capabilities: Don't allow writing
+ambiguous v3 file capabilities")").
+
+Link: https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4 [1]
+Link: https://github.com/containers/buildah/issues/3071 [2]
+Signed-off-by: Serge Hallyn <serge@hallyn.com>
+Reviewed-by: Andrew G. Morgan <morgan@kernel.org>
+Tested-by: Christian Brauner <christian.brauner@ubuntu.com>
+Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
+Tested-by: Giuseppe Scrivano <gscrivan@redhat.com>
+Cc: Eric Biederman <ebiederm@xmission.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/user_namespace.h | 3 ++
+ include/uapi/linux/capability.h | 3 +-
+ kernel/user_namespace.c | 65 +++++++++++++++++++++++++++++++--
+ 3 files changed, 67 insertions(+), 4 deletions(-)
+
+diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
+index 64cf8ebdc4ec..f6c5f784be5a 100644
+--- a/include/linux/user_namespace.h
++++ b/include/linux/user_namespace.h
+@@ -63,6 +63,9 @@ struct user_namespace {
+ kgid_t group;
+ struct ns_common ns;
+ unsigned long flags;
++ /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
++ * in its effective capability set at the child ns creation time. */
++ bool parent_could_setfcap;
+
+ #ifdef CONFIG_KEYS
+ /* List of joinable keyrings in this namespace. Modification access of
+diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
+index c6ca33034147..2ddb4226cd23 100644
+--- a/include/uapi/linux/capability.h
++++ b/include/uapi/linux/capability.h
+@@ -335,7 +335,8 @@ struct vfs_ns_cap_data {
+
+ #define CAP_AUDIT_CONTROL 30
+
+-/* Set or remove capabilities on files */
++/* Set or remove capabilities on files.
++ Map uid=0 into a child user namespace. */
+
+ #define CAP_SETFCAP 31
+
+diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
+index af612945a4d0..9a4b980d695b 100644
+--- a/kernel/user_namespace.c
++++ b/kernel/user_namespace.c
+@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new)
+ if (!ns)
+ goto fail_dec;
+
++ ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
+ ret = ns_alloc_inum(&ns->ns);
+ if (ret)
+ goto fail_free;
+@@ -841,6 +842,60 @@ static int sort_idmaps(struct uid_gid_map *map)
+ return 0;
+ }
+
++/**
++ * verify_root_map() - check the uid 0 mapping
++ * @file: idmapping file
++ * @map_ns: user namespace of the target process
++ * @new_map: requested idmap
++ *
++ * If a process requests mapping parent uid 0 into the new ns, verify that the
++ * process writing the map had the CAP_SETFCAP capability as the target process
++ * will be able to write fscaps that are valid in ancestor user namespaces.
++ *
++ * Return: true if the mapping is allowed, false if not.
++ */
++static bool verify_root_map(const struct file *file,
++ struct user_namespace *map_ns,
++ struct uid_gid_map *new_map)
++{
++ int idx;
++ const struct user_namespace *file_ns = file->f_cred->user_ns;
++ struct uid_gid_extent *extent0 = NULL;
++
++ for (idx = 0; idx < new_map->nr_extents; idx++) {
++ if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
++ extent0 = &new_map->extent[idx];
++ else
++ extent0 = &new_map->forward[idx];
++ if (extent0->lower_first == 0)
++ break;
++
++ extent0 = NULL;
++ }
++
++ if (!extent0)
++ return true;
++
++ if (map_ns == file_ns) {
++ /* The process unshared its ns and is writing to its own
++ * /proc/self/uid_map. User already has full capabilites in
++ * the new namespace. Verify that the parent had CAP_SETFCAP
++ * when it unshared.
++ * */
++ if (!file_ns->parent_could_setfcap)
++ return false;
++ } else {
++ /* Process p1 is writing to uid_map of p2, who is in a child
++ * user namespace to p1's. Verify that the opener of the map
++ * file has CAP_SETFCAP against the parent of the new map
++ * namespace */
++ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
++ return false;
++ }
++
++ return true;
++}
++
+ static ssize_t map_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos,
+ int cap_setid,
+@@ -848,7 +903,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
+ struct uid_gid_map *parent_map)
+ {
+ struct seq_file *seq = file->private_data;
+- struct user_namespace *ns = seq->private;
++ struct user_namespace *map_ns = seq->private;
+ struct uid_gid_map new_map;
+ unsigned idx;
+ struct uid_gid_extent extent;
+@@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
+ /*
+ * Adjusting namespace settings requires capabilities on the target.
+ */
+- if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
++ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
+ goto out;
+
+ /* Parse the user data */
+@@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
+
+ ret = -EPERM;
+ /* Validate the user is allowed to use user id's mapped to. */
+- if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
++ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
+ goto out;
+
+ ret = -EPERM;
+@@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file,
+ struct uid_gid_map *new_map)
+ {
+ const struct cred *cred = file->f_cred;
++
++ if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
++ return false;
++
+ /* Don't allow mappings that would allow anything that wouldn't
+ * be allowed without the establishment of unprivileged mappings.
+ */
+--
+2.30.2
+
--- /dev/null
+From bdb06c0750411a9034bca27436b6c781c912d8ab Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 15 Apr 2021 16:34:16 +0800
+Subject: perf data: Fix error return code in perf_data__create_dir()
+
+From: Zhen Lei <thunder.leizhen@huawei.com>
+
+[ Upstream commit f2211881e737cade55e0ee07cf6a26d91a35a6fe ]
+
+Although 'ret' has been initialized to -1, but it will be reassigned by
+the "ret = open(...)" statement in the for loop. So that, the value of
+'ret' is unknown when asprintf() failed.
+
+Reported-by: Hulk Robot <hulkci@huawei.com>
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Cc: Jiri Olsa <jolsa@redhat.com>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Namhyung Kim <namhyung@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Link: http://lore.kernel.org/lkml/20210415083417.3740-1-thunder.leizhen@huawei.com
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/perf/util/data.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
+index f29af4fc3d09..8fca4779ae6a 100644
+--- a/tools/perf/util/data.c
++++ b/tools/perf/util/data.c
+@@ -35,7 +35,7 @@ void perf_data__close_dir(struct perf_data *data)
+ int perf_data__create_dir(struct perf_data *data, int nr)
+ {
+ struct perf_data_file *files = NULL;
+- int i, ret = -1;
++ int i, ret;
+
+ if (WARN_ON(!data->is_dir))
+ return -EINVAL;
+@@ -51,7 +51,8 @@ int perf_data__create_dir(struct perf_data *data, int nr)
+ for (i = 0; i < nr; i++) {
+ struct perf_data_file *file = &files[i];
+
+- if (asprintf(&file->path, "%s/data.%d", data->path, i) < 0)
++ ret = asprintf(&file->path, "%s/data.%d", data->path, i);
++ if (ret < 0)
+ goto out_err;
+
+ ret = open(file->path, O_RDWR|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR);
+--
+2.30.2
+
--- /dev/null
+From fc9b3a5610018a28be46d28d5934618a6e1bffdf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 21 Apr 2021 14:04:00 +0200
+Subject: perf ftrace: Fix access to pid in array when setting a pid filter
+
+From: Thomas Richter <tmricht@linux.ibm.com>
+
+[ Upstream commit 671b60cb6a897a5b3832fe57657152f2c3995e25 ]
+
+Command 'perf ftrace -v -- ls' fails in s390 (at least 5.12.0rc6).
+
+The root cause is a missing pointer dereference which causes an
+array element address to be used as PID.
+
+Fix this by extracting the PID.
+
+Output before:
+ # ./perf ftrace -v -- ls
+ function_graph tracer is used
+ write '-263732416' to tracing/set_ftrace_pid failed: Invalid argument
+ failed to set ftrace pid
+ #
+
+Output after:
+ ./perf ftrace -v -- ls
+ function_graph tracer is used
+ # tracer: function_graph
+ #
+ # CPU DURATION FUNCTION CALLS
+ # | | | | | | |
+ 4) | rcu_read_lock_sched_held() {
+ 4) 0.552 us | rcu_lockdep_current_cpu_online();
+ 4) 6.124 us | }
+
+Reported-by: Alexander Schmidt <alexschm@de.ibm.com>
+Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
+Acked-by: Namhyung Kim <namhyung@kernel.org>
+Cc: Heiko Carstens <hca@linux.ibm.com>
+Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
+Cc: Sven Schnelle <svens@linux.ibm.com>
+Cc: Vasily Gorbik <gor@linux.ibm.com>
+Link: http://lore.kernel.org/lkml/20210421120400.2126433-1-tmricht@linux.ibm.com
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/perf/builtin-ftrace.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
+index d49448a1060c..87cb11a7a3ee 100644
+--- a/tools/perf/builtin-ftrace.c
++++ b/tools/perf/builtin-ftrace.c
+@@ -289,7 +289,7 @@ static int set_tracing_pid(struct perf_ftrace *ftrace)
+
+ for (i = 0; i < perf_thread_map__nr(ftrace->evlist->core.threads); i++) {
+ scnprintf(buf, sizeof(buf), "%d",
+- ftrace->evlist->core.threads->map[i]);
++ perf_thread_map__pid(ftrace->evlist->core.threads, i));
+ if (append_tracing_file("set_ftrace_pid", buf) < 0)
+ return -1;
+ }
+--
+2.30.2
+
--- /dev/null
+From ebf73fd21bdac2a7578ab92b04b615c6ed7e2867 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 23 Apr 2021 14:29:03 -0700
+Subject: tools/cgroup/slabinfo.py: updated to work on current kernel
+
+From: Vasily Averin <vvs@virtuozzo.com>
+
+[ Upstream commit 1974c45dd7745e999b9387be3d8fdcb27a5b1721 ]
+
+slabinfo.py script does not work with actual kernel version.
+
+First, it was unable to recognise SLUB susbsytem, and when I specified
+it manually it failed again with
+
+ AttributeError: 'struct page' has no member 'obj_cgroups'
+
+.. and then again with
+
+ File "tools/cgroup/memcg_slabinfo.py", line 221, in main
+ memcg.kmem_caches.address_of_(),
+ AttributeError: 'struct mem_cgroup' has no member 'kmem_caches'
+
+Link: https://lkml.kernel.org/r/cec1a75e-43b4-3d64-2084-d9f98fda037f@virtuozzo.com
+Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
+Tested-by: Roman Gushchin <guro@fb.com>
+Acked-by: Roman Gushchin <guro@fb.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/cgroup/memcg_slabinfo.py | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py
+index c4225ed63565..1600b17dbb8a 100644
+--- a/tools/cgroup/memcg_slabinfo.py
++++ b/tools/cgroup/memcg_slabinfo.py
+@@ -128,9 +128,9 @@ def detect_kernel_config():
+
+ cfg['nr_nodes'] = prog['nr_online_nodes'].value_()
+
+- if prog.type('struct kmem_cache').members[1][1] == 'flags':
++ if prog.type('struct kmem_cache').members[1].name == 'flags':
+ cfg['allocator'] = 'SLUB'
+- elif prog.type('struct kmem_cache').members[1][1] == 'batchcount':
++ elif prog.type('struct kmem_cache').members[1].name == 'batchcount':
+ cfg['allocator'] = 'SLAB'
+ else:
+ err('Can\'t determine the slab allocator')
+@@ -193,7 +193,7 @@ def main():
+ # look over all slab pages, belonging to non-root memcgs
+ # and look for objects belonging to the given memory cgroup
+ for page in for_each_slab_page(prog):
+- objcg_vec_raw = page.obj_cgroups.value_()
++ objcg_vec_raw = page.memcg_data.value_()
+ if objcg_vec_raw == 0:
+ continue
+ cache = page.slab_cache
+@@ -202,7 +202,7 @@ def main():
+ addr = cache.value_()
+ caches[addr] = cache
+ # clear the lowest bit to get the true obj_cgroups
+- objcg_vec = Object(prog, page.obj_cgroups.type_,
++ objcg_vec = Object(prog, 'struct obj_cgroup **',
+ value=objcg_vec_raw & ~1)
+
+ if addr not in stats:
+--
+2.30.2
+