]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.19-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Jul 2019 18:03:03 +0000 (20:03 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Jul 2019 18:03:03 +0000 (20:03 +0200)
added patches:
access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch
libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch

queue-4.19/access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch [new file with mode: 0644]
queue-4.19/libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch [new file with mode: 0644]
queue-4.19/series

diff --git a/queue-4.19/access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch b/queue-4.19/access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch
new file mode 100644 (file)
index 0000000..9942a55
--- /dev/null
@@ -0,0 +1,174 @@
+From d7852fbd0f0423937fa287a598bfde188bb68c22 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 11 Jul 2019 09:54:40 -0700
+Subject: access: avoid the RCU grace period for the temporary subjective credentials
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit d7852fbd0f0423937fa287a598bfde188bb68c22 upstream.
+
+It turns out that 'access()' (and 'faccessat()') can cause a lot of RCU
+work because it installs a temporary credential that gets allocated and
+freed for each system call.
+
+The allocation and freeing overhead is mostly benign, but because
+credentials can be accessed under the RCU read lock, the freeing
+involves a RCU grace period.
+
+Which is not a huge deal normally, but if you have a lot of access()
+calls, this causes a fair amount of seconday damage: instead of having a
+nice alloc/free patterns that hits in hot per-CPU slab caches, you have
+all those delayed free's, and on big machines with hundreds of cores,
+the RCU overhead can end up being enormous.
+
+But it turns out that all of this is entirely unnecessary.  Exactly
+because access() only installs the credential as the thread-local
+subjective credential, the temporary cred pointer doesn't actually need
+to be RCU free'd at all.  Once we're done using it, we can just free it
+synchronously and avoid all the RCU overhead.
+
+So add a 'non_rcu' flag to 'struct cred', which can be set by users that
+know they only use it in non-RCU context (there are other potential
+users for this).  We can make it a union with the rcu freeing list head
+that we need for the RCU case, so this doesn't need any extra storage.
+
+Note that this also makes 'get_current_cred()' clear the new non_rcu
+flag, in case we have filesystems that take a long-term reference to the
+cred and then expect the RCU delayed freeing afterwards.  It's not
+entirely clear that this is required, but it makes for clear semantics:
+the subjective cred remains non-RCU as long as you only access it
+synchronously using the thread-local accessors, but you _can_ use it as
+a generic cred if you want to.
+
+It is possible that we should just remove the whole RCU markings for
+->cred entirely.  Only ->real_cred is really supposed to be accessed
+through RCU, and the long-term cred copies that nfs uses might want to
+explicitly re-enable RCU freeing if required, rather than have
+get_current_cred() do it implicitly.
+
+But this is a "minimal semantic changes" change for the immediate
+problem.
+
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Acked-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Paul E. McKenney <paulmck@linux.ibm.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Jan Glauber <jglauber@marvell.com>
+Cc: Jiri Kosina <jikos@kernel.org>
+Cc: Jayachandran Chandrasekharan Nair <jnair@marvell.com>
+Cc: Greg KH <greg@kroah.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: David Howells <dhowells@redhat.com>
+Cc: Miklos Szeredi <miklos@szeredi.hu>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/open.c            |   19 +++++++++++++++++++
+ include/linux/cred.h |    7 ++++++-
+ kernel/cred.c        |   21 +++++++++++++++++++--
+ 3 files changed, 44 insertions(+), 3 deletions(-)
+
+--- a/fs/open.c
++++ b/fs/open.c
+@@ -373,6 +373,25 @@ long do_faccessat(int dfd, const char __
+                               override_cred->cap_permitted;
+       }
++      /*
++       * The new set of credentials can *only* be used in
++       * task-synchronous circumstances, and does not need
++       * RCU freeing, unless somebody then takes a separate
++       * reference to it.
++       *
++       * NOTE! This is _only_ true because this credential
++       * is used purely for override_creds() that installs
++       * it as the subjective cred. Other threads will be
++       * accessing ->real_cred, not the subjective cred.
++       *
++       * If somebody _does_ make a copy of this (using the
++       * 'get_current_cred()' function), that will clear the
++       * non_rcu field, because now that other user may be
++       * expecting RCU freeing. But normal thread-synchronous
++       * cred accesses will keep things non-RCY.
++       */
++      override_cred->non_rcu = 1;
++
+       old_cred = override_creds(override_cred);
+ retry:
+       res = user_path_at(dfd, filename, lookup_flags, &path);
+--- a/include/linux/cred.h
++++ b/include/linux/cred.h
+@@ -150,7 +150,11 @@ struct cred {
+       struct user_struct *user;       /* real user ID subscription */
+       struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
+       struct group_info *group_info;  /* supplementary groups for euid/fsgid */
+-      struct rcu_head rcu;            /* RCU deletion hook */
++      /* RCU deletion */
++      union {
++              int non_rcu;                    /* Can we skip RCU deletion? */
++              struct rcu_head rcu;            /* RCU deletion hook */
++      };
+ } __randomize_layout;
+ extern void __put_cred(struct cred *);
+@@ -248,6 +252,7 @@ static inline const struct cred *get_cre
+ {
+       struct cred *nonconst_cred = (struct cred *) cred;
+       validate_creds(cred);
++      nonconst_cred->non_rcu = 0;
+       return get_new_cred(nonconst_cred);
+ }
+--- a/kernel/cred.c
++++ b/kernel/cred.c
+@@ -147,7 +147,10 @@ void __put_cred(struct cred *cred)
+       BUG_ON(cred == current->cred);
+       BUG_ON(cred == current->real_cred);
+-      call_rcu(&cred->rcu, put_cred_rcu);
++      if (cred->non_rcu)
++              put_cred_rcu(&cred->rcu);
++      else
++              call_rcu(&cred->rcu, put_cred_rcu);
+ }
+ EXPORT_SYMBOL(__put_cred);
+@@ -258,6 +261,7 @@ struct cred *prepare_creds(void)
+       old = task->cred;
+       memcpy(new, old, sizeof(struct cred));
++      new->non_rcu = 0;
+       atomic_set(&new->usage, 1);
+       set_cred_subscribers(new, 0);
+       get_group_info(new->group_info);
+@@ -537,7 +541,19 @@ const struct cred *override_creds(const
+       validate_creds(old);
+       validate_creds(new);
+-      get_cred(new);
++
++      /*
++       * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
++       *
++       * That means that we do not clear the 'non_rcu' flag, since
++       * we are only installing the cred into the thread-synchronous
++       * '->cred' pointer, not the '->real_cred' pointer that is
++       * visible to other threads under RCU.
++       *
++       * Also note that we did validate_creds() manually, not depending
++       * on the validation in 'get_cred()'.
++       */
++      get_new_cred((struct cred *)new);
+       alter_cred_subscribers(new, 1);
+       rcu_assign_pointer(current->cred, new);
+       alter_cred_subscribers(old, -1);
+@@ -620,6 +636,7 @@ struct cred *prepare_kernel_cred(struct
+       validate_creds(old);
+       *new = *old;
++      new->non_rcu = 0;
+       atomic_set(&new->usage, 1);
+       set_cred_subscribers(new, 0);
+       get_uid(new->user);
diff --git a/queue-4.19/libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch b/queue-4.19/libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch
new file mode 100644 (file)
index 0000000..5e789e1
--- /dev/null
@@ -0,0 +1,208 @@
+From b70d31d054ee3a6fc1034b9d7fc0ae1e481aa018 Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Wed, 17 Jul 2019 18:08:15 -0700
+Subject: libnvdimm/bus: Stop holding nvdimm_bus_list_mutex over __nd_ioctl()
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit b70d31d054ee3a6fc1034b9d7fc0ae1e481aa018 upstream.
+
+In preparation for fixing a deadlock between wait_for_bus_probe_idle()
+and the nvdimm_bus_list_mutex arrange for __nd_ioctl() without
+nvdimm_bus_list_mutex held. This also unifies the 'dimm' and 'bus' level
+ioctls into a common nd_ioctl() preamble implementation.
+
+Marked for -stable as it is a pre-requisite for a follow-on fix.
+
+Cc: <stable@vger.kernel.org>
+Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation")
+Cc: Vishal Verma <vishal.l.verma@intel.com>
+Tested-by: Jane Chu <jane.chu@oracle.com>
+Link: https://lore.kernel.org/r/156341209518.292348.7183897251740665198.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/nvdimm/bus.c     |   94 ++++++++++++++++++++++++++++-------------------
+ drivers/nvdimm/nd-core.h |    3 +
+ 2 files changed, 59 insertions(+), 38 deletions(-)
+
+--- a/drivers/nvdimm/bus.c
++++ b/drivers/nvdimm/bus.c
+@@ -86,7 +86,7 @@ static void nvdimm_bus_probe_end(struct
+ {
+       nvdimm_bus_lock(&nvdimm_bus->dev);
+       if (--nvdimm_bus->probe_active == 0)
+-              wake_up(&nvdimm_bus->probe_wait);
++              wake_up(&nvdimm_bus->wait);
+       nvdimm_bus_unlock(&nvdimm_bus->dev);
+ }
+@@ -348,7 +348,7 @@ struct nvdimm_bus *nvdimm_bus_register(s
+               return NULL;
+       INIT_LIST_HEAD(&nvdimm_bus->list);
+       INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
+-      init_waitqueue_head(&nvdimm_bus->probe_wait);
++      init_waitqueue_head(&nvdimm_bus->wait);
+       nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
+       mutex_init(&nvdimm_bus->reconfig_mutex);
+       badrange_init(&nvdimm_bus->badrange);
+@@ -418,6 +418,9 @@ static int nd_bus_remove(struct device *
+       list_del_init(&nvdimm_bus->list);
+       mutex_unlock(&nvdimm_bus_list_mutex);
++      wait_event(nvdimm_bus->wait,
++                      atomic_read(&nvdimm_bus->ioctl_active) == 0);
++
+       nd_synchronize();
+       device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
+@@ -838,7 +841,7 @@ void wait_nvdimm_bus_probe_idle(struct d
+               if (nvdimm_bus->probe_active == 0)
+                       break;
+               nvdimm_bus_unlock(&nvdimm_bus->dev);
+-              wait_event(nvdimm_bus->probe_wait,
++              wait_event(nvdimm_bus->wait,
+                               nvdimm_bus->probe_active == 0);
+               nvdimm_bus_lock(&nvdimm_bus->dev);
+       } while (true);
+@@ -1068,24 +1071,10 @@ static int __nd_ioctl(struct nvdimm_bus
+       return rc;
+ }
+-static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+-{
+-      long id = (long) file->private_data;
+-      int rc = -ENXIO, ro;
+-      struct nvdimm_bus *nvdimm_bus;
+-
+-      ro = ((file->f_flags & O_ACCMODE) == O_RDONLY);
+-      mutex_lock(&nvdimm_bus_list_mutex);
+-      list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
+-              if (nvdimm_bus->id == id) {
+-                      rc = __nd_ioctl(nvdimm_bus, NULL, ro, cmd, arg);
+-                      break;
+-              }
+-      }
+-      mutex_unlock(&nvdimm_bus_list_mutex);
+-
+-      return rc;
+-}
++enum nd_ioctl_mode {
++      BUS_IOCTL,
++      DIMM_IOCTL,
++};
+ static int match_dimm(struct device *dev, void *data)
+ {
+@@ -1100,31 +1089,62 @@ static int match_dimm(struct device *dev
+       return 0;
+ }
+-static long nvdimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
++              enum nd_ioctl_mode mode)
++
+ {
+-      int rc = -ENXIO, ro;
+-      struct nvdimm_bus *nvdimm_bus;
++      struct nvdimm_bus *nvdimm_bus, *found = NULL;
++      long id = (long) file->private_data;
++      struct nvdimm *nvdimm = NULL;
++      int rc, ro;
+       ro = ((file->f_flags & O_ACCMODE) == O_RDONLY);
+       mutex_lock(&nvdimm_bus_list_mutex);
+       list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
+-              struct device *dev = device_find_child(&nvdimm_bus->dev,
+-                              file->private_data, match_dimm);
+-              struct nvdimm *nvdimm;
++              if (mode == DIMM_IOCTL) {
++                      struct device *dev;
+-              if (!dev)
+-                      continue;
++                      dev = device_find_child(&nvdimm_bus->dev,
++                                      file->private_data, match_dimm);
++                      if (!dev)
++                              continue;
++                      nvdimm = to_nvdimm(dev);
++                      found = nvdimm_bus;
++              } else if (nvdimm_bus->id == id) {
++                      found = nvdimm_bus;
++              }
+-              nvdimm = to_nvdimm(dev);
+-              rc = __nd_ioctl(nvdimm_bus, nvdimm, ro, cmd, arg);
+-              put_device(dev);
+-              break;
++              if (found) {
++                      atomic_inc(&nvdimm_bus->ioctl_active);
++                      break;
++              }
+       }
+       mutex_unlock(&nvdimm_bus_list_mutex);
++      if (!found)
++              return -ENXIO;
++
++      nvdimm_bus = found;
++      rc = __nd_ioctl(nvdimm_bus, nvdimm, ro, cmd, arg);
++
++      if (nvdimm)
++              put_device(&nvdimm->dev);
++      if (atomic_dec_and_test(&nvdimm_bus->ioctl_active))
++              wake_up(&nvdimm_bus->wait);
++
+       return rc;
+ }
++static long bus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++      return nd_ioctl(file, cmd, arg, BUS_IOCTL);
++}
++
++static long dimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++      return nd_ioctl(file, cmd, arg, DIMM_IOCTL);
++}
++
+ static int nd_open(struct inode *inode, struct file *file)
+ {
+       long minor = iminor(inode);
+@@ -1136,16 +1156,16 @@ static int nd_open(struct inode *inode,
+ static const struct file_operations nvdimm_bus_fops = {
+       .owner = THIS_MODULE,
+       .open = nd_open,
+-      .unlocked_ioctl = nd_ioctl,
+-      .compat_ioctl = nd_ioctl,
++      .unlocked_ioctl = bus_ioctl,
++      .compat_ioctl = bus_ioctl,
+       .llseek = noop_llseek,
+ };
+ static const struct file_operations nvdimm_fops = {
+       .owner = THIS_MODULE,
+       .open = nd_open,
+-      .unlocked_ioctl = nvdimm_ioctl,
+-      .compat_ioctl = nvdimm_ioctl,
++      .unlocked_ioctl = dimm_ioctl,
++      .compat_ioctl = dimm_ioctl,
+       .llseek = noop_llseek,
+ };
+--- a/drivers/nvdimm/nd-core.h
++++ b/drivers/nvdimm/nd-core.h
+@@ -25,10 +25,11 @@ extern int nvdimm_major;
+ struct nvdimm_bus {
+       struct nvdimm_bus_descriptor *nd_desc;
+-      wait_queue_head_t probe_wait;
++      wait_queue_head_t wait;
+       struct list_head list;
+       struct device dev;
+       int id, probe_active;
++      atomic_t ioctl_active;
+       struct list_head mapping_list;
+       struct mutex reconfig_mutex;
+       struct badrange badrange;
index a044e4bca6f3948ee706f3caa612793483ca942a..683e3f3361fe61d5bfc01070adff1a523216b593 100644 (file)
@@ -109,3 +109,5 @@ alsa-line6-fix-wrong-altsetting-for-line6_podhd500_1.patch
 alsa-hda-add-a-conexant-codec-entry-to-let-mute-led-work.patch
 powerpc-xive-fix-loop-exit-condition-in-xive_find_target_in_mask.patch
 powerpc-tm-fix-oops-on-sigreturn-on-systems-without-tm.patch
+libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch
+access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch