From 1e7d63c6f54c88c08bafe0fe1b03d3687689b373 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 29 Jul 2019 20:03:03 +0200 Subject: [PATCH] 4.19-stable patches added patches: access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch --- ...the-temporary-subjective-credentials.patch | 174 +++++++++++++++ ...vdimm_bus_list_mutex-over-__nd_ioctl.patch | 208 ++++++++++++++++++ queue-4.19/series | 2 + 3 files changed, 384 insertions(+) create mode 100644 queue-4.19/access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch create mode 100644 queue-4.19/libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch diff --git a/queue-4.19/access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch b/queue-4.19/access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch new file mode 100644 index 00000000000..9942a55a978 --- /dev/null +++ b/queue-4.19/access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch @@ -0,0 +1,174 @@ +From d7852fbd0f0423937fa287a598bfde188bb68c22 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 11 Jul 2019 09:54:40 -0700 +Subject: access: avoid the RCU grace period for the temporary subjective credentials + +From: Linus Torvalds + +commit d7852fbd0f0423937fa287a598bfde188bb68c22 upstream. + +It turns out that 'access()' (and 'faccessat()') can cause a lot of RCU +work because it installs a temporary credential that gets allocated and +freed for each system call. + +The allocation and freeing overhead is mostly benign, but because +credentials can be accessed under the RCU read lock, the freeing +involves a RCU grace period. + +Which is not a huge deal normally, but if you have a lot of access() +calls, this causes a fair amount of seconday damage: instead of having a +nice alloc/free patterns that hits in hot per-CPU slab caches, you have +all those delayed free's, and on big machines with hundreds of cores, +the RCU overhead can end up being enormous. + +But it turns out that all of this is entirely unnecessary. Exactly +because access() only installs the credential as the thread-local +subjective credential, the temporary cred pointer doesn't actually need +to be RCU free'd at all. Once we're done using it, we can just free it +synchronously and avoid all the RCU overhead. + +So add a 'non_rcu' flag to 'struct cred', which can be set by users that +know they only use it in non-RCU context (there are other potential +users for this). We can make it a union with the rcu freeing list head +that we need for the RCU case, so this doesn't need any extra storage. + +Note that this also makes 'get_current_cred()' clear the new non_rcu +flag, in case we have filesystems that take a long-term reference to the +cred and then expect the RCU delayed freeing afterwards. It's not +entirely clear that this is required, but it makes for clear semantics: +the subjective cred remains non-RCU as long as you only access it +synchronously using the thread-local accessors, but you _can_ use it as +a generic cred if you want to. + +It is possible that we should just remove the whole RCU markings for +->cred entirely. Only ->real_cred is really supposed to be accessed +through RCU, and the long-term cred copies that nfs uses might want to +explicitly re-enable RCU freeing if required, rather than have +get_current_cred() do it implicitly. + +But this is a "minimal semantic changes" change for the immediate +problem. + +Acked-by: Peter Zijlstra (Intel) +Acked-by: Eric Dumazet +Acked-by: Paul E. McKenney +Cc: Oleg Nesterov +Cc: Jan Glauber +Cc: Jiri Kosina +Cc: Jayachandran Chandrasekharan Nair +Cc: Greg KH +Cc: Kees Cook +Cc: David Howells +Cc: Miklos Szeredi +Cc: Al Viro +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/open.c | 19 +++++++++++++++++++ + include/linux/cred.h | 7 ++++++- + kernel/cred.c | 21 +++++++++++++++++++-- + 3 files changed, 44 insertions(+), 3 deletions(-) + +--- a/fs/open.c ++++ b/fs/open.c +@@ -373,6 +373,25 @@ long do_faccessat(int dfd, const char __ + override_cred->cap_permitted; + } + ++ /* ++ * The new set of credentials can *only* be used in ++ * task-synchronous circumstances, and does not need ++ * RCU freeing, unless somebody then takes a separate ++ * reference to it. ++ * ++ * NOTE! This is _only_ true because this credential ++ * is used purely for override_creds() that installs ++ * it as the subjective cred. Other threads will be ++ * accessing ->real_cred, not the subjective cred. ++ * ++ * If somebody _does_ make a copy of this (using the ++ * 'get_current_cred()' function), that will clear the ++ * non_rcu field, because now that other user may be ++ * expecting RCU freeing. But normal thread-synchronous ++ * cred accesses will keep things non-RCY. ++ */ ++ override_cred->non_rcu = 1; ++ + old_cred = override_creds(override_cred); + retry: + res = user_path_at(dfd, filename, lookup_flags, &path); +--- a/include/linux/cred.h ++++ b/include/linux/cred.h +@@ -150,7 +150,11 @@ struct cred { + struct user_struct *user; /* real user ID subscription */ + struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ + struct group_info *group_info; /* supplementary groups for euid/fsgid */ +- struct rcu_head rcu; /* RCU deletion hook */ ++ /* RCU deletion */ ++ union { ++ int non_rcu; /* Can we skip RCU deletion? */ ++ struct rcu_head rcu; /* RCU deletion hook */ ++ }; + } __randomize_layout; + + extern void __put_cred(struct cred *); +@@ -248,6 +252,7 @@ static inline const struct cred *get_cre + { + struct cred *nonconst_cred = (struct cred *) cred; + validate_creds(cred); ++ nonconst_cred->non_rcu = 0; + return get_new_cred(nonconst_cred); + } + +--- a/kernel/cred.c ++++ b/kernel/cred.c +@@ -147,7 +147,10 @@ void __put_cred(struct cred *cred) + BUG_ON(cred == current->cred); + BUG_ON(cred == current->real_cred); + +- call_rcu(&cred->rcu, put_cred_rcu); ++ if (cred->non_rcu) ++ put_cred_rcu(&cred->rcu); ++ else ++ call_rcu(&cred->rcu, put_cred_rcu); + } + EXPORT_SYMBOL(__put_cred); + +@@ -258,6 +261,7 @@ struct cred *prepare_creds(void) + old = task->cred; + memcpy(new, old, sizeof(struct cred)); + ++ new->non_rcu = 0; + atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); + get_group_info(new->group_info); +@@ -537,7 +541,19 @@ const struct cred *override_creds(const + + validate_creds(old); + validate_creds(new); +- get_cred(new); ++ ++ /* ++ * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. ++ * ++ * That means that we do not clear the 'non_rcu' flag, since ++ * we are only installing the cred into the thread-synchronous ++ * '->cred' pointer, not the '->real_cred' pointer that is ++ * visible to other threads under RCU. ++ * ++ * Also note that we did validate_creds() manually, not depending ++ * on the validation in 'get_cred()'. ++ */ ++ get_new_cred((struct cred *)new); + alter_cred_subscribers(new, 1); + rcu_assign_pointer(current->cred, new); + alter_cred_subscribers(old, -1); +@@ -620,6 +636,7 @@ struct cred *prepare_kernel_cred(struct + validate_creds(old); + + *new = *old; ++ new->non_rcu = 0; + atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); + get_uid(new->user); diff --git a/queue-4.19/libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch b/queue-4.19/libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch new file mode 100644 index 00000000000..5e789e17777 --- /dev/null +++ b/queue-4.19/libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch @@ -0,0 +1,208 @@ +From b70d31d054ee3a6fc1034b9d7fc0ae1e481aa018 Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Wed, 17 Jul 2019 18:08:15 -0700 +Subject: libnvdimm/bus: Stop holding nvdimm_bus_list_mutex over __nd_ioctl() + +From: Dan Williams + +commit b70d31d054ee3a6fc1034b9d7fc0ae1e481aa018 upstream. + +In preparation for fixing a deadlock between wait_for_bus_probe_idle() +and the nvdimm_bus_list_mutex arrange for __nd_ioctl() without +nvdimm_bus_list_mutex held. This also unifies the 'dimm' and 'bus' level +ioctls into a common nd_ioctl() preamble implementation. + +Marked for -stable as it is a pre-requisite for a follow-on fix. + +Cc: +Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation") +Cc: Vishal Verma +Tested-by: Jane Chu +Link: https://lore.kernel.org/r/156341209518.292348.7183897251740665198.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: Dan Williams +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/nvdimm/bus.c | 94 ++++++++++++++++++++++++++++------------------- + drivers/nvdimm/nd-core.h | 3 + + 2 files changed, 59 insertions(+), 38 deletions(-) + +--- a/drivers/nvdimm/bus.c ++++ b/drivers/nvdimm/bus.c +@@ -86,7 +86,7 @@ static void nvdimm_bus_probe_end(struct + { + nvdimm_bus_lock(&nvdimm_bus->dev); + if (--nvdimm_bus->probe_active == 0) +- wake_up(&nvdimm_bus->probe_wait); ++ wake_up(&nvdimm_bus->wait); + nvdimm_bus_unlock(&nvdimm_bus->dev); + } + +@@ -348,7 +348,7 @@ struct nvdimm_bus *nvdimm_bus_register(s + return NULL; + INIT_LIST_HEAD(&nvdimm_bus->list); + INIT_LIST_HEAD(&nvdimm_bus->mapping_list); +- init_waitqueue_head(&nvdimm_bus->probe_wait); ++ init_waitqueue_head(&nvdimm_bus->wait); + nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); + mutex_init(&nvdimm_bus->reconfig_mutex); + badrange_init(&nvdimm_bus->badrange); +@@ -418,6 +418,9 @@ static int nd_bus_remove(struct device * + list_del_init(&nvdimm_bus->list); + mutex_unlock(&nvdimm_bus_list_mutex); + ++ wait_event(nvdimm_bus->wait, ++ atomic_read(&nvdimm_bus->ioctl_active) == 0); ++ + nd_synchronize(); + device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); + +@@ -838,7 +841,7 @@ void wait_nvdimm_bus_probe_idle(struct d + if (nvdimm_bus->probe_active == 0) + break; + nvdimm_bus_unlock(&nvdimm_bus->dev); +- wait_event(nvdimm_bus->probe_wait, ++ wait_event(nvdimm_bus->wait, + nvdimm_bus->probe_active == 0); + nvdimm_bus_lock(&nvdimm_bus->dev); + } while (true); +@@ -1068,24 +1071,10 @@ static int __nd_ioctl(struct nvdimm_bus + return rc; + } + +-static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +-{ +- long id = (long) file->private_data; +- int rc = -ENXIO, ro; +- struct nvdimm_bus *nvdimm_bus; +- +- ro = ((file->f_flags & O_ACCMODE) == O_RDONLY); +- mutex_lock(&nvdimm_bus_list_mutex); +- list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) { +- if (nvdimm_bus->id == id) { +- rc = __nd_ioctl(nvdimm_bus, NULL, ro, cmd, arg); +- break; +- } +- } +- mutex_unlock(&nvdimm_bus_list_mutex); +- +- return rc; +-} ++enum nd_ioctl_mode { ++ BUS_IOCTL, ++ DIMM_IOCTL, ++}; + + static int match_dimm(struct device *dev, void *data) + { +@@ -1100,31 +1089,62 @@ static int match_dimm(struct device *dev + return 0; + } + +-static long nvdimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg, ++ enum nd_ioctl_mode mode) ++ + { +- int rc = -ENXIO, ro; +- struct nvdimm_bus *nvdimm_bus; ++ struct nvdimm_bus *nvdimm_bus, *found = NULL; ++ long id = (long) file->private_data; ++ struct nvdimm *nvdimm = NULL; ++ int rc, ro; + + ro = ((file->f_flags & O_ACCMODE) == O_RDONLY); + mutex_lock(&nvdimm_bus_list_mutex); + list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) { +- struct device *dev = device_find_child(&nvdimm_bus->dev, +- file->private_data, match_dimm); +- struct nvdimm *nvdimm; ++ if (mode == DIMM_IOCTL) { ++ struct device *dev; + +- if (!dev) +- continue; ++ dev = device_find_child(&nvdimm_bus->dev, ++ file->private_data, match_dimm); ++ if (!dev) ++ continue; ++ nvdimm = to_nvdimm(dev); ++ found = nvdimm_bus; ++ } else if (nvdimm_bus->id == id) { ++ found = nvdimm_bus; ++ } + +- nvdimm = to_nvdimm(dev); +- rc = __nd_ioctl(nvdimm_bus, nvdimm, ro, cmd, arg); +- put_device(dev); +- break; ++ if (found) { ++ atomic_inc(&nvdimm_bus->ioctl_active); ++ break; ++ } + } + mutex_unlock(&nvdimm_bus_list_mutex); + ++ if (!found) ++ return -ENXIO; ++ ++ nvdimm_bus = found; ++ rc = __nd_ioctl(nvdimm_bus, nvdimm, ro, cmd, arg); ++ ++ if (nvdimm) ++ put_device(&nvdimm->dev); ++ if (atomic_dec_and_test(&nvdimm_bus->ioctl_active)) ++ wake_up(&nvdimm_bus->wait); ++ + return rc; + } + ++static long bus_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ return nd_ioctl(file, cmd, arg, BUS_IOCTL); ++} ++ ++static long dimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ return nd_ioctl(file, cmd, arg, DIMM_IOCTL); ++} ++ + static int nd_open(struct inode *inode, struct file *file) + { + long minor = iminor(inode); +@@ -1136,16 +1156,16 @@ static int nd_open(struct inode *inode, + static const struct file_operations nvdimm_bus_fops = { + .owner = THIS_MODULE, + .open = nd_open, +- .unlocked_ioctl = nd_ioctl, +- .compat_ioctl = nd_ioctl, ++ .unlocked_ioctl = bus_ioctl, ++ .compat_ioctl = bus_ioctl, + .llseek = noop_llseek, + }; + + static const struct file_operations nvdimm_fops = { + .owner = THIS_MODULE, + .open = nd_open, +- .unlocked_ioctl = nvdimm_ioctl, +- .compat_ioctl = nvdimm_ioctl, ++ .unlocked_ioctl = dimm_ioctl, ++ .compat_ioctl = dimm_ioctl, + .llseek = noop_llseek, + }; + +--- a/drivers/nvdimm/nd-core.h ++++ b/drivers/nvdimm/nd-core.h +@@ -25,10 +25,11 @@ extern int nvdimm_major; + + struct nvdimm_bus { + struct nvdimm_bus_descriptor *nd_desc; +- wait_queue_head_t probe_wait; ++ wait_queue_head_t wait; + struct list_head list; + struct device dev; + int id, probe_active; ++ atomic_t ioctl_active; + struct list_head mapping_list; + struct mutex reconfig_mutex; + struct badrange badrange; diff --git a/queue-4.19/series b/queue-4.19/series index a044e4bca6f..683e3f3361f 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -109,3 +109,5 @@ alsa-line6-fix-wrong-altsetting-for-line6_podhd500_1.patch alsa-hda-add-a-conexant-codec-entry-to-let-mute-led-work.patch powerpc-xive-fix-loop-exit-condition-in-xive_find_target_in_mask.patch powerpc-tm-fix-oops-on-sigreturn-on-systems-without-tm.patch +libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch +access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch -- 2.47.3