5.2-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 29 Jul 2019 18:03:20 +0000 (20:03 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 29 Jul 2019 18:03:20 +0000 (20:03 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Jul 2019 18:03:20 +0000 (20:03 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Jul 2019 18:03:20 +0000 (20:03 +0200)
diff --git a/queue-5.2/access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch b/queue-5.2/access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch

new file mode 100644 (file)

index 0000000..b68dab3
--- /dev/null
+++ b/queue-5.2/access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch
@@ -0,0 +1,182 @@
+From d7852fbd0f0423937fa287a598bfde188bb68c22 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 11 Jul 2019 09:54:40 -0700
+Subject: access: avoid the RCU grace period for the temporary subjective credentials
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit d7852fbd0f0423937fa287a598bfde188bb68c22 upstream.
+
+It turns out that 'access()' (and 'faccessat()') can cause a lot of RCU
+work because it installs a temporary credential that gets allocated and
+freed for each system call.
+
+The allocation and freeing overhead is mostly benign, but because
+credentials can be accessed under the RCU read lock, the freeing
+involves a RCU grace period.
+
+Which is not a huge deal normally, but if you have a lot of access()
+calls, this causes a fair amount of seconday damage: instead of having a
+nice alloc/free patterns that hits in hot per-CPU slab caches, you have
+all those delayed free's, and on big machines with hundreds of cores,
+the RCU overhead can end up being enormous.
+
+But it turns out that all of this is entirely unnecessary.  Exactly
+because access() only installs the credential as the thread-local
+subjective credential, the temporary cred pointer doesn't actually need
+to be RCU free'd at all.  Once we're done using it, we can just free it
+synchronously and avoid all the RCU overhead.
+
+So add a 'non_rcu' flag to 'struct cred', which can be set by users that
+know they only use it in non-RCU context (there are other potential
+users for this).  We can make it a union with the rcu freeing list head
+that we need for the RCU case, so this doesn't need any extra storage.
+
+Note that this also makes 'get_current_cred()' clear the new non_rcu
+flag, in case we have filesystems that take a long-term reference to the
+cred and then expect the RCU delayed freeing afterwards.  It's not
+entirely clear that this is required, but it makes for clear semantics:
+the subjective cred remains non-RCU as long as you only access it
+synchronously using the thread-local accessors, but you _can_ use it as
+a generic cred if you want to.
+
+It is possible that we should just remove the whole RCU markings for
+->cred entirely.  Only ->real_cred is really supposed to be accessed
+through RCU, and the long-term cred copies that nfs uses might want to
+explicitly re-enable RCU freeing if required, rather than have
+get_current_cred() do it implicitly.
+
+But this is a "minimal semantic changes" change for the immediate
+problem.
+
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Acked-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Paul E. McKenney <paulmck@linux.ibm.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Jan Glauber <jglauber@marvell.com>
+Cc: Jiri Kosina <jikos@kernel.org>
+Cc: Jayachandran Chandrasekharan Nair <jnair@marvell.com>
+Cc: Greg KH <greg@kroah.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: David Howells <dhowells@redhat.com>
+Cc: Miklos Szeredi <miklos@szeredi.hu>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/open.c            |   19 +++++++++++++++++++
+ include/linux/cred.h |    8 +++++++-
+ kernel/cred.c        |   21 +++++++++++++++++++--
+ 3 files changed, 45 insertions(+), 3 deletions(-)
+
+--- a/fs/open.c
++++ b/fs/open.c
+@@ -374,6 +374,25 @@ long do_faccessat(int dfd, const char __
+                               override_cred->cap_permitted;
+       }
+ 
++      /*
++       * The new set of credentials can *only* be used in
++       * task-synchronous circumstances, and does not need
++       * RCU freeing, unless somebody then takes a separate
++       * reference to it.
++       *
++       * NOTE! This is _only_ true because this credential
++       * is used purely for override_creds() that installs
++       * it as the subjective cred. Other threads will be
++       * accessing ->real_cred, not the subjective cred.
++       *
++       * If somebody _does_ make a copy of this (using the
++       * 'get_current_cred()' function), that will clear the
++       * non_rcu field, because now that other user may be
++       * expecting RCU freeing. But normal thread-synchronous
++       * cred accesses will keep things non-RCY.
++       */
++      override_cred->non_rcu = 1;
++
+       old_cred = override_creds(override_cred);
+ retry:
+       res = user_path_at(dfd, filename, lookup_flags, &path);
+--- a/include/linux/cred.h
++++ b/include/linux/cred.h
+@@ -145,7 +145,11 @@ struct cred {
+       struct user_struct *user;       /* real user ID subscription */
+       struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
+       struct group_info *group_info;  /* supplementary groups for euid/fsgid */
+-      struct rcu_head rcu;            /* RCU deletion hook */
++      /* RCU deletion */
++      union {
++              int non_rcu;                    /* Can we skip RCU deletion? */
++              struct rcu_head rcu;            /* RCU deletion hook */
++      };
+ } __randomize_layout;
+ 
+ extern void __put_cred(struct cred *);
+@@ -246,6 +250,7 @@ static inline const struct cred *get_cre
+       if (!cred)
+               return cred;
+       validate_creds(cred);
++      nonconst_cred->non_rcu = 0;
+       return get_new_cred(nonconst_cred);
+ }
+ 
+@@ -257,6 +262,7 @@ static inline const struct cred *get_cre
+       if (!atomic_inc_not_zero(&nonconst_cred->usage))
+               return NULL;
+       validate_creds(cred);
++      nonconst_cred->non_rcu = 0;
+       return cred;
+ }
+ 
+--- a/kernel/cred.c
++++ b/kernel/cred.c
+@@ -144,7 +144,10 @@ void __put_cred(struct cred *cred)
+       BUG_ON(cred == current->cred);
+       BUG_ON(cred == current->real_cred);
+ 
+-      call_rcu(&cred->rcu, put_cred_rcu);
++      if (cred->non_rcu)
++              put_cred_rcu(&cred->rcu);
++      else
++              call_rcu(&cred->rcu, put_cred_rcu);
+ }
+ EXPORT_SYMBOL(__put_cred);
+ 
+@@ -256,6 +259,7 @@ struct cred *prepare_creds(void)
+       old = task->cred;
+       memcpy(new, old, sizeof(struct cred));
+ 
++      new->non_rcu = 0;
+       atomic_set(&new->usage, 1);
+       set_cred_subscribers(new, 0);
+       get_group_info(new->group_info);
+@@ -535,7 +539,19 @@ const struct cred *override_creds(const
+ 
+       validate_creds(old);
+       validate_creds(new);
+-      get_cred(new);
++
++      /*
++       * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
++       *
++       * That means that we do not clear the 'non_rcu' flag, since
++       * we are only installing the cred into the thread-synchronous
++       * '->cred' pointer, not the '->real_cred' pointer that is
++       * visible to other threads under RCU.
++       *
++       * Also note that we did validate_creds() manually, not depending
++       * on the validation in 'get_cred()'.
++       */
++      get_new_cred((struct cred *)new);
+       alter_cred_subscribers(new, 1);
+       rcu_assign_pointer(current->cred, new);
+       alter_cred_subscribers(old, -1);
+@@ -672,6 +688,7 @@ struct cred *prepare_kernel_cred(struct
+       validate_creds(old);
+ 
+       *new = *old;
++      new->non_rcu = 0;
+       atomic_set(&new->usage, 1);
+       set_cred_subscribers(new, 0);
+       get_uid(new->user);
diff --git a/queue-5.2/drivers-base-introduce-kill_device.patch b/queue-5.2/drivers-base-introduce-kill_device.patch

new file mode 100644 (file)

index 0000000..d38e826
--- /dev/null
+++ b/queue-5.2/drivers-base-introduce-kill_device.patch
@@ -0,0 +1,98 @@
+From 00289cd87676e14913d2d8492d1ce05c4baafdae Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Wed, 17 Jul 2019 18:07:53 -0700
+Subject: drivers/base: Introduce kill_device()
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 00289cd87676e14913d2d8492d1ce05c4baafdae upstream.
+
+The libnvdimm subsystem arranges for devices to be destroyed as a result
+of a sysfs operation. Since device_unregister() cannot be called from
+an actively running sysfs attribute of the same device libnvdimm
+arranges for device_unregister() to be performed in an out-of-line async
+context.
+
+The driver core maintains a 'dead' state for coordinating its own racing
+async registration / de-registration requests. Rather than add local
+'dead' state tracking infrastructure to libnvdimm device objects, export
+the existing state tracking via a new kill_device() helper.
+
+The kill_device() helper simply marks the device as dead, i.e. that it
+is on its way to device_del(), or returns that the device was already
+dead. This can be used in advance of calling device_unregister() for
+subsystems like libnvdimm that might need to handle multiple user
+threads racing to delete a device.
+
+This refactoring does not change any behavior, but it is a pre-requisite
+for follow-on fixes and therefore marked for -stable.
+
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: "Rafael J. Wysocki" <rafael@kernel.org>
+Fixes: 4d88a97aa9e8 ("libnvdimm, nvdimm: dimm driver and base libnvdimm device-driver...")
+Cc: <stable@vger.kernel.org>
+Tested-by: Jane Chu <jane.chu@oracle.com>
+Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Link: https://lore.kernel.org/r/156341207332.292348.14959761496009347574.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/base/core.c    |   27 +++++++++++++++++++--------
+ include/linux/device.h |    1 +
+ 2 files changed, 20 insertions(+), 8 deletions(-)
+
+--- a/drivers/base/core.c
++++ b/drivers/base/core.c
+@@ -2211,6 +2211,24 @@ void put_device(struct device *dev)
+ }
+ EXPORT_SYMBOL_GPL(put_device);
+ 
++bool kill_device(struct device *dev)
++{
++      /*
++       * Require the device lock and set the "dead" flag to guarantee that
++       * the update behavior is consistent with the other bitfields near
++       * it and that we cannot have an asynchronous probe routine trying
++       * to run while we are tearing out the bus/class/sysfs from
++       * underneath the device.
++       */
++      lockdep_assert_held(&dev->mutex);
++
++      if (dev->p->dead)
++              return false;
++      dev->p->dead = true;
++      return true;
++}
++EXPORT_SYMBOL_GPL(kill_device);
++
+ /**
+  * device_del - delete device from system.
+  * @dev: device.
+@@ -2230,15 +2248,8 @@ void device_del(struct device *dev)
+       struct kobject *glue_dir = NULL;
+       struct class_interface *class_intf;
+ 
+-      /*
+-       * Hold the device lock and set the "dead" flag to guarantee that
+-       * the update behavior is consistent with the other bitfields near
+-       * it and that we cannot have an asynchronous probe routine trying
+-       * to run while we are tearing out the bus/class/sysfs from
+-       * underneath the device.
+-       */
+       device_lock(dev);
+-      dev->p->dead = true;
++      kill_device(dev);
+       device_unlock(dev);
+ 
+       /* Notify clients of device removal.  This call must come
+--- a/include/linux/device.h
++++ b/include/linux/device.h
+@@ -1375,6 +1375,7 @@ extern int (*platform_notify_remove)(str
+  */
+ extern struct device *get_device(struct device *dev);
+ extern void put_device(struct device *dev);
++extern bool kill_device(struct device *dev);
+ 
+ #ifdef CONFIG_DEVTMPFS
+ extern int devtmpfs_create_node(struct device *dev);
diff --git a/queue-5.2/drm-i915-make-the-semaphore-saturation-mask-global.patch b/queue-5.2/drm-i915-make-the-semaphore-saturation-mask-global.patch

new file mode 100644 (file)

index 0000000..c0b774d
--- /dev/null
+++ b/queue-5.2/drm-i915-make-the-semaphore-saturation-mask-global.patch
@@ -0,0 +1,118 @@
+From 44d89409a12eb8333735958509d7d591b461d13d Mon Sep 17 00:00:00 2001
+From: Chris Wilson <chris@chris-wilson.co.uk>
+Date: Tue, 18 Jun 2019 08:41:35 +0100
+Subject: drm/i915: Make the semaphore saturation mask global
+
+From: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 44d89409a12eb8333735958509d7d591b461d13d upstream.
+
+The idea behind keeping the saturation mask local to a context backfired
+spectacularly. The premise with the local mask was that we would be more
+proactive in attempting to use semaphores after each time the context
+idled, and that all new contexts would attempt to use semaphores
+ignoring the current state of the system. This turns out to be horribly
+optimistic. If the system state is still oversaturated and the existing
+workloads have all stopped using semaphores, the new workloads would
+attempt to use semaphores and be deprioritised behind real work. The
+new contexts would not switch off using semaphores until their initial
+batch of low priority work had completed. Given sufficient backload load
+of equal user priority, this would completely starve the new work of any
+GPU time.
+
+To compensate, remove the local tracking in favour of keeping it as
+global state on the engine -- once the system is saturated and
+semaphores are disabled, everyone stops attempting to use semaphores
+until the system is idle again. One of the reason for preferring local
+context tracking was that it worked with virtual engines, so for
+switching to global state we could either do a complete check of all the
+virtual siblings or simply disable semaphores for those requests. This
+takes the simpler approach of disabling semaphores on virtual engines.
+
+The downside is that the decision that the engine is saturated is a
+local measure -- we are only checking whether or not this context was
+scheduled in a timely fashion, it may be legitimately delayed due to user
+priorities. We still have the same dilemma though, that we do not want
+to employ the semaphore poll unless it will be used.
+
+v2: Explain why we need to assume the worst wrt virtual engines.
+
+Fixes: ca6e56f654e7 ("drm/i915: Disable semaphore busywaits on saturated systems")
+Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
+Cc: Dmitry Ermilov <dmitry.ermilov@intel.com>
+Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20190618074153.16055-8-chris@chris-wilson.co.uk
+Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/i915_request.c        |    4 ++--
+ drivers/gpu/drm/i915/intel_context.c       |    1 -
+ drivers/gpu/drm/i915/intel_context_types.h |    2 --
+ drivers/gpu/drm/i915/intel_engine_cs.c     |    1 +
+ drivers/gpu/drm/i915/intel_engine_types.h  |    2 ++
+ 5 files changed, 5 insertions(+), 5 deletions(-)
+
+--- a/drivers/gpu/drm/i915/i915_request.c
++++ b/drivers/gpu/drm/i915/i915_request.c
+@@ -443,7 +443,7 @@ void __i915_request_submit(struct i915_r
+        */
+       if (request->sched.semaphores &&
+           i915_sw_fence_signaled(&request->semaphore))
+-              request->hw_context->saturated |= request->sched.semaphores;
++              engine->saturated |= request->sched.semaphores;
+ 
+       /* We may be recursing from the signal callback of another i915 fence */
+       spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+@@ -829,7 +829,7 @@ already_busywaiting(struct i915_request
+        *
+        * See the are-we-too-late? check in __i915_request_submit().
+        */
+-      return rq->sched.semaphores | rq->hw_context->saturated;
++      return rq->sched.semaphores | rq->engine->saturated;
+ }
+ 
+ static int
+--- a/drivers/gpu/drm/i915/intel_context.c
++++ b/drivers/gpu/drm/i915/intel_context.c
+@@ -230,7 +230,6 @@ intel_context_init(struct intel_context
+       ce->gem_context = ctx;
+       ce->engine = engine;
+       ce->ops = engine->cops;
+-      ce->saturated = 0;
+ 
+       INIT_LIST_HEAD(&ce->signal_link);
+       INIT_LIST_HEAD(&ce->signals);
+--- a/drivers/gpu/drm/i915/intel_context_types.h
++++ b/drivers/gpu/drm/i915/intel_context_types.h
+@@ -59,8 +59,6 @@ struct intel_context {
+       atomic_t pin_count;
+       struct mutex pin_mutex; /* guards pinning and associated on-gpuing */
+ 
+-      intel_engine_mask_t saturated; /* submitting semaphores too late? */
+-
+       /**
+        * active_tracker: Active tracker for the external rq activity
+        * on this intel_context object.
+--- a/drivers/gpu/drm/i915/intel_engine_cs.c
++++ b/drivers/gpu/drm/i915/intel_engine_cs.c
+@@ -1200,6 +1200,7 @@ void intel_engines_park(struct drm_i915_
+ 
+               i915_gem_batch_pool_fini(&engine->batch_pool);
+               engine->execlists.no_priolist = false;
++              engine->saturated = 0;
+       }
+ 
+       i915->gt.active_engines = 0;
+--- a/drivers/gpu/drm/i915/intel_engine_types.h
++++ b/drivers/gpu/drm/i915/intel_engine_types.h
+@@ -285,6 +285,8 @@ struct intel_engine_cs {
+       struct intel_context *kernel_context; /* pinned */
+       struct intel_context *preempt_context; /* pinned; optional */
+ 
++      intel_engine_mask_t saturated; /* submitting semaphores too late? */
++
+       struct drm_i915_gem_object *default_state;
+       void *pinned_default_state;
+ 
diff --git a/queue-5.2/libnvdimm-bus-prevent-duplicate-device_unregister-calls.patch b/queue-5.2/libnvdimm-bus-prevent-duplicate-device_unregister-calls.patch

new file mode 100644 (file)

index 0000000..ba7f689
--- /dev/null
+++ b/queue-5.2/libnvdimm-bus-prevent-duplicate-device_unregister-calls.patch
@@ -0,0 +1,93 @@
+From 8aac0e2338916e273ccbd438a2b7a1e8c61749f5 Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Wed, 17 Jul 2019 18:07:58 -0700
+Subject: libnvdimm/bus: Prevent duplicate device_unregister() calls
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 8aac0e2338916e273ccbd438a2b7a1e8c61749f5 upstream.
+
+A multithreaded namespace creation/destruction stress test currently
+fails with signatures like the following:
+
+    sysfs group 'power' not found for kobject 'dax1.1'
+    RIP: 0010:sysfs_remove_group+0x76/0x80
+    Call Trace:
+     device_del+0x73/0x370
+     device_unregister+0x16/0x50
+     nd_async_device_unregister+0x1e/0x30 [libnvdimm]
+     async_run_entry_fn+0x39/0x160
+     process_one_work+0x23c/0x5e0
+     worker_thread+0x3c/0x390
+
+    BUG: kernel NULL pointer dereference, address: 0000000000000020
+    RIP: 0010:klist_put+0x1b/0x6c
+    Call Trace:
+     klist_del+0xe/0x10
+     device_del+0x8a/0x2c9
+     ? __switch_to_asm+0x34/0x70
+     ? __switch_to_asm+0x40/0x70
+     device_unregister+0x44/0x4f
+     nd_async_device_unregister+0x22/0x2d [libnvdimm]
+     async_run_entry_fn+0x47/0x15a
+     process_one_work+0x1a2/0x2eb
+     worker_thread+0x1b8/0x26e
+
+Use the kill_device() helper to atomically resolve the race of multiple
+threads issuing kill, device_unregister(), requests.
+
+Reported-by: Jane Chu <jane.chu@oracle.com>
+Reported-by: Erwin Tsaur <erwin.tsaur@oracle.com>
+Fixes: 4d88a97aa9e8 ("libnvdimm, nvdimm: dimm driver and base libnvdimm device-driver...")
+Cc: <stable@vger.kernel.org>
+Link: https://github.com/pmem/ndctl/issues/96
+Tested-by: Tested-by: Jane Chu <jane.chu@oracle.com>
+Link: https://lore.kernel.org/r/156341207846.292348.10435719262819764054.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/nvdimm/bus.c |   25 +++++++++++++++++++++++++
+ 1 file changed, 25 insertions(+)
+
+--- a/drivers/nvdimm/bus.c
++++ b/drivers/nvdimm/bus.c
+@@ -547,13 +547,38 @@ EXPORT_SYMBOL(nd_device_register);
+ 
+ void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
+ {
++      bool killed;
++
+       switch (mode) {
+       case ND_ASYNC:
++              /*
++               * In the async case this is being triggered with the
++               * device lock held and the unregistration work needs to
++               * be moved out of line iff this is thread has won the
++               * race to schedule the deletion.
++               */
++              if (!kill_device(dev))
++                      return;
++
+               get_device(dev);
+               async_schedule_domain(nd_async_device_unregister, dev,
+                               &nd_async_domain);
+               break;
+       case ND_SYNC:
++              /*
++               * In the sync case the device is being unregistered due
++               * to a state change of the parent. Claim the kill state
++               * to synchronize against other unregistration requests,
++               * or otherwise let the async path handle it if the
++               * unregistration was already queued.
++               */
++              device_lock(dev);
++              killed = kill_device(dev);
++              device_unlock(dev);
++
++              if (!killed)
++                      return;
++
+               nd_synchronize();
+               device_unregister(dev);
+               break;
diff --git a/queue-5.2/libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch b/queue-5.2/libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch

new file mode 100644 (file)

index 0000000..d3d949d
--- /dev/null
+++ b/queue-5.2/libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch
@@ -0,0 +1,208 @@
+From b70d31d054ee3a6fc1034b9d7fc0ae1e481aa018 Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Wed, 17 Jul 2019 18:08:15 -0700
+Subject: libnvdimm/bus: Stop holding nvdimm_bus_list_mutex over __nd_ioctl()
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit b70d31d054ee3a6fc1034b9d7fc0ae1e481aa018 upstream.
+
+In preparation for fixing a deadlock between wait_for_bus_probe_idle()
+and the nvdimm_bus_list_mutex arrange for __nd_ioctl() without
+nvdimm_bus_list_mutex held. This also unifies the 'dimm' and 'bus' level
+ioctls into a common nd_ioctl() preamble implementation.
+
+Marked for -stable as it is a pre-requisite for a follow-on fix.
+
+Cc: <stable@vger.kernel.org>
+Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation")
+Cc: Vishal Verma <vishal.l.verma@intel.com>
+Tested-by: Jane Chu <jane.chu@oracle.com>
+Link: https://lore.kernel.org/r/156341209518.292348.7183897251740665198.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/nvdimm/bus.c     |   94 ++++++++++++++++++++++++++++-------------------
+ drivers/nvdimm/nd-core.h |    3 +
+ 2 files changed, 59 insertions(+), 38 deletions(-)
+
+--- a/drivers/nvdimm/bus.c
++++ b/drivers/nvdimm/bus.c
+@@ -73,7 +73,7 @@ static void nvdimm_bus_probe_end(struct
+ {
+       nvdimm_bus_lock(&nvdimm_bus->dev);
+       if (--nvdimm_bus->probe_active == 0)
+-              wake_up(&nvdimm_bus->probe_wait);
++              wake_up(&nvdimm_bus->wait);
+       nvdimm_bus_unlock(&nvdimm_bus->dev);
+ }
+ 
+@@ -341,7 +341,7 @@ struct nvdimm_bus *nvdimm_bus_register(s
+               return NULL;
+       INIT_LIST_HEAD(&nvdimm_bus->list);
+       INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
+-      init_waitqueue_head(&nvdimm_bus->probe_wait);
++      init_waitqueue_head(&nvdimm_bus->wait);
+       nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
+       if (nvdimm_bus->id < 0) {
+               kfree(nvdimm_bus);
+@@ -426,6 +426,9 @@ static int nd_bus_remove(struct device *
+       list_del_init(&nvdimm_bus->list);
+       mutex_unlock(&nvdimm_bus_list_mutex);
+ 
++      wait_event(nvdimm_bus->wait,
++                      atomic_read(&nvdimm_bus->ioctl_active) == 0);
++
+       nd_synchronize();
+       device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
+ 
+@@ -885,7 +888,7 @@ void wait_nvdimm_bus_probe_idle(struct d
+               if (nvdimm_bus->probe_active == 0)
+                       break;
+               nvdimm_bus_unlock(&nvdimm_bus->dev);
+-              wait_event(nvdimm_bus->probe_wait,
++              wait_event(nvdimm_bus->wait,
+                               nvdimm_bus->probe_active == 0);
+               nvdimm_bus_lock(&nvdimm_bus->dev);
+       } while (true);
+@@ -1115,24 +1118,10 @@ static int __nd_ioctl(struct nvdimm_bus
+       return rc;
+ }
+ 
+-static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+-{
+-      long id = (long) file->private_data;
+-      int rc = -ENXIO, ro;
+-      struct nvdimm_bus *nvdimm_bus;
+-
+-      ro = ((file->f_flags & O_ACCMODE) == O_RDONLY);
+-      mutex_lock(&nvdimm_bus_list_mutex);
+-      list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
+-              if (nvdimm_bus->id == id) {
+-                      rc = __nd_ioctl(nvdimm_bus, NULL, ro, cmd, arg);
+-                      break;
+-              }
+-      }
+-      mutex_unlock(&nvdimm_bus_list_mutex);
+-
+-      return rc;
+-}
++enum nd_ioctl_mode {
++      BUS_IOCTL,
++      DIMM_IOCTL,
++};
+ 
+ static int match_dimm(struct device *dev, void *data)
+ {
+@@ -1147,31 +1136,62 @@ static int match_dimm(struct device *dev
+       return 0;
+ }
+ 
+-static long nvdimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
++              enum nd_ioctl_mode mode)
++
+ {
+-      int rc = -ENXIO, ro;
+-      struct nvdimm_bus *nvdimm_bus;
++      struct nvdimm_bus *nvdimm_bus, *found = NULL;
++      long id = (long) file->private_data;
++      struct nvdimm *nvdimm = NULL;
++      int rc, ro;
+ 
+       ro = ((file->f_flags & O_ACCMODE) == O_RDONLY);
+       mutex_lock(&nvdimm_bus_list_mutex);
+       list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
+-              struct device *dev = device_find_child(&nvdimm_bus->dev,
+-                              file->private_data, match_dimm);
+-              struct nvdimm *nvdimm;
++              if (mode == DIMM_IOCTL) {
++                      struct device *dev;
+ 
+-              if (!dev)
+-                      continue;
++                      dev = device_find_child(&nvdimm_bus->dev,
++                                      file->private_data, match_dimm);
++                      if (!dev)
++                              continue;
++                      nvdimm = to_nvdimm(dev);
++                      found = nvdimm_bus;
++              } else if (nvdimm_bus->id == id) {
++                      found = nvdimm_bus;
++              }
+ 
+-              nvdimm = to_nvdimm(dev);
+-              rc = __nd_ioctl(nvdimm_bus, nvdimm, ro, cmd, arg);
+-              put_device(dev);
+-              break;
++              if (found) {
++                      atomic_inc(&nvdimm_bus->ioctl_active);
++                      break;
++              }
+       }
+       mutex_unlock(&nvdimm_bus_list_mutex);
+ 
++      if (!found)
++              return -ENXIO;
++
++      nvdimm_bus = found;
++      rc = __nd_ioctl(nvdimm_bus, nvdimm, ro, cmd, arg);
++
++      if (nvdimm)
++              put_device(&nvdimm->dev);
++      if (atomic_dec_and_test(&nvdimm_bus->ioctl_active))
++              wake_up(&nvdimm_bus->wait);
++
+       return rc;
+ }
+ 
++static long bus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++      return nd_ioctl(file, cmd, arg, BUS_IOCTL);
++}
++
++static long dimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++      return nd_ioctl(file, cmd, arg, DIMM_IOCTL);
++}
++
+ static int nd_open(struct inode *inode, struct file *file)
+ {
+       long minor = iminor(inode);
+@@ -1183,16 +1203,16 @@ static int nd_open(struct inode *inode,
+ static const struct file_operations nvdimm_bus_fops = {
+       .owner = THIS_MODULE,
+       .open = nd_open,
+-      .unlocked_ioctl = nd_ioctl,
+-      .compat_ioctl = nd_ioctl,
++      .unlocked_ioctl = bus_ioctl,
++      .compat_ioctl = bus_ioctl,
+       .llseek = noop_llseek,
+ };
+ 
+ static const struct file_operations nvdimm_fops = {
+       .owner = THIS_MODULE,
+       .open = nd_open,
+-      .unlocked_ioctl = nvdimm_ioctl,
+-      .compat_ioctl = nvdimm_ioctl,
++      .unlocked_ioctl = dimm_ioctl,
++      .compat_ioctl = dimm_ioctl,
+       .llseek = noop_llseek,
+ };
+ 
+--- a/drivers/nvdimm/nd-core.h
++++ b/drivers/nvdimm/nd-core.h
+@@ -17,10 +17,11 @@ extern struct workqueue_struct *nvdimm_w
+ 
+ struct nvdimm_bus {
+       struct nvdimm_bus_descriptor *nd_desc;
+-      wait_queue_head_t probe_wait;
++      wait_queue_head_t wait;
+       struct list_head list;
+       struct device dev;
+       int id, probe_active;
++      atomic_t ioctl_active;
+       struct list_head mapping_list;
+       struct mutex reconfig_mutex;
+       struct badrange badrange;
diff --git a/queue-5.2/libnvdimm-region-register-badblocks-before-namespaces.patch b/queue-5.2/libnvdimm-region-register-badblocks-before-namespaces.patch

new file mode 100644 (file)

index 0000000..9c55bd0
--- /dev/null
+++ b/queue-5.2/libnvdimm-region-register-badblocks-before-namespaces.patch
@@ -0,0 +1,91 @@
+From 700cd033a82d466ad8f9615f9985525e45f8960a Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Wed, 17 Jul 2019 18:08:03 -0700
+Subject: libnvdimm/region: Register badblocks before namespaces
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 700cd033a82d466ad8f9615f9985525e45f8960a upstream.
+
+Namespace activation expects to be able to reference region badblocks.
+The following warning sometimes triggers when asynchronous namespace
+activation races in front of the completion of namespace probing. Move
+all possible namespace probing after region badblocks initialization.
+
+Otherwise, lockdep sometimes catches the uninitialized state of the
+badblocks seqlock with stack trace signatures like:
+
+    INFO: trying to register non-static key.
+    pmem2: detected capacity change from 0 to 136365211648
+    the code is fine but needs lockdep annotation.
+    turning off the locking correctness validator.
+    CPU: 9 PID: 358 Comm: kworker/u80:5 Tainted: G           OE     5.2.0-rc4+ #3382
+    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015
+    Workqueue: events_unbound async_run_entry_fn
+    Call Trace:
+     dump_stack+0x85/0xc0
+    pmem1.12: detected capacity change from 0 to 8589934592
+     register_lock_class+0x56a/0x570
+     ? check_object+0x140/0x270
+     __lock_acquire+0x80/0x1710
+     ? __mutex_lock+0x39d/0x910
+     lock_acquire+0x9e/0x180
+     ? nd_pfn_validate+0x28f/0x440 [libnvdimm]
+     badblocks_check+0x93/0x1f0
+     ? nd_pfn_validate+0x28f/0x440 [libnvdimm]
+     nd_pfn_validate+0x28f/0x440 [libnvdimm]
+     ? lockdep_hardirqs_on+0xf0/0x180
+     nd_dax_probe+0x9a/0x120 [libnvdimm]
+     nd_pmem_probe+0x6d/0x180 [nd_pmem]
+     nvdimm_bus_probe+0x90/0x2c0 [libnvdimm]
+
+Fixes: 48af2f7e52f4 ("libnvdimm, pfn: during init, clear errors...")
+Cc: <stable@vger.kernel.org>
+Cc: Vishal Verma <vishal.l.verma@intel.com>
+Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
+Link: https://lore.kernel.org/r/156341208365.292348.1547528796026249120.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/nvdimm/region.c |   22 +++++++++++-----------
+ 1 file changed, 11 insertions(+), 11 deletions(-)
+
+--- a/drivers/nvdimm/region.c
++++ b/drivers/nvdimm/region.c
+@@ -34,17 +34,6 @@ static int nd_region_probe(struct device
+       if (rc)
+               return rc;
+ 
+-      rc = nd_region_register_namespaces(nd_region, &err);
+-      if (rc < 0)
+-              return rc;
+-
+-      ndrd = dev_get_drvdata(dev);
+-      ndrd->ns_active = rc;
+-      ndrd->ns_count = rc + err;
+-
+-      if (rc && err && rc == err)
+-              return -ENODEV;
+-
+       if (is_nd_pmem(&nd_region->dev)) {
+               struct resource ndr_res;
+ 
+@@ -60,6 +49,17 @@ static int nd_region_probe(struct device
+               nvdimm_badblocks_populate(nd_region, &nd_region->bb, &ndr_res);
+       }
+ 
++      rc = nd_region_register_namespaces(nd_region, &err);
++      if (rc < 0)
++              return rc;
++
++      ndrd = dev_get_drvdata(dev);
++      ndrd->ns_active = rc;
++      ndrd->ns_count = rc + err;
++
++      if (rc && err && rc == err)
++              return -ENODEV;
++
+       nd_region->btt_seed = nd_btt_create(nd_region);
+       nd_region->pfn_seed = nd_pfn_create(nd_region);
+       nd_region->dax_seed = nd_dax_create(nd_region);
diff --git a/queue-5.2/series b/queue-5.2/series

index 19c8e4ee5b5e8723e473b9dde80f93c9e30a070c..9d582aae1573a4259912ad76fc24211144d08d87 100644 (file)
--- a/queue-5.2/series
+++ b/queue-5.2/series
@@ -202,3 +202,10 @@ io_uring-fix-the-sequence-comparison-in-io_sequence_defer.patch
  iommu-vt-d-don-t-queue_iova-if-there-is-no-flush-queue.patch
  iommu-iova-remove-stale-cached32_node.patch
  iommu-iova-fix-compilation-error-with-config_iommu_iova.patch
+drivers-base-introduce-kill_device.patch
+libnvdimm-bus-prevent-duplicate-device_unregister-calls.patch
+libnvdimm-region-register-badblocks-before-namespaces.patch
+libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch
+structleak-disable-structleak_byref-in-combination-with-kasan_stack.patch
+drm-i915-make-the-semaphore-saturation-mask-global.patch
+access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch
diff --git a/queue-5.2/structleak-disable-structleak_byref-in-combination-with-kasan_stack.patch b/queue-5.2/structleak-disable-structleak_byref-in-combination-with-kasan_stack.patch

new file mode 100644 (file)

index 0000000..4d8f75b
--- /dev/null
+++ b/queue-5.2/structleak-disable-structleak_byref-in-combination-with-kasan_stack.patch
@@ -0,0 +1,90 @@
+From 173e6ee21e2b3f477f07548a79c43b8d9cfbb37d Mon Sep 17 00:00:00 2001
+From: Arnd Bergmann <arnd@arndb.de>
+Date: Mon, 22 Jul 2019 13:41:20 +0200
+Subject: structleak: disable STRUCTLEAK_BYREF in combination with KASAN_STACK
+
+From: Arnd Bergmann <arnd@arndb.de>
+
+commit 173e6ee21e2b3f477f07548a79c43b8d9cfbb37d upstream.
+
+The combination of KASAN_STACK and GCC_PLUGIN_STRUCTLEAK_BYREF
+leads to much larger kernel stack usage, as seen from the warnings
+about functions that now exceed the 2048 byte limit:
+
+drivers/media/i2c/tvp5150.c:253:1: error: the frame size of 3936 bytes is larger than 2048 bytes
+drivers/media/tuners/r820t.c:1327:1: error: the frame size of 2816 bytes is larger than 2048 bytes
+drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_n.c:16552:1: error: the frame size of 3144 bytes is larger than 2048 bytes [-Werror=frame-larger-than=]
+fs/ocfs2/aops.c:1892:1: error: the frame size of 2088 bytes is larger than 2048 bytes
+fs/ocfs2/dlm/dlmrecovery.c:737:1: error: the frame size of 2088 bytes is larger than 2048 bytes
+fs/ocfs2/namei.c:1677:1: error: the frame size of 2584 bytes is larger than 2048 bytes
+fs/ocfs2/super.c:1186:1: error: the frame size of 2640 bytes is larger than 2048 bytes
+fs/ocfs2/xattr.c:3678:1: error: the frame size of 2176 bytes is larger than 2048 bytes
+net/bluetooth/l2cap_core.c:7056:1: error: the frame size of 2144 bytes is larger than 2048 bytes [-Werror=frame-larger-than=]
+net/bluetooth/l2cap_core.c: In function 'l2cap_recv_frame':
+net/bridge/br_netlink.c:1505:1: error: the frame size of 2448 bytes is larger than 2048 bytes
+net/ieee802154/nl802154.c:548:1: error: the frame size of 2232 bytes is larger than 2048 bytes
+net/wireless/nl80211.c:1726:1: error: the frame size of 2224 bytes is larger than 2048 bytes
+net/wireless/nl80211.c:2357:1: error: the frame size of 4584 bytes is larger than 2048 bytes
+net/wireless/nl80211.c:5108:1: error: the frame size of 2760 bytes is larger than 2048 bytes
+net/wireless/nl80211.c:6472:1: error: the frame size of 2112 bytes is larger than 2048 bytes
+
+The structleak plugin was previously disabled for CONFIG_COMPILE_TEST,
+but meant we missed some bugs, so this time we should address them.
+
+The frame size warnings are distracting, and risking a kernel stack
+overflow is generally not beneficial to performance, so it may be best
+to disallow that particular combination. This can be done by turning
+off either one. I picked the dependency in GCC_PLUGIN_STRUCTLEAK_BYREF
+and GCC_PLUGIN_STRUCTLEAK_BYREF_ALL, as this option is designed to
+make uninitialized stack usage less harmful when enabled on its own,
+but it also prevents KASAN from detecting those cases in which it was
+in fact needed.
+
+KASAN_STACK is currently implied by KASAN on gcc, but could be made a
+user selectable option if we want to allow combining (non-stack) KASAN
+with GCC_PLUGIN_STRUCTLEAK_BYREF.
+
+Note that it would be possible to specifically address the files that
+print the warning, but presumably the overall stack usage is still
+significantly higher than in other configurations, so this would not
+address the full problem.
+
+I could not test this with CONFIG_INIT_STACK_ALL, which may or may not
+suffer from a similar problem.
+
+Fixes: 81a56f6dcd20 ("gcc-plugins: structleak: Generalize to all variable types")
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Link: https://lore.kernel.org/r/20190722114134.3123901-1-arnd@arndb.de
+Signed-off-by: Kees Cook <keescook@chromium.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ security/Kconfig.hardening |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/security/Kconfig.hardening
++++ b/security/Kconfig.hardening
+@@ -61,6 +61,7 @@ choice
+       config GCC_PLUGIN_STRUCTLEAK_BYREF
+               bool "zero-init structs passed by reference (strong)"
+               depends on GCC_PLUGINS
++              depends on !(KASAN && KASAN_STACK=1)
+               select GCC_PLUGIN_STRUCTLEAK
+               help
+                 Zero-initialize any structures on the stack that may
+@@ -70,9 +71,15 @@ choice
+                 exposures, like CVE-2017-1000410:
+                 https://git.kernel.org/linus/06e7e776ca4d3654
+ 
++                As a side-effect, this keeps a lot of variables on the
++                stack that can otherwise be optimized out, so combining
++                this with CONFIG_KASAN_STACK can lead to a stack overflow
++                and is disallowed.
++
+       config GCC_PLUGIN_STRUCTLEAK_BYREF_ALL
+               bool "zero-init anything passed by reference (very strong)"
+               depends on GCC_PLUGINS
++              depends on !(KASAN && KASAN_STACK=1)
+               select GCC_PLUGIN_STRUCTLEAK
+               help
+                 Zero-initialize any stack variables that may be passed
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 29 Jul 2019 18:03:20 +0000 (20:03 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 29 Jul 2019 18:03:20 +0000 (20:03 +0200)
queue-5.2/access-avoid-the-rcu-grace-period-for-the-temporary-subjective-credentials.patch	[new file with mode: 0644]	patch \| blob
queue-5.2/drivers-base-introduce-kill_device.patch	[new file with mode: 0644]	patch \| blob
queue-5.2/drm-i915-make-the-semaphore-saturation-mask-global.patch	[new file with mode: 0644]	patch \| blob
queue-5.2/libnvdimm-bus-prevent-duplicate-device_unregister-calls.patch	[new file with mode: 0644]	patch \| blob
queue-5.2/libnvdimm-bus-stop-holding-nvdimm_bus_list_mutex-over-__nd_ioctl.patch	[new file with mode: 0644]	patch \| blob
queue-5.2/libnvdimm-region-register-badblocks-before-namespaces.patch	[new file with mode: 0644]	patch \| blob
queue-5.2/series		patch \| blob \| blame \| history
queue-5.2/structleak-disable-structleak_byref-in-combination-with-kasan_stack.patch	[new file with mode: 0644]	patch \| blob