--- /dev/null
+From cd84a62e0078dce09f4ed349bec84f86c9d54b30 Mon Sep 17 00:00:00 2001
+From: Bart Van Assche <bvanassche@acm.org>
+Date: Wed, 26 Sep 2018 14:01:04 -0700
+Subject: block, scsi: Change the preempt-only flag into a counter
+
+From: Bart Van Assche <bvanassche@acm.org>
+
+commit cd84a62e0078dce09f4ed349bec84f86c9d54b30 upstream.
+
+The RQF_PREEMPT flag is used for three purposes:
+- In the SCSI core, for making sure that power management requests
+ are executed even if a device is in the "quiesced" state.
+- For domain validation by SCSI drivers that use the parallel port.
+- In the IDE driver, for IDE preempt requests.
+Rename "preempt-only" into "pm-only" because the primary purpose of
+this mode is power management. Since the power management core may
+but does not have to resume a runtime suspended device before
+performing system-wide suspend and since a later patch will set
+"pm-only" mode as long as a block device is runtime suspended, make
+it possible to set "pm-only" mode from more than one context. Since
+with this change scsi_device_quiesce() is no longer idempotent, make
+that function return early if it is called for a quiesced queue.
+
+Signed-off-by: Bart Van Assche <bvanassche@acm.org>
+Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Ming Lei <ming.lei@redhat.com>
+Cc: Jianchao Wang <jianchao.w.wang@oracle.com>
+Cc: Johannes Thumshirn <jthumshirn@suse.de>
+Cc: Alan Stern <stern@rowland.harvard.edu>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ block/blk-core.c | 35 ++++++++++++++++++-----------------
+ block/blk-mq-debugfs.c | 10 +++++++++-
+ drivers/scsi/scsi_lib.c | 11 +++++++----
+ include/linux/blkdev.h | 14 +++++++++-----
+ 4 files changed, 43 insertions(+), 27 deletions(-)
+
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -421,24 +421,25 @@ void blk_sync_queue(struct request_queue
+ EXPORT_SYMBOL(blk_sync_queue);
+
+ /**
+- * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
++ * blk_set_pm_only - increment pm_only counter
+ * @q: request queue pointer
+- *
+- * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
+- * set and 1 if the flag was already set.
+ */
+-int blk_set_preempt_only(struct request_queue *q)
++void blk_set_pm_only(struct request_queue *q)
+ {
+- return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
++ atomic_inc(&q->pm_only);
+ }
+-EXPORT_SYMBOL_GPL(blk_set_preempt_only);
++EXPORT_SYMBOL_GPL(blk_set_pm_only);
+
+-void blk_clear_preempt_only(struct request_queue *q)
++void blk_clear_pm_only(struct request_queue *q)
+ {
+- blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
+- wake_up_all(&q->mq_freeze_wq);
++ int pm_only;
++
++ pm_only = atomic_dec_return(&q->pm_only);
++ WARN_ON_ONCE(pm_only < 0);
++ if (pm_only == 0)
++ wake_up_all(&q->mq_freeze_wq);
+ }
+-EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
++EXPORT_SYMBOL_GPL(blk_clear_pm_only);
+
+ /**
+ * __blk_run_queue_uncond - run a queue whether or not it has been stopped
+@@ -916,7 +917,7 @@ EXPORT_SYMBOL(blk_alloc_queue);
+ */
+ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
+ {
+- const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
++ const bool pm = flags & BLK_MQ_REQ_PREEMPT;
+
+ while (true) {
+ bool success = false;
+@@ -924,11 +925,11 @@ int blk_queue_enter(struct request_queue
+ rcu_read_lock();
+ if (percpu_ref_tryget_live(&q->q_usage_counter)) {
+ /*
+- * The code that sets the PREEMPT_ONLY flag is
+- * responsible for ensuring that that flag is globally
+- * visible before the queue is unfrozen.
++ * The code that increments the pm_only counter is
++ * responsible for ensuring that that counter is
++ * globally visible before the queue is unfrozen.
+ */
+- if (preempt || !blk_queue_preempt_only(q)) {
++ if (pm || !blk_queue_pm_only(q)) {
+ success = true;
+ } else {
+ percpu_ref_put(&q->q_usage_counter);
+@@ -953,7 +954,7 @@ int blk_queue_enter(struct request_queue
+
+ wait_event(q->mq_freeze_wq,
+ (atomic_read(&q->mq_freeze_depth) == 0 &&
+- (preempt || !blk_queue_preempt_only(q))) ||
++ (pm || !blk_queue_pm_only(q))) ||
+ blk_queue_dying(q));
+ if (blk_queue_dying(q))
+ return -ENODEV;
+--- a/block/blk-mq-debugfs.c
++++ b/block/blk-mq-debugfs.c
+@@ -102,6 +102,14 @@ static int blk_flags_show(struct seq_fil
+ return 0;
+ }
+
++static int queue_pm_only_show(void *data, struct seq_file *m)
++{
++ struct request_queue *q = data;
++
++ seq_printf(m, "%d\n", atomic_read(&q->pm_only));
++ return 0;
++}
++
+ #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name
+ static const char *const blk_queue_flag_name[] = {
+ QUEUE_FLAG_NAME(QUEUED),
+@@ -132,7 +140,6 @@ static const char *const blk_queue_flag_
+ QUEUE_FLAG_NAME(REGISTERED),
+ QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
+ QUEUE_FLAG_NAME(QUIESCED),
+- QUEUE_FLAG_NAME(PREEMPT_ONLY),
+ };
+ #undef QUEUE_FLAG_NAME
+
+@@ -209,6 +216,7 @@ static ssize_t queue_write_hint_store(vo
+ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
+ { "poll_stat", 0400, queue_poll_stat_show },
+ { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
++ { "pm_only", 0600, queue_pm_only_show, NULL },
+ { "state", 0600, queue_state_show, queue_state_write },
+ { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store },
+ { "zone_wlock", 0400, queue_zone_wlock_show, NULL },
+--- a/drivers/scsi/scsi_lib.c
++++ b/drivers/scsi/scsi_lib.c
+@@ -3059,11 +3059,14 @@ scsi_device_quiesce(struct scsi_device *
+ */
+ WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);
+
+- blk_set_preempt_only(q);
++ if (sdev->quiesced_by == current)
++ return 0;
++
++ blk_set_pm_only(q);
+
+ blk_mq_freeze_queue(q);
+ /*
+- * Ensure that the effect of blk_set_preempt_only() will be visible
++ * Ensure that the effect of blk_set_pm_only() will be visible
+ * for percpu_ref_tryget() callers that occur after the queue
+ * unfreeze even if the queue was already frozen before this function
+ * was called. See also https://lwn.net/Articles/573497/.
+@@ -3076,7 +3079,7 @@ scsi_device_quiesce(struct scsi_device *
+ if (err == 0)
+ sdev->quiesced_by = current;
+ else
+- blk_clear_preempt_only(q);
++ blk_clear_pm_only(q);
+ mutex_unlock(&sdev->state_mutex);
+
+ return err;
+@@ -3100,7 +3103,7 @@ void scsi_device_resume(struct scsi_devi
+ */
+ mutex_lock(&sdev->state_mutex);
+ sdev->quiesced_by = NULL;
+- blk_clear_preempt_only(sdev->request_queue);
++ blk_clear_pm_only(sdev->request_queue);
+ if (sdev->sdev_state == SDEV_QUIESCE)
+ scsi_device_set_state(sdev, SDEV_RUNNING);
+ mutex_unlock(&sdev->state_mutex);
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -504,6 +504,12 @@ struct request_queue {
+ * various queue flags, see QUEUE_* below
+ */
+ unsigned long queue_flags;
++ /*
++ * Number of contexts that have called blk_set_pm_only(). If this
++ * counter is above zero then only RQF_PM and RQF_PREEMPT requests are
++ * processed.
++ */
++ atomic_t pm_only;
+
+ /*
+ * ida allocated id for this queue. Used to index queues from
+@@ -698,7 +704,6 @@ struct request_queue {
+ #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */
+ #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */
+ #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */
+-#define QUEUE_FLAG_PREEMPT_ONLY 29 /* only process REQ_PREEMPT requests */
+
+ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
+ (1 << QUEUE_FLAG_SAME_COMP) | \
+@@ -736,12 +741,11 @@ bool blk_queue_flag_test_and_clear(unsig
+ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
+ REQ_FAILFAST_DRIVER))
+ #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
+-#define blk_queue_preempt_only(q) \
+- test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags)
++#define blk_queue_pm_only(q) atomic_read(&(q)->pm_only)
+ #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
+
+-extern int blk_set_preempt_only(struct request_queue *q);
+-extern void blk_clear_preempt_only(struct request_queue *q);
++extern void blk_set_pm_only(struct request_queue *q);
++extern void blk_clear_pm_only(struct request_queue *q);
+
+ static inline int queue_in_flight(struct request_queue *q)
+ {
--- /dev/null
+From d6e47819721ae2d9d090058ad5570a66f3c42e39 Mon Sep 17 00:00:00 2001
+From: "Yan, Zheng" <zyan@redhat.com>
+Date: Thu, 23 May 2019 11:01:37 +0800
+Subject: ceph: hold i_ceph_lock when removing caps for freeing inode
+
+From: Yan, Zheng <zyan@redhat.com>
+
+commit d6e47819721ae2d9d090058ad5570a66f3c42e39 upstream.
+
+ceph_d_revalidate(, LOOKUP_RCU) may call __ceph_caps_issued_mask()
+on a freeing inode.
+
+Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
+Reviewed-by: Jeff Layton <jlayton@redhat.com>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ceph/caps.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/ceph/caps.c
++++ b/fs/ceph/caps.c
+@@ -1237,20 +1237,23 @@ static int send_cap_msg(struct cap_msg_a
+ }
+
+ /*
+- * Queue cap releases when an inode is dropped from our cache. Since
+- * inode is about to be destroyed, there is no need for i_ceph_lock.
++ * Queue cap releases when an inode is dropped from our cache.
+ */
+ void ceph_queue_caps_release(struct inode *inode)
+ {
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct rb_node *p;
+
++ /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
++ * may call __ceph_caps_issued_mask() on a freeing inode. */
++ spin_lock(&ci->i_ceph_lock);
+ p = rb_first(&ci->i_caps);
+ while (p) {
+ struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+ p = rb_next(p);
+ __ceph_remove_cap(cap, true);
+ }
++ spin_unlock(&ci->i_ceph_lock);
+ }
+
+ /*
--- /dev/null
+From 5515e9a6273b8c02034466bcbd717ac9f53dab99 Mon Sep 17 00:00:00 2001
+From: Miroslav Lichvar <mlichvar@redhat.com>
+Date: Tue, 16 Jul 2019 16:30:09 -0700
+Subject: drivers/pps/pps.c: clear offset flags in PPS_SETPARAMS ioctl
+
+From: Miroslav Lichvar <mlichvar@redhat.com>
+
+commit 5515e9a6273b8c02034466bcbd717ac9f53dab99 upstream.
+
+The PPS assert/clear offset corrections are set by the PPS_SETPARAMS
+ioctl in the pps_ktime structs, which also contain flags. The flags are
+not initialized by applications (using the timepps.h header) and they
+are not used by the kernel for anything except returning them back in
+the PPS_GETPARAMS ioctl.
+
+Set the flags to zero to make it clear they are unused and avoid leaking
+uninitialized data of the PPS_SETPARAMS caller to other applications
+that have a read access to the PPS device.
+
+Link: http://lkml.kernel.org/r/20190702092251.24303-1-mlichvar@redhat.com
+Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Rodolfo Giometti <giometti@enneenne.com>
+Cc: Greg KH <greg@kroah.com>
+Cc: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/pps/pps.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/drivers/pps/pps.c
++++ b/drivers/pps/pps.c
+@@ -166,6 +166,14 @@ static long pps_cdev_ioctl(struct file *
+ pps->params.mode |= PPS_CANWAIT;
+ pps->params.api_version = PPS_API_VERS;
+
++ /*
++ * Clear unused fields of pps_kparams to avoid leaking
++ * uninitialized data of the PPS_SETPARAMS caller via
++ * PPS_GETPARAMS
++ */
++ pps->params.assert_off_tu.flags = 0;
++ pps->params.clear_off_tu.flags = 0;
++
+ spin_unlock_irq(&pps->lock);
+
+ break;
--- /dev/null
+From 1b496469d0c020e09124e03e66a81421c21272a7 Mon Sep 17 00:00:00 2001
+From: Yoshinori Sato <ysato@users.sourceforge.jp>
+Date: Sun, 21 Apr 2019 22:53:58 +0900
+Subject: Fix allyesconfig output.
+
+From: Yoshinori Sato <ysato@users.sourceforge.jp>
+
+commit 1b496469d0c020e09124e03e66a81421c21272a7 upstream.
+
+Conflict JCore-SoC and SolutionEngine 7619.
+
+Signed-off-by: Yoshinori Sato <ysato@users.sourceforge.jp>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/sh/boards/Kconfig | 14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
+--- a/arch/sh/boards/Kconfig
++++ b/arch/sh/boards/Kconfig
+@@ -8,27 +8,19 @@ config SH_ALPHA_BOARD
+ bool
+
+ config SH_DEVICE_TREE
+- bool "Board Described by Device Tree"
++ bool
+ select OF
+ select OF_EARLY_FLATTREE
+ select TIMER_OF
+ select COMMON_CLK
+ select GENERIC_CALIBRATE_DELAY
+- help
+- Select Board Described by Device Tree to build a kernel that
+- does not hard-code any board-specific knowledge but instead uses
+- a device tree blob provided by the boot-loader. You must enable
+- drivers for any hardware you want to use separately. At this
+- time, only boards based on the open-hardware J-Core processors
+- have sufficient driver coverage to use this option; do not
+- select it if you are using original SuperH hardware.
+
+ config SH_JCORE_SOC
+ bool "J-Core SoC"
+- depends on SH_DEVICE_TREE && (CPU_SH2 || CPU_J2)
++ select SH_DEVICE_TREE
+ select CLKSRC_JCORE_PIT
+ select JCORE_AIC
+- default y if CPU_J2
++ depends on CPU_J2
+ help
+ Select this option to include drivers core components of the
+ J-Core SoC, including interrupt controllers and timers.
--- /dev/null
+From d26d0cd97c88eb1a5704b42e41ab443406807810 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 13 Jul 2019 14:27:14 -0700
+Subject: /proc/<pid>/cmdline: add back the setproctitle() special case
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit d26d0cd97c88eb1a5704b42e41ab443406807810 upstream.
+
+This makes the setproctitle() special case very explicit indeed, and
+handles it with a separate helper function entirely. In the process, it
+re-instates the original semantics of simply stopping at the first NUL
+character when the original last NUL character is no longer there.
+
+[ The original semantics can still be seen in mm/util.c: get_cmdline()
+ that is limited to a fixed-size buffer ]
+
+This makes the logic about when we use the string lengths etc much more
+obvious, and makes it easier to see what we do and what the two very
+different cases are.
+
+Note that even when we allow walking past the end of the argument array
+(because the setproctitle() might have overwritten and overflowed the
+original argv[] strings), we only allow it when it overflows into the
+environment region if it is immediately adjacent.
+
+[ Fixed for missing 'count' checks noted by Alexey Izbyshev ]
+
+Link: https://lore.kernel.org/lkml/alpine.LNX.2.21.1904052326230.3249@kich.toxcorp.com/
+Fixes: 5ab827189965 ("fs/proc: simplify and clarify get_mm_cmdline() function")
+Cc: Jakub Jankowski <shasta@toxcorp.com>
+Cc: Alexey Dobriyan <adobriyan@gmail.com>
+Cc: Alexey Izbyshev <izbyshev@ispras.ru>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/proc/base.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 77 insertions(+), 4 deletions(-)
+
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -205,12 +205,53 @@ static int proc_root_link(struct dentry
+ return result;
+ }
+
++/*
++ * If the user used setproctitle(), we just get the string from
++ * user space at arg_start, and limit it to a maximum of one page.
++ */
++static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
++ size_t count, unsigned long pos,
++ unsigned long arg_start)
++{
++ char *page;
++ int ret, got;
++
++ if (pos >= PAGE_SIZE)
++ return 0;
++
++ page = (char *)__get_free_page(GFP_KERNEL);
++ if (!page)
++ return -ENOMEM;
++
++ ret = 0;
++ got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
++ if (got > 0) {
++ int len = strnlen(page, got);
++
++ /* Include the NUL character if it was found */
++ if (len < got)
++ len++;
++
++ if (len > pos) {
++ len -= pos;
++ if (len > count)
++ len = count;
++ len -= copy_to_user(buf, page+pos, len);
++ if (!len)
++ len = -EFAULT;
++ ret = len;
++ }
++ }
++ free_page((unsigned long)page);
++ return ret;
++}
++
+ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
+ size_t count, loff_t *ppos)
+ {
+- unsigned long arg_start, arg_end;
++ unsigned long arg_start, arg_end, env_start, env_end;
+ unsigned long pos, len;
+- char *page;
++ char *page, c;
+
+ /* Check if process spawned far enough to have cmdline. */
+ if (!mm->env_end)
+@@ -219,14 +260,46 @@ static ssize_t get_mm_cmdline(struct mm_
+ spin_lock(&mm->arg_lock);
+ arg_start = mm->arg_start;
+ arg_end = mm->arg_end;
++ env_start = mm->env_start;
++ env_end = mm->env_end;
+ spin_unlock(&mm->arg_lock);
+
+ if (arg_start >= arg_end)
+ return 0;
+
++ /*
++ * We allow setproctitle() to overwrite the argument
++ * strings, and overflow past the original end. But
++ * only when it overflows into the environment area.
++ */
++ if (env_start != arg_end || env_end < env_start)
++ env_start = env_end = arg_end;
++ len = env_end - arg_start;
++
+ /* We're not going to care if "*ppos" has high bits set */
+- /* .. but we do check the result is in the proper range */
+- pos = arg_start + *ppos;
++ pos = *ppos;
++ if (pos >= len)
++ return 0;
++ if (count > len - pos)
++ count = len - pos;
++ if (!count)
++ return 0;
++
++ /*
++ * Magical special case: if the argv[] end byte is not
++ * zero, the user has overwritten it with setproctitle(3).
++ *
++ * Possible future enhancement: do this only once when
++ * pos is 0, and set a flag in the 'struct file'.
++ */
++ if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
++ return get_mm_proctitle(mm, buf, count, pos, arg_start);
++
++ /*
++ * For the non-setproctitle() case we limit things strictly
++ * to the [arg_start, arg_end[ range.
++ */
++ pos += arg_start;
+ if (pos < arg_start || pos >= arg_end)
+ return 0;
+ if (count > arg_end - pos)
--- /dev/null
+From 3d712546d8ba9f25cdf080d79f90482aa4231ed4 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 13 Jul 2019 13:40:13 -0700
+Subject: /proc/<pid>/cmdline: remove all the special cases
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 3d712546d8ba9f25cdf080d79f90482aa4231ed4 upstream.
+
+Start off with a clean slate that only reads exactly from arg_start to
+arg_end, without any oddities. This simplifies the code and in the
+process removes the case that caused us to potentially leak an
+uninitialized byte from the temporary kernel buffer.
+
+Note that in order to start from scratch with an understandable base,
+this simplifies things _too_ much, and removes all the legacy logic to
+handle setproctitle() having changed the argument strings.
+
+We'll add back those special cases very differently in the next commit.
+
+Link: https://lore.kernel.org/lkml/20190712160913.17727-1-izbyshev@ispras.ru/
+Fixes: f5b65348fd77 ("proc: fix missing final NUL in get_mm_cmdline() rewrite")
+Cc: Alexey Izbyshev <izbyshev@ispras.ru>
+Cc: Alexey Dobriyan <adobriyan@gmail.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/proc/base.c | 71 ++++++---------------------------------------------------
+ 1 file changed, 8 insertions(+), 63 deletions(-)
+
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -208,7 +208,7 @@ static int proc_root_link(struct dentry
+ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
+ size_t count, loff_t *ppos)
+ {
+- unsigned long arg_start, arg_end, env_start, env_end;
++ unsigned long arg_start, arg_end;
+ unsigned long pos, len;
+ char *page;
+
+@@ -219,36 +219,18 @@ static ssize_t get_mm_cmdline(struct mm_
+ spin_lock(&mm->arg_lock);
+ arg_start = mm->arg_start;
+ arg_end = mm->arg_end;
+- env_start = mm->env_start;
+- env_end = mm->env_end;
+ spin_unlock(&mm->arg_lock);
+
+ if (arg_start >= arg_end)
+ return 0;
+
+- /*
+- * We have traditionally allowed the user to re-write
+- * the argument strings and overflow the end result
+- * into the environment section. But only do that if
+- * the environment area is contiguous to the arguments.
+- */
+- if (env_start != arg_end || env_start >= env_end)
+- env_start = env_end = arg_end;
+-
+- /* .. and limit it to a maximum of one page of slop */
+- if (env_end >= arg_end + PAGE_SIZE)
+- env_end = arg_end + PAGE_SIZE - 1;
+-
+ /* We're not going to care if "*ppos" has high bits set */
+- pos = arg_start + *ppos;
+-
+ /* .. but we do check the result is in the proper range */
+- if (pos < arg_start || pos >= env_end)
++ pos = arg_start + *ppos;
++ if (pos < arg_start || pos >= arg_end)
+ return 0;
+-
+- /* .. and we never go past env_end */
+- if (env_end - pos < count)
+- count = env_end - pos;
++ if (count > arg_end - pos)
++ count = arg_end - pos;
+
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
+@@ -258,48 +240,11 @@ static ssize_t get_mm_cmdline(struct mm_
+ while (count) {
+ int got;
+ size_t size = min_t(size_t, PAGE_SIZE, count);
+- long offset;
+-
+- /*
+- * Are we already starting past the official end?
+- * We always include the last byte that is *supposed*
+- * to be NUL
+- */
+- offset = (pos >= arg_end) ? pos - arg_end + 1 : 0;
+
+- got = access_remote_vm(mm, pos - offset, page, size + offset, FOLL_ANON);
+- if (got <= offset)
++ got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
++ if (got <= 0)
+ break;
+- got -= offset;
+-
+- /* Don't walk past a NUL character once you hit arg_end */
+- if (pos + got >= arg_end) {
+- int n = 0;
+-
+- /*
+- * If we started before 'arg_end' but ended up
+- * at or after it, we start the NUL character
+- * check at arg_end-1 (where we expect the normal
+- * EOF to be).
+- *
+- * NOTE! This is smaller than 'got', because
+- * pos + got >= arg_end
+- */
+- if (pos < arg_end)
+- n = arg_end - pos - 1;
+-
+- /* Cut off at first NUL after 'n' */
+- got = n + strnlen(page+n, offset+got-n);
+- if (got < offset)
+- break;
+- got -= offset;
+-
+- /* Include the NUL if it existed */
+- if (got < size)
+- got++;
+- }
+-
+- got -= copy_to_user(buf, page+offset, got);
++ got -= copy_to_user(buf, page, got);
+ if (unlikely(!got)) {
+ if (!len)
+ len = -EFAULT;
--- /dev/null
+From 16d51a590a8ce3befb1308e0e7ab77f3b661af33 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Tue, 16 Jul 2019 17:20:45 +0200
+Subject: sched/fair: Don't free p->numa_faults with concurrent readers
+
+From: Jann Horn <jannh@google.com>
+
+commit 16d51a590a8ce3befb1308e0e7ab77f3b661af33 upstream.
+
+When going through execve(), zero out the NUMA fault statistics instead of
+freeing them.
+
+During execve, the task is reachable through procfs and the scheduler. A
+concurrent /proc/*/sched reader can read data from a freed ->numa_faults
+allocation (confirmed by KASAN) and write it back to userspace.
+I believe that it would also be possible for a use-after-free read to occur
+through a race between a NUMA fault and execve(): task_numa_fault() can
+lead to task_numa_compare(), which invokes task_weight() on the currently
+running task of a different CPU.
+
+Another way to fix this would be to make ->numa_faults RCU-managed or add
+extra locking, but it seems easier to wipe the NUMA fault statistics on
+execve.
+
+Signed-off-by: Jann Horn <jannh@google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Petr Mladek <pmladek@suse.com>
+Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Will Deacon <will@kernel.org>
+Fixes: 82727018b0d3 ("sched/numa: Call task_numa_free() from do_execve()")
+Link: https://lkml.kernel.org/r/20190716152047.14424-1-jannh@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/exec.c | 2 +-
+ include/linux/sched/numa_balancing.h | 4 ++--
+ kernel/fork.c | 2 +-
+ kernel/sched/fair.c | 24 ++++++++++++++++++++----
+ 4 files changed, 24 insertions(+), 8 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1826,7 +1826,7 @@ static int __do_execve_file(int fd, stru
+ membarrier_execve(current);
+ rseq_execve(current);
+ acct_update_integrals(current);
+- task_numa_free(current);
++ task_numa_free(current, false);
+ free_bprm(bprm);
+ kfree(pathbuf);
+ if (filename)
+--- a/include/linux/sched/numa_balancing.h
++++ b/include/linux/sched/numa_balancing.h
+@@ -19,7 +19,7 @@
+ extern void task_numa_fault(int last_node, int node, int pages, int flags);
+ extern pid_t task_numa_group_id(struct task_struct *p);
+ extern void set_numabalancing_state(bool enabled);
+-extern void task_numa_free(struct task_struct *p);
++extern void task_numa_free(struct task_struct *p, bool final);
+ extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
+ int src_nid, int dst_cpu);
+ #else
+@@ -34,7 +34,7 @@ static inline pid_t task_numa_group_id(s
+ static inline void set_numabalancing_state(bool enabled)
+ {
+ }
+-static inline void task_numa_free(struct task_struct *p)
++static inline void task_numa_free(struct task_struct *p, bool final)
+ {
+ }
+ static inline bool should_numa_migrate_memory(struct task_struct *p,
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -679,7 +679,7 @@ void __put_task_struct(struct task_struc
+ WARN_ON(tsk == current);
+
+ cgroup_free(tsk);
+- task_numa_free(tsk);
++ task_numa_free(tsk, true);
+ security_task_free(tsk);
+ exit_creds(tsk);
+ delayacct_tsk_free(tsk);
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -2345,13 +2345,23 @@ no_join:
+ return;
+ }
+
+-void task_numa_free(struct task_struct *p)
++/*
++ * Get rid of NUMA staticstics associated with a task (either current or dead).
++ * If @final is set, the task is dead and has reached refcount zero, so we can
++ * safely free all relevant data structures. Otherwise, there might be
++ * concurrent reads from places like load balancing and procfs, and we should
++ * reset the data back to default state without freeing ->numa_faults.
++ */
++void task_numa_free(struct task_struct *p, bool final)
+ {
+ struct numa_group *grp = p->numa_group;
+- void *numa_faults = p->numa_faults;
++ unsigned long *numa_faults = p->numa_faults;
+ unsigned long flags;
+ int i;
+
++ if (!numa_faults)
++ return;
++
+ if (grp) {
+ spin_lock_irqsave(&grp->lock, flags);
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+@@ -2364,8 +2374,14 @@ void task_numa_free(struct task_struct *
+ put_numa_group(grp);
+ }
+
+- p->numa_faults = NULL;
+- kfree(numa_faults);
++ if (final) {
++ p->numa_faults = NULL;
++ kfree(numa_faults);
++ } else {
++ p->total_numa_faults = 0;
++ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
++ numa_faults[i] = 0;
++ }
+ }
+
+ /*
--- /dev/null
+From cb361d8cdef69990f6b4504dc1fd9a594d983c97 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Tue, 16 Jul 2019 17:20:47 +0200
+Subject: sched/fair: Use RCU accessors consistently for ->numa_group
+
+From: Jann Horn <jannh@google.com>
+
+commit cb361d8cdef69990f6b4504dc1fd9a594d983c97 upstream.
+
+The old code used RCU annotations and accessors inconsistently for
+->numa_group, which can lead to use-after-frees and NULL dereferences.
+
+Let all accesses to ->numa_group use proper RCU helpers to prevent such
+issues.
+
+Signed-off-by: Jann Horn <jannh@google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Petr Mladek <pmladek@suse.com>
+Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Will Deacon <will@kernel.org>
+Fixes: 8c8a743c5087 ("sched/numa: Use {cpu, pid} to create task groups for shared faults")
+Link: https://lkml.kernel.org/r/20190716152047.14424-3-jannh@google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/sched.h | 10 +++-
+ kernel/sched/fair.c | 120 +++++++++++++++++++++++++++++++++-----------------
+ 2 files changed, 90 insertions(+), 40 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1023,7 +1023,15 @@ struct task_struct {
+ u64 last_sum_exec_runtime;
+ struct callback_head numa_work;
+
+- struct numa_group *numa_group;
++ /*
++ * This pointer is only modified for current in syscall and
++ * pagefault context (and for tasks being destroyed), so it can be read
++ * from any of the following contexts:
++ * - RCU read-side critical section
++ * - current->numa_group from everywhere
++ * - task's runqueue locked, task not running
++ */
++ struct numa_group __rcu *numa_group;
+
+ /*
+ * numa_faults is an array split into four regions:
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1053,6 +1053,21 @@ struct numa_group {
+ unsigned long faults[0];
+ };
+
++/*
++ * For functions that can be called in multiple contexts that permit reading
++ * ->numa_group (see struct task_struct for locking rules).
++ */
++static struct numa_group *deref_task_numa_group(struct task_struct *p)
++{
++ return rcu_dereference_check(p->numa_group, p == current ||
++ (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
++}
++
++static struct numa_group *deref_curr_numa_group(struct task_struct *p)
++{
++ return rcu_dereference_protected(p->numa_group, p == current);
++}
++
+ static inline unsigned long group_faults_priv(struct numa_group *ng);
+ static inline unsigned long group_faults_shared(struct numa_group *ng);
+
+@@ -1096,10 +1111,12 @@ static unsigned int task_scan_start(stru
+ {
+ unsigned long smin = task_scan_min(p);
+ unsigned long period = smin;
++ struct numa_group *ng;
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+- if (p->numa_group) {
+- struct numa_group *ng = p->numa_group;
++ rcu_read_lock();
++ ng = rcu_dereference(p->numa_group);
++ if (ng) {
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+
+@@ -1107,6 +1124,7 @@ static unsigned int task_scan_start(stru
+ period *= shared + 1;
+ period /= private + shared + 1;
+ }
++ rcu_read_unlock();
+
+ return max(smin, period);
+ }
+@@ -1115,13 +1133,14 @@ static unsigned int task_scan_max(struct
+ {
+ unsigned long smin = task_scan_min(p);
+ unsigned long smax;
++ struct numa_group *ng;
+
+ /* Watch for min being lower than max due to floor calculations */
+ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+- if (p->numa_group) {
+- struct numa_group *ng = p->numa_group;
++ ng = deref_curr_numa_group(p);
++ if (ng) {
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+ unsigned long period = smax;
+@@ -1153,7 +1172,7 @@ void init_numa_balancing(unsigned long c
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+- p->numa_group = NULL;
++ RCU_INIT_POINTER(p->numa_group, NULL);
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+@@ -1200,7 +1219,16 @@ static void account_numa_dequeue(struct
+
+ pid_t task_numa_group_id(struct task_struct *p)
+ {
+- return p->numa_group ? p->numa_group->gid : 0;
++ struct numa_group *ng;
++ pid_t gid = 0;
++
++ rcu_read_lock();
++ ng = rcu_dereference(p->numa_group);
++ if (ng)
++ gid = ng->gid;
++ rcu_read_unlock();
++
++ return gid;
+ }
+
+ /*
+@@ -1225,11 +1253,13 @@ static inline unsigned long task_faults(
+
+ static inline unsigned long group_faults(struct task_struct *p, int nid)
+ {
+- if (!p->numa_group)
++ struct numa_group *ng = deref_task_numa_group(p);
++
++ if (!ng)
+ return 0;
+
+- return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+- p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
++ return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
++ ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
+ }
+
+ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+@@ -1367,12 +1397,13 @@ static inline unsigned long task_weight(
+ static inline unsigned long group_weight(struct task_struct *p, int nid,
+ int dist)
+ {
++ struct numa_group *ng = deref_task_numa_group(p);
+ unsigned long faults, total_faults;
+
+- if (!p->numa_group)
++ if (!ng)
+ return 0;
+
+- total_faults = p->numa_group->total_faults;
++ total_faults = ng->total_faults;
+
+ if (!total_faults)
+ return 0;
+@@ -1386,7 +1417,7 @@ static inline unsigned long group_weight
+ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+ int src_nid, int dst_cpu)
+ {
+- struct numa_group *ng = p->numa_group;
++ struct numa_group *ng = deref_curr_numa_group(p);
+ int dst_nid = cpu_to_node(dst_cpu);
+ int last_cpupid, this_cpupid;
+
+@@ -1592,13 +1623,14 @@ static bool load_too_imbalanced(long src
+ static void task_numa_compare(struct task_numa_env *env,
+ long taskimp, long groupimp, bool maymove)
+ {
++ struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
+ struct rq *dst_rq = cpu_rq(env->dst_cpu);
++ long imp = p_ng ? groupimp : taskimp;
+ struct task_struct *cur;
+ long src_load, dst_load;
+- long load;
+- long imp = env->p->numa_group ? groupimp : taskimp;
+- long moveimp = imp;
+ int dist = env->dist;
++ long moveimp = imp;
++ long load;
+
+ if (READ_ONCE(dst_rq->numa_migrate_on))
+ return;
+@@ -1637,21 +1669,22 @@ static void task_numa_compare(struct tas
+ * If dst and source tasks are in the same NUMA group, or not
+ * in any group then look only at task weights.
+ */
+- if (cur->numa_group == env->p->numa_group) {
++ cur_ng = rcu_dereference(cur->numa_group);
++ if (cur_ng == p_ng) {
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
+ /*
+ * Add some hysteresis to prevent swapping the
+ * tasks within a group over tiny differences.
+ */
+- if (cur->numa_group)
++ if (cur_ng)
+ imp -= imp / 16;
+ } else {
+ /*
+ * Compare the group weights. If a task is all by itself
+ * (not part of a group), use the task weight instead.
+ */
+- if (cur->numa_group && env->p->numa_group)
++ if (cur_ng && p_ng)
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
+ else
+@@ -1749,11 +1782,12 @@ static int task_numa_migrate(struct task
+ .best_imp = 0,
+ .best_cpu = -1,
+ };
++ unsigned long taskweight, groupweight;
+ struct sched_domain *sd;
++ long taskimp, groupimp;
++ struct numa_group *ng;
+ struct rq *best_rq;
+- unsigned long taskweight, groupweight;
+ int nid, ret, dist;
+- long taskimp, groupimp;
+
+ /*
+ * Pick the lowest SD_NUMA domain, as that would have the smallest
+@@ -1799,7 +1833,8 @@ static int task_numa_migrate(struct task
+ * multiple NUMA nodes; in order to better consolidate the group,
+ * we need to check other locations.
+ */
+- if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
++ ng = deref_curr_numa_group(p);
++ if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
+ for_each_online_node(nid) {
+ if (nid == env.src_nid || nid == p->numa_preferred_nid)
+ continue;
+@@ -1832,7 +1867,7 @@ static int task_numa_migrate(struct task
+ * A task that migrated to a second choice node will be better off
+ * trying for a better one later. Do not set the preferred node here.
+ */
+- if (p->numa_group) {
++ if (ng) {
+ if (env.best_cpu == -1)
+ nid = env.src_nid;
+ else
+@@ -2127,6 +2162,7 @@ static void task_numa_placement(struct t
+ unsigned long total_faults;
+ u64 runtime, period;
+ spinlock_t *group_lock = NULL;
++ struct numa_group *ng;
+
+ /*
+ * The p->mm->numa_scan_seq field gets updated without
+@@ -2144,8 +2180,9 @@ static void task_numa_placement(struct t
+ runtime = numa_get_avg_runtime(p, &period);
+
+ /* If the task is part of a group prevent parallel updates to group stats */
+- if (p->numa_group) {
+- group_lock = &p->numa_group->lock;
++ ng = deref_curr_numa_group(p);
++ if (ng) {
++ group_lock = &ng->lock;
+ spin_lock_irq(group_lock);
+ }
+
+@@ -2186,7 +2223,7 @@ static void task_numa_placement(struct t
+ p->numa_faults[cpu_idx] += f_diff;
+ faults += p->numa_faults[mem_idx];
+ p->total_numa_faults += diff;
+- if (p->numa_group) {
++ if (ng) {
+ /*
+ * safe because we can only change our own group
+ *
+@@ -2194,14 +2231,14 @@ static void task_numa_placement(struct t
+ * nid and priv in a specific region because it
+ * is at the beginning of the numa_faults array.
+ */
+- p->numa_group->faults[mem_idx] += diff;
+- p->numa_group->faults_cpu[mem_idx] += f_diff;
+- p->numa_group->total_faults += diff;
+- group_faults += p->numa_group->faults[mem_idx];
++ ng->faults[mem_idx] += diff;
++ ng->faults_cpu[mem_idx] += f_diff;
++ ng->total_faults += diff;
++ group_faults += ng->faults[mem_idx];
+ }
+ }
+
+- if (!p->numa_group) {
++ if (!ng) {
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_nid = nid;
+@@ -2212,8 +2249,8 @@ static void task_numa_placement(struct t
+ }
+ }
+
+- if (p->numa_group) {
+- numa_group_count_active_nodes(p->numa_group);
++ if (ng) {
++ numa_group_count_active_nodes(ng);
+ spin_unlock_irq(group_lock);
+ max_nid = preferred_group_nid(p, max_nid);
+ }
+@@ -2247,7 +2284,7 @@ static void task_numa_group(struct task_
+ int cpu = cpupid_to_cpu(cpupid);
+ int i;
+
+- if (unlikely(!p->numa_group)) {
++ if (unlikely(!deref_curr_numa_group(p))) {
+ unsigned int size = sizeof(struct numa_group) +
+ 4*nr_node_ids*sizeof(unsigned long);
+
+@@ -2283,7 +2320,7 @@ static void task_numa_group(struct task_
+ if (!grp)
+ goto no_join;
+
+- my_grp = p->numa_group;
++ my_grp = deref_curr_numa_group(p);
+ if (grp == my_grp)
+ goto no_join;
+
+@@ -2354,7 +2391,8 @@ no_join:
+ */
+ void task_numa_free(struct task_struct *p, bool final)
+ {
+- struct numa_group *grp = p->numa_group;
++ /* safe: p either is current or is being freed by current */
++ struct numa_group *grp = rcu_dereference_raw(p->numa_group);
+ unsigned long *numa_faults = p->numa_faults;
+ unsigned long flags;
+ int i;
+@@ -2434,7 +2472,7 @@ void task_numa_fault(int last_cpupid, in
+ * actively using should be counted as local. This allows the
+ * scan rate to slow down when a workload has settled down.
+ */
+- ng = p->numa_group;
++ ng = deref_curr_numa_group(p);
+ if (!priv && !local && ng && ng->active_nodes > 1 &&
+ numa_is_active_node(cpu_node, ng) &&
+ numa_is_active_node(mem_node, ng))
+@@ -10234,18 +10272,22 @@ void show_numa_stats(struct task_struct
+ {
+ int node;
+ unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
++ struct numa_group *ng;
+
++ rcu_read_lock();
++ ng = rcu_dereference(p->numa_group);
+ for_each_online_node(node) {
+ if (p->numa_faults) {
+ tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
+ tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+- if (p->numa_group) {
+- gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
+- gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
++ if (ng) {
++ gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
++ gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+ print_numa_stats(m, node, tsf, tpf, gsf, gpf);
+ }
++ rcu_read_unlock();
+ }
+ #endif /* CONFIG_NUMA_BALANCING */
+ #endif /* CONFIG_SCHED_DEBUG */
--- /dev/null
+From 17605afaae825b0291f80c62a7f6565879edaa8a Mon Sep 17 00:00:00 2001
+From: Bart Van Assche <bvanassche@acm.org>
+Date: Fri, 15 Mar 2019 16:27:58 -0700
+Subject: scsi: core: Avoid that a kernel warning appears during system resume
+
+From: Bart Van Assche <bvanassche@acm.org>
+
+commit 17605afaae825b0291f80c62a7f6565879edaa8a upstream.
+
+Since scsi_device_quiesce() skips SCSI devices that have another state than
+RUNNING, OFFLINE or TRANSPORT_OFFLINE, scsi_device_resume() should not
+complain about SCSI devices that have been skipped. Hence this patch. This
+patch avoids that the following warning appears during resume:
+
+WARNING: CPU: 3 PID: 1039 at blk_clear_pm_only+0x2a/0x30
+CPU: 3 PID: 1039 Comm: kworker/u8:49 Not tainted 5.0.0+ #1
+Hardware name: LENOVO 4180F42/4180F42, BIOS 83ET75WW (1.45 ) 05/10/2013
+Workqueue: events_unbound async_run_entry_fn
+RIP: 0010:blk_clear_pm_only+0x2a/0x30
+Call Trace:
+ ? scsi_device_resume+0x28/0x50
+ ? scsi_dev_type_resume+0x2b/0x80
+ ? async_run_entry_fn+0x2c/0xd0
+ ? process_one_work+0x1f0/0x3f0
+ ? worker_thread+0x28/0x3c0
+ ? process_one_work+0x3f0/0x3f0
+ ? kthread+0x10c/0x130
+ ? __kthread_create_on_node+0x150/0x150
+ ? ret_from_fork+0x1f/0x30
+
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Hannes Reinecke <hare@suse.com>
+Cc: Ming Lei <ming.lei@redhat.com>
+Cc: Johannes Thumshirn <jthumshirn@suse.de>
+Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
+Cc: Martin Steigerwald <martin@lichtvoll.de>
+Cc: <stable@vger.kernel.org>
+Reported-by: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
+Tested-by: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
+Fixes: 3a0a529971ec ("block, scsi: Make SCSI quiesce and resume work reliably") # v4.15
+Signed-off-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/scsi_lib.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/scsi/scsi_lib.c
++++ b/drivers/scsi/scsi_lib.c
+@@ -3102,8 +3102,10 @@ void scsi_device_resume(struct scsi_devi
+ * device deleted during suspend)
+ */
+ mutex_lock(&sdev->state_mutex);
+- sdev->quiesced_by = NULL;
+- blk_clear_pm_only(sdev->request_queue);
++ if (sdev->quiesced_by) {
++ sdev->quiesced_by = NULL;
++ blk_clear_pm_only(sdev->request_queue);
++ }
+ if (sdev->sdev_state == SDEV_QUIESCE)
+ scsi_device_set_state(sdev, SDEV_RUNNING);
+ mutex_unlock(&sdev->state_mutex);
vhost_net-fix-possible-infinite-loop.patch
vhost-vsock-add-weight-support.patch
vhost-scsi-add-weight-support.patch
+sched-fair-don-t-free-p-numa_faults-with-concurrent-readers.patch
+sched-fair-use-rcu-accessors-consistently-for-numa_group.patch
+proc-pid-cmdline-remove-all-the-special-cases.patch
+proc-pid-cmdline-add-back-the-setproctitle-special-case.patch
+drivers-pps-pps.c-clear-offset-flags-in-pps_setparams-ioctl.patch
+fix-allyesconfig-output.patch
+ceph-hold-i_ceph_lock-when-removing-caps-for-freeing-inode.patch
+block-scsi-change-the-preempt-only-flag-into-a-counter.patch
+scsi-core-avoid-that-a-kernel-warning-appears-during-system-resume.patch