--- /dev/null
+From ca763d0a53b264a650342cee206512bc92ac7050 Mon Sep 17 00:00:00 2001
+From: Joe Thornber <ejt@redhat.com>
+Date: Thu, 9 Feb 2017 11:46:18 -0500
+Subject: dm cache: fix corruption seen when using cache > 2TB
+
+From: Joe Thornber <ejt@redhat.com>
+
+commit ca763d0a53b264a650342cee206512bc92ac7050 upstream.
+
+A rounding bug due to compiler generated temporary being 32bit was found
+in remap_to_cache(). A localized cast in remap_to_cache() fixes the
+corruption but this preferred fix (changing from uint32_t to sector_t)
+eliminates potential for future rounding errors elsewhere.
+
+Signed-off-by: Joe Thornber <ejt@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/dm-cache-target.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/drivers/md/dm-cache-target.c
++++ b/drivers/md/dm-cache-target.c
+@@ -248,7 +248,7 @@ struct cache {
+ /*
+ * Fields for converting from sectors to blocks.
+ */
+- uint32_t sectors_per_block;
++ sector_t sectors_per_block;
+ int sectors_per_block_shift;
+
+ spinlock_t lock;
+@@ -3546,11 +3546,11 @@ static void cache_status(struct dm_targe
+
+ residency = policy_residency(cache->policy);
+
+- DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
++ DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
+ (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
+ (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
+ (unsigned long long)nr_blocks_metadata,
+- cache->sectors_per_block,
++ (unsigned long long)cache->sectors_per_block,
+ (unsigned long long) from_cblock(residency),
+ (unsigned long long) from_cblock(cache->cache_size),
+ (unsigned) atomic_read(&cache->stats.read_hit),
--- /dev/null
+From d36a19541fe8f392778ac137d60f9be8dfdd8f9d Mon Sep 17 00:00:00 2001
+From: Heinz Mauelshagen <heinzm@redhat.com>
+Date: Tue, 28 Feb 2017 19:17:49 +0100
+Subject: dm raid: fix data corruption on reshape request
+
+From: Heinz Mauelshagen <heinzm@redhat.com>
+
+commit d36a19541fe8f392778ac137d60f9be8dfdd8f9d upstream.
+
+The lvm2 sequence to manage dm-raid constructor flags that trigger a
+rebuild or a reshape is defined as:
+
+1) load table with flags (e.g. rebuild/delta_disks/data_offset)
+2) clear out the flags in lvm2 metadata
+3) store the lvm2 metadata, reload the table to reset the flags
+ previously established during the initial load (1) -- in order to
+ prevent repeatedly requesting a rebuild or a reshape on activation
+
+Currently, loading an inactive table with rebuild/reshape flags
+specified will cause dm-raid to rebuild/reshape on resume and thus start
+updating the raid metadata (about the progress). When the second table
+reload, to reset the flags, occurs the constructor accesses the volatile
+progress state kept in the raid superblocks. Because the active mapping
+is still processing the rebuild/reshape, that position will be stale by
+the time the device is resumed.
+
+In the reshape case, this causes data corruption by processing already
+reshaped stripes again. In the rebuild case, it does _not_ cause data
+corruption but instead involves superfluous rebuilds.
+
+Fix by keeping the raid set frozen during the first resume and then
+allow the rebuild/reshape during the second resume.
+
+Fixes: 9dbd1aa3a ("dm raid: add reshaping support to the target")
+Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/dm-raid.c | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+--- a/drivers/md/dm-raid.c
++++ b/drivers/md/dm-raid.c
+@@ -3621,6 +3621,8 @@ static int raid_preresume(struct dm_targ
+ return r;
+ }
+
++#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
++
+ static void raid_resume(struct dm_target *ti)
+ {
+ struct raid_set *rs = ti->private;
+@@ -3638,7 +3640,15 @@ static void raid_resume(struct dm_target
+ mddev->ro = 0;
+ mddev->in_sync = 0;
+
+- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
++ /*
++ * Keep the RAID set frozen if reshape/rebuild flags are set.
++ * The RAID set is unfrozen once the next table load/resume,
++ * which clears the reshape/rebuild flags, occurs.
++ * This ensures that the constructor for the inactive table
++ * retrieves an up-to-date reshape_position.
++ */
++ if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
++ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+
+ if (mddev->suspended)
+ mddev_resume(mddev);
--- /dev/null
+From 37a098e9d10db6e2efc05fe61e3a6ff2e9802c53 Mon Sep 17 00:00:00 2001
+From: Mike Snitzer <snitzer@redhat.com>
+Date: Thu, 16 Feb 2017 23:57:17 -0500
+Subject: dm round robin: revert "use percpu 'repeat_count' and 'current_path'"
+
+From: Mike Snitzer <snitzer@redhat.com>
+
+commit 37a098e9d10db6e2efc05fe61e3a6ff2e9802c53 upstream.
+
+The sloppy nature of lockless access to percpu pointers
+(s->current_path) in rr_select_path(), from multiple threads, is
+causing some paths to used more than others -- which results in less
+IO performance being observed.
+
+Revert these upstream commits to restore truly symmetric round-robin
+IO submission in DM multipath:
+
+b0b477c dm round robin: use percpu 'repeat_count' and 'current_path'
+802934b dm round robin: do not use this_cpu_ptr() without having preemption disabled
+
+There is no benefit to all this complexity if repeat_count = 1 (which is
+the recommended default).
+
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/dm-round-robin.c | 67 +++++++++-----------------------------------
+ 1 file changed, 14 insertions(+), 53 deletions(-)
+
+--- a/drivers/md/dm-round-robin.c
++++ b/drivers/md/dm-round-robin.c
+@@ -17,8 +17,8 @@
+ #include <linux/module.h>
+
+ #define DM_MSG_PREFIX "multipath round-robin"
+-#define RR_MIN_IO 1000
+-#define RR_VERSION "1.1.0"
++#define RR_MIN_IO 1
++#define RR_VERSION "1.2.0"
+
+ /*-----------------------------------------------------------------
+ * Path-handling code, paths are held in lists
+@@ -47,44 +47,19 @@ struct selector {
+ struct list_head valid_paths;
+ struct list_head invalid_paths;
+ spinlock_t lock;
+- struct dm_path * __percpu *current_path;
+- struct percpu_counter repeat_count;
+ };
+
+-static void set_percpu_current_path(struct selector *s, struct dm_path *path)
+-{
+- int cpu;
+-
+- for_each_possible_cpu(cpu)
+- *per_cpu_ptr(s->current_path, cpu) = path;
+-}
+-
+ static struct selector *alloc_selector(void)
+ {
+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+- if (!s)
+- return NULL;
+-
+- INIT_LIST_HEAD(&s->valid_paths);
+- INIT_LIST_HEAD(&s->invalid_paths);
+- spin_lock_init(&s->lock);
+-
+- s->current_path = alloc_percpu(struct dm_path *);
+- if (!s->current_path)
+- goto out_current_path;
+- set_percpu_current_path(s, NULL);
+-
+- if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL))
+- goto out_repeat_count;
++ if (s) {
++ INIT_LIST_HEAD(&s->valid_paths);
++ INIT_LIST_HEAD(&s->invalid_paths);
++ spin_lock_init(&s->lock);
++ }
+
+ return s;
+-
+-out_repeat_count:
+- free_percpu(s->current_path);
+-out_current_path:
+- kfree(s);
+- return NULL;;
+ }
+
+ static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
+@@ -105,8 +80,6 @@ static void rr_destroy(struct path_selec
+
+ free_paths(&s->valid_paths);
+ free_paths(&s->invalid_paths);
+- free_percpu(s->current_path);
+- percpu_counter_destroy(&s->repeat_count);
+ kfree(s);
+ ps->context = NULL;
+ }
+@@ -157,6 +130,11 @@ static int rr_add_path(struct path_selec
+ return -EINVAL;
+ }
+
++ if (repeat_count > 1) {
++ DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
++ repeat_count = 1;
++ }
++
+ /* allocate the path */
+ pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+ if (!pi) {
+@@ -183,9 +161,6 @@ static void rr_fail_path(struct path_sel
+ struct path_info *pi = p->pscontext;
+
+ spin_lock_irqsave(&s->lock, flags);
+- if (p == *this_cpu_ptr(s->current_path))
+- set_percpu_current_path(s, NULL);
+-
+ list_move(&pi->list, &s->invalid_paths);
+ spin_unlock_irqrestore(&s->lock, flags);
+ }
+@@ -208,29 +183,15 @@ static struct dm_path *rr_select_path(st
+ unsigned long flags;
+ struct selector *s = ps->context;
+ struct path_info *pi = NULL;
+- struct dm_path *current_path = NULL;
+
+- local_irq_save(flags);
+- current_path = *this_cpu_ptr(s->current_path);
+- if (current_path) {
+- percpu_counter_dec(&s->repeat_count);
+- if (percpu_counter_read_positive(&s->repeat_count) > 0) {
+- local_irq_restore(flags);
+- return current_path;
+- }
+- }
+-
+- spin_lock(&s->lock);
++ spin_lock_irqsave(&s->lock, flags);
+ if (!list_empty(&s->valid_paths)) {
+ pi = list_entry(s->valid_paths.next, struct path_info, list);
+ list_move_tail(&pi->list, &s->valid_paths);
+- percpu_counter_set(&s->repeat_count, pi->repeat_count);
+- set_percpu_current_path(s, pi->path);
+- current_path = pi->path;
+ }
+ spin_unlock_irqrestore(&s->lock, flags);
+
+- return current_path;
++ return pi ? pi->path : NULL;
+ }
+
+ static struct path_selector_type rr_ps = {
--- /dev/null
+From 6085831883c25860264721df15f05bbded45e2a2 Mon Sep 17 00:00:00 2001
+From: Mikulas Patocka <mpatocka@redhat.com>
+Date: Wed, 15 Feb 2017 12:06:19 -0500
+Subject: dm stats: fix a leaked s->histogram_boundaries array
+
+From: Mikulas Patocka <mpatocka@redhat.com>
+
+commit 6085831883c25860264721df15f05bbded45e2a2 upstream.
+
+Fixes: dfcfac3e4cd9 ("dm stats: collect and report histogram of IO latencies")
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/dm-stats.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/md/dm-stats.c
++++ b/drivers/md/dm-stats.c
+@@ -175,6 +175,7 @@ static void dm_stat_free(struct rcu_head
+ int cpu;
+ struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
+
++ kfree(s->histogram_boundaries);
+ kfree(s->program_id);
+ kfree(s->aux_data);
+ for_each_possible_cpu(cpu) {
--- /dev/null
+From 907565337ebf998a68cb5c5b2174ce5e5da065eb Mon Sep 17 00:00:00 2001
+From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Date: Thu, 3 Nov 2016 10:29:28 -0600
+Subject: Fix: Disable sys_membarrier when nohz_full is enabled
+
+From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+
+commit 907565337ebf998a68cb5c5b2174ce5e5da065eb upstream.
+
+Userspace applications should be allowed to expect the membarrier system
+call with MEMBARRIER_CMD_SHARED command to issue memory barriers on
+nohz_full CPUs, but synchronize_sched() does not take those into
+account.
+
+Given that we do not want unrelated processes to be able to affect
+real-time sensitive nohz_full CPUs, simply return ENOSYS when membarrier
+is invoked on a kernel with enabled nohz_full CPUs.
+
+Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+CC: Josh Triplett <josh@joshtriplett.org>
+CC: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: Chris Metcalf <cmetcalf@mellanox.com>
+Cc: Rik van Riel <riel@redhat.com>
+Acked-by: Lai Jiangshan <jiangshanlai@gmail.com>
+Reviewed-by: Josh Triplett <josh@joshtriplett.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/membarrier.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/kernel/membarrier.c
++++ b/kernel/membarrier.c
+@@ -16,6 +16,7 @@
+
+ #include <linux/syscalls.h>
+ #include <linux/membarrier.h>
++#include <linux/tick.h>
+
+ /*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+@@ -51,6 +52,9 @@
+ */
+ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+ {
++ /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
++ if (tick_nohz_full_enabled())
++ return -ENOSYS;
+ if (unlikely(flags))
+ return -EINVAL;
+ switch (cmd) {
--- /dev/null
+From bc15ed663e7e53ee4dc3e60f8d09c93a0528c694 Mon Sep 17 00:00:00 2001
+From: Mimi Zohar <zohar@linux.vnet.ibm.com>
+Date: Tue, 17 Jan 2017 06:45:41 -0500
+Subject: ima: fix ima_d_path() possible race with rename
+
+From: Mimi Zohar <zohar@linux.vnet.ibm.com>
+
+commit bc15ed663e7e53ee4dc3e60f8d09c93a0528c694 upstream.
+
+On failure to return a pathname from ima_d_path(), a pointer to
+dname is returned, which is subsequently used in the IMA measurement
+list, the IMA audit records, and other audit logging. Saving the
+pointer to dname for later use has the potential to race with rename.
+
+Intead of returning a pointer to dname on failure, this patch returns
+a pointer to a copy of the filename.
+
+Reported-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ security/integrity/ima/ima.h | 2 +-
+ security/integrity/ima/ima_api.c | 20 ++++++++++++++++++--
+ security/integrity/ima/ima_main.c | 8 +++++---
+ 3 files changed, 24 insertions(+), 6 deletions(-)
+
+--- a/security/integrity/ima/ima.h
++++ b/security/integrity/ima/ima.h
+@@ -173,7 +173,7 @@ int ima_store_template(struct ima_templa
+ struct inode *inode,
+ const unsigned char *filename, int pcr);
+ void ima_free_template_entry(struct ima_template_entry *entry);
+-const char *ima_d_path(const struct path *path, char **pathbuf);
++const char *ima_d_path(const struct path *path, char **pathbuf, char *filename);
+
+ /* IMA policy related functions */
+ int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask,
+--- a/security/integrity/ima/ima_api.c
++++ b/security/integrity/ima/ima_api.c
+@@ -318,7 +318,17 @@ void ima_audit_measurement(struct integr
+ iint->flags |= IMA_AUDITED;
+ }
+
+-const char *ima_d_path(const struct path *path, char **pathbuf)
++/*
++ * ima_d_path - return a pointer to the full pathname
++ *
++ * Attempt to return a pointer to the full pathname for use in the
++ * IMA measurement list, IMA audit records, and auditing logs.
++ *
++ * On failure, return a pointer to a copy of the filename, not dname.
++ * Returning a pointer to dname, could result in using the pointer
++ * after the memory has been freed.
++ */
++const char *ima_d_path(const struct path *path, char **pathbuf, char *namebuf)
+ {
+ char *pathname = NULL;
+
+@@ -331,5 +341,11 @@ const char *ima_d_path(const struct path
+ pathname = NULL;
+ }
+ }
+- return pathname ?: (const char *)path->dentry->d_name.name;
++
++ if (!pathname) {
++ strlcpy(namebuf, path->dentry->d_name.name, NAME_MAX);
++ pathname = namebuf;
++ }
++
++ return pathname;
+ }
+--- a/security/integrity/ima/ima_main.c
++++ b/security/integrity/ima/ima_main.c
+@@ -83,6 +83,7 @@ static void ima_rdwr_violation_check(str
+ const char **pathname)
+ {
+ struct inode *inode = file_inode(file);
++ char filename[NAME_MAX];
+ fmode_t mode = file->f_mode;
+ bool send_tomtou = false, send_writers = false;
+
+@@ -102,7 +103,7 @@ static void ima_rdwr_violation_check(str
+ if (!send_tomtou && !send_writers)
+ return;
+
+- *pathname = ima_d_path(&file->f_path, pathbuf);
++ *pathname = ima_d_path(&file->f_path, pathbuf, filename);
+
+ if (send_tomtou)
+ ima_add_violation(file, *pathname, iint,
+@@ -161,6 +162,7 @@ static int process_measurement(struct fi
+ struct integrity_iint_cache *iint = NULL;
+ struct ima_template_desc *template_desc;
+ char *pathbuf = NULL;
++ char filename[NAME_MAX];
+ const char *pathname = NULL;
+ int rc = -ENOMEM, action, must_appraise;
+ int pcr = CONFIG_IMA_MEASURE_PCR_IDX;
+@@ -239,8 +241,8 @@ static int process_measurement(struct fi
+ goto out_digsig;
+ }
+
+- if (!pathname) /* ima_rdwr_violation possibly pre-fetched */
+- pathname = ima_d_path(&file->f_path, &pathbuf);
++ if (!pathbuf) /* ima_rdwr_violation possibly pre-fetched */
++ pathname = ima_d_path(&file->f_path, &pathbuf, filename);
+
+ if (action & IMA_MEASURE)
+ ima_store_measurement(iint, file, pathname,
--- /dev/null
+From 95e91b831f87ac8e1f8ed50c14d709089b4e01b8 Mon Sep 17 00:00:00 2001
+From: Davidlohr Bueso <dave@stgolabs.net>
+Date: Mon, 27 Feb 2017 14:28:24 -0800
+Subject: ipc/shm: Fix shmat mmap nil-page protection
+
+From: Davidlohr Bueso <dave@stgolabs.net>
+
+commit 95e91b831f87ac8e1f8ed50c14d709089b4e01b8 upstream.
+
+The issue is described here, with a nice testcase:
+
+ https://bugzilla.kernel.org/show_bug.cgi?id=192931
+
+The problem is that shmat() calls do_mmap_pgoff() with MAP_FIXED, and
+the address rounded down to 0. For the regular mmap case, the
+protection mentioned above is that the kernel gets to generate the
+address -- arch_get_unmapped_area() will always check for MAP_FIXED and
+return that address. So by the time we do security_mmap_addr(0) things
+get funky for shmat().
+
+The testcase itself shows that while a regular user crashes, root will
+not have a problem attaching a nil-page. There are two possible fixes
+to this. The first, and which this patch does, is to simply allow root
+to crash as well -- this is also regular mmap behavior, ie when hacking
+up the testcase and adding mmap(... |MAP_FIXED). While this approach
+is the safer option, the second alternative is to ignore SHM_RND if the
+rounded address is 0, thus only having MAP_SHARED flags. This makes the
+behavior of shmat() identical to the mmap() case. The downside of this
+is obviously user visible, but does make sense in that it maintains
+semantics after the round-down wrt 0 address and mmap.
+
+Passes shm related ltp tests.
+
+Link: http://lkml.kernel.org/r/1486050195-18629-1-git-send-email-dave@stgolabs.net
+Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
+Reported-by: Gareth Evans <gareth.evans@contextis.co.uk>
+Cc: Manfred Spraul <manfred@colorfullife.com>
+Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ ipc/shm.c | 13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+--- a/ipc/shm.c
++++ b/ipc/shm.c
+@@ -1085,8 +1085,8 @@ out_unlock1:
+ * "raddr" thing points to kernel space, and there has to be a wrapper around
+ * this.
+ */
+-long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
+- unsigned long shmlba)
++long do_shmat(int shmid, char __user *shmaddr, int shmflg,
++ ulong *raddr, unsigned long shmlba)
+ {
+ struct shmid_kernel *shp;
+ unsigned long addr;
+@@ -1107,8 +1107,13 @@ long do_shmat(int shmid, char __user *sh
+ goto out;
+ else if ((addr = (ulong)shmaddr)) {
+ if (addr & (shmlba - 1)) {
+- if (shmflg & SHM_RND)
+- addr &= ~(shmlba - 1); /* round down */
++ /*
++ * Round down to the nearest multiple of shmlba.
++ * For sane do_mmap_pgoff() parameters, avoid
++ * round downs that trigger nil-page and MAP_FIXED.
++ */
++ if ((shmflg & SHM_RND) && addr >= shmlba)
++ addr &= ~(shmlba - 1);
+ else
+ #ifndef __ARCH_FORCE_SHMLBA
+ if (addr & ~PAGE_MASK)
--- /dev/null
+From 9c57b5808c625f4fc93da330b932647eaff321f7 Mon Sep 17 00:00:00 2001
+From: Yisheng Xie <xieyisheng1@huawei.com>
+Date: Fri, 24 Feb 2017 15:00:40 -0800
+Subject: mm balloon: umount balloon_mnt when removing vb device
+
+From: Yisheng Xie <xieyisheng1@huawei.com>
+
+commit 9c57b5808c625f4fc93da330b932647eaff321f7 upstream.
+
+With CONFIG_BALLOON_COMPACTION=y the kernel will mount balloon_mnt for
+balloon page migration when we probe a virtio_balloon device. However
+we do not unmount it when removing the device. Fix this.
+
+Fixes: b1123ea6d3b3 ("mm: balloon: use general non-lru movable page feature")
+Link: http://lkml.kernel.org/r/1486531318-35189-1-git-send-email-xieyisheng1@huawei.com
+Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
+Acked-by: Minchan Kim <minchan@kernel.org>
+Cc: Rafael Aquini <aquini@redhat.com>
+Cc: Konstantin Khlebnikov <koct9i@gmail.com>
+Cc: Gioh Kim <gi-oh.kim@profitbricks.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Michael S. Tsirkin <mst@redhat.com>
+Cc: Jason Wang <jasowang@redhat.com>
+Cc: Hanjun Guo <guohanjun@huawei.com>
+Cc: Xishi Qiu <qiuxishi@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/virtio/virtio_balloon.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/drivers/virtio/virtio_balloon.c
++++ b/drivers/virtio/virtio_balloon.c
+@@ -615,8 +615,12 @@ static void virtballoon_remove(struct vi
+ cancel_work_sync(&vb->update_balloon_stats_work);
+
+ remove_common(vb);
++#ifdef CONFIG_BALLOON_COMPACTION
+ if (vb->vb_dev_info.inode)
+ iput(vb->vb_dev_info.inode);
++
++ kern_unmount(balloon_mnt);
++#endif
+ kfree(vb);
+ }
+
--- /dev/null
+From b5d24fda9c3dce51fcb4eee459550a458eaaf1e2 Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Fri, 24 Feb 2017 14:55:45 -0800
+Subject: mm, devm_memremap_pages: hold device_hotplug lock over mem_hotplug_{begin, done}
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit b5d24fda9c3dce51fcb4eee459550a458eaaf1e2 upstream.
+
+The mem_hotplug_{begin,done} lock coordinates with {get,put}_online_mems()
+to hold off "readers" of the current state of memory from new hotplug
+actions. mem_hotplug_begin() expects exclusive access, via the
+device_hotplug lock, to set mem_hotplug.active_writer. Calling
+mem_hotplug_begin() without locking device_hotplug can lead to
+corrupting mem_hotplug.refcount and missed wakeups / soft lockups.
+
+[dan.j.williams@intel.com: v2]
+ Link: http://lkml.kernel.org/r/148728203365.38457.17804568297887708345.stgit@dwillia2-desk3.amr.corp.intel.com
+Link: http://lkml.kernel.org/r/148693885680.16345.17802627926777862337.stgit@dwillia2-desk3.amr.corp.intel.com
+Fixes: f931ab479dd2 ("mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}")
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Reported-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Toshi Kani <toshi.kani@hpe.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Logan Gunthorpe <logang@deltatee.com>
+Cc: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/memremap.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/kernel/memremap.c
++++ b/kernel/memremap.c
+@@ -246,9 +246,13 @@ static void devm_memremap_pages_release(
+ /* pages are dead and unused, undo the arch mapping */
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(res), SECTION_SIZE);
++
++ lock_device_hotplug();
+ mem_hotplug_begin();
+ arch_remove_memory(align_start, align_size);
+ mem_hotplug_done();
++ unlock_device_hotplug();
++
+ untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+ pgmap_radix_release(res);
+ dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
+@@ -360,9 +364,11 @@ void *devm_memremap_pages(struct device
+ if (error)
+ goto err_pfn_remap;
+
++ lock_device_hotplug();
+ mem_hotplug_begin();
+ error = arch_add_memory(nid, align_start, align_size, true);
+ mem_hotplug_done();
++ unlock_device_hotplug();
+ if (error)
+ goto err_add_memory;
+
--- /dev/null
+From dd8416c47715cf324c9a16f13273f9fda87acfed Mon Sep 17 00:00:00 2001
+From: Minchan Kim <minchan@kernel.org>
+Date: Fri, 24 Feb 2017 14:59:59 -0800
+Subject: mm: do not access page->mapping directly on page_endio
+
+From: Minchan Kim <minchan@kernel.org>
+
+commit dd8416c47715cf324c9a16f13273f9fda87acfed upstream.
+
+With rw_page, page_endio is used for completing IO on a page and it
+propagates write error to the address space if the IO fails. The
+problem is it accesses page->mapping directly which might be okay for
+file-backed pages but it shouldn't for anonymous page. Otherwise, it
+can corrupt one of field from anon_vma under us and system goes panic
+randomly.
+
+swap_writepage
+ bdev_writepage
+ ops->rw_page
+
+I encountered the BUG during developing new zram feature and it was
+really hard to figure it out because it made random crash, somtime
+mmap_sem lockdep, sometime other places where places never related to
+zram/zsmalloc, and not reproducible with some configuration.
+
+When I consider how that bug is subtle and people do fast-swap test with
+brd, it's worth to add stable mark, I think.
+
+Fixes: dd6bd0d9c7db ("swap: use bdev_read_page() / bdev_write_page()")
+Signed-off-by: Minchan Kim <minchan@kernel.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/filemap.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -910,9 +910,12 @@ void page_endio(struct page *page, bool
+ unlock_page(page);
+ } else {
+ if (err) {
++ struct address_space *mapping;
++
+ SetPageError(page);
+- if (page->mapping)
+- mapping_set_error(page->mapping, err);
++ mapping = page_mapping(page);
++ if (mapping)
++ mapping_set_error(mapping, err);
+ }
+ end_page_writeback(page);
+ }
--- /dev/null
+From e02dc017c3032dcdce1b993af0db135462e1b4b7 Mon Sep 17 00:00:00 2001
+From: Gavin Shan <gwshan@linux.vnet.ibm.com>
+Date: Fri, 24 Feb 2017 14:59:33 -0800
+Subject: mm/page_alloc: fix nodes for reclaim in fast path
+
+From: Gavin Shan <gwshan@linux.vnet.ibm.com>
+
+commit e02dc017c3032dcdce1b993af0db135462e1b4b7 upstream.
+
+When @node_reclaim_node isn't 0, the page allocator tries to reclaim
+pages if the amount of free memory in the zones are below the low
+watermark. On Power platform, none of NUMA nodes are scanned for page
+reclaim because no nodes match the condition in zone_allows_reclaim().
+On Power platform, RECLAIM_DISTANCE is set to 10 which is the distance
+of Node-A to Node-A. So the preferred node even won't be scanned for
+page reclaim.
+
+ __alloc_pages_nodemask()
+ get_page_from_freelist()
+ zone_allows_reclaim()
+
+Anton proposed the test code as below:
+
+ # cat alloc.c
+ :
+ int main(int argc, char *argv[])
+ {
+ void *p;
+ unsigned long size;
+ unsigned long start, end;
+
+ start = time(NULL);
+ size = strtoul(argv[1], NULL, 0);
+ printf("To allocate %ldGB memory\n", size);
+
+ size <<= 30;
+ p = malloc(size);
+ assert(p);
+ memset(p, 0, size);
+
+ end = time(NULL);
+ printf("Used time: %ld seconds\n", end - start);
+ sleep(3600);
+ return 0;
+ }
+
+The system I use for testing has two NUMA nodes. Both have 128GB
+memory. In below scnario, the page caches on node#0 should be reclaimed
+when it encounters pressure to accommodate request of allocation.
+
+ # echo 2 > /proc/sys/vm/zone_reclaim_mode; \
+ sync; \
+ echo 3 > /proc/sys/vm/drop_caches; \
+ # taskset -c 0 cat file.32G > /dev/null; \
+ grep FilePages /sys/devices/system/node/node0/meminfo
+ Node 0 FilePages: 33619712 kB
+ # taskset -c 0 ./alloc 128
+ # grep FilePages /sys/devices/system/node/node0/meminfo
+ Node 0 FilePages: 33619840 kB
+ # grep MemFree /sys/devices/system/node/node0/meminfo
+ Node 0 MemFree: 186816 kB
+
+With the patch applied, the pagecache on node-0 is reclaimed when its
+free memory is running out. It's the expected behaviour.
+
+ # echo 2 > /proc/sys/vm/zone_reclaim_mode; \
+ sync; \
+ echo 3 > /proc/sys/vm/drop_caches
+ # taskset -c 0 cat file.32G > /dev/null; \
+ grep FilePages /sys/devices/system/node/node0/meminfo
+ Node 0 FilePages: 33605568 kB
+ # taskset -c 0 ./alloc 128
+ # grep FilePages /sys/devices/system/node/node0/meminfo
+ Node 0 FilePages: 1379520 kB
+ # grep MemFree /sys/devices/system/node/node0/meminfo
+ Node 0 MemFree: 317120 kB
+
+Fixes: 5f7a75acdb24 ("mm: page_alloc: do not cache reclaim distances")
+Link: http://lkml.kernel.org/r/1486532455-29613-1-git-send-email-gwshan@linux.vnet.ibm.com
+Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Anton Blanchard <anton@samba.org>
+Cc: Michael Ellerman <mpe@ellerman.id.au>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2858,7 +2858,7 @@ bool zone_watermark_ok_safe(struct zone
+ #ifdef CONFIG_NUMA
+ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+ {
+- return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
++ return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
+ RECLAIM_DISTANCE;
+ }
+ #else /* CONFIG_NUMA */
--- /dev/null
+From e1587a4945408faa58d0485002c110eb2454740c Mon Sep 17 00:00:00 2001
+From: Vinayak Menon <vinmenon@codeaurora.org>
+Date: Fri, 24 Feb 2017 14:59:39 -0800
+Subject: mm: vmpressure: fix sending wrong events on underflow
+
+From: Vinayak Menon <vinmenon@codeaurora.org>
+
+commit e1587a4945408faa58d0485002c110eb2454740c upstream.
+
+At the end of a window period, if the reclaimed pages is greater than
+scanned, an unsigned underflow can result in a huge pressure value and
+thus a critical event. Reclaimed pages is found to go higher than
+scanned because of the addition of reclaimed slab pages to reclaimed in
+shrink_node without a corresponding increment to scanned pages.
+
+Minchan Kim mentioned that this can also happen in the case of a THP
+page where the scanned is 1 and reclaimed could be 512.
+
+Link: http://lkml.kernel.org/r/1486641577-11685-1-git-send-email-vinmenon@codeaurora.org
+Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
+Acked-by: Minchan Kim <minchan@kernel.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Anton Vorontsov <anton.vorontsov@linaro.org>
+Cc: Shiraz Hashim <shashim@codeaurora.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmpressure.c | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/mm/vmpressure.c
++++ b/mm/vmpressure.c
+@@ -112,9 +112,16 @@ static enum vmpressure_levels vmpressure
+ unsigned long reclaimed)
+ {
+ unsigned long scale = scanned + reclaimed;
+- unsigned long pressure;
++ unsigned long pressure = 0;
+
+ /*
++ * reclaimed can be greater than scanned in cases
++ * like THP, where the scanned is 1 and reclaimed
++ * could be 512
++ */
++ if (reclaimed >= scanned)
++ goto out;
++ /*
+ * We calculate the ratio (in percents) of how many pages were
+ * scanned vs. reclaimed in a given time frame (window). Note that
+ * time is in VM reclaimer's "ticks", i.e. number of pages
+@@ -124,6 +131,7 @@ static enum vmpressure_levels vmpressure
+ pressure = scale - (reclaimed * scale / scanned);
+ pressure = pressure * 100 / scale;
+
++out:
+ pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
+ scanned, reclaimed);
+
--- /dev/null
+From fd538803731e50367b7c59ce4ad3454426a3d671 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Wed, 22 Feb 2017 15:45:58 -0800
+Subject: mm, vmscan: cleanup lru size claculations
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit fd538803731e50367b7c59ce4ad3454426a3d671 upstream.
+
+lruvec_lru_size returns the full size of the LRU list while we sometimes
+need a value reduced only to eligible zones (e.g. for lowmem requests).
+inactive_list_is_low is one such user. Later patches will add more of
+them. Add a new parameter to lruvec_lru_size and allow it filter out
+zones which are not eligible for the given context.
+
+Link: http://lkml.kernel.org/r/20170117103702.28542-2-mhocko@kernel.org
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Acked-by: Minchan Kim <minchan@kernel.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ include/linux/mmzone.h | 2 -
+ mm/vmscan.c | 81 +++++++++++++++++++++++--------------------------
+ mm/workingset.c | 2 -
+ 3 files changed, 41 insertions(+), 44 deletions(-)
+
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -779,7 +779,7 @@ static inline struct pglist_data *lruvec
+ #endif
+ }
+
+-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
++extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
+
+ #ifdef CONFIG_HAVE_MEMORY_PRESENT
+ void memory_present(int nid, unsigned long start, unsigned long end);
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -234,22 +234,39 @@ bool pgdat_reclaimable(struct pglist_dat
+ pgdat_reclaimable_pages(pgdat) * 6;
+ }
+
+-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
++/**
++ * lruvec_lru_size - Returns the number of pages on the given LRU list.
++ * @lruvec: lru vector
++ * @lru: lru to use
++ * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
++ */
++unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
+ {
++ unsigned long lru_size;
++ int zid;
++
+ if (!mem_cgroup_disabled())
+- return mem_cgroup_get_lru_size(lruvec, lru);
++ lru_size = mem_cgroup_get_lru_size(lruvec, lru);
++ else
++ lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
+
+- return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
+-}
++ for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
++ struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
++ unsigned long size;
+
+-unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru,
+- int zone_idx)
+-{
+- if (!mem_cgroup_disabled())
+- return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx);
++ if (!managed_zone(zone))
++ continue;
++
++ if (!mem_cgroup_disabled())
++ size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
++ else
++ size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
++ NR_ZONE_LRU_BASE + lru);
++ lru_size -= min(size, lru_size);
++ }
++
++ return lru_size;
+
+- return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx],
+- NR_ZONE_LRU_BASE + lru);
+ }
+
+ /*
+@@ -2028,11 +2045,10 @@ static bool inactive_list_is_low(struct
+ struct scan_control *sc)
+ {
+ unsigned long inactive_ratio;
+- unsigned long inactive;
+- unsigned long active;
++ unsigned long inactive, active;
++ enum lru_list inactive_lru = file * LRU_FILE;
++ enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+ unsigned long gb;
+- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+- int zid;
+
+ /*
+ * If we don't have swap space, anonymous page deactivation
+@@ -2041,27 +2057,8 @@ static bool inactive_list_is_low(struct
+ if (!file && !total_swap_pages)
+ return false;
+
+- inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
+- active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
+-
+- /*
+- * For zone-constrained allocations, it is necessary to check if
+- * deactivations are required for lowmem to be reclaimed. This
+- * calculates the inactive/active pages available in eligible zones.
+- */
+- for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
+- struct zone *zone = &pgdat->node_zones[zid];
+- unsigned long inactive_zone, active_zone;
+-
+- if (!managed_zone(zone))
+- continue;
+-
+- inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid);
+- active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid);
+-
+- inactive -= min(inactive, inactive_zone);
+- active -= min(active, active_zone);
+- }
++ inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
++ active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
+
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+@@ -2208,7 +2205,7 @@ static void get_scan_count(struct lruvec
+ * system is under heavy pressure.
+ */
+ if (!inactive_list_is_low(lruvec, true, sc) &&
+- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
++ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
+ scan_balance = SCAN_FILE;
+ goto out;
+ }
+@@ -2234,10 +2231,10 @@ static void get_scan_count(struct lruvec
+ * anon in [0], file in [1]
+ */
+
+- anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
+- lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
+- file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
++ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
++ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
++ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
++ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
+
+ spin_lock_irq(&pgdat->lru_lock);
+ if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
+@@ -2275,7 +2272,7 @@ out:
+ unsigned long size;
+ unsigned long scan;
+
+- size = lruvec_lru_size(lruvec, lru);
++ size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
+ scan = size >> sc->priority;
+
+ if (!scan && pass && force_scan)
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -266,7 +266,7 @@ bool workingset_refault(void *shadow)
+ }
+ lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ refault = atomic_long_read(&lruvec->inactive_age);
+- active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
++ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
+ rcu_read_unlock();
+
+ /*
--- /dev/null
+From 71ab6cfe88dcf9f6e6a65eb85cf2bda20a257682 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Wed, 22 Feb 2017 15:46:01 -0800
+Subject: mm, vmscan: consider eligible zones in get_scan_count
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit 71ab6cfe88dcf9f6e6a65eb85cf2bda20a257682 upstream.
+
+get_scan_count() considers the whole node LRU size when
+
+ - doing SCAN_FILE due to many page cache inactive pages
+ - calculating the number of pages to scan
+
+In both cases this might lead to unexpected behavior especially on 32b
+systems where we can expect lowmem memory pressure very often.
+
+A large highmem zone can easily distort SCAN_FILE heuristic because
+there might be only few file pages from the eligible zones on the node
+lru and we would still enforce file lru scanning which can lead to
+trashing while we could still scan anonymous pages.
+
+The later use of lruvec_lru_size can be problematic as well. Especially
+when there are not many pages from the eligible zones. We would have to
+skip over many pages to find anything to reclaim but shrink_node_memcg
+would only reduce the remaining number to scan by SWAP_CLUSTER_MAX at
+maximum. Therefore we can end up going over a large LRU many times
+without actually having chance to reclaim much if anything at all. The
+closer we are out of memory on lowmem zone the worse the problem will
+be.
+
+Fix this by filtering out all the ineligible zones when calculating the
+lru size for both paths and consider only sc->reclaim_idx zones.
+
+The patch would need to be tweaked a bit to apply to 4.10 and older but
+I will do that as soon as it hits the Linus tree in the next merge
+window.
+
+Link: http://lkml.kernel.org/r/20170117103702.28542-3-mhocko@kernel.org
+Fixes: b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a per-node basis")
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Tested-by: Trevor Cordes <trevor@tecnopolis.ca>
+Acked-by: Minchan Kim <minchan@kernel.org>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ mm/vmscan.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2205,7 +2205,7 @@ static void get_scan_count(struct lruvec
+ * system is under heavy pressure.
+ */
+ if (!inactive_list_is_low(lruvec, true, sc) &&
+- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
++ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
+ scan_balance = SCAN_FILE;
+ goto out;
+ }
+@@ -2272,7 +2272,7 @@ out:
+ unsigned long size;
+ unsigned long scan;
+
+- size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
++ size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ scan = size >> sc->priority;
+
+ if (!scan && pass && force_scan)
--- /dev/null
+From bcf23c79c4e46130701370af4383b61a3cba755c Mon Sep 17 00:00:00 2001
+From: Chanwoo Choi <cw00.choi@samsung.com>
+Date: Tue, 31 Jan 2017 15:38:16 +0900
+Subject: PM / devfreq: Fix available_governor sysfs
+
+From: Chanwoo Choi <cw00.choi@samsung.com>
+
+commit bcf23c79c4e46130701370af4383b61a3cba755c upstream.
+
+The devfreq using passive governor is not able to change the governor.
+So, the user can not change the governor through 'available_governor' sysfs
+entry. Also, the devfreq which don't use the passive governor is not able to
+change to 'passive' governor on the fly.
+
+Fixes: 996133119f57 ("PM / devfreq: Add new passive governor")
+Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
+Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/devfreq/devfreq.c | 31 +++++++++++++++++++++++++++----
+ drivers/devfreq/governor_passive.c | 1 +
+ include/linux/devfreq.h | 3 +++
+ 3 files changed, 31 insertions(+), 4 deletions(-)
+
+--- a/drivers/devfreq/devfreq.c
++++ b/drivers/devfreq/devfreq.c
+@@ -939,6 +939,9 @@ static ssize_t governor_store(struct dev
+ if (df->governor == governor) {
+ ret = 0;
+ goto out;
++ } else if (df->governor->immutable || governor->immutable) {
++ ret = -EINVAL;
++ goto out;
+ }
+
+ if (df->governor) {
+@@ -968,13 +971,33 @@ static ssize_t available_governors_show(
+ struct device_attribute *attr,
+ char *buf)
+ {
+- struct devfreq_governor *tmp_governor;
++ struct devfreq *df = to_devfreq(d);
+ ssize_t count = 0;
+
+ mutex_lock(&devfreq_list_lock);
+- list_for_each_entry(tmp_governor, &devfreq_governor_list, node)
+- count += scnprintf(&buf[count], (PAGE_SIZE - count - 2),
+- "%s ", tmp_governor->name);
++
++ /*
++ * The devfreq with immutable governor (e.g., passive) shows
++ * only own governor.
++ */
++ if (df->governor->immutable) {
++ count = scnprintf(&buf[count], DEVFREQ_NAME_LEN,
++ "%s ", df->governor_name);
++ /*
++ * The devfreq device shows the registered governor except for
++ * immutable governors such as passive governor .
++ */
++ } else {
++ struct devfreq_governor *governor;
++
++ list_for_each_entry(governor, &devfreq_governor_list, node) {
++ if (governor->immutable)
++ continue;
++ count += scnprintf(&buf[count], (PAGE_SIZE - count - 2),
++ "%s ", governor->name);
++ }
++ }
++
+ mutex_unlock(&devfreq_list_lock);
+
+ /* Truncate the trailing space */
+--- a/drivers/devfreq/governor_passive.c
++++ b/drivers/devfreq/governor_passive.c
+@@ -179,6 +179,7 @@ static int devfreq_passive_event_handler
+
+ static struct devfreq_governor devfreq_passive = {
+ .name = "passive",
++ .immutable = 1,
+ .get_target_freq = devfreq_passive_get_target_freq,
+ .event_handler = devfreq_passive_event_handler,
+ };
+--- a/include/linux/devfreq.h
++++ b/include/linux/devfreq.h
+@@ -104,6 +104,8 @@ struct devfreq_dev_profile {
+ * struct devfreq_governor - Devfreq policy governor
+ * @node: list node - contains registered devfreq governors
+ * @name: Governor's name
++ * @immutable: Immutable flag for governor. If the value is 1,
++ * this govenror is never changeable to other governor.
+ * @get_target_freq: Returns desired operating frequency for the device.
+ * Basically, get_target_freq will run
+ * devfreq_dev_profile.get_dev_status() to get the
+@@ -121,6 +123,7 @@ struct devfreq_governor {
+ struct list_head node;
+
+ const char name[DEVFREQ_NAME_LEN];
++ const unsigned int immutable;
+ int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
+ int (*event_handler)(struct devfreq *devfreq,
+ unsigned int event, void *data);
--- /dev/null
+From 30582c25a4b4e0a5e456a309fde79b845e9473b2 Mon Sep 17 00:00:00 2001
+From: Chanwoo Choi <cw00.choi@samsung.com>
+Date: Tue, 31 Jan 2017 15:38:17 +0900
+Subject: PM / devfreq: Fix wrong trans_stat of passive devfreq device
+
+From: Chanwoo Choi <cw00.choi@samsung.com>
+
+commit 30582c25a4b4e0a5e456a309fde79b845e9473b2 upstream.
+
+Until now, the trans_stat information of passive devfreq is not updated.
+This patch updates the trans_stat information after setting the target
+frequency of passive devfreq device.
+
+Fixes: 996133119f57 ("PM / devfreq: Add new passive governor")
+Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
+Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/devfreq/devfreq.c | 3 ++-
+ drivers/devfreq/governor.h | 2 ++
+ drivers/devfreq/governor_passive.c | 5 +++++
+ 3 files changed, 9 insertions(+), 1 deletion(-)
+
+--- a/drivers/devfreq/devfreq.c
++++ b/drivers/devfreq/devfreq.c
+@@ -130,7 +130,7 @@ static void devfreq_set_freq_table(struc
+ * @devfreq: the devfreq instance
+ * @freq: the update target frequency
+ */
+-static int devfreq_update_status(struct devfreq *devfreq, unsigned long freq)
++int devfreq_update_status(struct devfreq *devfreq, unsigned long freq)
+ {
+ int lev, prev_lev, ret = 0;
+ unsigned long cur_time;
+@@ -166,6 +166,7 @@ out:
+ devfreq->last_stat_updated = cur_time;
+ return ret;
+ }
++EXPORT_SYMBOL(devfreq_update_status);
+
+ /**
+ * find_devfreq_governor() - find devfreq governor from name
+--- a/drivers/devfreq/governor.h
++++ b/drivers/devfreq/governor.h
+@@ -38,4 +38,6 @@ extern void devfreq_interval_update(stru
+ extern int devfreq_add_governor(struct devfreq_governor *governor);
+ extern int devfreq_remove_governor(struct devfreq_governor *governor);
+
++extern int devfreq_update_status(struct devfreq *devfreq, unsigned long freq);
++
+ #endif /* _GOVERNOR_H */
+--- a/drivers/devfreq/governor_passive.c
++++ b/drivers/devfreq/governor_passive.c
+@@ -112,6 +112,11 @@ static int update_devfreq_passive(struct
+ if (ret < 0)
+ goto out;
+
++ if (devfreq->profile->freq_table
++ && (devfreq_update_status(devfreq, freq)))
++ dev_err(&devfreq->dev,
++ "Couldn't update frequency transition information.\n");
++
+ devfreq->previous_freq = freq;
+
+ out:
--- /dev/null
+From 0b0408745e7ff24757cbfd571d69026c0ddb803c Mon Sep 17 00:00:00 2001
+From: Alexandre Belloni <alexandre.belloni@free-electrons.com>
+Date: Tue, 25 Oct 2016 11:37:59 +0200
+Subject: power: reset: at91-poweroff: timely shutdown LPDDR memories
+
+From: Alexandre Belloni <alexandre.belloni@free-electrons.com>
+
+commit 0b0408745e7ff24757cbfd571d69026c0ddb803c upstream.
+
+LPDDR memories can only handle up to 400 uncontrolled power off. Ensure the
+proper power off sequence is used before shutting down the platform.
+
+Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
+Signed-off-by: Sebastian Reichel <sre@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/power/reset/Kconfig | 2 -
+ drivers/power/reset/at91-poweroff.c | 54 ++++++++++++++++++++++++++++++-
+ drivers/power/reset/at91-sama5d2_shdwc.c | 49 +++++++++++++++++++++++++++-
+ 3 files changed, 102 insertions(+), 3 deletions(-)
+
+--- a/drivers/power/reset/Kconfig
++++ b/drivers/power/reset/Kconfig
+@@ -32,7 +32,7 @@ config POWER_RESET_AT91_RESET
+
+ config POWER_RESET_AT91_SAMA5D2_SHDWC
+ tristate "Atmel AT91 SAMA5D2-Compatible shutdown controller driver"
+- depends on ARCH_AT91 || COMPILE_TEST
++ depends on ARCH_AT91
+ default SOC_SAMA5
+ help
+ This driver supports the alternate shutdown controller for some Atmel
+--- a/drivers/power/reset/at91-poweroff.c
++++ b/drivers/power/reset/at91-poweroff.c
+@@ -14,9 +14,12 @@
+ #include <linux/io.h>
+ #include <linux/module.h>
+ #include <linux/of.h>
++#include <linux/of_address.h>
+ #include <linux/platform_device.h>
+ #include <linux/printk.h>
+
++#include <soc/at91/at91sam9_ddrsdr.h>
++
+ #define AT91_SHDW_CR 0x00 /* Shut Down Control Register */
+ #define AT91_SHDW_SHDW BIT(0) /* Shut Down command */
+ #define AT91_SHDW_KEY (0xa5 << 24) /* KEY Password */
+@@ -50,6 +53,7 @@ static const char *shdwc_wakeup_modes[]
+
+ static void __iomem *at91_shdwc_base;
+ static struct clk *sclk;
++static void __iomem *mpddrc_base;
+
+ static void __init at91_wakeup_status(void)
+ {
+@@ -73,6 +77,29 @@ static void at91_poweroff(void)
+ writel(AT91_SHDW_KEY | AT91_SHDW_SHDW, at91_shdwc_base + AT91_SHDW_CR);
+ }
+
++static void at91_lpddr_poweroff(void)
++{
++ asm volatile(
++ /* Align to cache lines */
++ ".balign 32\n\t"
++
++ /* Ensure AT91_SHDW_CR is in the TLB by reading it */
++ " ldr r6, [%2, #" __stringify(AT91_SHDW_CR) "]\n\t"
++
++ /* Power down SDRAM0 */
++ " str %1, [%0, #" __stringify(AT91_DDRSDRC_LPR) "]\n\t"
++ /* Shutdown CPU */
++ " str %3, [%2, #" __stringify(AT91_SHDW_CR) "]\n\t"
++
++ " b .\n\t"
++ :
++ : "r" (mpddrc_base),
++ "r" cpu_to_le32(AT91_DDRSDRC_LPDDR2_PWOFF),
++ "r" (at91_shdwc_base),
++ "r" cpu_to_le32(AT91_SHDW_KEY | AT91_SHDW_SHDW)
++ : "r0");
++}
++
+ static int at91_poweroff_get_wakeup_mode(struct device_node *np)
+ {
+ const char *pm;
+@@ -124,6 +151,8 @@ static void at91_poweroff_dt_set_wakeup_
+ static int __init at91_poweroff_probe(struct platform_device *pdev)
+ {
+ struct resource *res;
++ struct device_node *np;
++ u32 ddr_type;
+ int ret;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+@@ -150,12 +179,30 @@ static int __init at91_poweroff_probe(st
+
+ pm_power_off = at91_poweroff;
+
++ np = of_find_compatible_node(NULL, NULL, "atmel,sama5d3-ddramc");
++ if (!np)
++ return 0;
++
++ mpddrc_base = of_iomap(np, 0);
++ of_node_put(np);
++
++ if (!mpddrc_base)
++ return 0;
++
++ ddr_type = readl(mpddrc_base + AT91_DDRSDRC_MDR) & AT91_DDRSDRC_MD;
++ if ((ddr_type == AT91_DDRSDRC_MD_LPDDR2) ||
++ (ddr_type == AT91_DDRSDRC_MD_LPDDR3))
++ pm_power_off = at91_lpddr_poweroff;
++ else
++ iounmap(mpddrc_base);
++
+ return 0;
+ }
+
+ static int __exit at91_poweroff_remove(struct platform_device *pdev)
+ {
+- if (pm_power_off == at91_poweroff)
++ if (pm_power_off == at91_poweroff ||
++ pm_power_off == at91_lpddr_poweroff)
+ pm_power_off = NULL;
+
+ clk_disable_unprepare(sclk);
+@@ -163,6 +210,11 @@ static int __exit at91_poweroff_remove(s
+ return 0;
+ }
+
++static const struct of_device_id at91_ramc_of_match[] = {
++ { .compatible = "atmel,sama5d3-ddramc", },
++ { /* sentinel */ }
++};
++
+ static const struct of_device_id at91_poweroff_of_match[] = {
+ { .compatible = "atmel,at91sam9260-shdwc", },
+ { .compatible = "atmel,at91sam9rl-shdwc", },
+--- a/drivers/power/reset/at91-sama5d2_shdwc.c
++++ b/drivers/power/reset/at91-sama5d2_shdwc.c
+@@ -22,9 +22,12 @@
+ #include <linux/io.h>
+ #include <linux/module.h>
+ #include <linux/of.h>
++#include <linux/of_address.h>
+ #include <linux/platform_device.h>
+ #include <linux/printk.h>
+
++#include <soc/at91/at91sam9_ddrsdr.h>
++
+ #define SLOW_CLOCK_FREQ 32768
+
+ #define AT91_SHDW_CR 0x00 /* Shut Down Control Register */
+@@ -75,6 +78,7 @@ struct shdwc {
+ */
+ static struct shdwc *at91_shdwc;
+ static struct clk *sclk;
++static void __iomem *mpddrc_base;
+
+ static const unsigned long long sdwc_dbc_period[] = {
+ 0, 3, 32, 512, 4096, 32768,
+@@ -108,6 +112,29 @@ static void at91_poweroff(void)
+ at91_shdwc->at91_shdwc_base + AT91_SHDW_CR);
+ }
+
++static void at91_lpddr_poweroff(void)
++{
++ asm volatile(
++ /* Align to cache lines */
++ ".balign 32\n\t"
++
++ /* Ensure AT91_SHDW_CR is in the TLB by reading it */
++ " ldr r6, [%2, #" __stringify(AT91_SHDW_CR) "]\n\t"
++
++ /* Power down SDRAM0 */
++ " str %1, [%0, #" __stringify(AT91_DDRSDRC_LPR) "]\n\t"
++ /* Shutdown CPU */
++ " str %3, [%2, #" __stringify(AT91_SHDW_CR) "]\n\t"
++
++ " b .\n\t"
++ :
++ : "r" (mpddrc_base),
++ "r" cpu_to_le32(AT91_DDRSDRC_LPDDR2_PWOFF),
++ "r" (at91_shdwc->at91_shdwc_base),
++ "r" cpu_to_le32(AT91_SHDW_KEY | AT91_SHDW_SHDW)
++ : "r0");
++}
++
+ static u32 at91_shdwc_debouncer_value(struct platform_device *pdev,
+ u32 in_period_us)
+ {
+@@ -212,6 +239,8 @@ static int __init at91_shdwc_probe(struc
+ {
+ struct resource *res;
+ const struct of_device_id *match;
++ struct device_node *np;
++ u32 ddr_type;
+ int ret;
+
+ if (!pdev->dev.of_node)
+@@ -249,6 +278,23 @@ static int __init at91_shdwc_probe(struc
+
+ pm_power_off = at91_poweroff;
+
++ np = of_find_compatible_node(NULL, NULL, "atmel,sama5d3-ddramc");
++ if (!np)
++ return 0;
++
++ mpddrc_base = of_iomap(np, 0);
++ of_node_put(np);
++
++ if (!mpddrc_base)
++ return 0;
++
++ ddr_type = readl(mpddrc_base + AT91_DDRSDRC_MDR) & AT91_DDRSDRC_MD;
++ if ((ddr_type == AT91_DDRSDRC_MD_LPDDR2) ||
++ (ddr_type == AT91_DDRSDRC_MD_LPDDR3))
++ pm_power_off = at91_lpddr_poweroff;
++ else
++ iounmap(mpddrc_base);
++
+ return 0;
+ }
+
+@@ -256,7 +302,8 @@ static int __exit at91_shdwc_remove(stru
+ {
+ struct shdwc *shdw = platform_get_drvdata(pdev);
+
+- if (pm_power_off == at91_poweroff)
++ if (pm_power_off == at91_poweroff ||
++ pm_power_off == at91_lpddr_poweroff)
+ pm_power_off = NULL;
+
+ /* Reset values to disable wake-up features */
--- /dev/null
+From c421530bf848604e97d0785a03b3fe2c62775083 Mon Sep 17 00:00:00 2001
+From: Raghava Aditya Renukunta <RaghavaAditya.Renukunta@microsemi.com>
+Date: Thu, 16 Feb 2017 12:51:21 -0800
+Subject: scsi: aacraid: Reorder Adapter status check
+
+From: Raghava Aditya Renukunta <RaghavaAditya.Renukunta@microsemi.com>
+
+commit c421530bf848604e97d0785a03b3fe2c62775083 upstream.
+
+The driver currently checks the SELF_TEST_FAILED first and then
+KERNEL_PANIC next. Under error conditions(boot code failure) both
+SELF_TEST_FAILED and KERNEL_PANIC can be set at the same time.
+
+The driver has the capability to reset the controller on an KERNEL_PANIC,
+but not on SELF_TEST_FAILED.
+
+Fixed by first checking KERNEL_PANIC and then the others.
+
+Fixes: e8b12f0fb835223752 ([SCSI] aacraid: Add new code for PMC-Sierra's SRC base controller family)
+Signed-off-by: Raghava Aditya Renukunta <RaghavaAditya.Renukunta@microsemi.com>
+Reviewed-by: David Carroll <David.Carroll@microsemi.com>
+Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/aacraid/src.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+--- a/drivers/scsi/aacraid/src.c
++++ b/drivers/scsi/aacraid/src.c
+@@ -414,16 +414,23 @@ static int aac_src_check_health(struct a
+ u32 status = src_readl(dev, MUnit.OMR);
+
+ /*
++ * Check to see if the board panic'd.
++ */
++ if (unlikely(status & KERNEL_PANIC))
++ goto err_blink;
++
++ /*
+ * Check to see if the board failed any self tests.
+ */
+ if (unlikely(status & SELF_TEST_FAILED))
+- return -1;
++ goto err_out;
+
+ /*
+- * Check to see if the board panic'd.
++ * Check to see if the board failed any self tests.
+ */
+- if (unlikely(status & KERNEL_PANIC))
+- return (status >> 16) & 0xFF;
++ if (unlikely(status & MONITOR_PANIC))
++ goto err_out;
++
+ /*
+ * Wait for the adapter to be up and running.
+ */
+@@ -433,6 +440,12 @@ static int aac_src_check_health(struct a
+ * Everything is OK
+ */
+ return 0;
++
++err_out:
++ return -1;
++
++err_blink:
++ return (status > 16) & 0xFF;
+ }
+
+ /**
--- /dev/null
+From bba5dc332ec2d3a685cb4dae668c793f6a3713a3 Mon Sep 17 00:00:00 2001
+From: Long Li <longli@microsoft.com>
+Date: Wed, 14 Dec 2016 18:46:02 -0800
+Subject: scsi: storvsc: properly handle SRB_ERROR when sense message is present
+
+From: Long Li <longli@microsoft.com>
+
+commit bba5dc332ec2d3a685cb4dae668c793f6a3713a3 upstream.
+
+When sense message is present on error, we should pass along to the upper
+layer to decide how to deal with the error.
+This patch fixes connectivity issues with Fiber Channel devices.
+
+Signed-off-by: Long Li <longli@microsoft.com>
+Reviewed-by: K. Y. Srinivasan <kys@microsoft.com>
+Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/storvsc_drv.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/drivers/scsi/storvsc_drv.c
++++ b/drivers/scsi/storvsc_drv.c
+@@ -891,6 +891,13 @@ static void storvsc_handle_error(struct
+ switch (SRB_STATUS(vm_srb->srb_status)) {
+ case SRB_STATUS_ERROR:
+ /*
++ * Let upper layer deal with error when
++ * sense message is present.
++ */
++
++ if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)
++ break;
++ /*
+ * If there is an error; offline the device since all
+ * error recovery strategies would have already been
+ * deployed on the host side. However, if the command
--- /dev/null
+From 40630f462824ee24bc00d692865c86c3828094e0 Mon Sep 17 00:00:00 2001
+From: Long Li <longli@microsoft.com>
+Date: Wed, 14 Dec 2016 18:46:03 -0800
+Subject: scsi: storvsc: properly set residual data length on errors
+
+From: Long Li <longli@microsoft.com>
+
+commit 40630f462824ee24bc00d692865c86c3828094e0 upstream.
+
+On I/O errors, the Windows driver doesn't set data_transfer_length
+on error conditions other than SRB_STATUS_DATA_OVERRUN.
+In these cases we need to set data_transfer_length to 0,
+indicating there is no data transferred. On SRB_STATUS_DATA_OVERRUN,
+data_transfer_length is set by the Windows driver to the actual data transferred.
+
+Reported-by: Shiva Krishna <Shiva.Krishna@nimblestorage.com>
+Signed-off-by: Long Li <longli@microsoft.com>
+Reviewed-by: K. Y. Srinivasan <kys@microsoft.com>
+Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/storvsc_drv.c | 16 +++++++++++++---
+ 1 file changed, 13 insertions(+), 3 deletions(-)
+
+--- a/drivers/scsi/storvsc_drv.c
++++ b/drivers/scsi/storvsc_drv.c
+@@ -377,6 +377,7 @@ enum storvsc_request_type {
+ #define SRB_STATUS_SUCCESS 0x01
+ #define SRB_STATUS_ABORTED 0x02
+ #define SRB_STATUS_ERROR 0x04
++#define SRB_STATUS_DATA_OVERRUN 0x12
+
+ #define SRB_STATUS(status) \
+ (status & ~(SRB_STATUS_AUTOSENSE_VALID | SRB_STATUS_QUEUE_FROZEN))
+@@ -962,6 +963,7 @@ static void storvsc_command_completion(s
+ struct scsi_cmnd *scmnd = cmd_request->cmd;
+ struct scsi_sense_hdr sense_hdr;
+ struct vmscsi_request *vm_srb;
++ u32 data_transfer_length;
+ struct Scsi_Host *host;
+ u32 payload_sz = cmd_request->payload_sz;
+ void *payload = cmd_request->payload;
+@@ -969,6 +971,7 @@ static void storvsc_command_completion(s
+ host = stor_dev->host;
+
+ vm_srb = &cmd_request->vstor_packet.vm_srb;
++ data_transfer_length = vm_srb->data_transfer_length;
+
+ scmnd->result = vm_srb->scsi_status;
+
+@@ -982,13 +985,20 @@ static void storvsc_command_completion(s
+ &sense_hdr);
+ }
+
+- if (vm_srb->srb_status != SRB_STATUS_SUCCESS)
++ if (vm_srb->srb_status != SRB_STATUS_SUCCESS) {
+ storvsc_handle_error(vm_srb, scmnd, host, sense_hdr.asc,
+ sense_hdr.ascq);
++ /*
++ * The Windows driver set data_transfer_length on
++ * SRB_STATUS_DATA_OVERRUN. On other errors, this value
++ * is untouched. In these cases we set it to 0.
++ */
++ if (vm_srb->srb_status != SRB_STATUS_DATA_OVERRUN)
++ data_transfer_length = 0;
++ }
+
+ scsi_set_resid(scmnd,
+- cmd_request->payload->range.len -
+- vm_srb->data_transfer_length);
++ cmd_request->payload->range.len - data_transfer_length);
+
+ scmnd->scsi_done(scmnd);
+
--- /dev/null
+From 3cd6d3d9b1abab8dcdf0800224ce26daac24eea2 Mon Sep 17 00:00:00 2001
+From: Long Li <longli@microsoft.com>
+Date: Wed, 14 Dec 2016 18:46:01 -0800
+Subject: scsi: storvsc: use tagged SRB requests if supported by the device
+
+From: Long Li <longli@microsoft.com>
+
+commit 3cd6d3d9b1abab8dcdf0800224ce26daac24eea2 upstream.
+
+Properly set SRB flags when hosting device supports tagged queuing.
+This patch improves the performance on Fiber Channel disks.
+
+Signed-off-by: Long Li <longli@microsoft.com>
+Reviewed-by: K. Y. Srinivasan <kys@microsoft.com>
+Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/storvsc_drv.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/drivers/scsi/storvsc_drv.c
++++ b/drivers/scsi/storvsc_drv.c
+@@ -136,6 +136,8 @@ struct hv_fc_wwn_packet {
+ #define SRB_FLAGS_PORT_DRIVER_RESERVED 0x0F000000
+ #define SRB_FLAGS_CLASS_DRIVER_RESERVED 0xF0000000
+
++#define SP_UNTAGGED ((unsigned char) ~0)
++#define SRB_SIMPLE_TAG_REQUEST 0x20
+
+ /*
+ * Platform neutral description of a scsi request -
+@@ -1451,6 +1453,13 @@ static int storvsc_queuecommand(struct S
+ vm_srb->win8_extension.srb_flags |=
+ SRB_FLAGS_DISABLE_SYNCH_TRANSFER;
+
++ if (scmnd->device->tagged_supported) {
++ vm_srb->win8_extension.srb_flags |=
++ (SRB_FLAGS_QUEUE_ACTION_ENABLE | SRB_FLAGS_NO_QUEUE_FREEZE);
++ vm_srb->win8_extension.queue_tag = SP_UNTAGGED;
++ vm_srb->win8_extension.queue_action = SRB_SIMPLE_TAG_REQUEST;
++ }
++
+ /* Build the SRB */
+ switch (scmnd->sc_data_direction) {
+ case DMA_TO_DEVICE:
--- /dev/null
+From 857de6e00778738dc3d61f75acbac35bdc48e533 Mon Sep 17 00:00:00 2001
+From: Hannes Reinecke <hare@suse.de>
+Date: Fri, 17 Feb 2017 09:02:45 +0100
+Subject: scsi: use 'scsi_device_from_queue()' for scsi_dh
+
+From: Hannes Reinecke <hare@suse.de>
+
+commit 857de6e00778738dc3d61f75acbac35bdc48e533 upstream.
+
+The device handler needs to check if a given queue belongs to a scsi
+device; only then does it make sense to attach a device handler.
+
+[mkp: dropped flags]
+
+Signed-off-by: Hannes Reinecke <hare@suse.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/scsi_dh.c | 22 ++++------------------
+ drivers/scsi/scsi_lib.c | 23 +++++++++++++++++++++++
+ include/scsi/scsi_device.h | 1 +
+ 3 files changed, 28 insertions(+), 18 deletions(-)
+
+--- a/drivers/scsi/scsi_dh.c
++++ b/drivers/scsi/scsi_dh.c
+@@ -219,20 +219,6 @@ int scsi_unregister_device_handler(struc
+ }
+ EXPORT_SYMBOL_GPL(scsi_unregister_device_handler);
+
+-static struct scsi_device *get_sdev_from_queue(struct request_queue *q)
+-{
+- struct scsi_device *sdev;
+- unsigned long flags;
+-
+- spin_lock_irqsave(q->queue_lock, flags);
+- sdev = q->queuedata;
+- if (!sdev || !get_device(&sdev->sdev_gendev))
+- sdev = NULL;
+- spin_unlock_irqrestore(q->queue_lock, flags);
+-
+- return sdev;
+-}
+-
+ /*
+ * scsi_dh_activate - activate the path associated with the scsi_device
+ * corresponding to the given request queue.
+@@ -251,7 +237,7 @@ int scsi_dh_activate(struct request_queu
+ struct scsi_device *sdev;
+ int err = SCSI_DH_NOSYS;
+
+- sdev = get_sdev_from_queue(q);
++ sdev = scsi_device_from_queue(q);
+ if (!sdev) {
+ if (fn)
+ fn(data, err);
+@@ -298,7 +284,7 @@ int scsi_dh_set_params(struct request_qu
+ struct scsi_device *sdev;
+ int err = -SCSI_DH_NOSYS;
+
+- sdev = get_sdev_from_queue(q);
++ sdev = scsi_device_from_queue(q);
+ if (!sdev)
+ return err;
+
+@@ -321,7 +307,7 @@ int scsi_dh_attach(struct request_queue
+ struct scsi_device_handler *scsi_dh;
+ int err = 0;
+
+- sdev = get_sdev_from_queue(q);
++ sdev = scsi_device_from_queue(q);
+ if (!sdev)
+ return -ENODEV;
+
+@@ -359,7 +345,7 @@ const char *scsi_dh_attached_handler_nam
+ struct scsi_device *sdev;
+ const char *handler_name = NULL;
+
+- sdev = get_sdev_from_queue(q);
++ sdev = scsi_device_from_queue(q);
+ if (!sdev)
+ return NULL;
+
+--- a/drivers/scsi/scsi_lib.c
++++ b/drivers/scsi/scsi_lib.c
+@@ -2127,6 +2127,29 @@ void scsi_mq_destroy_tags(struct Scsi_Ho
+ blk_mq_free_tag_set(&shost->tag_set);
+ }
+
++/**
++ * scsi_device_from_queue - return sdev associated with a request_queue
++ * @q: The request queue to return the sdev from
++ *
++ * Return the sdev associated with a request queue or NULL if the
++ * request_queue does not reference a SCSI device.
++ */
++struct scsi_device *scsi_device_from_queue(struct request_queue *q)
++{
++ struct scsi_device *sdev = NULL;
++
++ if (q->mq_ops) {
++ if (q->mq_ops == &scsi_mq_ops)
++ sdev = q->queuedata;
++ } else if (q->request_fn == scsi_request_fn)
++ sdev = q->queuedata;
++ if (!sdev || !get_device(&sdev->sdev_gendev))
++ sdev = NULL;
++
++ return sdev;
++}
++EXPORT_SYMBOL_GPL(scsi_device_from_queue);
++
+ /*
+ * Function: scsi_block_requests()
+ *
+--- a/include/scsi/scsi_device.h
++++ b/include/scsi/scsi_device.h
+@@ -315,6 +315,7 @@ extern void scsi_remove_device(struct sc
+ extern int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh);
+ void scsi_attach_vpd(struct scsi_device *sdev);
+
++extern struct scsi_device *scsi_device_from_queue(struct request_queue *q);
+ extern int scsi_device_get(struct scsi_device *);
+ extern void scsi_device_put(struct scsi_device *);
+ extern struct scsi_device *scsi_device_lookup(struct Scsi_Host *,
iommu-vt-d-fix-some-macros-that-are-incorrectly-specified-in-intel-iommu.patch
iommu-vt-d-tylersburg-isoch-identity-map-check-is-done-too-late.patch
cifs-fix-splice-read-for-non-cached-files.patch
+mm-devm_memremap_pages-hold-device_hotplug-lock-over-mem_hotplug_-begin-done.patch
+mm-page_alloc-fix-nodes-for-reclaim-in-fast-path.patch
+mm-vmpressure-fix-sending-wrong-events-on-underflow.patch
+mm-do-not-access-page-mapping-directly-on-page_endio.patch
+mm-balloon-umount-balloon_mnt-when-removing-vb-device.patch
+mm-vmscan-cleanup-lru-size-claculations.patch
+mm-vmscan-consider-eligible-zones-in-get_scan_count.patch
+sigaltstack-support-ss_autodisarm-for-config_compat.patch
+ipc-shm-fix-shmat-mmap-nil-page-protection.patch
+ima-fix-ima_d_path-possible-race-with-rename.patch
+pm-devfreq-fix-available_governor-sysfs.patch
+pm-devfreq-fix-wrong-trans_stat-of-passive-devfreq-device.patch
+dm-cache-fix-corruption-seen-when-using-cache-2tb.patch
+dm-stats-fix-a-leaked-s-histogram_boundaries-array.patch
+dm-round-robin-revert-use-percpu-repeat_count-and-current_path.patch
+dm-raid-fix-data-corruption-on-reshape-request.patch
+scsi-storvsc-use-tagged-srb-requests-if-supported-by-the-device.patch
+scsi-storvsc-properly-handle-srb_error-when-sense-message-is-present.patch
+scsi-storvsc-properly-set-residual-data-length-on-errors.patch
+scsi-aacraid-reorder-adapter-status-check.patch
+scsi-use-scsi_device_from_queue-for-scsi_dh.patch
+power-reset-at91-poweroff-timely-shutdown-lpddr-memories.patch
+fix-disable-sys_membarrier-when-nohz_full-is-enabled.patch
--- /dev/null
+From 441398d378f29a5ad6d0fcda07918e54e4961800 Mon Sep 17 00:00:00 2001
+From: Stas Sergeev <stsp@list.ru>
+Date: Mon, 27 Feb 2017 14:27:25 -0800
+Subject: sigaltstack: support SS_AUTODISARM for CONFIG_COMPAT
+
+From: Stas Sergeev <stsp@list.ru>
+
+commit 441398d378f29a5ad6d0fcda07918e54e4961800 upstream.
+
+Currently SS_AUTODISARM is not supported in compatibility mode, but does
+not return -EINVAL either. This makes dosemu built with -m32 on x86_64
+to crash. Also the kernel's sigaltstack selftest fails if compiled with
+-m32.
+
+This patch adds the needed support.
+
+Link: http://lkml.kernel.org/r/20170205101213.8163-2-stsp@list.ru
+Signed-off-by: Stas Sergeev <stsp@users.sourceforge.net>
+Cc: Milosz Tanski <milosz@adfin.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Arnd Bergmann <arnd@arndb.de>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
+Cc: Waiman Long <Waiman.Long@hpe.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
+Cc: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/compat.h | 4 +++-
+ kernel/signal.c | 11 +++++++++--
+ 2 files changed, 12 insertions(+), 3 deletions(-)
+
+--- a/include/linux/compat.h
++++ b/include/linux/compat.h
+@@ -711,8 +711,10 @@ int __compat_save_altstack(compat_stack_
+ compat_stack_t __user *__uss = uss; \
+ struct task_struct *t = current; \
+ put_user_ex(ptr_to_compat((void __user *)t->sas_ss_sp), &__uss->ss_sp); \
+- put_user_ex(sas_ss_flags(sp), &__uss->ss_flags); \
++ put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
+ put_user_ex(t->sas_ss_size, &__uss->ss_size); \
++ if (t->sas_ss_flags & SS_AUTODISARM) \
++ sas_ss_reset(t); \
+ } while (0);
+
+ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
+--- a/kernel/signal.c
++++ b/kernel/signal.c
+@@ -3226,10 +3226,17 @@ int compat_restore_altstack(const compat
+
+ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
+ {
++ int err;
+ struct task_struct *t = current;
+- return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
+- __put_user(sas_ss_flags(sp), &uss->ss_flags) |
++ err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
++ &uss->ss_sp) |
++ __put_user(t->sas_ss_flags, &uss->ss_flags) |
+ __put_user(t->sas_ss_size, &uss->ss_size);
++ if (err)
++ return err;
++ if (t->sas_ss_flags & SS_AUTODISARM)
++ sas_ss_reset(t);
++ return 0;
+ }
+ #endif
+