--- /dev/null
+From 1fdf41941b8010691679638f8d0c8d08cfee7726 Mon Sep 17 00:00:00 2001
+From: Omar Sandoval <osandov@fb.com>
+Date: Wed, 25 Jan 2017 17:06:39 -0800
+Subject: Btrfs: disable xattr operations on subvolume directories
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit 1fdf41941b8010691679638f8d0c8d08cfee7726 upstream.
+
+When you snapshot a subvolume containing a subvolume, you get a
+placeholder directory where the subvolume would be. These directory
+inodes have ->i_ops set to btrfs_dir_ro_inode_operations. Previously,
+these i_ops didn't include the xattr operation callbacks. The conversion
+to xattr_handlers missed this case, leading to bogus attempts to set
+xattrs on these inodes. This manifested itself as failures when running
+delayed inodes.
+
+To fix this, clear IOP_XATTR in ->i_opflags on these inodes.
+
+Fixes: 6c6ef9f26e59 ("xattr: Stop calling {get,set,remove}xattr inode operations")
+Cc: Andreas Gruenbacher <agruenba@redhat.com>
+Reported-by: Chris Murphy <lists@colorremedies.com>
+Tested-by: Chris Murphy <lists@colorremedies.com>
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -5679,6 +5679,7 @@ static struct inode *new_simple_dir(stru
+
+ inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+ inode->i_op = &btrfs_dir_ro_inode_operations;
++ inode->i_opflags &= ~IOP_XATTR;
+ inode->i_fop = &simple_dir_operations;
+ inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ inode->i_mtime = current_time(inode);
--- /dev/null
+From 57b59ed2e5b91e958843609c7884794e29e6c4cb Mon Sep 17 00:00:00 2001
+From: Omar Sandoval <osandov@fb.com>
+Date: Wed, 25 Jan 2017 17:06:40 -0800
+Subject: Btrfs: remove ->{get, set}_acl() from btrfs_dir_ro_inode_operations
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit 57b59ed2e5b91e958843609c7884794e29e6c4cb upstream.
+
+Subvolume directory inodes can't have ACLs.
+
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -10585,8 +10585,6 @@ static const struct inode_operations btr
+ static const struct inode_operations btrfs_dir_ro_inode_operations = {
+ .lookup = btrfs_lookup,
+ .permission = btrfs_permission,
+- .get_acl = btrfs_get_acl,
+- .set_acl = btrfs_set_acl,
+ .update_time = btrfs_update_time,
+ };
+
--- /dev/null
+From 67ade058ef2c65a3e56878af9c293ec76722a2e5 Mon Sep 17 00:00:00 2001
+From: Omar Sandoval <osandov@fb.com>
+Date: Wed, 25 Jan 2017 17:06:38 -0800
+Subject: Btrfs: remove old tree_root case in btrfs_read_locked_inode()
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit 67ade058ef2c65a3e56878af9c293ec76722a2e5 upstream.
+
+As Jeff explained in c2951f32d36c ("btrfs: remove old tree_root dirent
+processing in btrfs_real_readdir()"), supporting this old format is no
+longer necessary since the Btrfs magic number has been updated since we
+changed to the current format. There are other places where we still
+handle this old format, but since this is part of a fix that is going to
+stable, I'm only removing this one for now.
+
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3819,10 +3819,7 @@ cache_acl:
+ break;
+ case S_IFDIR:
+ inode->i_fop = &btrfs_dir_file_operations;
+- if (root == root->fs_info->tree_root)
+- inode->i_op = &btrfs_dir_ro_inode_operations;
+- else
+- inode->i_op = &btrfs_dir_inode_operations;
++ inode->i_op = &btrfs_dir_inode_operations;
+ break;
+ case S_IFLNK:
+ inode->i_op = &btrfs_symlink_inode_operations;
--- /dev/null
+From 950eabbd6ddedc1b08350b9169a6a51b130ebaaf Mon Sep 17 00:00:00 2001
+From: Arnd Bergmann <arnd@arndb.de>
+Date: Fri, 27 Jan 2017 13:32:14 +0100
+Subject: ISDN: eicon: silence misleading array-bounds warning
+
+From: Arnd Bergmann <arnd@arndb.de>
+
+commit 950eabbd6ddedc1b08350b9169a6a51b130ebaaf upstream.
+
+With some gcc versions, we get a warning about the eicon driver,
+and that currently shows up as the only remaining warning in one
+of the build bots:
+
+In file included from ../drivers/isdn/hardware/eicon/message.c:30:0:
+eicon/message.c: In function 'mixer_notify_update':
+eicon/platform.h:333:18: warning: array subscript is above array bounds [-Warray-bounds]
+
+The code is easily changed to open-code the unusual PUT_WORD() line
+causing this to avoid the warning.
+
+Link: http://arm-soc.lixom.net/buildlogs/stable-rc/v4.4.45/
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/isdn/hardware/eicon/message.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/isdn/hardware/eicon/message.c
++++ b/drivers/isdn/hardware/eicon/message.c
+@@ -11297,7 +11297,8 @@ static void mixer_notify_update(PLCI *pl
+ ((CAPI_MSG *) msg)->header.ncci = 0;
+ ((CAPI_MSG *) msg)->info.facility_req.Selector = SELECTOR_LINE_INTERCONNECT;
+ ((CAPI_MSG *) msg)->info.facility_req.structs[0] = 3;
+- PUT_WORD(&(((CAPI_MSG *) msg)->info.facility_req.structs[1]), LI_REQ_SILENT_UPDATE);
++ ((CAPI_MSG *) msg)->info.facility_req.structs[1] = LI_REQ_SILENT_UPDATE & 0xff;
++ ((CAPI_MSG *) msg)->info.facility_req.structs[2] = LI_REQ_SILENT_UPDATE >> 8;
+ ((CAPI_MSG *) msg)->info.facility_req.structs[3] = 0;
+ w = api_put(notify_plci->appl, (CAPI_MSG *) msg);
+ if (w != _QUEUE_FULL)
--- /dev/null
+From 8310d48b125d19fcd9521d83b8293e63eb1646aa Mon Sep 17 00:00:00 2001
+From: Keno Fischer <keno@juliacomputing.com>
+Date: Tue, 24 Jan 2017 15:17:48 -0800
+Subject: mm/huge_memory.c: respect FOLL_FORCE/FOLL_COW for thp
+
+From: Keno Fischer <keno@juliacomputing.com>
+
+commit 8310d48b125d19fcd9521d83b8293e63eb1646aa upstream.
+
+In commit 19be0eaffa3a ("mm: remove gup_flags FOLL_WRITE games from
+__get_user_pages()"), the mm code was changed from unsetting FOLL_WRITE
+after a COW was resolved to setting the (newly introduced) FOLL_COW
+instead. Simultaneously, the check in gup.c was updated to still allow
+writes with FOLL_FORCE set if FOLL_COW had also been set.
+
+However, a similar check in huge_memory.c was forgotten. As a result,
+remote memory writes to ro regions of memory backed by transparent huge
+pages cause an infinite loop in the kernel (handle_mm_fault sets
+FOLL_COW and returns 0 causing a retry, but follow_trans_huge_pmd bails
+out immidiately because `(flags & FOLL_WRITE) && !pmd_write(*pmd)` is
+true.
+
+While in this state the process is stil SIGKILLable, but little else
+works (e.g. no ptrace attach, no other signals). This is easily
+reproduced with the following code (assuming thp are set to always):
+
+ #include <assert.h>
+ #include <fcntl.h>
+ #include <stdint.h>
+ #include <stdio.h>
+ #include <string.h>
+ #include <sys/mman.h>
+ #include <sys/stat.h>
+ #include <sys/types.h>
+ #include <sys/wait.h>
+ #include <unistd.h>
+
+ #define TEST_SIZE 5 * 1024 * 1024
+
+ int main(void) {
+ int status;
+ pid_t child;
+ int fd = open("/proc/self/mem", O_RDWR);
+ void *addr = mmap(NULL, TEST_SIZE, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ assert(addr != MAP_FAILED);
+ pid_t parent_pid = getpid();
+ if ((child = fork()) == 0) {
+ void *addr2 = mmap(NULL, TEST_SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ assert(addr2 != MAP_FAILED);
+ memset(addr2, 'a', TEST_SIZE);
+ pwrite(fd, addr2, TEST_SIZE, (uintptr_t)addr);
+ return 0;
+ }
+ assert(child == waitpid(child, &status, 0));
+ assert(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+ return 0;
+ }
+
+Fix this by updating follow_trans_huge_pmd in huge_memory.c analogously
+to the update in gup.c in the original commit. The same pattern exists
+in follow_devmap_pmd. However, we should not be able to reach that
+check with FOLL_COW set, so add WARN_ONCE to make sure we notice if we
+ever do.
+
+[akpm@linux-foundation.org: coding-style fixes]
+Link: http://lkml.kernel.org/r/20170106015025.GA38411@juliacomputing.com
+Signed-off-by: Keno Fischer <keno@juliacomputing.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Nicholas Piggin <npiggin@gmail.com>
+Cc: Willy Tarreau <w@1wt.eu>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Hugh Dickins <hughd@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c | 18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -772,6 +772,12 @@ struct page *follow_devmap_pmd(struct vm
+
+ assert_spin_locked(pmd_lockptr(mm, pmd));
+
++ /*
++ * When we COW a devmap PMD entry, we split it into PTEs, so we should
++ * not be in this function with `flags & FOLL_COW` set.
++ */
++ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
++
+ if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ return NULL;
+
+@@ -1118,6 +1124,16 @@ out_unlock:
+ return ret;
+ }
+
++/*
++ * FOLL_FORCE can write to even unwritable pmd's, but only
++ * after we've gone through a COW cycle and they are dirty.
++ */
++static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
++{
++ return pmd_write(pmd) ||
++ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
++}
++
+ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+@@ -1128,7 +1144,7 @@ struct page *follow_trans_huge_pmd(struc
+
+ assert_spin_locked(pmd_lockptr(mm, pmd));
+
+- if (flags & FOLL_WRITE && !pmd_write(*pmd))
++ if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
+ goto out;
+
+ /* Avoid dumping huge zero page */
--- /dev/null
+From d51e9894d27492783fc6d1b489070b4ba66ce969 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Tue, 24 Jan 2017 15:18:18 -0800
+Subject: mm/mempolicy.c: do not put mempolicy before using its nodemask
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit d51e9894d27492783fc6d1b489070b4ba66ce969 upstream.
+
+Since commit be97a41b291e ("mm/mempolicy.c: merge alloc_hugepage_vma to
+alloc_pages_vma") alloc_pages_vma() can potentially free a mempolicy by
+mpol_cond_put() before accessing the embedded nodemask by
+__alloc_pages_nodemask(). The commit log says it's so "we can use a
+single exit path within the function" but that's clearly wrong. We can
+still do that when doing mpol_cond_put() after the allocation attempt.
+
+Make sure the mempolicy is not freed prematurely, otherwise
+__alloc_pages_nodemask() can end up using a bogus nodemask, which could
+lead e.g. to premature OOM.
+
+Fixes: be97a41b291e ("mm/mempolicy.c: merge alloc_hugepage_vma to alloc_pages_vma")
+Link: http://lkml.kernel.org/r/20170118141124.8345-1-vbabka@suse.cz
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mempolicy.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -2023,8 +2023,8 @@ retry_cpuset:
+
+ nmask = policy_nodemask(gfp, pol);
+ zl = policy_zonelist(gfp, pol, node);
+- mpol_cond_put(pol);
+ page = __alloc_pages_nodemask(gfp, order, zl, nmask);
++ mpol_cond_put(pol);
+ out:
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
--- /dev/null
+From ea57485af8f4221312a5a95d63c382b45e7840dc Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Tue, 24 Jan 2017 15:18:32 -0800
+Subject: mm, page_alloc: fix check for NULL preferred_zone
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit ea57485af8f4221312a5a95d63c382b45e7840dc upstream.
+
+Patch series "fix premature OOM regression in 4.7+ due to cpuset races".
+
+This is v2 of my attempt to fix the recent report based on LTP cpuset
+stress test [1]. The intention is to go to stable 4.9 LTSS with this,
+as triggering repeated OOMs is not nice. That's why the patches try to
+be not too intrusive.
+
+Unfortunately why investigating I found that modifying the testcase to
+use per-VMA policies instead of per-task policies will bring the OOM's
+back, but that seems to be much older and harder to fix problem. I have
+posted a RFC [2] but I believe that fixing the recent regressions has a
+higher priority.
+
+Longer-term we might try to think how to fix the cpuset mess in a better
+and less error prone way. I was for example very surprised to learn,
+that cpuset updates change not only task->mems_allowed, but also
+nodemask of mempolicies. Until now I expected the parameter to
+alloc_pages_nodemask() to be stable. I wonder why do we then treat
+cpusets specially in get_page_from_freelist() and distinguish HARDWALL
+etc, when there's unconditional intersection between mempolicy and
+cpuset. I would expect the nodemask adjustment for saving overhead in
+g_p_f(), but that clearly doesn't happen in the current form. So we
+have both crazy complexity and overhead, AFAICS.
+
+[1] https://lkml.kernel.org/r/CAFpQJXUq-JuEP=QPidy4p_=FN0rkH5Z-kfB4qBvsf6jMS87Edg@mail.gmail.com
+[2] https://lkml.kernel.org/r/7c459f26-13a6-a817-e508-b65b903a8378@suse.cz
+
+This patch (of 4):
+
+Since commit c33d6c06f60f ("mm, page_alloc: avoid looking up the first
+zone in a zonelist twice") we have a wrong check for NULL preferred_zone,
+which can theoretically happen due to concurrent cpuset modification. We
+check the zoneref pointer which is never NULL and we should check the zone
+pointer. Also document this in first_zones_zonelist() comment per Michal
+Hocko.
+
+Fixes: c33d6c06f60f ("mm, page_alloc: avoid looking up the first zone in a zonelist twice")
+Link: http://lkml.kernel.org/r/20170120103843.24587-2-vbabka@suse.cz
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Cc: Ganapatrao Kulkarni <gpkulkarni@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mmzone.h | 6 +++++-
+ mm/page_alloc.c | 2 +-
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -972,12 +972,16 @@ static __always_inline struct zoneref *n
+ * @zonelist - The zonelist to search for a suitable zone
+ * @highest_zoneidx - The zone index of the highest zone to return
+ * @nodes - An optional nodemask to filter the zonelist with
+- * @zone - The first suitable zone found is returned via this parameter
++ * @return - Zoneref pointer for the first suitable zone found (see below)
+ *
+ * This function returns the first zone at or below a given zone index that is
+ * within the allowed nodemask. The zoneref returned is a cursor that can be
+ * used to iterate the zonelist with next_zones_zonelist by advancing it by
+ * one before calling.
++ *
++ * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
++ * never NULL). This may happen either genuinely, or due to concurrent nodemask
++ * update due to cpuset modification.
+ */
+ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
+ enum zone_type highest_zoneidx,
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3763,7 +3763,7 @@ retry_cpuset:
+ */
+ ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+ ac.high_zoneidx, ac.nodemask);
+- if (!ac.preferred_zoneref) {
++ if (!ac.preferred_zoneref->zone) {
+ page = NULL;
+ goto no_zone;
+ }
--- /dev/null
+From 16096c25bf0ca5d87e4fa6ec6108ba53feead212 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Tue, 24 Jan 2017 15:18:35 -0800
+Subject: mm, page_alloc: fix fast-path race with cpuset update or removal
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 16096c25bf0ca5d87e4fa6ec6108ba53feead212 upstream.
+
+Ganapatrao Kulkarni reported that the LTP test cpuset01 in stress mode
+triggers OOM killer in few seconds, despite lots of free memory. The
+test attempts to repeatedly fault in memory in one process in a cpuset,
+while changing allowed nodes of the cpuset between 0 and 1 in another
+process.
+
+One possible cause is that in the fast path we find the preferred
+zoneref according to current mems_allowed, so that it points to the
+middle of the zonelist, skipping e.g. zones of node 1 completely. If
+the mems_allowed is updated to contain only node 1, we never reach it in
+the zonelist, and trigger OOM before checking the cpuset_mems_cookie.
+
+This patch fixes the particular case by redoing the preferred zoneref
+search if we switch back to the original nodemask. The condition is
+also slightly changed so that when the last non-root cpuset is removed,
+we don't miss it.
+
+Note that this is not a full fix, and more patches will follow.
+
+Link: http://lkml.kernel.org/r/20170120103843.24587-3-vbabka@suse.cz
+Fixes: 682a3385e773 ("mm, page_alloc: inline the fast path of the zonelist iterator")
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reported-by: Ganapatrao Kulkarni <gpkulkarni@gmail.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3783,9 +3783,17 @@ retry_cpuset:
+ /*
+ * Restore the original nodemask if it was potentially replaced with
+ * &cpuset_current_mems_allowed to optimize the fast-path attempt.
++ * Also recalculate the starting point for the zonelist iterator or
++ * we could end up iterating over non-eligible zones endlessly.
+ */
+- if (cpusets_enabled())
++ if (unlikely(ac.nodemask != nodemask)) {
+ ac.nodemask = nodemask;
++ ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
++ ac.high_zoneidx, ac.nodemask);
++ if (!ac.preferred_zoneref->zone)
++ goto no_zone;
++ }
++
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+
+ no_zone:
--- /dev/null
+From e47483bca2cc59a4593b37a270b16ee42b1d9f08 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Tue, 24 Jan 2017 15:18:41 -0800
+Subject: mm, page_alloc: fix premature OOM when racing with cpuset mems update
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit e47483bca2cc59a4593b37a270b16ee42b1d9f08 upstream.
+
+Ganapatrao Kulkarni reported that the LTP test cpuset01 in stress mode
+triggers OOM killer in few seconds, despite lots of free memory. The
+test attempts to repeatedly fault in memory in one process in a cpuset,
+while changing allowed nodes of the cpuset between 0 and 1 in another
+process.
+
+The problem comes from insufficient protection against cpuset changes,
+which can cause get_page_from_freelist() to consider all zones as
+non-eligible due to nodemask and/or current->mems_allowed. This was
+masked in the past by sufficient retries, but since commit 682a3385e773
+("mm, page_alloc: inline the fast path of the zonelist iterator") we fix
+the preferred_zoneref once, and don't iterate over the whole zonelist in
+further attempts, thus the only eligible zones might be placed in the
+zonelist before our starting point and we always miss them.
+
+A previous patch fixed this problem for current->mems_allowed. However,
+cpuset changes also update the task's mempolicy nodemask. The fix has
+two parts. We have to repeat the preferred_zoneref search when we
+detect cpuset update by way of seqcount, and we have to check the
+seqcount before considering OOM.
+
+[akpm@linux-foundation.org: fix typo in comment]
+Link: http://lkml.kernel.org/r/20170120103843.24587-5-vbabka@suse.cz
+Fixes: c33d6c06f60f ("mm, page_alloc: avoid looking up the first zone in a zonelist twice")
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reported-by: Ganapatrao Kulkarni <gpkulkarni@gmail.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c | 35 ++++++++++++++++++++++++-----------
+ 1 file changed, 24 insertions(+), 11 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3534,6 +3534,17 @@ retry_cpuset:
+ no_progress_loops = 0;
+ compact_priority = DEF_COMPACT_PRIORITY;
+ cpuset_mems_cookie = read_mems_allowed_begin();
++ /*
++ * We need to recalculate the starting point for the zonelist iterator
++ * because we might have used different nodemask in the fast path, or
++ * there was a cpuset modification and we are retrying - otherwise we
++ * could end up iterating over non-eligible zones endlessly.
++ */
++ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
++ ac->high_zoneidx, ac->nodemask);
++ if (!ac->preferred_zoneref->zone)
++ goto nopage;
++
+
+ /*
+ * The fast path uses conservative alloc_flags to succeed only until
+@@ -3694,6 +3705,13 @@ retry:
+ &compaction_retries))
+ goto retry;
+
++ /*
++ * It's possible we raced with cpuset update so the OOM would be
++ * premature (see below the nopage: label for full explanation).
++ */
++ if (read_mems_allowed_retry(cpuset_mems_cookie))
++ goto retry_cpuset;
++
+ /* Reclaim has failed us, start killing things */
+ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+ if (page)
+@@ -3707,10 +3725,11 @@ retry:
+
+ nopage:
+ /*
+- * When updating a task's mems_allowed, it is possible to race with
+- * parallel threads in such a way that an allocation can fail while
+- * the mask is being updated. If a page allocation is about to fail,
+- * check if the cpuset changed during allocation and if so, retry.
++ * When updating a task's mems_allowed or mempolicy nodemask, it is
++ * possible to race with parallel threads in such a way that our
++ * allocation can fail while the mask is being updated. If we are about
++ * to fail, check if the cpuset changed during allocation and if so,
++ * retry.
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry_cpuset;
+@@ -3801,15 +3820,9 @@ no_zone:
+ /*
+ * Restore the original nodemask if it was potentially replaced with
+ * &cpuset_current_mems_allowed to optimize the fast-path attempt.
+- * Also recalculate the starting point for the zonelist iterator or
+- * we could end up iterating over non-eligible zones endlessly.
+ */
+- if (unlikely(ac.nodemask != nodemask)) {
++ if (unlikely(ac.nodemask != nodemask))
+ ac.nodemask = nodemask;
+- ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+- ac.high_zoneidx, ac.nodemask);
+- /* If we have NULL preferred zone, slowpath wll handle that */
+- }
+
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+
--- /dev/null
+From 5ce9bfef1d27944c119a397a9d827bef795487ce Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Tue, 24 Jan 2017 15:18:38 -0800
+Subject: mm, page_alloc: move cpuset seqcount checking to slowpath
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 5ce9bfef1d27944c119a397a9d827bef795487ce upstream.
+
+This is a preparation for the following patch to make review simpler.
+While the primary motivation is a bug fix, this also simplifies the fast
+path, although the moved code is only enabled when cpusets are in use.
+
+Link: http://lkml.kernel.org/r/20170120103843.24587-4-vbabka@suse.cz
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Cc: Ganapatrao Kulkarni <gpkulkarni@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c | 47 ++++++++++++++++++++++++++---------------------
+ 1 file changed, 26 insertions(+), 21 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3502,12 +3502,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u
+ struct page *page = NULL;
+ unsigned int alloc_flags;
+ unsigned long did_some_progress;
+- enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
++ enum compact_priority compact_priority;
+ enum compact_result compact_result;
+- int compaction_retries = 0;
+- int no_progress_loops = 0;
++ int compaction_retries;
++ int no_progress_loops;
+ unsigned long alloc_start = jiffies;
+ unsigned int stall_timeout = 10 * HZ;
++ unsigned int cpuset_mems_cookie;
+
+ /*
+ * In the slowpath, we sanity check order to avoid ever trying to
+@@ -3528,6 +3529,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u
+ (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
+ gfp_mask &= ~__GFP_ATOMIC;
+
++retry_cpuset:
++ compaction_retries = 0;
++ no_progress_loops = 0;
++ compact_priority = DEF_COMPACT_PRIORITY;
++ cpuset_mems_cookie = read_mems_allowed_begin();
++
+ /*
+ * The fast path uses conservative alloc_flags to succeed only until
+ * kswapd needs to be woken up, and to avoid the cost of setting up
+@@ -3699,6 +3706,15 @@ retry:
+ }
+
+ nopage:
++ /*
++ * When updating a task's mems_allowed, it is possible to race with
++ * parallel threads in such a way that an allocation can fail while
++ * the mask is being updated. If a page allocation is about to fail,
++ * check if the cpuset changed during allocation and if so, retry.
++ */
++ if (read_mems_allowed_retry(cpuset_mems_cookie))
++ goto retry_cpuset;
++
+ warn_alloc(gfp_mask,
+ "page allocation failure: order:%u", order);
+ got_pg:
+@@ -3713,7 +3729,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u
+ struct zonelist *zonelist, nodemask_t *nodemask)
+ {
+ struct page *page;
+- unsigned int cpuset_mems_cookie;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
+ gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+ struct alloc_context ac = {
+@@ -3750,9 +3765,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u
+ if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+
+-retry_cpuset:
+- cpuset_mems_cookie = read_mems_allowed_begin();
+-
+ /* Dirty zone balancing only done in the fast path */
+ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+
+@@ -3765,6 +3777,11 @@ retry_cpuset:
+ ac.high_zoneidx, ac.nodemask);
+ if (!ac.preferred_zoneref->zone) {
+ page = NULL;
++ /*
++ * This might be due to race with cpuset_current_mems_allowed
++ * update, so make sure we retry with original nodemask in the
++ * slow path.
++ */
+ goto no_zone;
+ }
+
+@@ -3773,6 +3790,7 @@ retry_cpuset:
+ if (likely(page))
+ goto out;
+
++no_zone:
+ /*
+ * Runtime PM, block IO and its error handling path can deadlock
+ * because I/O on the device might not complete.
+@@ -3790,24 +3808,11 @@ retry_cpuset:
+ ac.nodemask = nodemask;
+ ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+ ac.high_zoneidx, ac.nodemask);
+- if (!ac.preferred_zoneref->zone)
+- goto no_zone;
++ /* If we have NULL preferred zone, slowpath wll handle that */
+ }
+
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+
+-no_zone:
+- /*
+- * When updating a task's mems_allowed, it is possible to race with
+- * parallel threads in such a way that an allocation can fail while
+- * the mask is being updated. If a page allocation is about to fail,
+- * check if the cpuset changed during allocation and if so, retry.
+- */
+- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
+- alloc_mask = gfp_mask;
+- goto retry_cpuset;
+- }
+-
+ out:
+ if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
+ unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
drm-vc4-fix-a-bounds-check.patch
revert-drm-radeon-always-apply-pci-shutdown-callbacks.patch
drm-atomic-clear-out-fence-when-duplicating-state.patch
+mm-huge_memory.c-respect-foll_force-foll_cow-for-thp.patch
+mm-mempolicy.c-do-not-put-mempolicy-before-using-its-nodemask.patch
+mm-page_alloc-fix-check-for-null-preferred_zone.patch
+mm-page_alloc-fix-fast-path-race-with-cpuset-update-or-removal.patch
+mm-page_alloc-move-cpuset-seqcount-checking-to-slowpath.patch
+mm-page_alloc-fix-premature-oom-when-racing-with-cpuset-mems-update.patch
+vring-force-use-of-dma-api-for-arm-based-systems-with-legacy-devices.patch
+userns-make-ucounts-lock-irq-safe.patch
+sysctl-fix-proc_doulongvec_ms_jiffies_minmax.patch
+xfs-prevent-quotacheck-from-overloading-inode-lru.patch
+isdn-eicon-silence-misleading-array-bounds-warning.patch
+btrfs-remove-old-tree_root-case-in-btrfs_read_locked_inode.patch
+btrfs-disable-xattr-operations-on-subvolume-directories.patch
+btrfs-remove-get-set-_acl-from-btrfs_dir_ro_inode_operations.patch
--- /dev/null
+From ff9f8a7cf935468a94d9927c68b00daae701667e Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 25 Jan 2017 18:20:55 -0800
+Subject: sysctl: fix proc_doulongvec_ms_jiffies_minmax()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit ff9f8a7cf935468a94d9927c68b00daae701667e upstream.
+
+We perform the conversion between kernel jiffies and ms only when
+exporting kernel value to user space.
+
+We need to do the opposite operation when value is written by user.
+
+Only matters when HZ != 1000
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sysctl.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -2487,6 +2487,7 @@ static int __do_proc_doulongvec_minmax(v
+ break;
+ if (neg)
+ continue;
++ val = convmul * val / convdiv;
+ if ((min && val < *min) || (max && val > *max))
+ continue;
+ *i = val;
--- /dev/null
+From 880a38547ff08715ce4f1daf9a4bb30c87676e68 Mon Sep 17 00:00:00 2001
+From: Nikolay Borisov <n.borisov.lkml@gmail.com>
+Date: Fri, 20 Jan 2017 15:21:35 +0200
+Subject: userns: Make ucounts lock irq-safe
+
+From: Nikolay Borisov <n.borisov.lkml@gmail.com>
+
+commit 880a38547ff08715ce4f1daf9a4bb30c87676e68 upstream.
+
+The ucounts_lock is being used to protect various ucounts lifecycle
+management functionalities. However, those services can also be invoked
+when a pidns is being freed in an RCU callback (e.g. softirq context).
+This can lead to deadlocks. There were already efforts trying to
+prevent similar deadlocks in add7c65ca426 ("pid: fix lockdep deadlock
+warning due to ucount_lock"), however they just moved the context
+from hardirq to softrq. Fix this issue once and for all by explictly
+making the lock disable irqs altogether.
+
+Dmitry Vyukov <dvyukov@google.com> reported:
+
+> I've got the following deadlock report while running syzkaller fuzzer
+> on eec0d3d065bfcdf9cd5f56dd2a36b94d12d32297 of linux-next (on odroid
+> device if it matters):
+>
+> =================================
+> [ INFO: inconsistent lock state ]
+> 4.10.0-rc3-next-20170112-xc2-dirty #6 Not tainted
+> ---------------------------------
+> inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
+> swapper/2/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
+> (ucounts_lock){+.?...}, at: [< inline >] spin_lock
+> ./include/linux/spinlock.h:302
+> (ucounts_lock){+.?...}, at: [<ffff2000081678c8>]
+> put_ucounts+0x60/0x138 kernel/ucount.c:162
+> {SOFTIRQ-ON-W} state was registered at:
+> [<ffff2000081c82d8>] mark_lock+0x220/0xb60 kernel/locking/lockdep.c:3054
+> [< inline >] mark_irqflags kernel/locking/lockdep.c:2941
+> [<ffff2000081c97a8>] __lock_acquire+0x388/0x3260 kernel/locking/lockdep.c:3295
+> [<ffff2000081cce24>] lock_acquire+0xa4/0x138 kernel/locking/lockdep.c:3753
+> [< inline >] __raw_spin_lock ./include/linux/spinlock_api_smp.h:144
+> [<ffff200009798128>] _raw_spin_lock+0x90/0xd0 kernel/locking/spinlock.c:151
+> [< inline >] spin_lock ./include/linux/spinlock.h:302
+> [< inline >] get_ucounts kernel/ucount.c:131
+> [<ffff200008167c28>] inc_ucount+0x80/0x6c8 kernel/ucount.c:189
+> [< inline >] inc_mnt_namespaces fs/namespace.c:2818
+> [<ffff200008481850>] alloc_mnt_ns+0x78/0x3a8 fs/namespace.c:2849
+> [<ffff200008487298>] create_mnt_ns+0x28/0x200 fs/namespace.c:2959
+> [< inline >] init_mount_tree fs/namespace.c:3199
+> [<ffff200009bd6674>] mnt_init+0x258/0x384 fs/namespace.c:3251
+> [<ffff200009bd60bc>] vfs_caches_init+0x6c/0x80 fs/dcache.c:3626
+> [<ffff200009bb1114>] start_kernel+0x414/0x460 init/main.c:648
+> [<ffff200009bb01e8>] __primary_switched+0x6c/0x70 arch/arm64/kernel/head.S:456
+> irq event stamp: 2316924
+> hardirqs last enabled at (2316924): [< inline >] rcu_do_batch
+> kernel/rcu/tree.c:2911
+> hardirqs last enabled at (2316924): [< inline >]
+> invoke_rcu_callbacks kernel/rcu/tree.c:3182
+> hardirqs last enabled at (2316924): [< inline >]
+> __rcu_process_callbacks kernel/rcu/tree.c:3149
+> hardirqs last enabled at (2316924): [<ffff200008210414>]
+> rcu_process_callbacks+0x7a4/0xc28 kernel/rcu/tree.c:3166
+> hardirqs last disabled at (2316923): [< inline >] rcu_do_batch
+> kernel/rcu/tree.c:2900
+> hardirqs last disabled at (2316923): [< inline >]
+> invoke_rcu_callbacks kernel/rcu/tree.c:3182
+> hardirqs last disabled at (2316923): [< inline >]
+> __rcu_process_callbacks kernel/rcu/tree.c:3149
+> hardirqs last disabled at (2316923): [<ffff20000820fe80>]
+> rcu_process_callbacks+0x210/0xc28 kernel/rcu/tree.c:3166
+> softirqs last enabled at (2316912): [<ffff20000811b4c4>]
+> _local_bh_enable+0x4c/0x80 kernel/softirq.c:155
+> softirqs last disabled at (2316913): [< inline >]
+> do_softirq_own_stack ./include/linux/interrupt.h:488
+> softirqs last disabled at (2316913): [< inline >]
+> invoke_softirq kernel/softirq.c:371
+> softirqs last disabled at (2316913): [<ffff20000811c994>]
+> irq_exit+0x264/0x308 kernel/softirq.c:405
+>
+> other info that might help us debug this:
+> Possible unsafe locking scenario:
+>
+> CPU0
+> ----
+> lock(ucounts_lock);
+> <Interrupt>
+> lock(ucounts_lock);
+>
+> *** DEADLOCK ***
+>
+> 1 lock held by swapper/2/0:
+> #0: (rcu_callback){......}, at: [< inline >] __rcu_reclaim
+> kernel/rcu/rcu.h:108
+> #0: (rcu_callback){......}, at: [< inline >] rcu_do_batch
+> kernel/rcu/tree.c:2919
+> #0: (rcu_callback){......}, at: [< inline >]
+> invoke_rcu_callbacks kernel/rcu/tree.c:3182
+> #0: (rcu_callback){......}, at: [< inline >]
+> __rcu_process_callbacks kernel/rcu/tree.c:3149
+> #0: (rcu_callback){......}, at: [<ffff200008210390>]
+> rcu_process_callbacks+0x720/0xc28 kernel/rcu/tree.c:3166
+>
+> stack backtrace:
+> CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.10.0-rc3-next-20170112-xc2-dirty #6
+> Hardware name: Hardkernel ODROID-C2 (DT)
+> Call trace:
+> [<ffff20000808fa60>] dump_backtrace+0x0/0x440 arch/arm64/kernel/traps.c:500
+> [<ffff20000808fec0>] show_stack+0x20/0x30 arch/arm64/kernel/traps.c:225
+> [<ffff2000088a99e0>] dump_stack+0x110/0x168
+> [<ffff2000082fa2b4>] print_usage_bug.part.27+0x49c/0x4bc
+> kernel/locking/lockdep.c:2387
+> [< inline >] print_usage_bug kernel/locking/lockdep.c:2357
+> [< inline >] valid_state kernel/locking/lockdep.c:2400
+> [< inline >] mark_lock_irq kernel/locking/lockdep.c:2617
+> [<ffff2000081c89ec>] mark_lock+0x934/0xb60 kernel/locking/lockdep.c:3065
+> [< inline >] mark_irqflags kernel/locking/lockdep.c:2923
+> [<ffff2000081c9a60>] __lock_acquire+0x640/0x3260 kernel/locking/lockdep.c:3295
+> [<ffff2000081cce24>] lock_acquire+0xa4/0x138 kernel/locking/lockdep.c:3753
+> [< inline >] __raw_spin_lock ./include/linux/spinlock_api_smp.h:144
+> [<ffff200009798128>] _raw_spin_lock+0x90/0xd0 kernel/locking/spinlock.c:151
+> [< inline >] spin_lock ./include/linux/spinlock.h:302
+> [<ffff2000081678c8>] put_ucounts+0x60/0x138 kernel/ucount.c:162
+> [<ffff200008168364>] dec_ucount+0xf4/0x158 kernel/ucount.c:214
+> [< inline >] dec_pid_namespaces kernel/pid_namespace.c:89
+> [<ffff200008293dc8>] delayed_free_pidns+0x40/0xe0 kernel/pid_namespace.c:156
+> [< inline >] __rcu_reclaim kernel/rcu/rcu.h:118
+> [< inline >] rcu_do_batch kernel/rcu/tree.c:2919
+> [< inline >] invoke_rcu_callbacks kernel/rcu/tree.c:3182
+> [< inline >] __rcu_process_callbacks kernel/rcu/tree.c:3149
+> [<ffff2000082103d8>] rcu_process_callbacks+0x768/0xc28 kernel/rcu/tree.c:3166
+> [<ffff2000080821dc>] __do_softirq+0x324/0x6e0 kernel/softirq.c:284
+> [< inline >] do_softirq_own_stack ./include/linux/interrupt.h:488
+> [< inline >] invoke_softirq kernel/softirq.c:371
+> [<ffff20000811c994>] irq_exit+0x264/0x308 kernel/softirq.c:405
+> [<ffff2000081ecc28>] __handle_domain_irq+0xc0/0x150 kernel/irq/irqdesc.c:636
+> [<ffff200008081c80>] gic_handle_irq+0x68/0xd8
+> Exception stack(0xffff8000648e7dd0 to 0xffff8000648e7f00)
+> 7dc0: ffff8000648d4b3c 0000000000000007
+> 7de0: 0000000000000000 1ffff0000c91a967 1ffff0000c91a967 1ffff0000c91a967
+> 7e00: ffff20000a4b6b68 0000000000000001 0000000000000007 0000000000000001
+> 7e20: 1fffe4000149ae90 ffff200009d35000 0000000000000000 0000000000000002
+> 7e40: 0000000000000000 0000000000000000 0000000002624a1a 0000000000000000
+> 7e60: 0000000000000000 ffff200009cbcd88 000060006d2ed000 0000000000000140
+> 7e80: ffff200009cff000 ffff200009cb6000 ffff200009cc2020 ffff200009d2159d
+> 7ea0: 0000000000000000 ffff8000648d4380 0000000000000000 ffff8000648e7f00
+> 7ec0: ffff20000820a478 ffff8000648e7f00 ffff20000820a47c 0000000010000145
+> 7ee0: 0000000000000140 dfff200000000000 ffffffffffffffff ffff20000820a478
+> [<ffff2000080837f8>] el1_irq+0xb8/0x130 arch/arm64/kernel/entry.S:486
+> [< inline >] arch_local_irq_restore
+> ./arch/arm64/include/asm/irqflags.h:81
+> [<ffff20000820a47c>] rcu_idle_exit+0x64/0xa8 kernel/rcu/tree.c:1030
+> [< inline >] cpuidle_idle_call kernel/sched/idle.c:200
+> [<ffff2000081bcbfc>] do_idle+0x1dc/0x2d0 kernel/sched/idle.c:243
+> [<ffff2000081bd1cc>] cpu_startup_entry+0x24/0x28 kernel/sched/idle.c:345
+> [<ffff200008099f8c>] secondary_start_kernel+0x2cc/0x358
+> arch/arm64/kernel/smp.c:276
+> [<000000000279f1a4>] 0x279f1a4
+
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Tested-by: Dmitry Vyukov <dvyukov@google.com>
+Fixes: add7c65ca426 ("pid: fix lockdep deadlock warning due to ucount_lock")
+Fixes: f333c700c610 ("pidns: Add a limit on the number of pid namespaces")
+Link: https://www.spinics.net/lists/kernel/msg2426637.html
+Signed-off-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
+Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/ucount.c | 14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/kernel/ucount.c
++++ b/kernel/ucount.c
+@@ -128,10 +128,10 @@ static struct ucounts *get_ucounts(struc
+ struct hlist_head *hashent = ucounts_hashentry(ns, uid);
+ struct ucounts *ucounts, *new;
+
+- spin_lock(&ucounts_lock);
++ spin_lock_irq(&ucounts_lock);
+ ucounts = find_ucounts(ns, uid, hashent);
+ if (!ucounts) {
+- spin_unlock(&ucounts_lock);
++ spin_unlock_irq(&ucounts_lock);
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+@@ -141,7 +141,7 @@ static struct ucounts *get_ucounts(struc
+ new->uid = uid;
+ atomic_set(&new->count, 0);
+
+- spin_lock(&ucounts_lock);
++ spin_lock_irq(&ucounts_lock);
+ ucounts = find_ucounts(ns, uid, hashent);
+ if (ucounts) {
+ kfree(new);
+@@ -152,16 +152,18 @@ static struct ucounts *get_ucounts(struc
+ }
+ if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+ ucounts = NULL;
+- spin_unlock(&ucounts_lock);
++ spin_unlock_irq(&ucounts_lock);
+ return ucounts;
+ }
+
+ static void put_ucounts(struct ucounts *ucounts)
+ {
++ unsigned long flags;
++
+ if (atomic_dec_and_test(&ucounts->count)) {
+- spin_lock(&ucounts_lock);
++ spin_lock_irqsave(&ucounts_lock, flags);
+ hlist_del_init(&ucounts->node);
+- spin_unlock(&ucounts_lock);
++ spin_unlock_irqrestore(&ucounts_lock, flags);
+
+ kfree(ucounts);
+ }
--- /dev/null
+From c7070619f3408d9a0dffbed9149e6f00479cf43b Mon Sep 17 00:00:00 2001
+From: Will Deacon <will.deacon@arm.com>
+Date: Fri, 20 Jan 2017 10:33:32 +0000
+Subject: vring: Force use of DMA API for ARM-based systems with legacy devices
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit c7070619f3408d9a0dffbed9149e6f00479cf43b upstream.
+
+Booting Linux on an ARM fastmodel containing an SMMU emulation results
+in an unexpected I/O page fault from the legacy virtio-blk PCI device:
+
+[ 1.211721] arm-smmu-v3 2b400000.smmu: event 0x10 received:
+[ 1.211800] arm-smmu-v3 2b400000.smmu: 0x00000000fffff010
+[ 1.211880] arm-smmu-v3 2b400000.smmu: 0x0000020800000000
+[ 1.211959] arm-smmu-v3 2b400000.smmu: 0x00000008fa081002
+[ 1.212075] arm-smmu-v3 2b400000.smmu: 0x0000000000000000
+[ 1.212155] arm-smmu-v3 2b400000.smmu: event 0x10 received:
+[ 1.212234] arm-smmu-v3 2b400000.smmu: 0x00000000fffff010
+[ 1.212314] arm-smmu-v3 2b400000.smmu: 0x0000020800000000
+[ 1.212394] arm-smmu-v3 2b400000.smmu: 0x00000008fa081000
+[ 1.212471] arm-smmu-v3 2b400000.smmu: 0x0000000000000000
+
+<system hangs failing to read partition table>
+
+This is because the legacy virtio-blk device is behind an SMMU, so we
+have consequently swizzled its DMA ops and configured the SMMU to
+translate accesses. This then requires the vring code to use the DMA API
+to establish translations, otherwise all transactions will result in
+fatal faults and termination.
+
+Given that ARM-based systems only see an SMMU if one is really present
+(the topology is all described by firmware tables such as device-tree or
+IORT), then we can safely use the DMA API for all legacy virtio devices.
+Modern devices can advertise the prescense of an IOMMU using the
+VIRTIO_F_IOMMU_PLATFORM feature flag.
+
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Michael S. Tsirkin <mst@redhat.com>
+Fixes: 876945dbf649 ("arm64: Hook up IOMMU dma_ops")
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/virtio/virtio_ring.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/drivers/virtio/virtio_ring.c
++++ b/drivers/virtio/virtio_ring.c
+@@ -159,6 +159,13 @@ static bool vring_use_dma_api(struct vir
+ if (xen_domain())
+ return true;
+
++ /*
++ * On ARM-based machines, the DMA ops will do the right thing,
++ * so always use them with legacy devices.
++ */
++ if (IS_ENABLED(CONFIG_ARM) || IS_ENABLED(CONFIG_ARM64))
++ return !virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
++
+ return false;
+ }
+
--- /dev/null
+From e0d76fa4475ef2cf4b52d18588b8ce95153d021b Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Thu, 26 Jan 2017 13:18:09 -0800
+Subject: xfs: prevent quotacheck from overloading inode lru
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e0d76fa4475ef2cf4b52d18588b8ce95153d021b upstream.
+
+Quotacheck runs at mount time in situations where quota accounting must
+be recalculated. In doing so, it uses bulkstat to visit every inode in
+the filesystem. Historically, every inode processed during quotacheck
+was released and immediately tagged for reclaim because quotacheck runs
+before the superblock is marked active by the VFS. In other words,
+the final iput() lead to an immediate ->destroy_inode() call, which
+allowed the XFS background reclaim worker to start reclaiming inodes.
+
+Commit 17c12bcd3 ("xfs: when replaying bmap operations, don't let
+unlinked inodes get reaped") marks the XFS superblock active sooner as
+part of the mount process to support caching inodes processed during log
+recovery. This occurs before quotacheck and thus means all inodes
+processed by quotacheck are inserted to the LRU on release. The
+s_umount lock is held until the mount has completed and thus prevents
+the shrinkers from operating on the sb. This means that quotacheck can
+excessively populate the inode LRU and lead to OOM conditions on systems
+without sufficient RAM.
+
+Update the quotacheck bulkstat handler to set XFS_IGET_DONTCACHE on
+inodes processed by quotacheck. This causes ->drop_inode() to return 1
+and in turn causes iput_final() to evict the inode. This preserves the
+original quotacheck behavior and prevents it from overloading the LRU
+and running out of memory.
+
+Reported-by: Martin Svec <martin.svec@zoner.cz>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_qm.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -1177,7 +1177,8 @@ xfs_qm_dqusage_adjust(
+ * the case in all other instances. It's OK that we do this because
+ * quotacheck is done only at mount time.
+ */
+- error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
++ error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL,
++ &ip);
+ if (error) {
+ *res = BULKSTAT_RV_NOTHING;
+ return error;