]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 30 Jan 2017 09:07:39 +0000 (10:07 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 30 Jan 2017 09:07:39 +0000 (10:07 +0100)
added patches:
btrfs-disable-xattr-operations-on-subvolume-directories.patch
btrfs-remove-get-set-_acl-from-btrfs_dir_ro_inode_operations.patch
btrfs-remove-old-tree_root-case-in-btrfs_read_locked_inode.patch
isdn-eicon-silence-misleading-array-bounds-warning.patch
mm-huge_memory.c-respect-foll_force-foll_cow-for-thp.patch
mm-mempolicy.c-do-not-put-mempolicy-before-using-its-nodemask.patch
mm-page_alloc-fix-check-for-null-preferred_zone.patch
mm-page_alloc-fix-fast-path-race-with-cpuset-update-or-removal.patch
mm-page_alloc-fix-premature-oom-when-racing-with-cpuset-mems-update.patch
mm-page_alloc-move-cpuset-seqcount-checking-to-slowpath.patch
sysctl-fix-proc_doulongvec_ms_jiffies_minmax.patch
userns-make-ucounts-lock-irq-safe.patch
vring-force-use-of-dma-api-for-arm-based-systems-with-legacy-devices.patch
xfs-prevent-quotacheck-from-overloading-inode-lru.patch

15 files changed:
queue-4.9/btrfs-disable-xattr-operations-on-subvolume-directories.patch [new file with mode: 0644]
queue-4.9/btrfs-remove-get-set-_acl-from-btrfs_dir_ro_inode_operations.patch [new file with mode: 0644]
queue-4.9/btrfs-remove-old-tree_root-case-in-btrfs_read_locked_inode.patch [new file with mode: 0644]
queue-4.9/isdn-eicon-silence-misleading-array-bounds-warning.patch [new file with mode: 0644]
queue-4.9/mm-huge_memory.c-respect-foll_force-foll_cow-for-thp.patch [new file with mode: 0644]
queue-4.9/mm-mempolicy.c-do-not-put-mempolicy-before-using-its-nodemask.patch [new file with mode: 0644]
queue-4.9/mm-page_alloc-fix-check-for-null-preferred_zone.patch [new file with mode: 0644]
queue-4.9/mm-page_alloc-fix-fast-path-race-with-cpuset-update-or-removal.patch [new file with mode: 0644]
queue-4.9/mm-page_alloc-fix-premature-oom-when-racing-with-cpuset-mems-update.patch [new file with mode: 0644]
queue-4.9/mm-page_alloc-move-cpuset-seqcount-checking-to-slowpath.patch [new file with mode: 0644]
queue-4.9/series
queue-4.9/sysctl-fix-proc_doulongvec_ms_jiffies_minmax.patch [new file with mode: 0644]
queue-4.9/userns-make-ucounts-lock-irq-safe.patch [new file with mode: 0644]
queue-4.9/vring-force-use-of-dma-api-for-arm-based-systems-with-legacy-devices.patch [new file with mode: 0644]
queue-4.9/xfs-prevent-quotacheck-from-overloading-inode-lru.patch [new file with mode: 0644]

diff --git a/queue-4.9/btrfs-disable-xattr-operations-on-subvolume-directories.patch b/queue-4.9/btrfs-disable-xattr-operations-on-subvolume-directories.patch
new file mode 100644 (file)
index 0000000..e8fd589
--- /dev/null
@@ -0,0 +1,42 @@
+From 1fdf41941b8010691679638f8d0c8d08cfee7726 Mon Sep 17 00:00:00 2001
+From: Omar Sandoval <osandov@fb.com>
+Date: Wed, 25 Jan 2017 17:06:39 -0800
+Subject: Btrfs: disable xattr operations on subvolume directories
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit 1fdf41941b8010691679638f8d0c8d08cfee7726 upstream.
+
+When you snapshot a subvolume containing a subvolume, you get a
+placeholder directory where the subvolume would be. These directory
+inodes have ->i_ops set to btrfs_dir_ro_inode_operations. Previously,
+these i_ops didn't include the xattr operation callbacks. The conversion
+to xattr_handlers missed this case, leading to bogus attempts to set
+xattrs on these inodes. This manifested itself as failures when running
+delayed inodes.
+
+To fix this, clear IOP_XATTR in ->i_opflags on these inodes.
+
+Fixes: 6c6ef9f26e59 ("xattr: Stop calling {get,set,remove}xattr inode operations")
+Cc: Andreas Gruenbacher <agruenba@redhat.com>
+Reported-by: Chris Murphy <lists@colorremedies.com>
+Tested-by: Chris Murphy <lists@colorremedies.com>
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -5679,6 +5679,7 @@ static struct inode *new_simple_dir(stru
+       inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+       inode->i_op = &btrfs_dir_ro_inode_operations;
++      inode->i_opflags &= ~IOP_XATTR;
+       inode->i_fop = &simple_dir_operations;
+       inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+       inode->i_mtime = current_time(inode);
diff --git a/queue-4.9/btrfs-remove-get-set-_acl-from-btrfs_dir_ro_inode_operations.patch b/queue-4.9/btrfs-remove-get-set-_acl-from-btrfs_dir_ro_inode_operations.patch
new file mode 100644 (file)
index 0000000..a5ee3be
--- /dev/null
@@ -0,0 +1,31 @@
+From 57b59ed2e5b91e958843609c7884794e29e6c4cb Mon Sep 17 00:00:00 2001
+From: Omar Sandoval <osandov@fb.com>
+Date: Wed, 25 Jan 2017 17:06:40 -0800
+Subject: Btrfs: remove ->{get, set}_acl() from btrfs_dir_ro_inode_operations
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit 57b59ed2e5b91e958843609c7884794e29e6c4cb upstream.
+
+Subvolume directory inodes can't have ACLs.
+
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |    2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -10585,8 +10585,6 @@ static const struct inode_operations btr
+ static const struct inode_operations btrfs_dir_ro_inode_operations = {
+       .lookup         = btrfs_lookup,
+       .permission     = btrfs_permission,
+-      .get_acl        = btrfs_get_acl,
+-      .set_acl        = btrfs_set_acl,
+       .update_time    = btrfs_update_time,
+ };
diff --git a/queue-4.9/btrfs-remove-old-tree_root-case-in-btrfs_read_locked_inode.patch b/queue-4.9/btrfs-remove-old-tree_root-case-in-btrfs_read_locked_inode.patch
new file mode 100644 (file)
index 0000000..5ec23fa
--- /dev/null
@@ -0,0 +1,39 @@
+From 67ade058ef2c65a3e56878af9c293ec76722a2e5 Mon Sep 17 00:00:00 2001
+From: Omar Sandoval <osandov@fb.com>
+Date: Wed, 25 Jan 2017 17:06:38 -0800
+Subject: Btrfs: remove old tree_root case in btrfs_read_locked_inode()
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit 67ade058ef2c65a3e56878af9c293ec76722a2e5 upstream.
+
+As Jeff explained in c2951f32d36c ("btrfs: remove old tree_root dirent
+processing in btrfs_real_readdir()"), supporting this old format is no
+longer necessary since the Btrfs magic number has been updated since we
+changed to the current format. There are other places where we still
+handle this old format, but since this is part of a fix that is going to
+stable, I'm only removing this one for now.
+
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |    5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3819,10 +3819,7 @@ cache_acl:
+               break;
+       case S_IFDIR:
+               inode->i_fop = &btrfs_dir_file_operations;
+-              if (root == root->fs_info->tree_root)
+-                      inode->i_op = &btrfs_dir_ro_inode_operations;
+-              else
+-                      inode->i_op = &btrfs_dir_inode_operations;
++              inode->i_op = &btrfs_dir_inode_operations;
+               break;
+       case S_IFLNK:
+               inode->i_op = &btrfs_symlink_inode_operations;
diff --git a/queue-4.9/isdn-eicon-silence-misleading-array-bounds-warning.patch b/queue-4.9/isdn-eicon-silence-misleading-array-bounds-warning.patch
new file mode 100644 (file)
index 0000000..972e5d0
--- /dev/null
@@ -0,0 +1,41 @@
+From 950eabbd6ddedc1b08350b9169a6a51b130ebaaf Mon Sep 17 00:00:00 2001
+From: Arnd Bergmann <arnd@arndb.de>
+Date: Fri, 27 Jan 2017 13:32:14 +0100
+Subject: ISDN: eicon: silence misleading array-bounds warning
+
+From: Arnd Bergmann <arnd@arndb.de>
+
+commit 950eabbd6ddedc1b08350b9169a6a51b130ebaaf upstream.
+
+With some gcc versions, we get a warning about the eicon driver,
+and that currently shows up as the only remaining warning in one
+of the build bots:
+
+In file included from ../drivers/isdn/hardware/eicon/message.c:30:0:
+eicon/message.c: In function 'mixer_notify_update':
+eicon/platform.h:333:18: warning: array subscript is above array bounds [-Warray-bounds]
+
+The code is easily changed to open-code the unusual PUT_WORD() line
+causing this to avoid the warning.
+
+Link: http://arm-soc.lixom.net/buildlogs/stable-rc/v4.4.45/
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/isdn/hardware/eicon/message.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/isdn/hardware/eicon/message.c
++++ b/drivers/isdn/hardware/eicon/message.c
+@@ -11297,7 +11297,8 @@ static void mixer_notify_update(PLCI *pl
+                               ((CAPI_MSG *) msg)->header.ncci = 0;
+                               ((CAPI_MSG *) msg)->info.facility_req.Selector = SELECTOR_LINE_INTERCONNECT;
+                               ((CAPI_MSG *) msg)->info.facility_req.structs[0] = 3;
+-                              PUT_WORD(&(((CAPI_MSG *) msg)->info.facility_req.structs[1]), LI_REQ_SILENT_UPDATE);
++                              ((CAPI_MSG *) msg)->info.facility_req.structs[1] = LI_REQ_SILENT_UPDATE & 0xff;
++                              ((CAPI_MSG *) msg)->info.facility_req.structs[2] = LI_REQ_SILENT_UPDATE >> 8;
+                               ((CAPI_MSG *) msg)->info.facility_req.structs[3] = 0;
+                               w = api_put(notify_plci->appl, (CAPI_MSG *) msg);
+                               if (w != _QUEUE_FULL)
diff --git a/queue-4.9/mm-huge_memory.c-respect-foll_force-foll_cow-for-thp.patch b/queue-4.9/mm-huge_memory.c-respect-foll_force-foll_cow-for-thp.patch
new file mode 100644 (file)
index 0000000..cd55109
--- /dev/null
@@ -0,0 +1,127 @@
+From 8310d48b125d19fcd9521d83b8293e63eb1646aa Mon Sep 17 00:00:00 2001
+From: Keno Fischer <keno@juliacomputing.com>
+Date: Tue, 24 Jan 2017 15:17:48 -0800
+Subject: mm/huge_memory.c: respect FOLL_FORCE/FOLL_COW for thp
+
+From: Keno Fischer <keno@juliacomputing.com>
+
+commit 8310d48b125d19fcd9521d83b8293e63eb1646aa upstream.
+
+In commit 19be0eaffa3a ("mm: remove gup_flags FOLL_WRITE games from
+__get_user_pages()"), the mm code was changed from unsetting FOLL_WRITE
+after a COW was resolved to setting the (newly introduced) FOLL_COW
+instead.  Simultaneously, the check in gup.c was updated to still allow
+writes with FOLL_FORCE set if FOLL_COW had also been set.
+
+However, a similar check in huge_memory.c was forgotten.  As a result,
+remote memory writes to ro regions of memory backed by transparent huge
+pages cause an infinite loop in the kernel (handle_mm_fault sets
+FOLL_COW and returns 0 causing a retry, but follow_trans_huge_pmd bails
+out immidiately because `(flags & FOLL_WRITE) && !pmd_write(*pmd)` is
+true.
+
+While in this state the process is stil SIGKILLable, but little else
+works (e.g.  no ptrace attach, no other signals).  This is easily
+reproduced with the following code (assuming thp are set to always):
+
+    #include <assert.h>
+    #include <fcntl.h>
+    #include <stdint.h>
+    #include <stdio.h>
+    #include <string.h>
+    #include <sys/mman.h>
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/wait.h>
+    #include <unistd.h>
+
+    #define TEST_SIZE 5 * 1024 * 1024
+
+    int main(void) {
+      int status;
+      pid_t child;
+      int fd = open("/proc/self/mem", O_RDWR);
+      void *addr = mmap(NULL, TEST_SIZE, PROT_READ,
+                        MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+      assert(addr != MAP_FAILED);
+      pid_t parent_pid = getpid();
+      if ((child = fork()) == 0) {
+        void *addr2 = mmap(NULL, TEST_SIZE, PROT_READ | PROT_WRITE,
+                           MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+        assert(addr2 != MAP_FAILED);
+        memset(addr2, 'a', TEST_SIZE);
+        pwrite(fd, addr2, TEST_SIZE, (uintptr_t)addr);
+        return 0;
+      }
+      assert(child == waitpid(child, &status, 0));
+      assert(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+      return 0;
+    }
+
+Fix this by updating follow_trans_huge_pmd in huge_memory.c analogously
+to the update in gup.c in the original commit.  The same pattern exists
+in follow_devmap_pmd.  However, we should not be able to reach that
+check with FOLL_COW set, so add WARN_ONCE to make sure we notice if we
+ever do.
+
+[akpm@linux-foundation.org: coding-style fixes]
+Link: http://lkml.kernel.org/r/20170106015025.GA38411@juliacomputing.com
+Signed-off-by: Keno Fischer <keno@juliacomputing.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Nicholas Piggin <npiggin@gmail.com>
+Cc: Willy Tarreau <w@1wt.eu>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Hugh Dickins <hughd@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |   18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -772,6 +772,12 @@ struct page *follow_devmap_pmd(struct vm
+       assert_spin_locked(pmd_lockptr(mm, pmd));
++      /*
++       * When we COW a devmap PMD entry, we split it into PTEs, so we should
++       * not be in this function with `flags & FOLL_COW` set.
++       */
++      WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
++
+       if (flags & FOLL_WRITE && !pmd_write(*pmd))
+               return NULL;
+@@ -1118,6 +1124,16 @@ out_unlock:
+       return ret;
+ }
++/*
++ * FOLL_FORCE can write to even unwritable pmd's, but only
++ * after we've gone through a COW cycle and they are dirty.
++ */
++static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
++{
++      return pmd_write(pmd) ||
++             ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
++}
++
+ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+                                  unsigned long addr,
+                                  pmd_t *pmd,
+@@ -1128,7 +1144,7 @@ struct page *follow_trans_huge_pmd(struc
+       assert_spin_locked(pmd_lockptr(mm, pmd));
+-      if (flags & FOLL_WRITE && !pmd_write(*pmd))
++      if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
+               goto out;
+       /* Avoid dumping huge zero page */
diff --git a/queue-4.9/mm-mempolicy.c-do-not-put-mempolicy-before-using-its-nodemask.patch b/queue-4.9/mm-mempolicy.c-do-not-put-mempolicy-before-using-its-nodemask.patch
new file mode 100644 (file)
index 0000000..e25774d
--- /dev/null
@@ -0,0 +1,48 @@
+From d51e9894d27492783fc6d1b489070b4ba66ce969 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Tue, 24 Jan 2017 15:18:18 -0800
+Subject: mm/mempolicy.c: do not put mempolicy before using its nodemask
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit d51e9894d27492783fc6d1b489070b4ba66ce969 upstream.
+
+Since commit be97a41b291e ("mm/mempolicy.c: merge alloc_hugepage_vma to
+alloc_pages_vma") alloc_pages_vma() can potentially free a mempolicy by
+mpol_cond_put() before accessing the embedded nodemask by
+__alloc_pages_nodemask().  The commit log says it's so "we can use a
+single exit path within the function" but that's clearly wrong.  We can
+still do that when doing mpol_cond_put() after the allocation attempt.
+
+Make sure the mempolicy is not freed prematurely, otherwise
+__alloc_pages_nodemask() can end up using a bogus nodemask, which could
+lead e.g.  to premature OOM.
+
+Fixes: be97a41b291e ("mm/mempolicy.c: merge alloc_hugepage_vma to alloc_pages_vma")
+Link: http://lkml.kernel.org/r/20170118141124.8345-1-vbabka@suse.cz
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mempolicy.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -2023,8 +2023,8 @@ retry_cpuset:
+       nmask = policy_nodemask(gfp, pol);
+       zl = policy_zonelist(gfp, pol, node);
+-      mpol_cond_put(pol);
+       page = __alloc_pages_nodemask(gfp, order, zl, nmask);
++      mpol_cond_put(pol);
+ out:
+       if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+               goto retry_cpuset;
diff --git a/queue-4.9/mm-page_alloc-fix-check-for-null-preferred_zone.patch b/queue-4.9/mm-page_alloc-fix-check-for-null-preferred_zone.patch
new file mode 100644 (file)
index 0000000..aa96114
--- /dev/null
@@ -0,0 +1,92 @@
+From ea57485af8f4221312a5a95d63c382b45e7840dc Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Tue, 24 Jan 2017 15:18:32 -0800
+Subject: mm, page_alloc: fix check for NULL preferred_zone
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit ea57485af8f4221312a5a95d63c382b45e7840dc upstream.
+
+Patch series "fix premature OOM regression in 4.7+ due to cpuset races".
+
+This is v2 of my attempt to fix the recent report based on LTP cpuset
+stress test [1].  The intention is to go to stable 4.9 LTSS with this,
+as triggering repeated OOMs is not nice.  That's why the patches try to
+be not too intrusive.
+
+Unfortunately why investigating I found that modifying the testcase to
+use per-VMA policies instead of per-task policies will bring the OOM's
+back, but that seems to be much older and harder to fix problem.  I have
+posted a RFC [2] but I believe that fixing the recent regressions has a
+higher priority.
+
+Longer-term we might try to think how to fix the cpuset mess in a better
+and less error prone way.  I was for example very surprised to learn,
+that cpuset updates change not only task->mems_allowed, but also
+nodemask of mempolicies.  Until now I expected the parameter to
+alloc_pages_nodemask() to be stable.  I wonder why do we then treat
+cpusets specially in get_page_from_freelist() and distinguish HARDWALL
+etc, when there's unconditional intersection between mempolicy and
+cpuset.  I would expect the nodemask adjustment for saving overhead in
+g_p_f(), but that clearly doesn't happen in the current form.  So we
+have both crazy complexity and overhead, AFAICS.
+
+[1] https://lkml.kernel.org/r/CAFpQJXUq-JuEP=QPidy4p_=FN0rkH5Z-kfB4qBvsf6jMS87Edg@mail.gmail.com
+[2] https://lkml.kernel.org/r/7c459f26-13a6-a817-e508-b65b903a8378@suse.cz
+
+This patch (of 4):
+
+Since commit c33d6c06f60f ("mm, page_alloc: avoid looking up the first
+zone in a zonelist twice") we have a wrong check for NULL preferred_zone,
+which can theoretically happen due to concurrent cpuset modification.  We
+check the zoneref pointer which is never NULL and we should check the zone
+pointer.  Also document this in first_zones_zonelist() comment per Michal
+Hocko.
+
+Fixes: c33d6c06f60f ("mm, page_alloc: avoid looking up the first zone in a zonelist twice")
+Link: http://lkml.kernel.org/r/20170120103843.24587-2-vbabka@suse.cz
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Cc: Ganapatrao Kulkarni <gpkulkarni@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mmzone.h |    6 +++++-
+ mm/page_alloc.c        |    2 +-
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -972,12 +972,16 @@ static __always_inline struct zoneref *n
+  * @zonelist - The zonelist to search for a suitable zone
+  * @highest_zoneidx - The zone index of the highest zone to return
+  * @nodes - An optional nodemask to filter the zonelist with
+- * @zone - The first suitable zone found is returned via this parameter
++ * @return - Zoneref pointer for the first suitable zone found (see below)
+  *
+  * This function returns the first zone at or below a given zone index that is
+  * within the allowed nodemask. The zoneref returned is a cursor that can be
+  * used to iterate the zonelist with next_zones_zonelist by advancing it by
+  * one before calling.
++ *
++ * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
++ * never NULL). This may happen either genuinely, or due to concurrent nodemask
++ * update due to cpuset modification.
+  */
+ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
+                                       enum zone_type highest_zoneidx,
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3763,7 +3763,7 @@ retry_cpuset:
+        */
+       ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+                                       ac.high_zoneidx, ac.nodemask);
+-      if (!ac.preferred_zoneref) {
++      if (!ac.preferred_zoneref->zone) {
+               page = NULL;
+               goto no_zone;
+       }
diff --git a/queue-4.9/mm-page_alloc-fix-fast-path-race-with-cpuset-update-or-removal.patch b/queue-4.9/mm-page_alloc-fix-fast-path-race-with-cpuset-update-or-removal.patch
new file mode 100644 (file)
index 0000000..4a1236b
--- /dev/null
@@ -0,0 +1,64 @@
+From 16096c25bf0ca5d87e4fa6ec6108ba53feead212 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Tue, 24 Jan 2017 15:18:35 -0800
+Subject: mm, page_alloc: fix fast-path race with cpuset update or removal
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 16096c25bf0ca5d87e4fa6ec6108ba53feead212 upstream.
+
+Ganapatrao Kulkarni reported that the LTP test cpuset01 in stress mode
+triggers OOM killer in few seconds, despite lots of free memory.  The
+test attempts to repeatedly fault in memory in one process in a cpuset,
+while changing allowed nodes of the cpuset between 0 and 1 in another
+process.
+
+One possible cause is that in the fast path we find the preferred
+zoneref according to current mems_allowed, so that it points to the
+middle of the zonelist, skipping e.g.  zones of node 1 completely.  If
+the mems_allowed is updated to contain only node 1, we never reach it in
+the zonelist, and trigger OOM before checking the cpuset_mems_cookie.
+
+This patch fixes the particular case by redoing the preferred zoneref
+search if we switch back to the original nodemask.  The condition is
+also slightly changed so that when the last non-root cpuset is removed,
+we don't miss it.
+
+Note that this is not a full fix, and more patches will follow.
+
+Link: http://lkml.kernel.org/r/20170120103843.24587-3-vbabka@suse.cz
+Fixes: 682a3385e773 ("mm, page_alloc: inline the fast path of the zonelist iterator")
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reported-by: Ganapatrao Kulkarni <gpkulkarni@gmail.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3783,9 +3783,17 @@ retry_cpuset:
+       /*
+        * Restore the original nodemask if it was potentially replaced with
+        * &cpuset_current_mems_allowed to optimize the fast-path attempt.
++       * Also recalculate the starting point for the zonelist iterator or
++       * we could end up iterating over non-eligible zones endlessly.
+        */
+-      if (cpusets_enabled())
++      if (unlikely(ac.nodemask != nodemask)) {
+               ac.nodemask = nodemask;
++              ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
++                                              ac.high_zoneidx, ac.nodemask);
++              if (!ac.preferred_zoneref->zone)
++                      goto no_zone;
++      }
++
+       page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+ no_zone:
diff --git a/queue-4.9/mm-page_alloc-fix-premature-oom-when-racing-with-cpuset-mems-update.patch b/queue-4.9/mm-page_alloc-fix-premature-oom-when-racing-with-cpuset-mems-update.patch
new file mode 100644 (file)
index 0000000..d46676d
--- /dev/null
@@ -0,0 +1,113 @@
+From e47483bca2cc59a4593b37a270b16ee42b1d9f08 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Tue, 24 Jan 2017 15:18:41 -0800
+Subject: mm, page_alloc: fix premature OOM when racing with cpuset mems update
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit e47483bca2cc59a4593b37a270b16ee42b1d9f08 upstream.
+
+Ganapatrao Kulkarni reported that the LTP test cpuset01 in stress mode
+triggers OOM killer in few seconds, despite lots of free memory.  The
+test attempts to repeatedly fault in memory in one process in a cpuset,
+while changing allowed nodes of the cpuset between 0 and 1 in another
+process.
+
+The problem comes from insufficient protection against cpuset changes,
+which can cause get_page_from_freelist() to consider all zones as
+non-eligible due to nodemask and/or current->mems_allowed.  This was
+masked in the past by sufficient retries, but since commit 682a3385e773
+("mm, page_alloc: inline the fast path of the zonelist iterator") we fix
+the preferred_zoneref once, and don't iterate over the whole zonelist in
+further attempts, thus the only eligible zones might be placed in the
+zonelist before our starting point and we always miss them.
+
+A previous patch fixed this problem for current->mems_allowed.  However,
+cpuset changes also update the task's mempolicy nodemask.  The fix has
+two parts.  We have to repeat the preferred_zoneref search when we
+detect cpuset update by way of seqcount, and we have to check the
+seqcount before considering OOM.
+
+[akpm@linux-foundation.org: fix typo in comment]
+Link: http://lkml.kernel.org/r/20170120103843.24587-5-vbabka@suse.cz
+Fixes: c33d6c06f60f ("mm, page_alloc: avoid looking up the first zone in a zonelist twice")
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reported-by: Ganapatrao Kulkarni <gpkulkarni@gmail.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |   35 ++++++++++++++++++++++++-----------
+ 1 file changed, 24 insertions(+), 11 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3534,6 +3534,17 @@ retry_cpuset:
+       no_progress_loops = 0;
+       compact_priority = DEF_COMPACT_PRIORITY;
+       cpuset_mems_cookie = read_mems_allowed_begin();
++      /*
++       * We need to recalculate the starting point for the zonelist iterator
++       * because we might have used different nodemask in the fast path, or
++       * there was a cpuset modification and we are retrying - otherwise we
++       * could end up iterating over non-eligible zones endlessly.
++       */
++      ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
++                                      ac->high_zoneidx, ac->nodemask);
++      if (!ac->preferred_zoneref->zone)
++              goto nopage;
++
+       /*
+        * The fast path uses conservative alloc_flags to succeed only until
+@@ -3694,6 +3705,13 @@ retry:
+                               &compaction_retries))
+               goto retry;
++      /*
++       * It's possible we raced with cpuset update so the OOM would be
++       * premature (see below the nopage: label for full explanation).
++       */
++      if (read_mems_allowed_retry(cpuset_mems_cookie))
++              goto retry_cpuset;
++
+       /* Reclaim has failed us, start killing things */
+       page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+       if (page)
+@@ -3707,10 +3725,11 @@ retry:
+ nopage:
+       /*
+-       * When updating a task's mems_allowed, it is possible to race with
+-       * parallel threads in such a way that an allocation can fail while
+-       * the mask is being updated. If a page allocation is about to fail,
+-       * check if the cpuset changed during allocation and if so, retry.
++       * When updating a task's mems_allowed or mempolicy nodemask, it is
++       * possible to race with parallel threads in such a way that our
++       * allocation can fail while the mask is being updated. If we are about
++       * to fail, check if the cpuset changed during allocation and if so,
++       * retry.
+        */
+       if (read_mems_allowed_retry(cpuset_mems_cookie))
+               goto retry_cpuset;
+@@ -3801,15 +3820,9 @@ no_zone:
+       /*
+        * Restore the original nodemask if it was potentially replaced with
+        * &cpuset_current_mems_allowed to optimize the fast-path attempt.
+-       * Also recalculate the starting point for the zonelist iterator or
+-       * we could end up iterating over non-eligible zones endlessly.
+        */
+-      if (unlikely(ac.nodemask != nodemask)) {
++      if (unlikely(ac.nodemask != nodemask))
+               ac.nodemask = nodemask;
+-              ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+-                                              ac.high_zoneidx, ac.nodemask);
+-              /* If we have NULL preferred zone, slowpath wll handle that */
+-      }
+       page = __alloc_pages_slowpath(alloc_mask, order, &ac);
diff --git a/queue-4.9/mm-page_alloc-move-cpuset-seqcount-checking-to-slowpath.patch b/queue-4.9/mm-page_alloc-move-cpuset-seqcount-checking-to-slowpath.patch
new file mode 100644 (file)
index 0000000..9337c1e
--- /dev/null
@@ -0,0 +1,139 @@
+From 5ce9bfef1d27944c119a397a9d827bef795487ce Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Tue, 24 Jan 2017 15:18:38 -0800
+Subject: mm, page_alloc: move cpuset seqcount checking to slowpath
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 5ce9bfef1d27944c119a397a9d827bef795487ce upstream.
+
+This is a preparation for the following patch to make review simpler.
+While the primary motivation is a bug fix, this also simplifies the fast
+path, although the moved code is only enabled when cpusets are in use.
+
+Link: http://lkml.kernel.org/r/20170120103843.24587-4-vbabka@suse.cz
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Cc: Ganapatrao Kulkarni <gpkulkarni@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |   47 ++++++++++++++++++++++++++---------------------
+ 1 file changed, 26 insertions(+), 21 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3502,12 +3502,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u
+       struct page *page = NULL;
+       unsigned int alloc_flags;
+       unsigned long did_some_progress;
+-      enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
++      enum compact_priority compact_priority;
+       enum compact_result compact_result;
+-      int compaction_retries = 0;
+-      int no_progress_loops = 0;
++      int compaction_retries;
++      int no_progress_loops;
+       unsigned long alloc_start = jiffies;
+       unsigned int stall_timeout = 10 * HZ;
++      unsigned int cpuset_mems_cookie;
+       /*
+        * In the slowpath, we sanity check order to avoid ever trying to
+@@ -3528,6 +3529,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u
+                               (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
+               gfp_mask &= ~__GFP_ATOMIC;
++retry_cpuset:
++      compaction_retries = 0;
++      no_progress_loops = 0;
++      compact_priority = DEF_COMPACT_PRIORITY;
++      cpuset_mems_cookie = read_mems_allowed_begin();
++
+       /*
+        * The fast path uses conservative alloc_flags to succeed only until
+        * kswapd needs to be woken up, and to avoid the cost of setting up
+@@ -3699,6 +3706,15 @@ retry:
+       }
+ nopage:
++      /*
++       * When updating a task's mems_allowed, it is possible to race with
++       * parallel threads in such a way that an allocation can fail while
++       * the mask is being updated. If a page allocation is about to fail,
++       * check if the cpuset changed during allocation and if so, retry.
++       */
++      if (read_mems_allowed_retry(cpuset_mems_cookie))
++              goto retry_cpuset;
++
+       warn_alloc(gfp_mask,
+                       "page allocation failure: order:%u", order);
+ got_pg:
+@@ -3713,7 +3729,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u
+                       struct zonelist *zonelist, nodemask_t *nodemask)
+ {
+       struct page *page;
+-      unsigned int cpuset_mems_cookie;
+       unsigned int alloc_flags = ALLOC_WMARK_LOW;
+       gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+       struct alloc_context ac = {
+@@ -3750,9 +3765,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u
+       if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
+               alloc_flags |= ALLOC_CMA;
+-retry_cpuset:
+-      cpuset_mems_cookie = read_mems_allowed_begin();
+-
+       /* Dirty zone balancing only done in the fast path */
+       ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+@@ -3765,6 +3777,11 @@ retry_cpuset:
+                                       ac.high_zoneidx, ac.nodemask);
+       if (!ac.preferred_zoneref->zone) {
+               page = NULL;
++              /*
++               * This might be due to race with cpuset_current_mems_allowed
++               * update, so make sure we retry with original nodemask in the
++               * slow path.
++               */
+               goto no_zone;
+       }
+@@ -3773,6 +3790,7 @@ retry_cpuset:
+       if (likely(page))
+               goto out;
++no_zone:
+       /*
+        * Runtime PM, block IO and its error handling path can deadlock
+        * because I/O on the device might not complete.
+@@ -3790,24 +3808,11 @@ retry_cpuset:
+               ac.nodemask = nodemask;
+               ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+                                               ac.high_zoneidx, ac.nodemask);
+-              if (!ac.preferred_zoneref->zone)
+-                      goto no_zone;
++              /* If we have NULL preferred zone, slowpath wll handle that */
+       }
+       page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+-no_zone:
+-      /*
+-       * When updating a task's mems_allowed, it is possible to race with
+-       * parallel threads in such a way that an allocation can fail while
+-       * the mask is being updated. If a page allocation is about to fail,
+-       * check if the cpuset changed during allocation and if so, retry.
+-       */
+-      if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
+-              alloc_mask = gfp_mask;
+-              goto retry_cpuset;
+-      }
+-
+ out:
+       if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
+           unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
index 0c4d9b92ab38741aeec4a97844a8d40ca4347b22..46684bbdc128c7d15f10087634ae999d2cae9b3a 100644 (file)
@@ -9,3 +9,17 @@ drm-vc4-return-einval-on-the-overflow-checks-failing.patch
 drm-vc4-fix-a-bounds-check.patch
 revert-drm-radeon-always-apply-pci-shutdown-callbacks.patch
 drm-atomic-clear-out-fence-when-duplicating-state.patch
+mm-huge_memory.c-respect-foll_force-foll_cow-for-thp.patch
+mm-mempolicy.c-do-not-put-mempolicy-before-using-its-nodemask.patch
+mm-page_alloc-fix-check-for-null-preferred_zone.patch
+mm-page_alloc-fix-fast-path-race-with-cpuset-update-or-removal.patch
+mm-page_alloc-move-cpuset-seqcount-checking-to-slowpath.patch
+mm-page_alloc-fix-premature-oom-when-racing-with-cpuset-mems-update.patch
+vring-force-use-of-dma-api-for-arm-based-systems-with-legacy-devices.patch
+userns-make-ucounts-lock-irq-safe.patch
+sysctl-fix-proc_doulongvec_ms_jiffies_minmax.patch
+xfs-prevent-quotacheck-from-overloading-inode-lru.patch
+isdn-eicon-silence-misleading-array-bounds-warning.patch
+btrfs-remove-old-tree_root-case-in-btrfs_read_locked_inode.patch
+btrfs-disable-xattr-operations-on-subvolume-directories.patch
+btrfs-remove-get-set-_acl-from-btrfs_dir_ro_inode_operations.patch
diff --git a/queue-4.9/sysctl-fix-proc_doulongvec_ms_jiffies_minmax.patch b/queue-4.9/sysctl-fix-proc_doulongvec_ms_jiffies_minmax.patch
new file mode 100644 (file)
index 0000000..ea990b8
--- /dev/null
@@ -0,0 +1,34 @@
+From ff9f8a7cf935468a94d9927c68b00daae701667e Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 25 Jan 2017 18:20:55 -0800
+Subject: sysctl: fix proc_doulongvec_ms_jiffies_minmax()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit ff9f8a7cf935468a94d9927c68b00daae701667e upstream.
+
+We perform the conversion between kernel jiffies and ms only when
+exporting kernel value to user space.
+
+We need to do the opposite operation when value is written by user.
+
+Only matters when HZ != 1000
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sysctl.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -2487,6 +2487,7 @@ static int __do_proc_doulongvec_minmax(v
+                               break;
+                       if (neg)
+                               continue;
++                      val = convmul * val / convdiv;
+                       if ((min && val < *min) || (max && val > *max))
+                               continue;
+                       *i = val;
diff --git a/queue-4.9/userns-make-ucounts-lock-irq-safe.patch b/queue-4.9/userns-make-ucounts-lock-irq-safe.patch
new file mode 100644 (file)
index 0000000..2ca2e96
--- /dev/null
@@ -0,0 +1,216 @@
+From 880a38547ff08715ce4f1daf9a4bb30c87676e68 Mon Sep 17 00:00:00 2001
+From: Nikolay Borisov <n.borisov.lkml@gmail.com>
+Date: Fri, 20 Jan 2017 15:21:35 +0200
+Subject: userns: Make ucounts lock irq-safe
+
+From: Nikolay Borisov <n.borisov.lkml@gmail.com>
+
+commit 880a38547ff08715ce4f1daf9a4bb30c87676e68 upstream.
+
+The ucounts_lock is being used to protect various ucounts lifecycle
+management functionalities. However, those services can also be invoked
+when a pidns is being freed in an RCU callback (e.g. softirq context).
+This can lead to deadlocks. There were already efforts trying to
+prevent similar deadlocks in add7c65ca426 ("pid: fix lockdep deadlock
+warning due to ucount_lock"), however they just moved the context
+from hardirq to softrq. Fix this issue once and for all by explictly
+making the lock disable irqs altogether.
+
+Dmitry Vyukov <dvyukov@google.com> reported:
+
+> I've got the following deadlock report while running syzkaller fuzzer
+> on eec0d3d065bfcdf9cd5f56dd2a36b94d12d32297 of linux-next (on odroid
+> device if it matters):
+>
+> =================================
+> [ INFO: inconsistent lock state ]
+> 4.10.0-rc3-next-20170112-xc2-dirty #6 Not tainted
+> ---------------------------------
+> inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
+> swapper/2/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
+>  (ucounts_lock){+.?...}, at: [<     inline     >] spin_lock
+> ./include/linux/spinlock.h:302
+>  (ucounts_lock){+.?...}, at: [<ffff2000081678c8>]
+> put_ucounts+0x60/0x138 kernel/ucount.c:162
+> {SOFTIRQ-ON-W} state was registered at:
+> [<ffff2000081c82d8>] mark_lock+0x220/0xb60 kernel/locking/lockdep.c:3054
+> [<     inline     >] mark_irqflags kernel/locking/lockdep.c:2941
+> [<ffff2000081c97a8>] __lock_acquire+0x388/0x3260 kernel/locking/lockdep.c:3295
+> [<ffff2000081cce24>] lock_acquire+0xa4/0x138 kernel/locking/lockdep.c:3753
+> [<     inline     >] __raw_spin_lock ./include/linux/spinlock_api_smp.h:144
+> [<ffff200009798128>] _raw_spin_lock+0x90/0xd0 kernel/locking/spinlock.c:151
+> [<     inline     >] spin_lock ./include/linux/spinlock.h:302
+> [<     inline     >] get_ucounts kernel/ucount.c:131
+> [<ffff200008167c28>] inc_ucount+0x80/0x6c8 kernel/ucount.c:189
+> [<     inline     >] inc_mnt_namespaces fs/namespace.c:2818
+> [<ffff200008481850>] alloc_mnt_ns+0x78/0x3a8 fs/namespace.c:2849
+> [<ffff200008487298>] create_mnt_ns+0x28/0x200 fs/namespace.c:2959
+> [<     inline     >] init_mount_tree fs/namespace.c:3199
+> [<ffff200009bd6674>] mnt_init+0x258/0x384 fs/namespace.c:3251
+> [<ffff200009bd60bc>] vfs_caches_init+0x6c/0x80 fs/dcache.c:3626
+> [<ffff200009bb1114>] start_kernel+0x414/0x460 init/main.c:648
+> [<ffff200009bb01e8>] __primary_switched+0x6c/0x70 arch/arm64/kernel/head.S:456
+> irq event stamp: 2316924
+> hardirqs last  enabled at (2316924): [<     inline     >] rcu_do_batch
+> kernel/rcu/tree.c:2911
+> hardirqs last  enabled at (2316924): [<     inline     >]
+> invoke_rcu_callbacks kernel/rcu/tree.c:3182
+> hardirqs last  enabled at (2316924): [<     inline     >]
+> __rcu_process_callbacks kernel/rcu/tree.c:3149
+> hardirqs last  enabled at (2316924): [<ffff200008210414>]
+> rcu_process_callbacks+0x7a4/0xc28 kernel/rcu/tree.c:3166
+> hardirqs last disabled at (2316923): [<     inline     >] rcu_do_batch
+> kernel/rcu/tree.c:2900
+> hardirqs last disabled at (2316923): [<     inline     >]
+> invoke_rcu_callbacks kernel/rcu/tree.c:3182
+> hardirqs last disabled at (2316923): [<     inline     >]
+> __rcu_process_callbacks kernel/rcu/tree.c:3149
+> hardirqs last disabled at (2316923): [<ffff20000820fe80>]
+> rcu_process_callbacks+0x210/0xc28 kernel/rcu/tree.c:3166
+> softirqs last  enabled at (2316912): [<ffff20000811b4c4>]
+> _local_bh_enable+0x4c/0x80 kernel/softirq.c:155
+> softirqs last disabled at (2316913): [<     inline     >]
+> do_softirq_own_stack ./include/linux/interrupt.h:488
+> softirqs last disabled at (2316913): [<     inline     >]
+> invoke_softirq kernel/softirq.c:371
+> softirqs last disabled at (2316913): [<ffff20000811c994>]
+> irq_exit+0x264/0x308 kernel/softirq.c:405
+>
+> other info that might help us debug this:
+>  Possible unsafe locking scenario:
+>
+>        CPU0
+>        ----
+>   lock(ucounts_lock);
+>   <Interrupt>
+>     lock(ucounts_lock);
+>
+>  *** DEADLOCK ***
+>
+> 1 lock held by swapper/2/0:
+>  #0:  (rcu_callback){......}, at: [<     inline     >] __rcu_reclaim
+> kernel/rcu/rcu.h:108
+>  #0:  (rcu_callback){......}, at: [<     inline     >] rcu_do_batch
+> kernel/rcu/tree.c:2919
+>  #0:  (rcu_callback){......}, at: [<     inline     >]
+> invoke_rcu_callbacks kernel/rcu/tree.c:3182
+>  #0:  (rcu_callback){......}, at: [<     inline     >]
+> __rcu_process_callbacks kernel/rcu/tree.c:3149
+>  #0:  (rcu_callback){......}, at: [<ffff200008210390>]
+> rcu_process_callbacks+0x720/0xc28 kernel/rcu/tree.c:3166
+>
+> stack backtrace:
+> CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.10.0-rc3-next-20170112-xc2-dirty #6
+> Hardware name: Hardkernel ODROID-C2 (DT)
+> Call trace:
+> [<ffff20000808fa60>] dump_backtrace+0x0/0x440 arch/arm64/kernel/traps.c:500
+> [<ffff20000808fec0>] show_stack+0x20/0x30 arch/arm64/kernel/traps.c:225
+> [<ffff2000088a99e0>] dump_stack+0x110/0x168
+> [<ffff2000082fa2b4>] print_usage_bug.part.27+0x49c/0x4bc
+> kernel/locking/lockdep.c:2387
+> [<     inline     >] print_usage_bug kernel/locking/lockdep.c:2357
+> [<     inline     >] valid_state kernel/locking/lockdep.c:2400
+> [<     inline     >] mark_lock_irq kernel/locking/lockdep.c:2617
+> [<ffff2000081c89ec>] mark_lock+0x934/0xb60 kernel/locking/lockdep.c:3065
+> [<     inline     >] mark_irqflags kernel/locking/lockdep.c:2923
+> [<ffff2000081c9a60>] __lock_acquire+0x640/0x3260 kernel/locking/lockdep.c:3295
+> [<ffff2000081cce24>] lock_acquire+0xa4/0x138 kernel/locking/lockdep.c:3753
+> [<     inline     >] __raw_spin_lock ./include/linux/spinlock_api_smp.h:144
+> [<ffff200009798128>] _raw_spin_lock+0x90/0xd0 kernel/locking/spinlock.c:151
+> [<     inline     >] spin_lock ./include/linux/spinlock.h:302
+> [<ffff2000081678c8>] put_ucounts+0x60/0x138 kernel/ucount.c:162
+> [<ffff200008168364>] dec_ucount+0xf4/0x158 kernel/ucount.c:214
+> [<     inline     >] dec_pid_namespaces kernel/pid_namespace.c:89
+> [<ffff200008293dc8>] delayed_free_pidns+0x40/0xe0 kernel/pid_namespace.c:156
+> [<     inline     >] __rcu_reclaim kernel/rcu/rcu.h:118
+> [<     inline     >] rcu_do_batch kernel/rcu/tree.c:2919
+> [<     inline     >] invoke_rcu_callbacks kernel/rcu/tree.c:3182
+> [<     inline     >] __rcu_process_callbacks kernel/rcu/tree.c:3149
+> [<ffff2000082103d8>] rcu_process_callbacks+0x768/0xc28 kernel/rcu/tree.c:3166
+> [<ffff2000080821dc>] __do_softirq+0x324/0x6e0 kernel/softirq.c:284
+> [<     inline     >] do_softirq_own_stack ./include/linux/interrupt.h:488
+> [<     inline     >] invoke_softirq kernel/softirq.c:371
+> [<ffff20000811c994>] irq_exit+0x264/0x308 kernel/softirq.c:405
+> [<ffff2000081ecc28>] __handle_domain_irq+0xc0/0x150 kernel/irq/irqdesc.c:636
+> [<ffff200008081c80>] gic_handle_irq+0x68/0xd8
+> Exception stack(0xffff8000648e7dd0 to 0xffff8000648e7f00)
+> 7dc0:                                   ffff8000648d4b3c 0000000000000007
+> 7de0: 0000000000000000 1ffff0000c91a967 1ffff0000c91a967 1ffff0000c91a967
+> 7e00: ffff20000a4b6b68 0000000000000001 0000000000000007 0000000000000001
+> 7e20: 1fffe4000149ae90 ffff200009d35000 0000000000000000 0000000000000002
+> 7e40: 0000000000000000 0000000000000000 0000000002624a1a 0000000000000000
+> 7e60: 0000000000000000 ffff200009cbcd88 000060006d2ed000 0000000000000140
+> 7e80: ffff200009cff000 ffff200009cb6000 ffff200009cc2020 ffff200009d2159d
+> 7ea0: 0000000000000000 ffff8000648d4380 0000000000000000 ffff8000648e7f00
+> 7ec0: ffff20000820a478 ffff8000648e7f00 ffff20000820a47c 0000000010000145
+> 7ee0: 0000000000000140 dfff200000000000 ffffffffffffffff ffff20000820a478
+> [<ffff2000080837f8>] el1_irq+0xb8/0x130 arch/arm64/kernel/entry.S:486
+> [<     inline     >] arch_local_irq_restore
+> ./arch/arm64/include/asm/irqflags.h:81
+> [<ffff20000820a47c>] rcu_idle_exit+0x64/0xa8 kernel/rcu/tree.c:1030
+> [<     inline     >] cpuidle_idle_call kernel/sched/idle.c:200
+> [<ffff2000081bcbfc>] do_idle+0x1dc/0x2d0 kernel/sched/idle.c:243
+> [<ffff2000081bd1cc>] cpu_startup_entry+0x24/0x28 kernel/sched/idle.c:345
+> [<ffff200008099f8c>] secondary_start_kernel+0x2cc/0x358
+> arch/arm64/kernel/smp.c:276
+> [<000000000279f1a4>] 0x279f1a4
+
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Tested-by: Dmitry Vyukov <dvyukov@google.com>
+Fixes: add7c65ca426 ("pid: fix lockdep deadlock warning due to ucount_lock")
+Fixes: f333c700c610 ("pidns: Add a limit on the number of pid namespaces")
+Link: https://www.spinics.net/lists/kernel/msg2426637.html
+Signed-off-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
+Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/ucount.c |   14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/kernel/ucount.c
++++ b/kernel/ucount.c
+@@ -128,10 +128,10 @@ static struct ucounts *get_ucounts(struc
+       struct hlist_head *hashent = ucounts_hashentry(ns, uid);
+       struct ucounts *ucounts, *new;
+-      spin_lock(&ucounts_lock);
++      spin_lock_irq(&ucounts_lock);
+       ucounts = find_ucounts(ns, uid, hashent);
+       if (!ucounts) {
+-              spin_unlock(&ucounts_lock);
++              spin_unlock_irq(&ucounts_lock);
+               new = kzalloc(sizeof(*new), GFP_KERNEL);
+               if (!new)
+@@ -141,7 +141,7 @@ static struct ucounts *get_ucounts(struc
+               new->uid = uid;
+               atomic_set(&new->count, 0);
+-              spin_lock(&ucounts_lock);
++              spin_lock_irq(&ucounts_lock);
+               ucounts = find_ucounts(ns, uid, hashent);
+               if (ucounts) {
+                       kfree(new);
+@@ -152,16 +152,18 @@ static struct ucounts *get_ucounts(struc
+       }
+       if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+               ucounts = NULL;
+-      spin_unlock(&ucounts_lock);
++      spin_unlock_irq(&ucounts_lock);
+       return ucounts;
+ }
+ static void put_ucounts(struct ucounts *ucounts)
+ {
++      unsigned long flags;
++
+       if (atomic_dec_and_test(&ucounts->count)) {
+-              spin_lock(&ucounts_lock);
++              spin_lock_irqsave(&ucounts_lock, flags);
+               hlist_del_init(&ucounts->node);
+-              spin_unlock(&ucounts_lock);
++              spin_unlock_irqrestore(&ucounts_lock, flags);
+               kfree(ucounts);
+       }
diff --git a/queue-4.9/vring-force-use-of-dma-api-for-arm-based-systems-with-legacy-devices.patch b/queue-4.9/vring-force-use-of-dma-api-for-arm-based-systems-with-legacy-devices.patch
new file mode 100644 (file)
index 0000000..4dc7003
--- /dev/null
@@ -0,0 +1,65 @@
+From c7070619f3408d9a0dffbed9149e6f00479cf43b Mon Sep 17 00:00:00 2001
+From: Will Deacon <will.deacon@arm.com>
+Date: Fri, 20 Jan 2017 10:33:32 +0000
+Subject: vring: Force use of DMA API for ARM-based systems with legacy devices
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit c7070619f3408d9a0dffbed9149e6f00479cf43b upstream.
+
+Booting Linux on an ARM fastmodel containing an SMMU emulation results
+in an unexpected I/O page fault from the legacy virtio-blk PCI device:
+
+[    1.211721] arm-smmu-v3 2b400000.smmu: event 0x10 received:
+[    1.211800] arm-smmu-v3 2b400000.smmu:      0x00000000fffff010
+[    1.211880] arm-smmu-v3 2b400000.smmu:      0x0000020800000000
+[    1.211959] arm-smmu-v3 2b400000.smmu:      0x00000008fa081002
+[    1.212075] arm-smmu-v3 2b400000.smmu:      0x0000000000000000
+[    1.212155] arm-smmu-v3 2b400000.smmu: event 0x10 received:
+[    1.212234] arm-smmu-v3 2b400000.smmu:      0x00000000fffff010
+[    1.212314] arm-smmu-v3 2b400000.smmu:      0x0000020800000000
+[    1.212394] arm-smmu-v3 2b400000.smmu:      0x00000008fa081000
+[    1.212471] arm-smmu-v3 2b400000.smmu:      0x0000000000000000
+
+<system hangs failing to read partition table>
+
+This is because the legacy virtio-blk device is behind an SMMU, so we
+have consequently swizzled its DMA ops and configured the SMMU to
+translate accesses. This then requires the vring code to use the DMA API
+to establish translations, otherwise all transactions will result in
+fatal faults and termination.
+
+Given that ARM-based systems only see an SMMU if one is really present
+(the topology is all described by firmware tables such as device-tree or
+IORT), then we can safely use the DMA API for all legacy virtio devices.
+Modern devices can advertise the prescense of an IOMMU using the
+VIRTIO_F_IOMMU_PLATFORM feature flag.
+
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Michael S. Tsirkin <mst@redhat.com>
+Fixes: 876945dbf649 ("arm64: Hook up IOMMU dma_ops")
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/virtio/virtio_ring.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/drivers/virtio/virtio_ring.c
++++ b/drivers/virtio/virtio_ring.c
+@@ -159,6 +159,13 @@ static bool vring_use_dma_api(struct vir
+       if (xen_domain())
+               return true;
++      /*
++       * On ARM-based machines, the DMA ops will do the right thing,
++       * so always use them with legacy devices.
++       */
++      if (IS_ENABLED(CONFIG_ARM) || IS_ENABLED(CONFIG_ARM64))
++              return !virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
++
+       return false;
+ }
diff --git a/queue-4.9/xfs-prevent-quotacheck-from-overloading-inode-lru.patch b/queue-4.9/xfs-prevent-quotacheck-from-overloading-inode-lru.patch
new file mode 100644 (file)
index 0000000..c938306
--- /dev/null
@@ -0,0 +1,56 @@
+From e0d76fa4475ef2cf4b52d18588b8ce95153d021b Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Thu, 26 Jan 2017 13:18:09 -0800
+Subject: xfs: prevent quotacheck from overloading inode lru
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e0d76fa4475ef2cf4b52d18588b8ce95153d021b upstream.
+
+Quotacheck runs at mount time in situations where quota accounting must
+be recalculated. In doing so, it uses bulkstat to visit every inode in
+the filesystem. Historically, every inode processed during quotacheck
+was released and immediately tagged for reclaim because quotacheck runs
+before the superblock is marked active by the VFS. In other words,
+the final iput() lead to an immediate ->destroy_inode() call, which
+allowed the XFS background reclaim worker to start reclaiming inodes.
+
+Commit 17c12bcd3 ("xfs: when replaying bmap operations, don't let
+unlinked inodes get reaped") marks the XFS superblock active sooner as
+part of the mount process to support caching inodes processed during log
+recovery. This occurs before quotacheck and thus means all inodes
+processed by quotacheck are inserted to the LRU on release.  The
+s_umount lock is held until the mount has completed and thus prevents
+the shrinkers from operating on the sb. This means that quotacheck can
+excessively populate the inode LRU and lead to OOM conditions on systems
+without sufficient RAM.
+
+Update the quotacheck bulkstat handler to set XFS_IGET_DONTCACHE on
+inodes processed by quotacheck. This causes ->drop_inode() to return 1
+and in turn causes iput_final() to evict the inode. This preserves the
+original quotacheck behavior and prevents it from overloading the LRU
+and running out of memory.
+
+Reported-by: Martin Svec <martin.svec@zoner.cz>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_qm.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -1177,7 +1177,8 @@ xfs_qm_dqusage_adjust(
+        * the case in all other instances. It's OK that we do this because
+        * quotacheck is done only at mount time.
+        */
+-      error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
++      error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL,
++                       &ip);
+       if (error) {
+               *res = BULKSTAT_RV_NOTHING;
+               return error;