From 16fff1c0ee29a403bd78d97776b4f7ce066a9289 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 4 Dec 2017 09:48:14 +0100 Subject: [PATCH] 4.14-stable patches added patches: autofs-revert-autofs-fix-at_no_automount-not-being-honored.patch autofs-revert-autofs-take-more-care-to-not-update-last_used-on-path-walk.patch device-dax-implement-split-to-catch-invalid-munmap-attempts.patch exec-avoid-rlimit_stack-races-with-prlimit.patch fs-fat-inode.c-fix-sb_rdonly-change.patch ib-core-disable-memory-registration-of-filesystem-dax-vmas.patch mm-cma-fix-alloc_contig_range-ret-code-potential-leak.patch mm-fail-get_vaddr_frames-for-filesystem-dax-mappings.patch mm-fix-device-dax-pud-write-faults-triggered-by-get_user_pages.patch mm-hugetlb-fix-null-pointer-dereference-on-5-level-paging-machine.patch mm-hugetlbfs-introduce-split-to-vm_operations_struct.patch mm-introduce-get_user_pages_longterm.patch mm-madvise.c-fix-madvise-infinite-loop-under-special-circumstances.patch mm-memcg-fix-mem_cgroup_swapout-for-thps.patch mm-migrate-fix-an-incorrect-call-of-prep_transhuge_page.patch mm-thp-do-not-make-page-table-dirty-unconditionally-in-touch_pd.patch v4l2-disable-filesystem-dax-mapping-support.patch --- ...ix-at_no_automount-not-being-honored.patch | 81 +++++++ ...to-not-update-last_used-on-path-walk.patch | 79 +++++++ ...lit-to-catch-invalid-munmap-attempts.patch | 73 +++++++ ...void-rlimit_stack-races-with-prlimit.patch | 52 +++++ .../fs-fat-inode.c-fix-sb_rdonly-change.patch | 42 ++++ ...-registration-of-filesystem-dax-vmas.patch | 51 +++++ ...contig_range-ret-code-potential-leak.patch | 63 ++++++ ...r_frames-for-filesystem-dax-mappings.patch | 63 ++++++ ...e-faults-triggered-by-get_user_pages.patch | 114 ++++++++++ ...ereference-on-5-level-paging-machine.patch | 44 ++++ ...roduce-split-to-vm_operations_struct.patch | 90 ++++++++ ...mm-introduce-get_user_pages_longterm.patch | 201 ++++++++++++++++++ ...ite-loop-under-special-circumstances.patch | 74 +++++++ ...emcg-fix-mem_cgroup_swapout-for-thps.patch | 45 ++++ ...ncorrect-call-of-prep_transhuge_page.patch | 60 ++++++ ...le-dirty-unconditionally-in-touch_pd.patch | 108 ++++++++++ queue-4.14/series | 17 ++ ...sable-filesystem-dax-mapping-support.patch | 66 ++++++ 18 files changed, 1323 insertions(+) create mode 100644 queue-4.14/autofs-revert-autofs-fix-at_no_automount-not-being-honored.patch create mode 100644 queue-4.14/autofs-revert-autofs-take-more-care-to-not-update-last_used-on-path-walk.patch create mode 100644 queue-4.14/device-dax-implement-split-to-catch-invalid-munmap-attempts.patch create mode 100644 queue-4.14/exec-avoid-rlimit_stack-races-with-prlimit.patch create mode 100644 queue-4.14/fs-fat-inode.c-fix-sb_rdonly-change.patch create mode 100644 queue-4.14/ib-core-disable-memory-registration-of-filesystem-dax-vmas.patch create mode 100644 queue-4.14/mm-cma-fix-alloc_contig_range-ret-code-potential-leak.patch create mode 100644 queue-4.14/mm-fail-get_vaddr_frames-for-filesystem-dax-mappings.patch create mode 100644 queue-4.14/mm-fix-device-dax-pud-write-faults-triggered-by-get_user_pages.patch create mode 100644 queue-4.14/mm-hugetlb-fix-null-pointer-dereference-on-5-level-paging-machine.patch create mode 100644 queue-4.14/mm-hugetlbfs-introduce-split-to-vm_operations_struct.patch create mode 100644 queue-4.14/mm-introduce-get_user_pages_longterm.patch create mode 100644 queue-4.14/mm-madvise.c-fix-madvise-infinite-loop-under-special-circumstances.patch create mode 100644 queue-4.14/mm-memcg-fix-mem_cgroup_swapout-for-thps.patch create mode 100644 queue-4.14/mm-migrate-fix-an-incorrect-call-of-prep_transhuge_page.patch create mode 100644 queue-4.14/mm-thp-do-not-make-page-table-dirty-unconditionally-in-touch_pd.patch create mode 100644 queue-4.14/v4l2-disable-filesystem-dax-mapping-support.patch diff --git a/queue-4.14/autofs-revert-autofs-fix-at_no_automount-not-being-honored.patch b/queue-4.14/autofs-revert-autofs-fix-at_no_automount-not-being-honored.patch new file mode 100644 index 00000000000..4f43f960dfd --- /dev/null +++ b/queue-4.14/autofs-revert-autofs-fix-at_no_automount-not-being-honored.patch @@ -0,0 +1,81 @@ +From 5d38f049cee1e1c4a7ac55aa79d37d01ddcc3860 Mon Sep 17 00:00:00 2001 +From: Ian Kent +Date: Wed, 29 Nov 2017 16:11:26 -0800 +Subject: autofs: revert "autofs: fix AT_NO_AUTOMOUNT not being honored" + +From: Ian Kent + +commit 5d38f049cee1e1c4a7ac55aa79d37d01ddcc3860 upstream. + +Commit 42f461482178 ("autofs: fix AT_NO_AUTOMOUNT not being honored") +allowed the fstatat(2) system call to properly honor the AT_NO_AUTOMOUNT +flag but introduced a semantic change. + +In order to honor AT_NO_AUTOMOUNT a semantic change was made to the +negative dentry case for stat family system calls in follow_automount(). + +This changed the unconditional triggering of an automount in this case +to no longer be done and an error returned instead. + +This has caused more problems than I expected so reverting the change is +needed. + +In a discussion with Neil Brown it was concluded that the automount(8) +daemon can implement this change without kernel modifications. So that +will be done instead and the autofs module documentation updated with a +description of the problem and what needs to be done by module users for +this specific case. + +Link: http://lkml.kernel.org/r/151174730120.6162.3848002191530283984.stgit@pluto.themaw.net +Fixes: 42f4614821 ("autofs: fix AT_NO_AUTOMOUNT not being honored") +Signed-off-by: Ian Kent +Cc: Neil Brown +Cc: Al Viro +Cc: David Howells +Cc: Colin Walters +Cc: Ondrej Holy +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/namei.c | 15 +++------------ + include/linux/fs.h | 3 ++- + 2 files changed, 5 insertions(+), 13 deletions(-) + +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -1129,18 +1129,9 @@ static int follow_automount(struct path + * of the daemon to instantiate them before they can be used. + */ + if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | +- LOOKUP_OPEN | LOOKUP_CREATE | +- LOOKUP_AUTOMOUNT))) { +- /* Positive dentry that isn't meant to trigger an +- * automount, EISDIR will allow it to be used, +- * otherwise there's no mount here "now" so return +- * ENOENT. +- */ +- if (path->dentry->d_inode) +- return -EISDIR; +- else +- return -ENOENT; +- } ++ LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && ++ path->dentry->d_inode) ++ return -EISDIR; + + if (path->dentry->d_sb->s_user_ns != &init_user_ns) + return -EACCES; +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3069,7 +3069,8 @@ static inline int vfs_lstat(const char _ + static inline int vfs_fstatat(int dfd, const char __user *filename, + struct kstat *stat, int flags) + { +- return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS); ++ return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, ++ stat, STATX_BASIC_STATS); + } + static inline int vfs_fstat(int fd, struct kstat *stat) + { diff --git a/queue-4.14/autofs-revert-autofs-take-more-care-to-not-update-last_used-on-path-walk.patch b/queue-4.14/autofs-revert-autofs-take-more-care-to-not-update-last_used-on-path-walk.patch new file mode 100644 index 00000000000..0e4d1835fdb --- /dev/null +++ b/queue-4.14/autofs-revert-autofs-take-more-care-to-not-update-last_used-on-path-walk.patch @@ -0,0 +1,79 @@ +From 43694d4bf843ddd34519e8e9de983deefeada699 Mon Sep 17 00:00:00 2001 +From: Ian Kent +Date: Wed, 29 Nov 2017 16:11:23 -0800 +Subject: autofs: revert "autofs: take more care to not update last_used on path walk" + +From: Ian Kent + +commit 43694d4bf843ddd34519e8e9de983deefeada699 upstream. + +While commit 092a53452bb7 ("autofs: take more care to not update +last_used on path walk") helped (partially) resolve a problem where +automounts were not expiring due to aggressive accesses from user space +it has a side effect for very large environments. + +This change helps with the expire problem by making the expire more +aggressive but, for very large environments, that means more mount +requests from clients. When there are a lot of clients that can mean +fairly significant server load increases. + +It turns out I put the last_used in this position to solve this very +problem and failed to update my own thinking of the autofs expire +policy. So the patch being reverted introduces a regression which +should be fixed. + +Link: http://lkml.kernel.org/r/151174729420.6162.1832622523537052460.stgit@pluto.themaw.net +Fixes: 092a53452b ("autofs: take more care to not update last_used on path walk") +Signed-off-by: Ian Kent +Reviewed-by: NeilBrown +Cc: Al Viro +Cc: Colin Walters +Cc: David Howells +Cc: Ondrej Holy +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/autofs4/root.c | 17 ++++++----------- + 1 file changed, 6 insertions(+), 11 deletions(-) + +--- a/fs/autofs4/root.c ++++ b/fs/autofs4/root.c +@@ -281,8 +281,8 @@ static int autofs4_mount_wait(const stru + pr_debug("waiting for mount name=%pd\n", path->dentry); + status = autofs4_wait(sbi, path, NFY_MOUNT); + pr_debug("mount wait done status=%d\n", status); +- ino->last_used = jiffies; + } ++ ino->last_used = jiffies; + return status; + } + +@@ -321,21 +321,16 @@ static struct dentry *autofs4_mountpoint + */ + if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { + struct dentry *parent = dentry->d_parent; ++ struct autofs_info *ino; + struct dentry *new; + + new = d_lookup(parent, &dentry->d_name); + if (!new) + return NULL; +- if (new == dentry) +- dput(new); +- else { +- struct autofs_info *ino; +- +- ino = autofs4_dentry_ino(new); +- ino->last_used = jiffies; +- dput(path->dentry); +- path->dentry = new; +- } ++ ino = autofs4_dentry_ino(new); ++ ino->last_used = jiffies; ++ dput(path->dentry); ++ path->dentry = new; + } + return path->dentry; + } diff --git a/queue-4.14/device-dax-implement-split-to-catch-invalid-munmap-attempts.patch b/queue-4.14/device-dax-implement-split-to-catch-invalid-munmap-attempts.patch new file mode 100644 index 00000000000..9f00bd03a56 --- /dev/null +++ b/queue-4.14/device-dax-implement-split-to-catch-invalid-munmap-attempts.patch @@ -0,0 +1,73 @@ +From 9702cffdbf2129516db679e4467db81e1cd287da Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Wed, 29 Nov 2017 16:10:32 -0800 +Subject: device-dax: implement ->split() to catch invalid munmap attempts + +From: Dan Williams + +commit 9702cffdbf2129516db679e4467db81e1cd287da upstream. + +Similar to how device-dax enforces that the 'address', 'offset', and +'len' parameters to mmap() be aligned to the device's fundamental +alignment, the same constraints apply to munmap(). Implement ->split() +to fail munmap calls that violate the alignment constraint. + +Otherwise, we later fail VM_BUG_ON checks in the unmap_page_range() path +with crash signatures of the form: + + vma ffff8800b60c8a88 start 00007f88c0000000 end 00007f88c0e00000 + next (null) prev (null) mm ffff8800b61150c0 + prot 8000000000000027 anon_vma (null) vm_ops ffffffffa0091240 + pgoff 0 file ffff8800b638ef80 private_data (null) + flags: 0x380000fb(read|write|shared|mayread|maywrite|mayexec|mayshare|softdirty|mixedmap|hugepage) + ------------[ cut here ]------------ + kernel BUG at mm/huge_memory.c:2014! + [..] + RIP: 0010:__split_huge_pud+0x12a/0x180 + [..] + Call Trace: + unmap_page_range+0x245/0xa40 + ? __vma_adjust+0x301/0x990 + unmap_vmas+0x4c/0xa0 + unmap_region+0xae/0x120 + ? __vma_rb_erase+0x11a/0x230 + do_munmap+0x276/0x410 + vm_munmap+0x6a/0xa0 + SyS_munmap+0x1d/0x30 + +Link: http://lkml.kernel.org/r/151130418681.4029.7118245855057952010.stgit@dwillia2-desk3.amr.corp.intel.com +Fixes: dee410792419 ("/dev/dax, core: file operations and dax-mmap") +Signed-off-by: Dan Williams +Reported-by: Jeff Moyer +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/dax/device.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/drivers/dax/device.c ++++ b/drivers/dax/device.c +@@ -427,9 +427,21 @@ static int dev_dax_fault(struct vm_fault + return dev_dax_huge_fault(vmf, PE_SIZE_PTE); + } + ++static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr) ++{ ++ struct file *filp = vma->vm_file; ++ struct dev_dax *dev_dax = filp->private_data; ++ struct dax_region *dax_region = dev_dax->region; ++ ++ if (!IS_ALIGNED(addr, dax_region->align)) ++ return -EINVAL; ++ return 0; ++} ++ + static const struct vm_operations_struct dax_vm_ops = { + .fault = dev_dax_fault, + .huge_fault = dev_dax_huge_fault, ++ .split = dev_dax_split, + }; + + static int dax_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/queue-4.14/exec-avoid-rlimit_stack-races-with-prlimit.patch b/queue-4.14/exec-avoid-rlimit_stack-races-with-prlimit.patch new file mode 100644 index 00000000000..c0087fa1c95 --- /dev/null +++ b/queue-4.14/exec-avoid-rlimit_stack-races-with-prlimit.patch @@ -0,0 +1,52 @@ +From 04e35f4495dd560db30c25efca4eecae8ec8c375 Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Wed, 29 Nov 2017 16:10:51 -0800 +Subject: exec: avoid RLIMIT_STACK races with prlimit() + +From: Kees Cook + +commit 04e35f4495dd560db30c25efca4eecae8ec8c375 upstream. + +While the defense-in-depth RLIMIT_STACK limit on setuid processes was +protected against races from other threads calling setrlimit(), I missed +protecting it against races from external processes calling prlimit(). +This adds locking around the change and makes sure that rlim_max is set +too. + +Link: http://lkml.kernel.org/r/20171127193457.GA11348@beast +Fixes: 64701dee4178e ("exec: Use sane stack rlimit under secureexec") +Signed-off-by: Kees Cook +Reported-by: Ben Hutchings +Reported-by: Brad Spengler +Acked-by: Serge Hallyn +Cc: James Morris +Cc: Andy Lutomirski +Cc: Oleg Nesterov +Cc: Jiri Slaby +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/exec.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1340,10 +1340,15 @@ void setup_new_exec(struct linux_binprm + * avoid bad behavior from the prior rlimits. This has to + * happen before arch_pick_mmap_layout(), which examines + * RLIMIT_STACK, but after the point of no return to avoid +- * needing to clean up the change on failure. ++ * races from other threads changing the limits. This also ++ * must be protected from races with prlimit() calls. + */ ++ task_lock(current->group_leader); + if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) + current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; ++ if (current->signal->rlim[RLIMIT_STACK].rlim_max > _STK_LIM) ++ current->signal->rlim[RLIMIT_STACK].rlim_max = _STK_LIM; ++ task_unlock(current->group_leader); + } + + arch_pick_mmap_layout(current->mm); diff --git a/queue-4.14/fs-fat-inode.c-fix-sb_rdonly-change.patch b/queue-4.14/fs-fat-inode.c-fix-sb_rdonly-change.patch new file mode 100644 index 00000000000..7e68807eca5 --- /dev/null +++ b/queue-4.14/fs-fat-inode.c-fix-sb_rdonly-change.patch @@ -0,0 +1,42 @@ +From b6e8e12c0aeb5fbf1bf46c84d58cc93aedede385 Mon Sep 17 00:00:00 2001 +From: OGAWA Hirofumi +Date: Wed, 29 Nov 2017 16:11:19 -0800 +Subject: fs/fat/inode.c: fix sb_rdonly() change + +From: OGAWA Hirofumi + +commit b6e8e12c0aeb5fbf1bf46c84d58cc93aedede385 upstream. + +Commit bc98a42c1f7d ("VFS: Convert sb->s_flags & MS_RDONLY to +sb_rdonly(sb)") converted fat_remount():new_rdonly from a bool to an +int. + +However fat_remount() depends upon the compiler's conversion of a +non-zero integer into boolean `true'. + +Fix it by switching `new_rdonly' back into a bool. + +Link: http://lkml.kernel.org/r/87mv3d5x51.fsf@mail.parknet.co.jp +Fixes: bc98a42c1f7d0f8 ("VFS: Convert sb->s_flags & MS_RDONLY to sb_rdonly(sb)") +Signed-off-by: OGAWA Hirofumi +Cc: Joe Perches +Cc: David Howells +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fat/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/fat/inode.c ++++ b/fs/fat/inode.c +@@ -779,7 +779,7 @@ static void __exit fat_destroy_inodecach + + static int fat_remount(struct super_block *sb, int *flags, char *data) + { +- int new_rdonly; ++ bool new_rdonly; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); + diff --git a/queue-4.14/ib-core-disable-memory-registration-of-filesystem-dax-vmas.patch b/queue-4.14/ib-core-disable-memory-registration-of-filesystem-dax-vmas.patch new file mode 100644 index 00000000000..b406bdca0ba --- /dev/null +++ b/queue-4.14/ib-core-disable-memory-registration-of-filesystem-dax-vmas.patch @@ -0,0 +1,51 @@ +From 5f1d43de54164dcfb9bfa542fcc92c1e1a1b6c1d Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Wed, 29 Nov 2017 16:10:47 -0800 +Subject: IB/core: disable memory registration of filesystem-dax vmas + +From: Dan Williams + +commit 5f1d43de54164dcfb9bfa542fcc92c1e1a1b6c1d upstream. + +Until there is a solution to the dma-to-dax vs truncate problem it is +not safe to allow RDMA to create long standing memory registrations +against filesytem-dax vmas. + +Link: http://lkml.kernel.org/r/151068941011.7446.7766030590347262502.stgit@dwillia2-desk3.amr.corp.intel.com +Fixes: 3565fce3a659 ("mm, x86: get_user_pages() for dax mappings") +Signed-off-by: Dan Williams +Reported-by: Christoph Hellwig +Reviewed-by: Christoph Hellwig +Acked-by: Jason Gunthorpe +Acked-by: Doug Ledford +Cc: Sean Hefty +Cc: Hal Rosenstock +Cc: Jeff Moyer +Cc: Ross Zwisler +Cc: Inki Dae +Cc: Jan Kara +Cc: Joonyoung Shim +Cc: Kyungmin Park +Cc: Mauro Carvalho Chehab +Cc: Mel Gorman +Cc: Seung-Woo Kim +Cc: Vlastimil Babka +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/core/umem.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/infiniband/core/umem.c ++++ b/drivers/infiniband/core/umem.c +@@ -191,7 +191,7 @@ struct ib_umem *ib_umem_get(struct ib_uc + sg_list_start = umem->sg_head.sgl; + + while (npages) { +- ret = get_user_pages(cur_base, ++ ret = get_user_pages_longterm(cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof (struct page *)), + gup_flags, page_list, vma_list); diff --git a/queue-4.14/mm-cma-fix-alloc_contig_range-ret-code-potential-leak.patch b/queue-4.14/mm-cma-fix-alloc_contig_range-ret-code-potential-leak.patch new file mode 100644 index 00000000000..636f7dd5129 --- /dev/null +++ b/queue-4.14/mm-cma-fix-alloc_contig_range-ret-code-potential-leak.patch @@ -0,0 +1,63 @@ +From 63cd448908b5eb51d84c52f02b31b9b4ccd1cb5a Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Wed, 29 Nov 2017 16:10:01 -0800 +Subject: mm/cma: fix alloc_contig_range ret code/potential leak + +From: Mike Kravetz + +commit 63cd448908b5eb51d84c52f02b31b9b4ccd1cb5a upstream. + +If the call __alloc_contig_migrate_range() in alloc_contig_range returns +-EBUSY, processing continues so that test_pages_isolated() is called +where there is a tracepoint to identify the busy pages. However, it is +possible for busy pages to become available between the calls to these +two routines. In this case, the range of pages may be allocated. +Unfortunately, the original return code (ret == -EBUSY) is still set and +returned to the caller. Therefore, the caller believes the pages were +not allocated and they are leaked. + +Update the comment to indicate that allocation is still possible even if +__alloc_contig_migrate_range returns -EBUSY. Also, clear return code in +this case so that it is not accidentally used or returned to caller. + +Link: http://lkml.kernel.org/r/20171122185214.25285-1-mike.kravetz@oracle.com +Fixes: 8ef5849fa8a2 ("mm/cma: always check which page caused allocation failure") +Signed-off-by: Mike Kravetz +Acked-by: Vlastimil Babka +Acked-by: Michal Hocko +Acked-by: Johannes Weiner +Acked-by: Joonsoo Kim +Cc: Michal Nazarewicz +Cc: Laura Abbott +Cc: Michal Hocko +Cc: Mel Gorman +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page_alloc.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -7587,11 +7587,18 @@ int alloc_contig_range(unsigned long sta + + /* + * In case of -EBUSY, we'd like to know which page causes problem. +- * So, just fall through. We will check it in test_pages_isolated(). ++ * So, just fall through. test_pages_isolated() has a tracepoint ++ * which will report the busy page. ++ * ++ * It is possible that busy pages could become available before ++ * the call to test_pages_isolated, and the range will actually be ++ * allocated. So, if we fall through be sure to clear ret so that ++ * -EBUSY is not accidentally used or returned to caller. + */ + ret = __alloc_contig_migrate_range(&cc, start, end); + if (ret && ret != -EBUSY) + goto done; ++ ret =0; + + /* + * Pages from [start, end) are within a MAX_ORDER_NR_PAGES diff --git a/queue-4.14/mm-fail-get_vaddr_frames-for-filesystem-dax-mappings.patch b/queue-4.14/mm-fail-get_vaddr_frames-for-filesystem-dax-mappings.patch new file mode 100644 index 00000000000..4111d78cad1 --- /dev/null +++ b/queue-4.14/mm-fail-get_vaddr_frames-for-filesystem-dax-mappings.patch @@ -0,0 +1,63 @@ +From b7f0554a56f21fb3e636a627450a9add030889be Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Wed, 29 Nov 2017 16:10:39 -0800 +Subject: mm: fail get_vaddr_frames() for filesystem-dax mappings + +From: Dan Williams + +commit b7f0554a56f21fb3e636a627450a9add030889be upstream. + +Until there is a solution to the dma-to-dax vs truncate problem it is +not safe to allow V4L2, Exynos, and other frame vector users to create +long standing / irrevocable memory registrations against filesytem-dax +vmas. + +[dan.j.williams@intel.com: add comment for vma_is_fsdax() check in get_vaddr_frames(), per Jan] + Link: http://lkml.kernel.org/r/151197874035.26211.4061781453123083667.stgit@dwillia2-desk3.amr.corp.intel.com +Link: http://lkml.kernel.org/r/151068939985.7446.15684639617389154187.stgit@dwillia2-desk3.amr.corp.intel.com +Fixes: 3565fce3a659 ("mm, x86: get_user_pages() for dax mappings") +Signed-off-by: Dan Williams +Reviewed-by: Jan Kara +Cc: Inki Dae +Cc: Seung-Woo Kim +Cc: Joonyoung Shim +Cc: Kyungmin Park +Cc: Mauro Carvalho Chehab +Cc: Mel Gorman +Cc: Vlastimil Babka +Cc: Christoph Hellwig +Cc: Doug Ledford +Cc: Hal Rosenstock +Cc: Jason Gunthorpe +Cc: Jeff Moyer +Cc: Ross Zwisler +Cc: Sean Hefty +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/frame_vector.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/mm/frame_vector.c ++++ b/mm/frame_vector.c +@@ -53,6 +53,18 @@ int get_vaddr_frames(unsigned long start + ret = -EFAULT; + goto out; + } ++ ++ /* ++ * While get_vaddr_frames() could be used for transient (kernel ++ * controlled lifetime) pinning of memory pages all current ++ * users establish long term (userspace controlled lifetime) ++ * page pinning. Treat get_vaddr_frames() like ++ * get_user_pages_longterm() and disallow it for filesystem-dax ++ * mappings. ++ */ ++ if (vma_is_fsdax(vma)) ++ return -EOPNOTSUPP; ++ + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { + vec->got_ref = true; + vec->is_pfns = false; diff --git a/queue-4.14/mm-fix-device-dax-pud-write-faults-triggered-by-get_user_pages.patch b/queue-4.14/mm-fix-device-dax-pud-write-faults-triggered-by-get_user_pages.patch new file mode 100644 index 00000000000..18358b04e68 --- /dev/null +++ b/queue-4.14/mm-fix-device-dax-pud-write-faults-triggered-by-get_user_pages.patch @@ -0,0 +1,114 @@ +From 1501899a898dfb5477c55534bdfd734c046da06d Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Wed, 29 Nov 2017 16:10:06 -0800 +Subject: mm: fix device-dax pud write-faults triggered by get_user_pages() + +From: Dan Williams + +commit 1501899a898dfb5477c55534bdfd734c046da06d upstream. + +Currently only get_user_pages_fast() can safely handle the writable gup +case due to its use of pud_access_permitted() to check whether the pud +entry is writable. In the gup slow path pud_write() is used instead of +pud_access_permitted() and to date it has been unimplemented, just calls +BUG_ON(). + + kernel BUG at ./include/linux/hugetlb.h:244! + [..] + RIP: 0010:follow_devmap_pud+0x482/0x490 + [..] + Call Trace: + follow_page_mask+0x28c/0x6e0 + __get_user_pages+0xe4/0x6c0 + get_user_pages_unlocked+0x130/0x1b0 + get_user_pages_fast+0x89/0xb0 + iov_iter_get_pages_alloc+0x114/0x4a0 + nfs_direct_read_schedule_iovec+0xd2/0x350 + ? nfs_start_io_direct+0x63/0x70 + nfs_file_direct_read+0x1e0/0x250 + nfs_file_read+0x90/0xc0 + +For now this just implements a simple check for the _PAGE_RW bit similar +to pmd_write. However, this implies that the gup-slow-path check is +missing the extra checks that the gup-fast-path performs with +pud_access_permitted. Later patches will align all checks to use the +'access_permitted' helper if the architecture provides it. + +Note that the generic 'access_permitted' helper fallback is the simple +_PAGE_RW check on architectures that do not define the +'access_permitted' helper(s). + +[dan.j.williams@intel.com: fix powerpc compile error] + Link: http://lkml.kernel.org/r/151129126165.37405.16031785266675461397.stgit@dwillia2-desk3.amr.corp.intel.com +Link: http://lkml.kernel.org/r/151043109938.2842.14834662818213616199.stgit@dwillia2-desk3.amr.corp.intel.com +Fixes: a00cc7d9dd93 ("mm, x86: add support for PUD-sized transparent hugepages") +Signed-off-by: Dan Williams +Reported-by: Stephen Rothwell +Acked-by: Thomas Gleixner [x86] +Cc: Kirill A. Shutemov +Cc: Catalin Marinas +Cc: "David S. Miller" +Cc: Dave Hansen +Cc: Will Deacon +Cc: "H. Peter Anvin" +Cc: Ingo Molnar +Cc: Arnd Bergmann +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/pgtable.h | 6 ++++++ + include/asm-generic/pgtable.h | 8 ++++++++ + include/linux/hugetlb.h | 8 -------- + 3 files changed, 14 insertions(+), 8 deletions(-) + +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -1093,6 +1093,12 @@ static inline void pmdp_set_wrprotect(st + clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); + } + ++#define pud_write pud_write ++static inline int pud_write(pud_t pud) ++{ ++ return pud_flags(pud) & _PAGE_RW; ++} ++ + /* + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); + * +--- a/include/asm-generic/pgtable.h ++++ b/include/asm-generic/pgtable.h +@@ -814,6 +814,14 @@ static inline int pmd_write(pmd_t pmd) + #endif /* __HAVE_ARCH_PMD_WRITE */ + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + ++#ifndef pud_write ++static inline int pud_write(pud_t pud) ++{ ++ BUG(); ++ return 0; ++} ++#endif /* pud_write */ ++ + #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ + (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -239,14 +239,6 @@ static inline int pgd_write(pgd_t pgd) + } + #endif + +-#ifndef pud_write +-static inline int pud_write(pud_t pud) +-{ +- BUG(); +- return 0; +-} +-#endif +- + #define HUGETLB_ANON_FILE "anon_hugepage" + + enum { diff --git a/queue-4.14/mm-hugetlb-fix-null-pointer-dereference-on-5-level-paging-machine.patch b/queue-4.14/mm-hugetlb-fix-null-pointer-dereference-on-5-level-paging-machine.patch new file mode 100644 index 00000000000..405475bbe23 --- /dev/null +++ b/queue-4.14/mm-hugetlb-fix-null-pointer-dereference-on-5-level-paging-machine.patch @@ -0,0 +1,44 @@ +From f4f0a3d85b50a65a348e2b8635041d6b30f01deb Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Wed, 29 Nov 2017 16:11:30 -0800 +Subject: mm/hugetlb: fix NULL-pointer dereference on 5-level paging machine + +From: Kirill A. Shutemov + +commit f4f0a3d85b50a65a348e2b8635041d6b30f01deb upstream. + +I made a mistake during converting hugetlb code to 5-level paging: in +huge_pte_alloc() we have to use p4d_alloc(), not p4d_offset(). + +Otherwise it leads to crash -- NULL-pointer dereference in pud_alloc() +if p4d table is not yet allocated. + +It only can happen in 5-level paging mode. In 4-level paging mode +p4d_offset() always returns pgd, so we are fine. + +Link: http://lkml.kernel.org/r/20171122121921.64822-1-kirill.shutemov@linux.intel.com +Fixes: c2febafc6773 ("mm: convert generic code to 5-level paging") +Signed-off-by: Kirill A. Shutemov +Acked-by: Vlastimil Babka +Acked-by: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/hugetlb.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -4625,7 +4625,9 @@ pte_t *huge_pte_alloc(struct mm_struct * + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); +- p4d = p4d_offset(pgd, addr); ++ p4d = p4d_alloc(mm, pgd, addr); ++ if (!p4d) ++ return NULL; + pud = pud_alloc(mm, p4d, addr); + if (pud) { + if (sz == PUD_SIZE) { diff --git a/queue-4.14/mm-hugetlbfs-introduce-split-to-vm_operations_struct.patch b/queue-4.14/mm-hugetlbfs-introduce-split-to-vm_operations_struct.patch new file mode 100644 index 00000000000..98300362484 --- /dev/null +++ b/queue-4.14/mm-hugetlbfs-introduce-split-to-vm_operations_struct.patch @@ -0,0 +1,90 @@ +From 31383c6865a578834dd953d9dbc88e6b19fe3997 Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Wed, 29 Nov 2017 16:10:28 -0800 +Subject: mm, hugetlbfs: introduce ->split() to vm_operations_struct + +From: Dan Williams + +commit 31383c6865a578834dd953d9dbc88e6b19fe3997 upstream. + +Patch series "device-dax: fix unaligned munmap handling" + +When device-dax is operating in huge-page mode we want it to behave like +hugetlbfs and fail attempts to split vmas into unaligned ranges. It +would be messy to teach the munmap path about device-dax alignment +constraints in the same (hstate) way that hugetlbfs communicates this +constraint. Instead, these patches introduce a new ->split() vm +operation. + +This patch (of 2): + +The device-dax interface has similar constraints as hugetlbfs in that it +requires the munmap path to unmap in huge page aligned units. Rather +than add more custom vma handling code in __split_vma() introduce a new +vm operation to perform this vma specific check. + +Link: http://lkml.kernel.org/r/151130418135.4029.6783191281930729710.stgit@dwillia2-desk3.amr.corp.intel.com +Fixes: dee410792419 ("/dev/dax, core: file operations and dax-mmap") +Signed-off-by: Dan Williams +Cc: Jeff Moyer +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/mm.h | 1 + + mm/hugetlb.c | 8 ++++++++ + mm/mmap.c | 8 +++++--- + 3 files changed, 14 insertions(+), 3 deletions(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -367,6 +367,7 @@ enum page_entry_size { + struct vm_operations_struct { + void (*open)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct * area); ++ int (*split)(struct vm_area_struct * area, unsigned long addr); + int (*mremap)(struct vm_area_struct * area); + int (*fault)(struct vm_fault *vmf); + int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -3125,6 +3125,13 @@ static void hugetlb_vm_op_close(struct v + } + } + ++static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) ++{ ++ if (addr & ~(huge_page_mask(hstate_vma(vma)))) ++ return -EINVAL; ++ return 0; ++} ++ + /* + * We cannot handle pagefaults against hugetlb pages at all. They cause + * handle_mm_fault() to try to instantiate regular-sized pages in the +@@ -3141,6 +3148,7 @@ const struct vm_operations_struct hugetl + .fault = hugetlb_vm_op_fault, + .open = hugetlb_vm_op_open, + .close = hugetlb_vm_op_close, ++ .split = hugetlb_vm_op_split, + }; + + static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -2540,9 +2540,11 @@ int __split_vma(struct mm_struct *mm, st + struct vm_area_struct *new; + int err; + +- if (is_vm_hugetlb_page(vma) && (addr & +- ~(huge_page_mask(hstate_vma(vma))))) +- return -EINVAL; ++ if (vma->vm_ops && vma->vm_ops->split) { ++ err = vma->vm_ops->split(vma, addr); ++ if (err) ++ return err; ++ } + + new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!new) diff --git a/queue-4.14/mm-introduce-get_user_pages_longterm.patch b/queue-4.14/mm-introduce-get_user_pages_longterm.patch new file mode 100644 index 00000000000..6fa2ff46320 --- /dev/null +++ b/queue-4.14/mm-introduce-get_user_pages_longterm.patch @@ -0,0 +1,201 @@ +From 2bb6d2837083de722bfdc369cb0d76ce188dd9b4 Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Wed, 29 Nov 2017 16:10:35 -0800 +Subject: mm: introduce get_user_pages_longterm + +From: Dan Williams + +commit 2bb6d2837083de722bfdc369cb0d76ce188dd9b4 upstream. + +Patch series "introduce get_user_pages_longterm()", v2. + +Here is a new get_user_pages api for cases where a driver intends to +keep an elevated page count indefinitely. This is distinct from usages +like iov_iter_get_pages where the elevated page counts are transient. +The iov_iter_get_pages cases immediately turn around and submit the +pages to a device driver which will put_page when the i/o operation +completes (under kernel control). + +In the longterm case userspace is responsible for dropping the page +reference at some undefined point in the future. This is untenable for +filesystem-dax case where the filesystem is in control of the lifetime +of the block / page and needs reasonable limits on how long it can wait +for pages in a mapping to become idle. + +Fixing filesystems to actually wait for dax pages to be idle before +blocks from a truncate/hole-punch operation are repurposed is saved for +a later patch series. + +Also, allowing longterm registration of dax mappings is a future patch +series that introduces a "map with lease" semantic where the kernel can +revoke a lease and force userspace to drop its page references. + +I have also tagged these for -stable to purposely break cases that might +assume that longterm memory registrations for filesystem-dax mappings +were supported by the kernel. The behavior regression this policy +change implies is one of the reasons we maintain the "dax enabled. +Warning: EXPERIMENTAL, use at your own risk" notification when mounting +a filesystem in dax mode. + +It is worth noting the device-dax interface does not suffer the same +constraints since it does not support file space management operations +like hole-punch. + +This patch (of 4): + +Until there is a solution to the dma-to-dax vs truncate problem it is +not safe to allow long standing memory registrations against +filesytem-dax vmas. Device-dax vmas do not have this problem and are +explicitly allowed. + +This is temporary until a "memory registration with layout-lease" +mechanism can be implemented for the affected sub-systems (RDMA and +V4L2). + +[akpm@linux-foundation.org: use kcalloc()] +Link: http://lkml.kernel.org/r/151068939435.7446.13560129395419350737.stgit@dwillia2-desk3.amr.corp.intel.com +Fixes: 3565fce3a659 ("mm, x86: get_user_pages() for dax mappings") +Signed-off-by: Dan Williams +Suggested-by: Christoph Hellwig +Cc: Doug Ledford +Cc: Hal Rosenstock +Cc: Inki Dae +Cc: Jan Kara +Cc: Jason Gunthorpe +Cc: Jeff Moyer +Cc: Joonyoung Shim +Cc: Kyungmin Park +Cc: Mauro Carvalho Chehab +Cc: Mel Gorman +Cc: Ross Zwisler +Cc: Sean Hefty +Cc: Seung-Woo Kim +Cc: Vlastimil Babka +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/fs.h | 14 +++++++++++ + include/linux/mm.h | 13 ++++++++++ + mm/gup.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 91 insertions(+) + +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3175,6 +3175,20 @@ static inline bool vma_is_dax(struct vm_ + return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); + } + ++static inline bool vma_is_fsdax(struct vm_area_struct *vma) ++{ ++ struct inode *inode; ++ ++ if (!vma->vm_file) ++ return false; ++ if (!vma_is_dax(vma)) ++ return false; ++ inode = file_inode(vma->vm_file); ++ if (inode->i_mode == S_IFCHR) ++ return false; /* device-dax */ ++ return true; ++} ++ + static inline int iocb_flags(struct file *file) + { + int res = 0; +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1368,6 +1368,19 @@ long get_user_pages_locked(unsigned long + unsigned int gup_flags, struct page **pages, int *locked); + long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags); ++#ifdef CONFIG_FS_DAX ++long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, ++ unsigned int gup_flags, struct page **pages, ++ struct vm_area_struct **vmas); ++#else ++static inline long get_user_pages_longterm(unsigned long start, ++ unsigned long nr_pages, unsigned int gup_flags, ++ struct page **pages, struct vm_area_struct **vmas) ++{ ++ return get_user_pages(start, nr_pages, gup_flags, pages, vmas); ++} ++#endif /* CONFIG_FS_DAX */ ++ + int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages); + +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, + } + EXPORT_SYMBOL(get_user_pages); + ++#ifdef CONFIG_FS_DAX ++/* ++ * This is the same as get_user_pages() in that it assumes we are ++ * operating on the current task's mm, but it goes further to validate ++ * that the vmas associated with the address range are suitable for ++ * longterm elevated page reference counts. For example, filesystem-dax ++ * mappings are subject to the lifetime enforced by the filesystem and ++ * we need guarantees that longterm users like RDMA and V4L2 only ++ * establish mappings that have a kernel enforced revocation mechanism. ++ * ++ * "longterm" == userspace controlled elevated page count lifetime. ++ * Contrast this to iov_iter_get_pages() usages which are transient. ++ */ ++long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, ++ unsigned int gup_flags, struct page **pages, ++ struct vm_area_struct **vmas_arg) ++{ ++ struct vm_area_struct **vmas = vmas_arg; ++ struct vm_area_struct *vma_prev = NULL; ++ long rc, i; ++ ++ if (!pages) ++ return -EINVAL; ++ ++ if (!vmas) { ++ vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), ++ GFP_KERNEL); ++ if (!vmas) ++ return -ENOMEM; ++ } ++ ++ rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); ++ ++ for (i = 0; i < rc; i++) { ++ struct vm_area_struct *vma = vmas[i]; ++ ++ if (vma == vma_prev) ++ continue; ++ ++ vma_prev = vma; ++ ++ if (vma_is_fsdax(vma)) ++ break; ++ } ++ ++ /* ++ * Either get_user_pages() failed, or the vma validation ++ * succeeded, in either case we don't need to put_page() before ++ * returning. ++ */ ++ if (i >= rc) ++ goto out; ++ ++ for (i = 0; i < rc; i++) ++ put_page(pages[i]); ++ rc = -EOPNOTSUPP; ++out: ++ if (vmas != vmas_arg) ++ kfree(vmas); ++ return rc; ++} ++EXPORT_SYMBOL(get_user_pages_longterm); ++#endif /* CONFIG_FS_DAX */ ++ + /** + * populate_vma_page_range() - populate a range of pages in the vma. + * @vma: target vma diff --git a/queue-4.14/mm-madvise.c-fix-madvise-infinite-loop-under-special-circumstances.patch b/queue-4.14/mm-madvise.c-fix-madvise-infinite-loop-under-special-circumstances.patch new file mode 100644 index 00000000000..10526e236f8 --- /dev/null +++ b/queue-4.14/mm-madvise.c-fix-madvise-infinite-loop-under-special-circumstances.patch @@ -0,0 +1,74 @@ +From 6ea8d958a2c95a1d514015d4e29ba21a8c0a1a91 Mon Sep 17 00:00:00 2001 +From: chenjie +Date: Wed, 29 Nov 2017 16:10:54 -0800 +Subject: mm/madvise.c: fix madvise() infinite loop under special circumstances + +From: chenjie + +commit 6ea8d958a2c95a1d514015d4e29ba21a8c0a1a91 upstream. + +MADVISE_WILLNEED has always been a noop for DAX (formerly XIP) mappings. +Unfortunately madvise_willneed() doesn't communicate this information +properly to the generic madvise syscall implementation. The calling +convention is quite subtle there. madvise_vma() is supposed to either +return an error or update &prev otherwise the main loop will never +advance to the next vma and it will keep looping for ever without a way +to get out of the kernel. + +It seems this has been broken since introduction. Nobody has noticed +because nobody seems to be using MADVISE_WILLNEED on these DAX mappings. + +[mhocko@suse.com: rewrite changelog] +Link: http://lkml.kernel.org/r/20171127115318.911-1-guoxuenan@huawei.com +Fixes: fe77ba6f4f97 ("[PATCH] xip: madvice/fadvice: execute in place") +Signed-off-by: chenjie +Signed-off-by: guoxuenan +Acked-by: Michal Hocko +Cc: Minchan Kim +Cc: zhangyi (F) +Cc: Miao Xie +Cc: Mike Rapoport +Cc: Shaohua Li +Cc: Andrea Arcangeli +Cc: Mel Gorman +Cc: Kirill A. Shutemov +Cc: David Rientjes +Cc: Anshuman Khandual +Cc: Rik van Riel +Cc: Carsten Otte +Cc: Dan Williams +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/madvise.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -276,15 +276,14 @@ static long madvise_willneed(struct vm_a + { + struct file *file = vma->vm_file; + ++ *prev = vma; + #ifdef CONFIG_SWAP + if (!file) { +- *prev = vma; + force_swapin_readahead(vma, start, end); + return 0; + } + + if (shmem_mapping(file->f_mapping)) { +- *prev = vma; + force_shm_swapin_readahead(vma, start, end, + file->f_mapping); + return 0; +@@ -299,7 +298,6 @@ static long madvise_willneed(struct vm_a + return 0; + } + +- *prev = vma; + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; diff --git a/queue-4.14/mm-memcg-fix-mem_cgroup_swapout-for-thps.patch b/queue-4.14/mm-memcg-fix-mem_cgroup_swapout-for-thps.patch new file mode 100644 index 00000000000..05ad9bca524 --- /dev/null +++ b/queue-4.14/mm-memcg-fix-mem_cgroup_swapout-for-thps.patch @@ -0,0 +1,45 @@ +From d08afa149acfd00871484ada6dabc3880524cd1c Mon Sep 17 00:00:00 2001 +From: Shakeel Butt +Date: Wed, 29 Nov 2017 16:11:15 -0800 +Subject: mm, memcg: fix mem_cgroup_swapout() for THPs + +From: Shakeel Butt + +commit d08afa149acfd00871484ada6dabc3880524cd1c upstream. + +Commit d6810d730022 ("memcg, THP, swap: make mem_cgroup_swapout() +support THP") changed mem_cgroup_swapout() to support transparent huge +page (THP). + +However the patch missed one location which should be changed for +correctly handling THPs. The resulting bug will cause the memory +cgroups whose THPs were swapped out to become zombies on deletion. + +Link: http://lkml.kernel.org/r/20171128161941.20931-1-shakeelb@google.com +Fixes: d6810d730022 ("memcg, THP, swap: make mem_cgroup_swapout() support THP") +Signed-off-by: Shakeel Butt +Acked-by: Johannes Weiner +Acked-by: Michal Hocko +Cc: Huang Ying +Cc: Vladimir Davydov +Cc: Greg Thelen +Signed-off-by: Greg Kroah-Hartman + +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds + +--- + mm/memcontrol.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -6044,7 +6044,7 @@ void mem_cgroup_swapout(struct page *pag + memcg_check_events(memcg, page); + + if (!mem_cgroup_is_root(memcg)) +- css_put(&memcg->css); ++ css_put_many(&memcg->css, nr_entries); + } + + /** diff --git a/queue-4.14/mm-migrate-fix-an-incorrect-call-of-prep_transhuge_page.patch b/queue-4.14/mm-migrate-fix-an-incorrect-call-of-prep_transhuge_page.patch new file mode 100644 index 00000000000..a15583c7ac1 --- /dev/null +++ b/queue-4.14/mm-migrate-fix-an-incorrect-call-of-prep_transhuge_page.patch @@ -0,0 +1,60 @@ +From 40a899ed16486455f964e46d1af31fd4fded21c1 Mon Sep 17 00:00:00 2001 +From: Zi Yan +Date: Wed, 29 Nov 2017 16:11:12 -0800 +Subject: mm: migrate: fix an incorrect call of prep_transhuge_page() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Zi Yan + +commit 40a899ed16486455f964e46d1af31fd4fded21c1 upstream. + +In https://lkml.org/lkml/2017/11/20/411, Andrea reported that during +memory hotplug/hot remove prep_transhuge_page() is called incorrectly on +non-THP pages for migration, when THP is on but THP migration is not +enabled. This leads to a bad state of target pages for migration. + +By inspecting the code, if called on a non-THP, prep_transhuge_page() +will + + 1) change the value of the mapping of (page + 2), since it is used for + THP deferred list; + + 2) change the lru value of (page + 1), since it is used for THP's dtor. + +Both can lead to data corruption of these two pages. + +Andrea said: + "Pragmatically and from the point of view of the memory_hotplug subsys, + the effect is a kernel crash when pages are being migrated during a + memory hot remove offline and migration target pages are found in a + bad state" + +This patch fixes it by only calling prep_transhuge_page() when we are +certain that the target page is THP. + +Link: http://lkml.kernel.org/r/20171121021855.50525-1-zi.yan@sent.com +Fixes: 8135d8926c08 ("mm: memory_hotplug: memory hotremove supports thp migration") +Signed-off-by: Zi Yan +Reported-by: Andrea Reale +Cc: Naoya Horiguchi +Cc: Michal Hocko +Cc: "Jérôme Glisse" +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +diff --git a/include/linux/migrate.h b/include/linux/migrate.h +index 895ec0c4942e..a2246cf670ba 100644 +--- a/include/linux/migrate.h ++++ b/include/linux/migrate.h +@@ -54,7 +54,7 @@ static inline struct page *new_page_nodemask(struct page *page, + new_page = __alloc_pages_nodemask(gfp_mask, order, + preferred_nid, nodemask); + +- if (new_page && PageTransHuge(page)) ++ if (new_page && PageTransHuge(new_page)) + prep_transhuge_page(new_page); + + return new_page; diff --git a/queue-4.14/mm-thp-do-not-make-page-table-dirty-unconditionally-in-touch_pd.patch b/queue-4.14/mm-thp-do-not-make-page-table-dirty-unconditionally-in-touch_pd.patch new file mode 100644 index 00000000000..90618deeaa6 --- /dev/null +++ b/queue-4.14/mm-thp-do-not-make-page-table-dirty-unconditionally-in-touch_pd.patch @@ -0,0 +1,108 @@ +From a8f97366452ed491d13cf1e44241bc0b5740b1f0 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Mon, 27 Nov 2017 06:21:25 +0300 +Subject: mm, thp: Do not make page table dirty unconditionally in touch_p[mu]d() + +From: Kirill A. Shutemov + +commit a8f97366452ed491d13cf1e44241bc0b5740b1f0 upstream. + +Currently, we unconditionally make page table dirty in touch_pmd(). +It may result in false-positive can_follow_write_pmd(). + +We may avoid the situation, if we would only make the page table entry +dirty if caller asks for write access -- FOLL_WRITE. + +The patch also changes touch_pud() in the same way. + +Signed-off-by: Kirill A. Shutemov +Cc: Michal Hocko +Cc: Hugh Dickins +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/huge_memory.c | 36 +++++++++++++----------------------- + 1 file changed, 13 insertions(+), 23 deletions(-) + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -842,20 +842,15 @@ EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); + #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + + static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, +- pmd_t *pmd) ++ pmd_t *pmd, int flags) + { + pmd_t _pmd; + +- /* +- * We should set the dirty bit only for FOLL_WRITE but for now +- * the dirty bit in the pmd is meaningless. And if the dirty +- * bit will become meaningful and we'll only set it with +- * FOLL_WRITE, an atomic set_bit will be required on the pmd to +- * set the young bit, instead of the current set_pmd_at. +- */ +- _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); ++ _pmd = pmd_mkyoung(*pmd); ++ if (flags & FOLL_WRITE) ++ _pmd = pmd_mkdirty(_pmd); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, +- pmd, _pmd, 1)) ++ pmd, _pmd, flags & FOLL_WRITE)) + update_mmu_cache_pmd(vma, addr, pmd); + } + +@@ -884,7 +879,7 @@ struct page *follow_devmap_pmd(struct vm + return NULL; + + if (flags & FOLL_TOUCH) +- touch_pmd(vma, addr, pmd); ++ touch_pmd(vma, addr, pmd, flags); + + /* + * device mapped pages can only be returned if the +@@ -995,20 +990,15 @@ out: + + #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + static void touch_pud(struct vm_area_struct *vma, unsigned long addr, +- pud_t *pud) ++ pud_t *pud, int flags) + { + pud_t _pud; + +- /* +- * We should set the dirty bit only for FOLL_WRITE but for now +- * the dirty bit in the pud is meaningless. And if the dirty +- * bit will become meaningful and we'll only set it with +- * FOLL_WRITE, an atomic set_bit will be required on the pud to +- * set the young bit, instead of the current set_pud_at. +- */ +- _pud = pud_mkyoung(pud_mkdirty(*pud)); ++ _pud = pud_mkyoung(*pud); ++ if (flags & FOLL_WRITE) ++ _pud = pud_mkdirty(_pud); + if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, +- pud, _pud, 1)) ++ pud, _pud, flags & FOLL_WRITE)) + update_mmu_cache_pud(vma, addr, pud); + } + +@@ -1031,7 +1021,7 @@ struct page *follow_devmap_pud(struct vm + return NULL; + + if (flags & FOLL_TOUCH) +- touch_pud(vma, addr, pud); ++ touch_pud(vma, addr, pud, flags); + + /* + * device mapped pages can only be returned if the +@@ -1407,7 +1397,7 @@ struct page *follow_trans_huge_pmd(struc + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + if (flags & FOLL_TOUCH) +- touch_pmd(vma, addr, pmd); ++ touch_pmd(vma, addr, pmd, flags); + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* + * We don't mlock() pte-mapped THPs. This way we can avoid diff --git a/queue-4.14/series b/queue-4.14/series index 51d61ddee04..14acf634102 100644 --- a/queue-4.14/series +++ b/queue-4.14/series @@ -1,3 +1,20 @@ platform-x86-hp-wmi-fix-tablet-mode-detection-for-convertibles.patch mm-memory_hotplug-do-not-back-off-draining-pcp-free-pages-from-kworker-context.patch mm-oom_reaper-gather-each-vma-to-prevent-leaking-tlb-entry.patch +mm-thp-do-not-make-page-table-dirty-unconditionally-in-touch_pd.patch +mm-cma-fix-alloc_contig_range-ret-code-potential-leak.patch +mm-fix-device-dax-pud-write-faults-triggered-by-get_user_pages.patch +mm-hugetlbfs-introduce-split-to-vm_operations_struct.patch +device-dax-implement-split-to-catch-invalid-munmap-attempts.patch +mm-introduce-get_user_pages_longterm.patch +mm-fail-get_vaddr_frames-for-filesystem-dax-mappings.patch +v4l2-disable-filesystem-dax-mapping-support.patch +ib-core-disable-memory-registration-of-filesystem-dax-vmas.patch +exec-avoid-rlimit_stack-races-with-prlimit.patch +mm-madvise.c-fix-madvise-infinite-loop-under-special-circumstances.patch +mm-migrate-fix-an-incorrect-call-of-prep_transhuge_page.patch +mm-memcg-fix-mem_cgroup_swapout-for-thps.patch +fs-fat-inode.c-fix-sb_rdonly-change.patch +autofs-revert-autofs-take-more-care-to-not-update-last_used-on-path-walk.patch +autofs-revert-autofs-fix-at_no_automount-not-being-honored.patch +mm-hugetlb-fix-null-pointer-dereference-on-5-level-paging-machine.patch diff --git a/queue-4.14/v4l2-disable-filesystem-dax-mapping-support.patch b/queue-4.14/v4l2-disable-filesystem-dax-mapping-support.patch new file mode 100644 index 00000000000..7b3be7320a6 --- /dev/null +++ b/queue-4.14/v4l2-disable-filesystem-dax-mapping-support.patch @@ -0,0 +1,66 @@ +From b70131de648c2b997d22f4653934438013f407a1 Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Wed, 29 Nov 2017 16:10:43 -0800 +Subject: v4l2: disable filesystem-dax mapping support + +From: Dan Williams + +commit b70131de648c2b997d22f4653934438013f407a1 upstream. + +V4L2 memory registrations are incompatible with filesystem-dax that +needs the ability to revoke dma access to a mapping at will, or +otherwise allow the kernel to wait for completion of DMA. The +filesystem-dax implementation breaks the traditional solution of +truncate of active file backed mappings since there is no page-cache +page we can orphan to sustain ongoing DMA. + +If v4l2 wants to support long lived DMA mappings it needs to arrange to +hold a file lease or use some other mechanism so that the kernel can +coordinate revoking DMA access when the filesystem needs to truncate +mappings. + +Link: http://lkml.kernel.org/r/151068940499.7446.12846708245365671207.stgit@dwillia2-desk3.amr.corp.intel.com +Fixes: 3565fce3a659 ("mm, x86: get_user_pages() for dax mappings") +Signed-off-by: Dan Williams +Reported-by: Jan Kara +Reviewed-by: Jan Kara +Cc: Mauro Carvalho Chehab +Cc: Christoph Hellwig +Cc: Doug Ledford +Cc: Hal Rosenstock +Cc: Inki Dae +Cc: Jason Gunthorpe +Cc: Jeff Moyer +Cc: Joonyoung Shim +Cc: Kyungmin Park +Cc: Mel Gorman +Cc: Ross Zwisler +Cc: Sean Hefty +Cc: Seung-Woo Kim +Cc: Vlastimil Babka +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/media/v4l2-core/videobuf-dma-sg.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/drivers/media/v4l2-core/videobuf-dma-sg.c ++++ b/drivers/media/v4l2-core/videobuf-dma-sg.c +@@ -185,12 +185,13 @@ static int videobuf_dma_init_user_locked + dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", + data, size, dma->nr_pages); + +- err = get_user_pages(data & PAGE_MASK, dma->nr_pages, ++ err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages, + flags, dma->pages, NULL); + + if (err != dma->nr_pages) { + dma->nr_pages = (err >= 0) ? err : 0; +- dprintk(1, "get_user_pages: err=%d [%d]\n", err, dma->nr_pages); ++ dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err, ++ dma->nr_pages); + return err < 0 ? err : -EINVAL; + } + return 0; -- 2.47.3