]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.19-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 10 Jan 2019 16:11:35 +0000 (17:11 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 10 Jan 2019 16:11:35 +0000 (17:11 +0100)
added patches:
fork-record-start_time-late.patch
hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch
memcg-oom-notify-on-oom-killer-invocation-from-the-charge-path.patch
mm-devm_memremap_pages-add-memory_device_private-support.patch
mm-devm_memremap_pages-fix-shutdown-handling.patch
mm-devm_memremap_pages-kill-mapping-system-ram-support.patch
mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch
mm-hmm-mark-hmm_devmem_-add-add_resource-export_symbol_gpl.patch
mm-hmm-use-devm-semantics-for-hmm_devmem_-add-remove.patch
mm-swap-fix-swapoff-with-ksm-pages.patch
sunrpc-fix-cache_head-leak-due-to-queued-request.patch
sunrpc-use-after-free-in-svc_process_common.patch
sunrpc-use-svc_net-in-svcauth_gss_-functions.patch
zram-fix-double-free-backing-device.patch

17 files changed:
queue-4.19/fork-record-start_time-late.patch [new file with mode: 0644]
queue-4.19/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch [new file with mode: 0644]
queue-4.19/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch [new file with mode: 0644]
queue-4.19/hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch [new file with mode: 0644]
queue-4.19/memcg-oom-notify-on-oom-killer-invocation-from-the-charge-path.patch [new file with mode: 0644]
queue-4.19/mm-devm_memremap_pages-add-memory_device_private-support.patch [new file with mode: 0644]
queue-4.19/mm-devm_memremap_pages-fix-shutdown-handling.patch [new file with mode: 0644]
queue-4.19/mm-devm_memremap_pages-kill-mapping-system-ram-support.patch [new file with mode: 0644]
queue-4.19/mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch [new file with mode: 0644]
queue-4.19/mm-hmm-mark-hmm_devmem_-add-add_resource-export_symbol_gpl.patch [new file with mode: 0644]
queue-4.19/mm-hmm-use-devm-semantics-for-hmm_devmem_-add-remove.patch [new file with mode: 0644]
queue-4.19/mm-swap-fix-swapoff-with-ksm-pages.patch [new file with mode: 0644]
queue-4.19/series
queue-4.19/sunrpc-fix-cache_head-leak-due-to-queued-request.patch [new file with mode: 0644]
queue-4.19/sunrpc-use-after-free-in-svc_process_common.patch [new file with mode: 0644]
queue-4.19/sunrpc-use-svc_net-in-svcauth_gss_-functions.patch [new file with mode: 0644]
queue-4.19/zram-fix-double-free-backing-device.patch [new file with mode: 0644]

diff --git a/queue-4.19/fork-record-start_time-late.patch b/queue-4.19/fork-record-start_time-late.patch
new file mode 100644 (file)
index 0000000..f8b7e2a
--- /dev/null
@@ -0,0 +1,78 @@
+From 7b55851367136b1efd84d98fea81ba57a98304cf Mon Sep 17 00:00:00 2001
+From: David Herrmann <dh.herrmann@gmail.com>
+Date: Tue, 8 Jan 2019 13:58:52 +0100
+Subject: fork: record start_time late
+
+From: David Herrmann <dh.herrmann@gmail.com>
+
+commit 7b55851367136b1efd84d98fea81ba57a98304cf upstream.
+
+This changes the fork(2) syscall to record the process start_time after
+initializing the basic task structure but still before making the new
+process visible to user-space.
+
+Technically, we could record the start_time anytime during fork(2).  But
+this might lead to scenarios where a start_time is recorded long before
+a process becomes visible to user-space.  For instance, with
+userfaultfd(2) and TLS, user-space can delay the execution of fork(2)
+for an indefinite amount of time (and will, if this causes network
+access, or similar).
+
+By recording the start_time late, it much closer reflects the point in
+time where the process becomes live and can be observed by other
+processes.
+
+Lastly, this makes it much harder for user-space to predict and control
+the start_time they get assigned.  Previously, user-space could fork a
+process and stall it in copy_thread_tls() before its pid is allocated,
+but after its start_time is recorded.  This can be misused to later-on
+cycle through PIDs and resume the stalled fork(2) yielding a process
+that has the same pid and start_time as a process that existed before.
+This can be used to circumvent security systems that identify processes
+by their pid+start_time combination.
+
+Even though user-space was always aware that start_time recording is
+flaky (but several projects are known to still rely on start_time-based
+identification), changing the start_time to be recorded late will help
+mitigate existing attacks and make it much harder for user-space to
+control the start_time a process gets assigned.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Tom Gundersen <teg@jklm.no>
+Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/fork.c |   13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1784,8 +1784,6 @@ static __latent_entropy struct task_stru
+       posix_cpu_timers_init(p);
+-      p->start_time = ktime_get_ns();
+-      p->real_start_time = ktime_get_boot_ns();
+       p->io_context = NULL;
+       audit_set_context(p, NULL);
+       cgroup_fork(p);
+@@ -1950,6 +1948,17 @@ static __latent_entropy struct task_stru
+               goto bad_fork_free_pid;
+       /*
++       * From this point on we must avoid any synchronous user-space
++       * communication until we take the tasklist-lock. In particular, we do
++       * not want user-space to be able to predict the process start-time by
++       * stalling fork(2) after we recorded the start_time but before it is
++       * visible to the system.
++       */
++
++      p->start_time = ktime_get_ns();
++      p->real_start_time = ktime_get_boot_ns();
++
++      /*
+        * Make it visible to the rest of the system, but dont wake it up yet.
+        * Need tasklist lock for parent etc handling!
+        */
diff --git a/queue-4.19/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch b/queue-4.19/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
new file mode 100644 (file)
index 0000000..1a8a9de
--- /dev/null
@@ -0,0 +1,345 @@
+From b43a9990055958e70347c56f90ea2ae32c67334c Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Fri, 28 Dec 2018 00:39:38 -0800
+Subject: hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+commit b43a9990055958e70347c56f90ea2ae32c67334c upstream.
+
+While looking at BUGs associated with invalid huge page map counts, it was
+discovered and observed that a huge pte pointer could become 'invalid' and
+point to another task's page table.  Consider the following:
+
+A task takes a page fault on a shared hugetlbfs file and calls
+huge_pte_alloc to get a ptep.  Suppose the returned ptep points to a
+shared pmd.
+
+Now, another task truncates the hugetlbfs file.  As part of truncation, it
+unmaps everyone who has the file mapped.  If the range being truncated is
+covered by a shared pmd, huge_pmd_unshare will be called.  For all but the
+last user of the shared pmd, huge_pmd_unshare will clear the pud pointing
+to the pmd.  If the task in the middle of the page fault is not the last
+user, the ptep returned by huge_pte_alloc now points to another task's
+page table or worse.  This leads to bad things such as incorrect page
+map/reference counts or invalid memory references.
+
+To fix, expand the use of i_mmap_rwsem as follows:
+
+- i_mmap_rwsem is held in read mode whenever huge_pmd_share is called.
+  huge_pmd_share is only called via huge_pte_alloc, so callers of
+  huge_pte_alloc take i_mmap_rwsem before calling.  In addition, callers
+  of huge_pte_alloc continue to hold the semaphore until finished with the
+  ptep.
+
+- i_mmap_rwsem is held in write mode whenever huge_pmd_unshare is
+  called.
+
+[mike.kravetz@oracle.com: add explicit check for mapping != null]
+Link: http://lkml.kernel.org/r/20181218223557.5202-2-mike.kravetz@oracle.com
+Fixes: 39dde65c9940 ("shared page table for hugetlb page")
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
+Cc: Colin Ian King <colin.king@canonical.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c        |   65 ++++++++++++++++++++++++++++++++++++++++------------
+ mm/memory-failure.c |   16 +++++++++++-
+ mm/migrate.c        |   13 +++++++++-
+ mm/rmap.c           |    4 +++
+ mm/userfaultfd.c    |   11 +++++++-
+ 5 files changed, 89 insertions(+), 20 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -3237,6 +3237,7 @@ int copy_hugetlb_page_range(struct mm_st
+       struct page *ptepage;
+       unsigned long addr;
+       int cow;
++      struct address_space *mapping = vma->vm_file->f_mapping;
+       struct hstate *h = hstate_vma(vma);
+       unsigned long sz = huge_page_size(h);
+       unsigned long mmun_start;       /* For mmu_notifiers */
+@@ -3249,12 +3250,23 @@ int copy_hugetlb_page_range(struct mm_st
+       mmun_end = vma->vm_end;
+       if (cow)
+               mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
++      else {
++              /*
++               * For shared mappings i_mmap_rwsem must be held to call
++               * huge_pte_alloc, otherwise the returned ptep could go
++               * away if part of a shared pmd and another thread calls
++               * huge_pmd_unshare.
++               */
++              i_mmap_lock_read(mapping);
++      }
+       for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+               spinlock_t *src_ptl, *dst_ptl;
++
+               src_pte = huge_pte_offset(src, addr, sz);
+               if (!src_pte)
+                       continue;
++
+               dst_pte = huge_pte_alloc(dst, addr, sz);
+               if (!dst_pte) {
+                       ret = -ENOMEM;
+@@ -3325,6 +3337,8 @@ int copy_hugetlb_page_range(struct mm_st
+       if (cow)
+               mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
++      else
++              i_mmap_unlock_read(mapping);
+       return ret;
+ }
+@@ -3772,14 +3786,18 @@ retry:
+                       };
+                       /*
+-                       * hugetlb_fault_mutex must be dropped before
+-                       * handling userfault.  Reacquire after handling
+-                       * fault to make calling code simpler.
++                       * hugetlb_fault_mutex and i_mmap_rwsem must be
++                       * dropped before handling userfault.  Reacquire
++                       * after handling fault to make calling code simpler.
+                        */
+                       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
+                                                       idx, haddr);
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
++                      i_mmap_unlock_read(mapping);
++
+                       ret = handle_userfault(&vmf, VM_UFFD_MISSING);
++
++                      i_mmap_lock_read(mapping);
+                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       goto out;
+               }
+@@ -3927,6 +3945,11 @@ vm_fault_t hugetlb_fault(struct mm_struc
+       ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+       if (ptep) {
++              /*
++               * Since we hold no locks, ptep could be stale.  That is
++               * OK as we are only making decisions based on content and
++               * not actually modifying content here.
++               */
+               entry = huge_ptep_get(ptep);
+               if (unlikely(is_hugetlb_entry_migration(entry))) {
+                       migration_entry_wait_huge(vma, mm, ptep);
+@@ -3934,20 +3957,31 @@ vm_fault_t hugetlb_fault(struct mm_struc
+               } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                       return VM_FAULT_HWPOISON_LARGE |
+                               VM_FAULT_SET_HINDEX(hstate_index(h));
+-      } else {
+-              ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+-              if (!ptep)
+-                      return VM_FAULT_OOM;
+       }
++      /*
++       * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
++       * until finished with ptep.  This prevents huge_pmd_unshare from
++       * being called elsewhere and making the ptep no longer valid.
++       *
++       * ptep could have already be assigned via huge_pte_offset.  That
++       * is OK, as huge_pte_alloc will return the same value unless
++       * something changed.
++       */
+       mapping = vma->vm_file->f_mapping;
+-      idx = vma_hugecache_offset(h, vma, haddr);
++      i_mmap_lock_read(mapping);
++      ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
++      if (!ptep) {
++              i_mmap_unlock_read(mapping);
++              return VM_FAULT_OOM;
++      }
+       /*
+        * Serialize hugepage allocation and instantiation, so that we don't
+        * get spurious allocation failures if two CPUs race to instantiate
+        * the same page in the page cache.
+        */
++      idx = vma_hugecache_offset(h, vma, haddr);
+       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
+       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+@@ -4035,6 +4069,7 @@ out_ptl:
+       }
+ out_mutex:
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
++      i_mmap_unlock_read(mapping);
+       /*
+        * Generally it's safe to hold refcount during waiting page lock. But
+        * here we just wait to defer the next page fault to avoid busy loop and
+@@ -4639,10 +4674,12 @@ void adjust_range_if_pmd_sharing_possibl
+  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+  * and returns the corresponding pte. While this is not necessary for the
+  * !shared pmd case because we can allocate the pmd later as well, it makes the
+- * code much cleaner. pmd allocation is essential for the shared case because
+- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
+- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+- * bad pmd for sharing.
++ * code much cleaner.
++ *
++ * This routine must be called with i_mmap_rwsem held in at least read mode.
++ * For hugetlbfs, this prevents removal of any page table entries associated
++ * with the address space.  This is important as we are setting up sharing
++ * based on existing page table entries (mappings).
+  */
+ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+ {
+@@ -4659,7 +4696,6 @@ pte_t *huge_pmd_share(struct mm_struct *
+       if (!vma_shareable(vma, addr))
+               return (pte_t *)pmd_alloc(mm, pud, addr);
+-      i_mmap_lock_write(mapping);
+       vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+               if (svma == vma)
+                       continue;
+@@ -4689,7 +4725,6 @@ pte_t *huge_pmd_share(struct mm_struct *
+       spin_unlock(ptl);
+ out:
+       pte = (pte_t *)pmd_alloc(mm, pud, addr);
+-      i_mmap_unlock_write(mapping);
+       return pte;
+ }
+@@ -4700,7 +4735,7 @@ out:
+  * indicated by page_count > 1, unmap is achieved by clearing pud and
+  * decrementing the ref count. If count == 1, the pte page is not shared.
+  *
+- * called with page table lock held.
++ * Called with page table lock held and i_mmap_rwsem held in write mode.
+  *
+  * returns: 1 successfully unmapped a shared pte page
+  *        0 the underlying pte page is not shared, or it is the last user
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struc
+       enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+       struct address_space *mapping;
+       LIST_HEAD(tokill);
+-      bool unmap_success;
++      bool unmap_success = true;
+       int kill = 1, forcekill;
+       struct page *hpage = *hpagep;
+       bool mlocked = PageMlocked(hpage);
+@@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struc
+       if (kill)
+               collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
+-      unmap_success = try_to_unmap(hpage, ttu);
++      if (!PageHuge(hpage)) {
++              unmap_success = try_to_unmap(hpage, ttu);
++      } else if (mapping) {
++              /*
++               * For hugetlb pages, try_to_unmap could potentially call
++               * huge_pmd_unshare.  Because of this, take semaphore in
++               * write mode here and set TTU_RMAP_LOCKED to indicate we
++               * have taken the lock at this higer level.
++               */
++              i_mmap_lock_write(mapping);
++              unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
++              i_mmap_unlock_write(mapping);
++      }
+       if (!unmap_success)
+               pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
+                      pfn, page_mapcount(hpage));
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1307,8 +1307,19 @@ static int unmap_and_move_huge_page(new_
+               goto put_anon;
+       if (page_mapped(hpage)) {
++              struct address_space *mapping = page_mapping(hpage);
++
++              /*
++               * try_to_unmap could potentially call huge_pmd_unshare.
++               * Because of this, take semaphore in write mode here and
++               * set TTU_RMAP_LOCKED to let lower levels know we have
++               * taken the lock.
++               */
++              i_mmap_lock_write(mapping);
+               try_to_unmap(hpage,
+-                      TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
++                      TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
++                      TTU_RMAP_LOCKED);
++              i_mmap_unlock_write(mapping);
+               page_was_mapped = 1;
+       }
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -25,6 +25,7 @@
+  *     page->flags PG_locked (lock_page)
+  *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+  *         mapping->i_mmap_rwsem
++ *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+  *           anon_vma->rwsem
+  *             mm->page_table_lock or pte_lock
+  *               zone_lru_lock (in mark_page_accessed, isolate_lru_page)
+@@ -1374,6 +1375,9 @@ static bool try_to_unmap_one(struct page
+               /*
+                * If sharing is possible, start and end will be adjusted
+                * accordingly.
++               *
++               * If called for a huge page, caller must hold i_mmap_rwsem
++               * in write mode as it is possible to call huge_pmd_unshare.
+                */
+               adjust_range_if_pmd_sharing_possible(vma, &start, &end);
+       }
+--- a/mm/userfaultfd.c
++++ b/mm/userfaultfd.c
+@@ -267,10 +267,14 @@ retry:
+               VM_BUG_ON(dst_addr & ~huge_page_mask(h));
+               /*
+-               * Serialize via hugetlb_fault_mutex
++               * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
++               * i_mmap_rwsem ensures the dst_pte remains valid even
++               * in the case of shared pmds.  fault mutex prevents
++               * races with other faulting threads.
+                */
+-              idx = linear_page_index(dst_vma, dst_addr);
+               mapping = dst_vma->vm_file->f_mapping;
++              i_mmap_lock_read(mapping);
++              idx = linear_page_index(dst_vma, dst_addr);
+               hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
+                                                               idx, dst_addr);
+               mutex_lock(&hugetlb_fault_mutex_table[hash]);
+@@ -279,6 +283,7 @@ retry:
+               dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
+               if (!dst_pte) {
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
++                      i_mmap_unlock_read(mapping);
+                       goto out_unlock;
+               }
+@@ -286,6 +291,7 @@ retry:
+               dst_pteval = huge_ptep_get(dst_pte);
+               if (!huge_pte_none(dst_pteval)) {
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
++                      i_mmap_unlock_read(mapping);
+                       goto out_unlock;
+               }
+@@ -293,6 +299,7 @@ retry:
+                                               dst_addr, src_addr, &page);
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
++              i_mmap_unlock_read(mapping);
+               vm_alloc_shared = vm_shared;
+               cond_resched();
diff --git a/queue-4.19/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch b/queue-4.19/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
new file mode 100644 (file)
index 0000000..1f42b81
--- /dev/null
@@ -0,0 +1,228 @@
+From c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Fri, 28 Dec 2018 00:39:42 -0800
+Subject: hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+commit c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 upstream.
+
+hugetlbfs page faults can race with truncate and hole punch operations.
+Current code in the page fault path attempts to handle this by 'backing
+out' operations if we encounter the race.  One obvious omission in the
+current code is removing a page newly added to the page cache.  This is
+pretty straight forward to address, but there is a more subtle and
+difficult issue of backing out hugetlb reservations.  To handle this
+correctly, the 'reservation state' before page allocation needs to be
+noted so that it can be properly backed out.  There are four distinct
+possibilities for reservation state: shared/reserved, shared/no-resv,
+private/reserved and private/no-resv.  Backing out a reservation may
+require memory allocation which could fail so that needs to be taken into
+account as well.
+
+Instead of writing the required complicated code for this rare occurrence,
+just eliminate the race.  i_mmap_rwsem is now held in read mode for the
+duration of page fault processing.  Hold i_mmap_rwsem longer in truncation
+and hold punch code to cover the call to remove_inode_hugepages.
+
+With this modification, code in remove_inode_hugepages checking for races
+becomes 'dead' as it can not longer happen.  Remove the dead code and
+expand comments to explain reasoning.  Similarly, checks for races with
+truncation in the page fault path can be simplified and removed.
+
+[mike.kravetz@oracle.com: incorporat suggestions from Kirill]
+  Link: http://lkml.kernel.org/r/20181222223013.22193-3-mike.kravetz@oracle.com
+Link: http://lkml.kernel.org/r/20181218223557.5202-3-mike.kravetz@oracle.com
+Fixes: ebed4bfc8da8 ("hugetlb: fix absurd HugePages_Rsvd")
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/hugetlbfs/inode.c |   61 +++++++++++++++++++++++----------------------------
+ mm/hugetlb.c         |   21 ++++++++---------
+ 2 files changed, 38 insertions(+), 44 deletions(-)
+
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -383,17 +383,16 @@ hugetlb_vmdelete_list(struct rb_root_cac
+  * truncation is indicated by end of range being LLONG_MAX
+  *    In this case, we first scan the range and release found pages.
+  *    After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+- *    maps and global counts.  Page faults can not race with truncation
+- *    in this routine.  hugetlb_no_page() prevents page faults in the
+- *    truncated range.  It checks i_size before allocation, and again after
+- *    with the page table lock for the page held.  The same lock must be
+- *    acquired to unmap a page.
++ *    maps and global counts.
+  * hole punch is indicated if end is not LLONG_MAX
+  *    In the hole punch case we scan the range and release found pages.
+  *    Only when releasing a page is the associated region/reserv map
+  *    deleted.  The region/reserv map for ranges without associated
+- *    pages are not modified.  Page faults can race with hole punch.
+- *    This is indicated if we find a mapped page.
++ *    pages are not modified.
++ *
++ * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
++ * races with page faults.
++ *
+  * Note: If the passed end of range value is beyond the end of file, but
+  * not LLONG_MAX this routine still performs a hole punch operation.
+  */
+@@ -423,32 +422,14 @@ static void remove_inode_hugepages(struc
+               for (i = 0; i < pagevec_count(&pvec); ++i) {
+                       struct page *page = pvec.pages[i];
+-                      u32 hash;
+                       index = page->index;
+-                      hash = hugetlb_fault_mutex_hash(h, current->mm,
+-                                                      &pseudo_vma,
+-                                                      mapping, index, 0);
+-                      mutex_lock(&hugetlb_fault_mutex_table[hash]);
+-
+                       /*
+-                       * If page is mapped, it was faulted in after being
+-                       * unmapped in caller.  Unmap (again) now after taking
+-                       * the fault mutex.  The mutex will prevent faults
+-                       * until we finish removing the page.
+-                       *
+-                       * This race can only happen in the hole punch case.
+-                       * Getting here in a truncate operation is a bug.
++                       * A mapped page is impossible as callers should unmap
++                       * all references before calling.  And, i_mmap_rwsem
++                       * prevents the creation of additional mappings.
+                        */
+-                      if (unlikely(page_mapped(page))) {
+-                              BUG_ON(truncate_op);
+-
+-                              i_mmap_lock_write(mapping);
+-                              hugetlb_vmdelete_list(&mapping->i_mmap,
+-                                      index * pages_per_huge_page(h),
+-                                      (index + 1) * pages_per_huge_page(h));
+-                              i_mmap_unlock_write(mapping);
+-                      }
++                      VM_BUG_ON(page_mapped(page));
+                       lock_page(page);
+                       /*
+@@ -470,7 +451,6 @@ static void remove_inode_hugepages(struc
+                       }
+                       unlock_page(page);
+-                      mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+               }
+               huge_pagevec_release(&pvec);
+               cond_resched();
+@@ -482,9 +462,20 @@ static void remove_inode_hugepages(struc
+ static void hugetlbfs_evict_inode(struct inode *inode)
+ {
++      struct address_space *mapping = inode->i_mapping;
+       struct resv_map *resv_map;
++      /*
++       * The vfs layer guarantees that there are no other users of this
++       * inode.  Therefore, it would be safe to call remove_inode_hugepages
++       * without holding i_mmap_rwsem.  We acquire and hold here to be
++       * consistent with other callers.  Since there will be no contention
++       * on the semaphore, overhead is negligible.
++       */
++      i_mmap_lock_write(mapping);
+       remove_inode_hugepages(inode, 0, LLONG_MAX);
++      i_mmap_unlock_write(mapping);
++
+       resv_map = (struct resv_map *)inode->i_mapping->private_data;
+       /* root inode doesn't have the resv_map, so we should check it */
+       if (resv_map)
+@@ -505,8 +496,8 @@ static int hugetlb_vmtruncate(struct ino
+       i_mmap_lock_write(mapping);
+       if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
+               hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
+-      i_mmap_unlock_write(mapping);
+       remove_inode_hugepages(inode, offset, LLONG_MAX);
++      i_mmap_unlock_write(mapping);
+       return 0;
+ }
+@@ -540,8 +531,8 @@ static long hugetlbfs_punch_hole(struct
+                       hugetlb_vmdelete_list(&mapping->i_mmap,
+                                               hole_start >> PAGE_SHIFT,
+                                               hole_end  >> PAGE_SHIFT);
+-              i_mmap_unlock_write(mapping);
+               remove_inode_hugepages(inode, hole_start, hole_end);
++              i_mmap_unlock_write(mapping);
+               inode_unlock(inode);
+       }
+@@ -624,7 +615,11 @@ static long hugetlbfs_fallocate(struct f
+               /* addr is the offset within the file (zero based) */
+               addr = index * hpage_size;
+-              /* mutex taken here, fault path and hole punch */
++              /*
++               * fault mutex taken here, protects against fault path
++               * and hole punch.  inode_lock previously taken protects
++               * against truncation.
++               */
+               hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+                                               index, addr);
+               mutex_lock(&hugetlb_fault_mutex_table[hash]);
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -3757,16 +3757,16 @@ static vm_fault_t hugetlb_no_page(struct
+       }
+       /*
+-       * Use page lock to guard against racing truncation
+-       * before we get page_table_lock.
++       * We can not race with truncation due to holding i_mmap_rwsem.
++       * Check once here for faults beyond end of file.
+        */
++      size = i_size_read(mapping->host) >> huge_page_shift(h);
++      if (idx >= size)
++              goto out;
++
+ retry:
+       page = find_lock_page(mapping, idx);
+       if (!page) {
+-              size = i_size_read(mapping->host) >> huge_page_shift(h);
+-              if (idx >= size)
+-                      goto out;
+-
+               /*
+                * Check for page in userfault range
+                */
+@@ -3856,9 +3856,6 @@ retry:
+       }
+       ptl = huge_pte_lock(h, mm, ptep);
+-      size = i_size_read(mapping->host) >> huge_page_shift(h);
+-      if (idx >= size)
+-              goto backout;
+       ret = 0;
+       if (!huge_pte_none(huge_ptep_get(ptep)))
+@@ -3961,8 +3958,10 @@ vm_fault_t hugetlb_fault(struct mm_struc
+       /*
+        * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
+-       * until finished with ptep.  This prevents huge_pmd_unshare from
+-       * being called elsewhere and making the ptep no longer valid.
++       * until finished with ptep.  This serves two purposes:
++       * 1) It prevents huge_pmd_unshare from being called elsewhere
++       *    and making the ptep no longer valid.
++       * 2) It synchronizes us with file truncation.
+        *
+        * ptep could have already be assigned via huge_pte_offset.  That
+        * is OK, as huge_pte_alloc will return the same value unless
diff --git a/queue-4.19/hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch b/queue-4.19/hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch
new file mode 100644 (file)
index 0000000..6d00873
--- /dev/null
@@ -0,0 +1,169 @@
+From b15c87263a69272423771118c653e9a1d0672caa Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Fri, 28 Dec 2018 00:38:01 -0800
+Subject: hwpoison, memory_hotplug: allow hwpoisoned pages to be offlined
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit b15c87263a69272423771118c653e9a1d0672caa upstream.
+
+We have received a bug report that an injected MCE about faulty memory
+prevents memory offline to succeed on 4.4 base kernel.  The underlying
+reason was that the HWPoison page has an elevated reference count and the
+migration keeps failing.  There are two problems with that.  First of all
+it is dubious to migrate the poisoned page because we know that accessing
+that memory is possible to fail.  Secondly it doesn't make any sense to
+migrate a potentially broken content and preserve the memory corruption
+over to a new location.
+
+Oscar has found out that 4.4 and the current upstream kernels behave
+slightly differently with his simply testcase
+
+===
+
+int main(void)
+{
+        int ret;
+        int i;
+        int fd;
+        char *array = malloc(4096);
+        char *array_locked = malloc(4096);
+
+        fd = open("/tmp/data", O_RDONLY);
+        read(fd, array, 4095);
+
+        for (i = 0; i < 4096; i++)
+                array_locked[i] = 'd';
+
+        ret = mlock((void *)PAGE_ALIGN((unsigned long)array_locked), sizeof(array_locked));
+        if (ret)
+                perror("mlock");
+
+        sleep (20);
+
+        ret = madvise((void *)PAGE_ALIGN((unsigned long)array_locked), 4096, MADV_HWPOISON);
+        if (ret)
+                perror("madvise");
+
+        for (i = 0; i < 4096; i++)
+                array_locked[i] = 'd';
+
+        return 0;
+}
+===
+
++ offline this memory.
+
+In 4.4 kernels he saw the hwpoisoned page to be returned back to the LRU
+list
+kernel:  [<ffffffff81019ac9>] dump_trace+0x59/0x340
+kernel:  [<ffffffff81019e9a>] show_stack_log_lvl+0xea/0x170
+kernel:  [<ffffffff8101ac71>] show_stack+0x21/0x40
+kernel:  [<ffffffff8132bb90>] dump_stack+0x5c/0x7c
+kernel:  [<ffffffff810815a1>] warn_slowpath_common+0x81/0xb0
+kernel:  [<ffffffff811a275c>] __pagevec_lru_add_fn+0x14c/0x160
+kernel:  [<ffffffff811a2eed>] pagevec_lru_move_fn+0xad/0x100
+kernel:  [<ffffffff811a334c>] __lru_cache_add+0x6c/0xb0
+kernel:  [<ffffffff81195236>] add_to_page_cache_lru+0x46/0x70
+kernel:  [<ffffffffa02b4373>] extent_readpages+0xc3/0x1a0 [btrfs]
+kernel:  [<ffffffff811a16d7>] __do_page_cache_readahead+0x177/0x200
+kernel:  [<ffffffff811a18c8>] ondemand_readahead+0x168/0x2a0
+kernel:  [<ffffffff8119673f>] generic_file_read_iter+0x41f/0x660
+kernel:  [<ffffffff8120e50d>] __vfs_read+0xcd/0x140
+kernel:  [<ffffffff8120e9ea>] vfs_read+0x7a/0x120
+kernel:  [<ffffffff8121404b>] kernel_read+0x3b/0x50
+kernel:  [<ffffffff81215c80>] do_execveat_common.isra.29+0x490/0x6f0
+kernel:  [<ffffffff81215f08>] do_execve+0x28/0x30
+kernel:  [<ffffffff81095ddb>] call_usermodehelper_exec_async+0xfb/0x130
+kernel:  [<ffffffff8161c045>] ret_from_fork+0x55/0x80
+
+And that latter confuses the hotremove path because an LRU page is
+attempted to be migrated and that fails due to an elevated reference
+count.  It is quite possible that the reuse of the HWPoisoned page is some
+kind of fixed race condition but I am not really sure about that.
+
+With the upstream kernel the failure is slightly different.  The page
+doesn't seem to have LRU bit set but isolate_movable_page simply fails and
+do_migrate_range simply puts all the isolated pages back to LRU and
+therefore no progress is made and scan_movable_pages finds same set of
+pages over and over again.
+
+Fix both cases by explicitly checking HWPoisoned pages before we even try
+to get reference on the page, try to unmap it if it is still mapped.  As
+explained by Naoya:
+
+: Hwpoison code never unmapped those for no big reason because
+: Ksm pages never dominate memory, so we simply didn't have strong
+: motivation to save the pages.
+
+Also put WARN_ON(PageLRU) in case there is a race and we can hit LRU
+HWPoison pages which shouldn't happen but I couldn't convince myself about
+that.  Naoya has noted the following:
+
+: Theoretically no such gurantee, because try_to_unmap() doesn't have a
+: guarantee of success and then memory_failure() returns immediately
+: when hwpoison_user_mappings fails.
+: Or the following code (comes after hwpoison_user_mappings block) also impli=
+: es
+: that the target page can still have PageLRU flag.
+:
+:         /*
+:          * Torn down by someone else?
+:          */
+:         if (PageLRU(p) && !PageSwapCache(p) && p->mapping =3D=3D NULL) {
+:                 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
+:                 res =3D -EBUSY;
+:                 goto out;
+:         }
+:
+: So I think it's OK to keep "if (WARN_ON(PageLRU(page)))" block in
+: current version of your patch.
+
+Link: http://lkml.kernel.org/r/20181206120135.14079-1-mhocko@kernel.org
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.com>
+Debugged-by: Oscar Salvador <osalvador@suse.com>
+Tested-by: Oscar Salvador <osalvador@suse.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory_hotplug.c |   16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -35,6 +35,7 @@
+ #include <linux/memblock.h>
+ #include <linux/bootmem.h>
+ #include <linux/compaction.h>
++#include <linux/rmap.h>
+ #include <asm/tlbflush.h>
+@@ -1393,6 +1394,21 @@ do_migrate_range(unsigned long start_pfn
+                       pfn = page_to_pfn(compound_head(page))
+                               + hpage_nr_pages(page) - 1;
++              /*
++               * HWPoison pages have elevated reference counts so the migration would
++               * fail on them. It also doesn't make any sense to migrate them in the
++               * first place. Still try to unmap such a page in case it is still mapped
++               * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
++               * the unmap as the catch all safety net).
++               */
++              if (PageHWPoison(page)) {
++                      if (WARN_ON(PageLRU(page)))
++                              isolate_lru_page(page);
++                      if (page_mapped(page))
++                              try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
++                      continue;
++              }
++
+               if (!get_page_unless_zero(page))
+                       continue;
+               /*
diff --git a/queue-4.19/memcg-oom-notify-on-oom-killer-invocation-from-the-charge-path.patch b/queue-4.19/memcg-oom-notify-on-oom-killer-invocation-from-the-charge-path.patch
new file mode 100644 (file)
index 0000000..4c62073
--- /dev/null
@@ -0,0 +1,71 @@
+From 7056d3a37d2c6aaaab10c13e8e69adc67ec1fc65 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Fri, 28 Dec 2018 00:39:57 -0800
+Subject: memcg, oom: notify on oom killer invocation from the charge path
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit 7056d3a37d2c6aaaab10c13e8e69adc67ec1fc65 upstream.
+
+Burt Holzman has noticed that memcg v1 doesn't notify about OOM events via
+eventfd anymore.  The reason is that 29ef680ae7c2 ("memcg, oom: move
+out_of_memory back to the charge path") has moved the oom handling back to
+the charge path.  While doing so the notification was left behind in
+mem_cgroup_oom_synchronize.
+
+Fix the issue by replicating the oom hierarchy locking and the
+notification.
+
+Link: http://lkml.kernel.org/r/20181224091107.18354-1-mhocko@kernel.org
+Fixes: 29ef680ae7c2 ("memcg, oom: move out_of_memory back to the charge path")
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Reported-by: Burt Holzman <burt@fnal.gov>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com
+Cc: <stable@vger.kernel.org>   [4.19+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |   20 ++++++++++++++++++--
+ 1 file changed, 18 insertions(+), 2 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -1666,6 +1666,9 @@ enum oom_status {
+ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+ {
++      enum oom_status ret;
++      bool locked;
++
+       if (order > PAGE_ALLOC_COSTLY_ORDER)
+               return OOM_SKIPPED;
+@@ -1698,10 +1701,23 @@ static enum oom_status mem_cgroup_oom(st
+               return OOM_ASYNC;
+       }
++      mem_cgroup_mark_under_oom(memcg);
++
++      locked = mem_cgroup_oom_trylock(memcg);
++
++      if (locked)
++              mem_cgroup_oom_notify(memcg);
++
++      mem_cgroup_unmark_under_oom(memcg);
+       if (mem_cgroup_out_of_memory(memcg, mask, order))
+-              return OOM_SUCCESS;
++              ret = OOM_SUCCESS;
++      else
++              ret = OOM_FAILED;
++
++      if (locked)
++              mem_cgroup_oom_unlock(memcg);
+-      return OOM_FAILED;
++      return ret;
+ }
+ /**
diff --git a/queue-4.19/mm-devm_memremap_pages-add-memory_device_private-support.patch b/queue-4.19/mm-devm_memremap_pages-add-memory_device_private-support.patch
new file mode 100644 (file)
index 0000000..f58c0bb
--- /dev/null
@@ -0,0 +1,105 @@
+From 69324b8f48339de2f90fdf2f774687fc6c47629a Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Fri, 28 Dec 2018 00:35:01 -0800
+Subject: mm, devm_memremap_pages: add MEMORY_DEVICE_PRIVATE support
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 69324b8f48339de2f90fdf2f774687fc6c47629a upstream.
+
+In preparation for consolidating all ZONE_DEVICE enabling via
+devm_memremap_pages(), teach it how to handle the constraints of
+MEMORY_DEVICE_PRIVATE ranges.
+
+[jglisse@redhat.com: call move_pfn_range_to_zone for MEMORY_DEVICE_PRIVATE]
+Link: http://lkml.kernel.org/r/154275559036.76910.12434636179931292607.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
+Acked-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Logan Gunthorpe <logang@deltatee.com>
+Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/memremap.c |   53 +++++++++++++++++++++++++++++++++++++++++------------
+ 1 file changed, 41 insertions(+), 12 deletions(-)
+
+--- a/kernel/memremap.c
++++ b/kernel/memremap.c
+@@ -132,9 +132,15 @@ static void devm_memremap_pages_release(
+               - align_start;
+       mem_hotplug_begin();
+-      arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
+-                      &pgmap->altmap : NULL);
+-      kasan_remove_zero_shadow(__va(align_start), align_size);
++      if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
++              pfn = align_start >> PAGE_SHIFT;
++              __remove_pages(page_zone(pfn_to_page(pfn)), pfn,
++                              align_size >> PAGE_SHIFT, NULL);
++      } else {
++              arch_remove_memory(align_start, align_size,
++                              pgmap->altmap_valid ? &pgmap->altmap : NULL);
++              kasan_remove_zero_shadow(__va(align_start), align_size);
++      }
+       mem_hotplug_done();
+       untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+@@ -232,17 +238,40 @@ void *devm_memremap_pages(struct device
+               goto err_pfn_remap;
+       mem_hotplug_begin();
+-      error = kasan_add_zero_shadow(__va(align_start), align_size);
+-      if (error) {
+-              mem_hotplug_done();
+-              goto err_kasan;
++
++      /*
++       * For device private memory we call add_pages() as we only need to
++       * allocate and initialize struct page for the device memory. More-
++       * over the device memory is un-accessible thus we do not want to
++       * create a linear mapping for the memory like arch_add_memory()
++       * would do.
++       *
++       * For all other device memory types, which are accessible by
++       * the CPU, we do want the linear mapping and thus use
++       * arch_add_memory().
++       */
++      if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
++              error = add_pages(nid, align_start >> PAGE_SHIFT,
++                              align_size >> PAGE_SHIFT, NULL, false);
++      } else {
++              error = kasan_add_zero_shadow(__va(align_start), align_size);
++              if (error) {
++                      mem_hotplug_done();
++                      goto err_kasan;
++              }
++
++              error = arch_add_memory(nid, align_start, align_size, altmap,
++                              false);
++      }
++
++      if (!error) {
++              struct zone *zone;
++
++              zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
++              move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
++                              align_size >> PAGE_SHIFT, altmap);
+       }
+-      error = arch_add_memory(nid, align_start, align_size, altmap, false);
+-      if (!error)
+-              move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+-                                      align_start >> PAGE_SHIFT,
+-                                      align_size >> PAGE_SHIFT, altmap);
+       mem_hotplug_done();
+       if (error)
+               goto err_add_memory;
diff --git a/queue-4.19/mm-devm_memremap_pages-fix-shutdown-handling.patch b/queue-4.19/mm-devm_memremap_pages-fix-shutdown-handling.patch
new file mode 100644 (file)
index 0000000..57e6441
--- /dev/null
@@ -0,0 +1,264 @@
+From a95c90f1e2c253b280385ecf3d4ebfe476926b28 Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Fri, 28 Dec 2018 00:34:57 -0800
+Subject: mm, devm_memremap_pages: fix shutdown handling
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit a95c90f1e2c253b280385ecf3d4ebfe476926b28 upstream.
+
+The last step before devm_memremap_pages() returns success is to allocate
+a release action, devm_memremap_pages_release(), to tear the entire setup
+down.  However, the result from devm_add_action() is not checked.
+
+Checking the error from devm_add_action() is not enough.  The api
+currently relies on the fact that the percpu_ref it is using is killed by
+the time the devm_memremap_pages_release() is run.  Rather than continue
+this awkward situation, offload the responsibility of killing the
+percpu_ref to devm_memremap_pages_release() directly.  This allows
+devm_memremap_pages() to do the right thing relative to init failures and
+shutdown.
+
+Without this change we could fail to register the teardown of
+devm_memremap_pages().  The likelihood of hitting this failure is tiny as
+small memory allocations almost always succeed.  However, the impact of
+the failure is large given any future reconfiguration, or disable/enable,
+of an nvdimm namespace will fail forever as subsequent calls to
+devm_memremap_pages() will fail to setup the pgmap_radix since there will
+be stale entries for the physical address range.
+
+An argument could be made to require that the ->kill() operation be set in
+the @pgmap arg rather than passed in separately.  However, it helps code
+readability, tracking the lifetime of a given instance, to be able to grep
+the kill routine directly at the devm_memremap_pages() call site.
+
+Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
+Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
+Reported-by: Logan Gunthorpe <logang@deltatee.com>
+Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/dax/pmem.c                |   14 +++-----------
+ drivers/nvdimm/pmem.c             |   13 +++++--------
+ include/linux/memremap.h          |    2 ++
+ kernel/memremap.c                 |   31 +++++++++++++++----------------
+ tools/testing/nvdimm/test/iomap.c |   15 ++++++++++++++-
+ 5 files changed, 39 insertions(+), 36 deletions(-)
+
+--- a/drivers/dax/pmem.c
++++ b/drivers/dax/pmem.c
+@@ -48,9 +48,8 @@ static void dax_pmem_percpu_exit(void *d
+       percpu_ref_exit(ref);
+ }
+-static void dax_pmem_percpu_kill(void *data)
++static void dax_pmem_percpu_kill(struct percpu_ref *ref)
+ {
+-      struct percpu_ref *ref = data;
+       struct dax_pmem *dax_pmem = to_dax_pmem(ref);
+       dev_dbg(dax_pmem->dev, "trace\n");
+@@ -112,17 +111,10 @@ static int dax_pmem_probe(struct device
+       }
+       dax_pmem->pgmap.ref = &dax_pmem->ref;
++      dax_pmem->pgmap.kill = dax_pmem_percpu_kill;
+       addr = devm_memremap_pages(dev, &dax_pmem->pgmap);
+-      if (IS_ERR(addr)) {
+-              devm_remove_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref);
+-              percpu_ref_exit(&dax_pmem->ref);
++      if (IS_ERR(addr))
+               return PTR_ERR(addr);
+-      }
+-
+-      rc = devm_add_action_or_reset(dev, dax_pmem_percpu_kill,
+-                                                      &dax_pmem->ref);
+-      if (rc)
+-              return rc;
+       /* adjust the dax_region resource to the start of data */
+       memcpy(&res, &dax_pmem->pgmap.res, sizeof(res));
+--- a/drivers/nvdimm/pmem.c
++++ b/drivers/nvdimm/pmem.c
+@@ -309,8 +309,11 @@ static void pmem_release_queue(void *q)
+       blk_cleanup_queue(q);
+ }
+-static void pmem_freeze_queue(void *q)
++static void pmem_freeze_queue(struct percpu_ref *ref)
+ {
++      struct request_queue *q;
++
++      q = container_of(ref, typeof(*q), q_usage_counter);
+       blk_freeze_queue_start(q);
+ }
+@@ -402,6 +405,7 @@ static int pmem_attach_disk(struct devic
+       pmem->pfn_flags = PFN_DEV;
+       pmem->pgmap.ref = &q->q_usage_counter;
++      pmem->pgmap.kill = pmem_freeze_queue;
+       if (is_nd_pfn(dev)) {
+               if (setup_pagemap_fsdax(dev, &pmem->pgmap))
+                       return -ENOMEM;
+@@ -427,13 +431,6 @@ static int pmem_attach_disk(struct devic
+               memcpy(&bb_res, &nsio->res, sizeof(bb_res));
+       }
+-      /*
+-       * At release time the queue must be frozen before
+-       * devm_memremap_pages is unwound
+-       */
+-      if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
+-              return -ENOMEM;
+-
+       if (IS_ERR(addr))
+               return PTR_ERR(addr);
+       pmem->virt_addr = addr;
+--- a/include/linux/memremap.h
++++ b/include/linux/memremap.h
+@@ -106,6 +106,7 @@ typedef void (*dev_page_free_t)(struct p
+  * @altmap: pre-allocated/reserved memory for vmemmap allocations
+  * @res: physical address range covered by @ref
+  * @ref: reference count that pins the devm_memremap_pages() mapping
++ * @kill: callback to transition @ref to the dead state
+  * @dev: host device of the mapping for debug
+  * @data: private data pointer for page_free()
+  * @type: memory type: see MEMORY_* in memory_hotplug.h
+@@ -117,6 +118,7 @@ struct dev_pagemap {
+       bool altmap_valid;
+       struct resource res;
+       struct percpu_ref *ref;
++      void (*kill)(struct percpu_ref *ref);
+       struct device *dev;
+       void *data;
+       enum memory_type type;
+--- a/kernel/memremap.c
++++ b/kernel/memremap.c
+@@ -122,14 +122,10 @@ static void devm_memremap_pages_release(
+       resource_size_t align_start, align_size;
+       unsigned long pfn;
++      pgmap->kill(pgmap->ref);
+       for_each_device_pfn(pfn, pgmap)
+               put_page(pfn_to_page(pfn));
+-      if (percpu_ref_tryget_live(pgmap->ref)) {
+-              dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+-              percpu_ref_put(pgmap->ref);
+-      }
+-
+       /* pages are dead and unused, undo the arch mapping */
+       align_start = res->start & ~(SECTION_SIZE - 1);
+       align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+@@ -150,7 +146,7 @@ static void devm_memremap_pages_release(
+ /**
+  * devm_memremap_pages - remap and provide memmap backing for the given resource
+  * @dev: hosting device for @res
+- * @pgmap: pointer to a struct dev_pgmap
++ * @pgmap: pointer to a struct dev_pagemap
+  *
+  * Notes:
+  * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
+@@ -159,11 +155,8 @@ static void devm_memremap_pages_release(
+  * 2/ The altmap field may optionally be initialized, in which case altmap_valid
+  *    must be set to true
+  *
+- * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages()
+- *    time (or devm release event). The expected order of events is that ref has
+- *    been through percpu_ref_kill() before devm_memremap_pages_release(). The
+- *    wait for the completion of all references being dropped and
+- *    percpu_ref_exit() must occur after devm_memremap_pages_release().
++ * 3/ pgmap->ref must be 'live' on entry and will be killed at
++ *    devm_memremap_pages_release() time, or if this routine fails.
+  *
+  * 4/ res is expected to be a host memory range that could feasibly be
+  *    treated as a "System RAM" range, i.e. not a device mmio range, but
+@@ -180,6 +173,9 @@ void *devm_memremap_pages(struct device
+       int error, nid, is_ram;
+       struct dev_pagemap *conflict_pgmap;
++      if (!pgmap->ref || !pgmap->kill)
++              return ERR_PTR(-EINVAL);
++
+       align_start = res->start & ~(SECTION_SIZE - 1);
+       align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+               - align_start;
+@@ -205,12 +201,10 @@ void *devm_memremap_pages(struct device
+       if (is_ram != REGION_DISJOINT) {
+               WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
+                               is_ram == REGION_MIXED ? "mixed" : "ram", res);
+-              return ERR_PTR(-ENXIO);
++              error = -ENXIO;
++              goto err_array;
+       }
+-      if (!pgmap->ref)
+-              return ERR_PTR(-EINVAL);
+-
+       pgmap->dev = dev;
+       mutex_lock(&pgmap_lock);
+@@ -267,7 +261,10 @@ void *devm_memremap_pages(struct device
+               percpu_ref_get(pgmap->ref);
+       }
+-      devm_add_action(dev, devm_memremap_pages_release, pgmap);
++      error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
++                      pgmap);
++      if (error)
++              return ERR_PTR(error);
+       return __va(res->start);
+@@ -278,6 +275,8 @@ void *devm_memremap_pages(struct device
+  err_pfn_remap:
+  err_radix:
+       pgmap_radix_release(res, pgoff);
++ err_array:
++      pgmap->kill(pgmap->ref);
+       return ERR_PTR(error);
+ }
+ EXPORT_SYMBOL_GPL(devm_memremap_pages);
+--- a/tools/testing/nvdimm/test/iomap.c
++++ b/tools/testing/nvdimm/test/iomap.c
+@@ -104,13 +104,26 @@ void *__wrap_devm_memremap(struct device
+ }
+ EXPORT_SYMBOL(__wrap_devm_memremap);
++static void nfit_test_kill(void *_pgmap)
++{
++      struct dev_pagemap *pgmap = _pgmap;
++
++      pgmap->kill(pgmap->ref);
++}
++
+ void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
+ {
+       resource_size_t offset = pgmap->res.start;
+       struct nfit_test_resource *nfit_res = get_nfit_res(offset);
+-      if (nfit_res)
++      if (nfit_res) {
++              int rc;
++
++              rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
++              if (rc)
++                      return ERR_PTR(rc);
+               return nfit_res->buf + offset - nfit_res->res.start;
++      }
+       return devm_memremap_pages(dev, pgmap);
+ }
+ EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
diff --git a/queue-4.19/mm-devm_memremap_pages-kill-mapping-system-ram-support.patch b/queue-4.19/mm-devm_memremap_pages-kill-mapping-system-ram-support.patch
new file mode 100644 (file)
index 0000000..9e1f26e
--- /dev/null
@@ -0,0 +1,60 @@
+From 06489cfbd915ff36c8e36df27f1c2dc60f97ca56 Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Fri, 28 Dec 2018 00:34:54 -0800
+Subject: mm, devm_memremap_pages: kill mapping "System RAM" support
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 06489cfbd915ff36c8e36df27f1c2dc60f97ca56 upstream.
+
+Given the fact that devm_memremap_pages() requires a percpu_ref that is
+torn down by devm_memremap_pages_release() the current support for mapping
+RAM is broken.
+
+Support for remapping "System RAM" has been broken since the beginning and
+there is no existing user of this this code path, so just kill the support
+and make it an explicit error.
+
+This cleanup also simplifies a follow-on patch to fix the error path when
+setting a devm release action for devm_memremap_pages_release() fails.
+
+Link: http://lkml.kernel.org/r/154275557997.76910.14689813630968180480.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/memremap.c |    9 +++------
+ 1 file changed, 3 insertions(+), 6 deletions(-)
+
+--- a/kernel/memremap.c
++++ b/kernel/memremap.c
+@@ -202,15 +202,12 @@ void *devm_memremap_pages(struct device
+       is_ram = region_intersects(align_start, align_size,
+               IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
+-      if (is_ram == REGION_MIXED) {
+-              WARN_ONCE(1, "%s attempted on mixed region %pr\n",
+-                              __func__, res);
++      if (is_ram != REGION_DISJOINT) {
++              WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
++                              is_ram == REGION_MIXED ? "mixed" : "ram", res);
+               return ERR_PTR(-ENXIO);
+       }
+-      if (is_ram == REGION_INTERSECTS)
+-              return __va(res->start);
+-
+       if (!pgmap->ref)
+               return ERR_PTR(-EINVAL);
diff --git a/queue-4.19/mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch b/queue-4.19/mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch
new file mode 100644 (file)
index 0000000..052bd48
--- /dev/null
@@ -0,0 +1,70 @@
+From 808153e1187fa77ac7d7dad261ff476888dcf398 Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Fri, 28 Dec 2018 00:34:50 -0800
+Subject: mm, devm_memremap_pages: mark devm_memremap_pages() EXPORT_SYMBOL_GPL
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 808153e1187fa77ac7d7dad261ff476888dcf398 upstream.
+
+devm_memremap_pages() is a facility that can create struct page entries
+for any arbitrary range and give drivers the ability to subvert core
+aspects of page management.
+
+Specifically the facility is tightly integrated with the kernel's memory
+hotplug functionality.  It injects an altmap argument deep into the
+architecture specific vmemmap implementation to allow allocating from
+specific reserved pages, and it has Linux specific assumptions about page
+structure reference counting relative to get_user_pages() and
+get_user_pages_fast().  It was an oversight and a mistake that this was
+not marked EXPORT_SYMBOL_GPL from the outset.
+
+Again, devm_memremap_pagex() exposes and relies upon core kernel internal
+assumptions and will continue to evolve along with 'struct page', memory
+hotplug, and support for new memory types / topologies.  Only an in-kernel
+GPL-only driver is expected to keep up with this ongoing evolution.  This
+interface, and functionality derived from this interface, is not suitable
+for kernel-external drivers.
+
+Link: http://lkml.kernel.org/r/154275557457.76910.16923571232582744134.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: "Jérôme Glisse" <jglisse@redhat.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Logan Gunthorpe <logang@deltatee.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/memremap.c                 |    2 +-
+ tools/testing/nvdimm/test/iomap.c |    2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+--- a/kernel/memremap.c
++++ b/kernel/memremap.c
+@@ -283,7 +283,7 @@ void *devm_memremap_pages(struct device
+       pgmap_radix_release(res, pgoff);
+       return ERR_PTR(error);
+ }
+-EXPORT_SYMBOL(devm_memremap_pages);
++EXPORT_SYMBOL_GPL(devm_memremap_pages);
+ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+ {
+--- a/tools/testing/nvdimm/test/iomap.c
++++ b/tools/testing/nvdimm/test/iomap.c
+@@ -113,7 +113,7 @@ void *__wrap_devm_memremap_pages(struct
+               return nfit_res->buf + offset - nfit_res->res.start;
+       return devm_memremap_pages(dev, pgmap);
+ }
+-EXPORT_SYMBOL(__wrap_devm_memremap_pages);
++EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
+ pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags)
+ {
diff --git a/queue-4.19/mm-hmm-mark-hmm_devmem_-add-add_resource-export_symbol_gpl.patch b/queue-4.19/mm-hmm-mark-hmm_devmem_-add-add_resource-export_symbol_gpl.patch
new file mode 100644 (file)
index 0000000..0403bc0
--- /dev/null
@@ -0,0 +1,133 @@
+From 02917e9f8676207a4c577d4d94eae12bf348e9d7 Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Fri, 28 Dec 2018 00:35:15 -0800
+Subject: mm, hmm: mark hmm_devmem_{add, add_resource} EXPORT_SYMBOL_GPL
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 02917e9f8676207a4c577d4d94eae12bf348e9d7 upstream.
+
+At Maintainer Summit, Greg brought up a topic I proposed around
+EXPORT_SYMBOL_GPL usage.  The motivation was considerations for when
+EXPORT_SYMBOL_GPL is warranted and the criteria for taking the exceptional
+step of reclassifying an existing export.  Specifically, I wanted to make
+the case that although the line is fuzzy and hard to specify in abstract
+terms, it is nonetheless clear that devm_memremap_pages() and HMM
+(Heterogeneous Memory Management) have crossed it.  The
+devm_memremap_pages() facility should have been EXPORT_SYMBOL_GPL from the
+beginning, and HMM as a derivative of that functionality should have
+naturally picked up that designation as well.
+
+Contrary to typical rules, the HMM infrastructure was merged upstream with
+zero in-tree consumers.  There was a promise at the time that those users
+would be merged "soon", but it has been over a year with no drivers
+arriving.  While the Nouveau driver is about to belatedly make good on
+that promise it is clear that HMM was targeted first and foremost at an
+out-of-tree consumer.
+
+HMM is derived from devm_memremap_pages(), a facility Christoph and I
+spearheaded to support persistent memory.  It combines a device lifetime
+model with a dynamically created 'struct page' / memmap array for any
+physical address range.  It enables coordination and control of the many
+code paths in the kernel built to interact with memory via 'struct page'
+objects.  With HMM the integration goes even deeper by allowing device
+drivers to hook and manipulate page fault and page free events.
+
+One interpretation of when EXPORT_SYMBOL is suitable is when it is
+exporting stable and generic leaf functionality.  The
+devm_memremap_pages() facility continues to see expanding use cases,
+peer-to-peer DMA being the most recent, with no clear end date when it
+will stop attracting reworks and semantic changes.  It is not suitable to
+export devm_memremap_pages() as a stable 3rd party driver API due to the
+fact that it is still changing and manipulates core behavior.  Moreover,
+it is not in the best interest of the long term development of the core
+memory management subsystem to permit any external driver to effectively
+define its own system-wide memory management policies with no
+encouragement to engage with upstream.
+
+I am also concerned that HMM was designed in a way to minimize further
+engagement with the core-MM.  That, with these hooks in place,
+device-drivers are free to implement their own policies without much
+consideration for whether and how the core-MM could grow to meet that
+need.  Going forward not only should HMM be EXPORT_SYMBOL_GPL, but the
+core-MM should be allowed the opportunity and stimulus to change and
+address these new use cases as first class functionality.
+
+Original changelog:
+
+hmm_devmem_add(), and hmm_devmem_add_resource() duplicated
+devm_memremap_pages() and are now simple now wrappers around the core
+facility to inject a dev_pagemap instance into the global pgmap_radix and
+hook page-idle events.  The devm_memremap_pages() interface is base
+infrastructure for HMM.  HMM has more and deeper ties into the kernel
+memory management implementation than base ZONE_DEVICE which is itself a
+EXPORT_SYMBOL_GPL facility.
+
+Originally, the HMM page structure creation routines copied the
+devm_memremap_pages() code and reused ZONE_DEVICE.  A cleanup to unify the
+implementations was discussed during the initial review:
+http://lkml.iu.edu/hypermail/linux/kernel/1701.2/00812.html Recent work to
+extend devm_memremap_pages() for the peer-to-peer-DMA facility enabled
+this cleanup to move forward.
+
+In addition to the integration with devm_memremap_pages() HMM depends on
+other GPL-only symbols:
+
+    mmu_notifier_unregister_no_release
+    percpu_ref
+    region_intersects
+    __class_create
+
+It goes further to consume / indirectly expose functionality that is not
+exported to any other driver:
+
+    alloc_pages_vma
+    walk_page_range
+
+HMM is derived from devm_memremap_pages(), and extends deep core-kernel
+fundamentals. Similar to devm_memremap_pages(), mark its entry points
+EXPORT_SYMBOL_GPL().
+
+[logang@deltatee.com: PCI/P2PDMA: match interface changes to devm_memremap_pages()]
+  Link: http://lkml.kernel.org/r/20181130225911.2900-1-logang@deltatee.com
+Link: http://lkml.kernel.org/r/154275560565.76910.15919297436557795278.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Cc: Logan Gunthorpe <logang@deltatee.com>
+Cc: "Jérôme Glisse" <jglisse@redhat.com>
+Cc: Balbir Singh <bsingharora@gmail.com>,
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hmm.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -1210,7 +1210,7 @@ struct hmm_devmem *hmm_devmem_add(const
+       return devmem;
+ }
+-EXPORT_SYMBOL(hmm_devmem_add);
++EXPORT_SYMBOL_GPL(hmm_devmem_add);
+ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+                                          struct device *device,
+@@ -1264,7 +1264,7 @@ struct hmm_devmem *hmm_devmem_add_resour
+       return devmem;
+ }
+-EXPORT_SYMBOL(hmm_devmem_add_resource);
++EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
+ /*
+  * A device driver that wants to handle multiple devices memory through a
diff --git a/queue-4.19/mm-hmm-use-devm-semantics-for-hmm_devmem_-add-remove.patch b/queue-4.19/mm-hmm-use-devm-semantics-for-hmm_devmem_-add-remove.patch
new file mode 100644 (file)
index 0000000..128cda5
--- /dev/null
@@ -0,0 +1,290 @@
+From 58ef15b765af0d2cbe6799ec564f1dc485010ab8 Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Fri, 28 Dec 2018 00:35:07 -0800
+Subject: mm, hmm: use devm semantics for hmm_devmem_{add, remove}
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 58ef15b765af0d2cbe6799ec564f1dc485010ab8 upstream.
+
+devm semantics arrange for resources to be torn down when
+device-driver-probe fails or when device-driver-release completes.
+Similar to devm_memremap_pages() there is no need to support an explicit
+remove operation when the users properly adhere to devm semantics.
+
+Note that devm_kzalloc() automatically handles allocating node-local
+memory.
+
+Link: http://lkml.kernel.org/r/154275559545.76910.9186690723515469051.stgit@dwillia2-desk3.amr.corp.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
+Cc: "Jérôme Glisse" <jglisse@redhat.com>
+Cc: Logan Gunthorpe <logang@deltatee.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/hmm.h |    4 -
+ mm/hmm.c            |  127 +++++++++-------------------------------------------
+ 2 files changed, 25 insertions(+), 106 deletions(-)
+
+--- a/include/linux/hmm.h
++++ b/include/linux/hmm.h
+@@ -499,8 +499,7 @@ struct hmm_devmem {
+  * enough and allocate struct page for it.
+  *
+  * The device driver can wrap the hmm_devmem struct inside a private device
+- * driver struct. The device driver must call hmm_devmem_remove() before the
+- * device goes away and before freeing the hmm_devmem struct memory.
++ * driver struct.
+  */
+ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+                                 struct device *device,
+@@ -508,7 +507,6 @@ struct hmm_devmem *hmm_devmem_add(const
+ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+                                          struct device *device,
+                                          struct resource *res);
+-void hmm_devmem_remove(struct hmm_devmem *devmem);
+ /*
+  * hmm_devmem_page_set_drvdata - set per-page driver data field
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -945,7 +945,6 @@ static void hmm_devmem_ref_exit(void *da
+       devmem = container_of(ref, struct hmm_devmem, ref);
+       percpu_ref_exit(ref);
+-      devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
+ }
+ static void hmm_devmem_ref_kill(void *data)
+@@ -956,7 +955,6 @@ static void hmm_devmem_ref_kill(void *da
+       devmem = container_of(ref, struct hmm_devmem, ref);
+       percpu_ref_kill(ref);
+       wait_for_completion(&devmem->completion);
+-      devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
+ }
+ static int hmm_devmem_fault(struct vm_area_struct *vma,
+@@ -994,7 +992,7 @@ static void hmm_devmem_radix_release(str
+       mutex_unlock(&hmm_devmem_lock);
+ }
+-static void hmm_devmem_release(struct device *dev, void *data)
++static void hmm_devmem_release(void *data)
+ {
+       struct hmm_devmem *devmem = data;
+       struct resource *resource = devmem->resource;
+@@ -1002,11 +1000,6 @@ static void hmm_devmem_release(struct de
+       struct zone *zone;
+       struct page *page;
+-      if (percpu_ref_tryget_live(&devmem->ref)) {
+-              dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+-              percpu_ref_put(&devmem->ref);
+-      }
+-
+       /* pages are dead and unused, undo the arch mapping */
+       start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
+       npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
+@@ -1130,19 +1123,6 @@ error:
+       return ret;
+ }
+-static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
+-{
+-      struct hmm_devmem *devmem = data;
+-
+-      return devmem->resource == match_data;
+-}
+-
+-static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
+-{
+-      devres_release(devmem->device, &hmm_devmem_release,
+-                     &hmm_devmem_match, devmem->resource);
+-}
+-
+ /*
+  * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
+  *
+@@ -1170,8 +1150,7 @@ struct hmm_devmem *hmm_devmem_add(const
+       dev_pagemap_get_ops();
+-      devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+-                                 GFP_KERNEL, dev_to_node(device));
++      devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
+       if (!devmem)
+               return ERR_PTR(-ENOMEM);
+@@ -1185,11 +1164,11 @@ struct hmm_devmem *hmm_devmem_add(const
+       ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
+                             0, GFP_KERNEL);
+       if (ret)
+-              goto error_percpu_ref;
++              return ERR_PTR(ret);
+-      ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
++      ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref);
+       if (ret)
+-              goto error_devm_add_action;
++              return ERR_PTR(ret);
+       size = ALIGN(size, PA_SECTION_SIZE);
+       addr = min((unsigned long)iomem_resource.end,
+@@ -1209,16 +1188,12 @@ struct hmm_devmem *hmm_devmem_add(const
+               devmem->resource = devm_request_mem_region(device, addr, size,
+                                                          dev_name(device));
+-              if (!devmem->resource) {
+-                      ret = -ENOMEM;
+-                      goto error_no_resource;
+-              }
++              if (!devmem->resource)
++                      return ERR_PTR(-ENOMEM);
+               break;
+       }
+-      if (!devmem->resource) {
+-              ret = -ERANGE;
+-              goto error_no_resource;
+-      }
++      if (!devmem->resource)
++              return ERR_PTR(-ERANGE);
+       devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+       devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+@@ -1227,28 +1202,13 @@ struct hmm_devmem *hmm_devmem_add(const
+       ret = hmm_devmem_pages_create(devmem);
+       if (ret)
+-              goto error_pages;
+-
+-      devres_add(device, devmem);
++              return ERR_PTR(ret);
+-      ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+-      if (ret) {
+-              hmm_devmem_remove(devmem);
++      ret = devm_add_action_or_reset(device, hmm_devmem_release, devmem);
++      if (ret)
+               return ERR_PTR(ret);
+-      }
+       return devmem;
+-
+-error_pages:
+-      devm_release_mem_region(device, devmem->resource->start,
+-                              resource_size(devmem->resource));
+-error_no_resource:
+-error_devm_add_action:
+-      hmm_devmem_ref_kill(&devmem->ref);
+-      hmm_devmem_ref_exit(&devmem->ref);
+-error_percpu_ref:
+-      devres_free(devmem);
+-      return ERR_PTR(ret);
+ }
+ EXPORT_SYMBOL(hmm_devmem_add);
+@@ -1264,8 +1224,7 @@ struct hmm_devmem *hmm_devmem_add_resour
+       dev_pagemap_get_ops();
+-      devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+-                                 GFP_KERNEL, dev_to_node(device));
++      devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
+       if (!devmem)
+               return ERR_PTR(-ENOMEM);
+@@ -1279,12 +1238,12 @@ struct hmm_devmem *hmm_devmem_add_resour
+       ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
+                             0, GFP_KERNEL);
+       if (ret)
+-              goto error_percpu_ref;
++              return ERR_PTR(ret);
+-      ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
++      ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit,
++                      &devmem->ref);
+       if (ret)
+-              goto error_devm_add_action;
+-
++              return ERR_PTR(ret);
+       devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+       devmem->pfn_last = devmem->pfn_first +
+@@ -1292,60 +1251,22 @@ struct hmm_devmem *hmm_devmem_add_resour
+       ret = hmm_devmem_pages_create(devmem);
+       if (ret)
+-              goto error_devm_add_action;
++              return ERR_PTR(ret);
+-      devres_add(device, devmem);
++      ret = devm_add_action_or_reset(device, hmm_devmem_release, devmem);
++      if (ret)
++              return ERR_PTR(ret);
+-      ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+-      if (ret) {
+-              hmm_devmem_remove(devmem);
++      ret = devm_add_action_or_reset(device, hmm_devmem_ref_kill,
++                      &devmem->ref);
++      if (ret)
+               return ERR_PTR(ret);
+-      }
+       return devmem;
+-
+-error_devm_add_action:
+-      hmm_devmem_ref_kill(&devmem->ref);
+-      hmm_devmem_ref_exit(&devmem->ref);
+-error_percpu_ref:
+-      devres_free(devmem);
+-      return ERR_PTR(ret);
+ }
+ EXPORT_SYMBOL(hmm_devmem_add_resource);
+ /*
+- * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
+- *
+- * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
+- *
+- * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
+- * of the device driver. It will free struct page and remove the resource that
+- * reserved the physical address range for this device memory.
+- */
+-void hmm_devmem_remove(struct hmm_devmem *devmem)
+-{
+-      resource_size_t start, size;
+-      struct device *device;
+-      bool cdm = false;
+-
+-      if (!devmem)
+-              return;
+-
+-      device = devmem->device;
+-      start = devmem->resource->start;
+-      size = resource_size(devmem->resource);
+-
+-      cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
+-      hmm_devmem_ref_kill(&devmem->ref);
+-      hmm_devmem_ref_exit(&devmem->ref);
+-      hmm_devmem_pages_remove(devmem);
+-
+-      if (!cdm)
+-              devm_release_mem_region(device, start, size);
+-}
+-EXPORT_SYMBOL(hmm_devmem_remove);
+-
+-/*
+  * A device driver that wants to handle multiple devices memory through a
+  * single fake device can use hmm_device to do so. This is purely a helper
+  * and it is not needed to make use of any HMM functionality.
diff --git a/queue-4.19/mm-swap-fix-swapoff-with-ksm-pages.patch b/queue-4.19/mm-swap-fix-swapoff-with-ksm-pages.patch
new file mode 100644 (file)
index 0000000..3d7f0d8
--- /dev/null
@@ -0,0 +1,70 @@
+From 7af7a8e19f0c5425ff639b0f0d2d244c2a647724 Mon Sep 17 00:00:00 2001
+From: Huang Ying <ying.huang@intel.com>
+Date: Fri, 28 Dec 2018 00:39:53 -0800
+Subject: mm, swap: fix swapoff with KSM pages
+
+From: Huang Ying <ying.huang@intel.com>
+
+commit 7af7a8e19f0c5425ff639b0f0d2d244c2a647724 upstream.
+
+KSM pages may be mapped to the multiple VMAs that cannot be reached from
+one anon_vma.  So during swapin, a new copy of the page need to be
+generated if a different anon_vma is needed, please refer to comments of
+ksm_might_need_to_copy() for details.
+
+During swapoff, unuse_vma() uses anon_vma (if available) to locate VMA and
+virtual address mapped to the page, so not all mappings to a swapped out
+KSM page could be found.  So in try_to_unuse(), even if the swap count of
+a swap entry isn't zero, the page needs to be deleted from swap cache, so
+that, in the next round a new page could be allocated and swapin for the
+other mappings of the swapped out KSM page.
+
+But this contradicts with the THP swap support.  Where the THP could be
+deleted from swap cache only after the swap count of every swap entry in
+the huge swap cluster backing the THP has reach 0.  So try_to_unuse() is
+changed in commit e07098294adf ("mm, THP, swap: support to reclaim swap
+space for THP swapped out") to check that before delete a page from swap
+cache, but this has broken KSM swapoff too.
+
+Fortunately, KSM is for the normal pages only, so the original behavior
+for KSM pages could be restored easily via checking PageTransCompound().
+That is how this patch works.
+
+The bug is introduced by e07098294adf ("mm, THP, swap: support to reclaim
+swap space for THP swapped out"), which is merged by v4.14-rc1.  So I
+think we should backport the fix to from 4.14 on.  But Hugh thinks it may
+be rare for the KSM pages being in the swap device when swapoff, so nobody
+reports the bug so far.
+
+Link: http://lkml.kernel.org/r/20181226051522.28442-1-ying.huang@intel.com
+Fixes: e07098294adf ("mm, THP, swap: support to reclaim swap space for THP swapped out")
+Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
+Reported-by: Hugh Dickins <hughd@google.com>
+Tested-by: Hugh Dickins <hughd@google.com>
+Acked-by: Hugh Dickins <hughd@google.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Shaohua Li <shli@kernel.org>
+Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/swapfile.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -2208,7 +2208,8 @@ int try_to_unuse(unsigned int type, bool
+                */
+               if (PageSwapCache(page) &&
+                   likely(page_private(page) == entry.val) &&
+-                  !page_swapped(page))
++                  (!PageTransCompound(page) ||
++                   !swap_page_trans_huge_swapped(si, entry)))
+                       delete_from_swap_cache(compound_head(page));
+               /*
index 5f8231422607d4f8f136608b5bcff9870e73149f..ad3b98af3f35f77c9f395a3cb01dbe12a1b80753 100644 (file)
@@ -76,3 +76,19 @@ serial-sunsu-fix-refcount-leak.patch
 auxdisplay-charlcd-fix-x-y-command-parsing.patch
 scsi-zfcp-fix-posting-too-many-status-read-buffers-leading-to-adapter-shutdown.patch
 scsi-lpfc-do-not-set-queue-page_count-to-0-if-pc_sli4_params.wqpcnt-is-invalid.patch
+fork-record-start_time-late.patch
+zram-fix-double-free-backing-device.patch
+hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch
+mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch
+mm-devm_memremap_pages-kill-mapping-system-ram-support.patch
+mm-devm_memremap_pages-fix-shutdown-handling.patch
+mm-devm_memremap_pages-add-memory_device_private-support.patch
+mm-hmm-use-devm-semantics-for-hmm_devmem_-add-remove.patch
+mm-hmm-mark-hmm_devmem_-add-add_resource-export_symbol_gpl.patch
+mm-swap-fix-swapoff-with-ksm-pages.patch
+hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
+hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
+memcg-oom-notify-on-oom-killer-invocation-from-the-charge-path.patch
+sunrpc-fix-cache_head-leak-due-to-queued-request.patch
+sunrpc-use-svc_net-in-svcauth_gss_-functions.patch
+sunrpc-use-after-free-in-svc_process_common.patch
diff --git a/queue-4.19/sunrpc-fix-cache_head-leak-due-to-queued-request.patch b/queue-4.19/sunrpc-fix-cache_head-leak-due-to-queued-request.patch
new file mode 100644 (file)
index 0000000..203e0c1
--- /dev/null
@@ -0,0 +1,69 @@
+From 4ecd55ea074217473f94cfee21bb72864d39f8d7 Mon Sep 17 00:00:00 2001
+From: Vasily Averin <vvs@virtuozzo.com>
+Date: Wed, 28 Nov 2018 11:45:57 +0300
+Subject: sunrpc: fix cache_head leak due to queued request
+
+From: Vasily Averin <vvs@virtuozzo.com>
+
+commit 4ecd55ea074217473f94cfee21bb72864d39f8d7 upstream.
+
+After commit d202cce8963d, an expired cache_head can be removed from the
+cache_detail's hash.
+
+However, the expired cache_head may be waiting for a reply from a
+previously submitted request. Such a cache_head has an increased
+refcounter and therefore it won't be freed after cache_put(freeme).
+
+Because the cache_head was removed from the hash it cannot be found
+during cache_clean() and can be leaked forever, together with stalled
+cache_request and other taken resources.
+
+In our case we noticed it because an entry in the export cache was
+holding a reference on a filesystem.
+
+Fixes d202cce8963d ("sunrpc: never return expired entries in sunrpc_cache_lookup")
+Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
+Cc: stable@kernel.org # 2.6.35
+Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
+Reviewed-by: NeilBrown <neilb@suse.com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/sunrpc/cache.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/net/sunrpc/cache.c
++++ b/net/sunrpc/cache.c
+@@ -54,6 +54,11 @@ static void cache_init(struct cache_head
+       h->last_refresh = now;
+ }
++static void cache_fresh_locked(struct cache_head *head, time_t expiry,
++                              struct cache_detail *detail);
++static void cache_fresh_unlocked(struct cache_head *head,
++                              struct cache_detail *detail);
++
+ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
+                                      struct cache_head *key, int hash)
+ {
+@@ -95,6 +100,7 @@ struct cache_head *sunrpc_cache_lookup(s
+                       if (cache_is_expired(detail, tmp)) {
+                               hlist_del_init(&tmp->cache_list);
+                               detail->entries --;
++                              cache_fresh_locked(tmp, 0, detail);
+                               freeme = tmp;
+                               break;
+                       }
+@@ -110,8 +116,10 @@ struct cache_head *sunrpc_cache_lookup(s
+       cache_get(new);
+       write_unlock(&detail->hash_lock);
+-      if (freeme)
++      if (freeme) {
++              cache_fresh_unlocked(freeme, detail);
+               cache_put(freeme, detail);
++      }
+       return new;
+ }
+ EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);
diff --git a/queue-4.19/sunrpc-use-after-free-in-svc_process_common.patch b/queue-4.19/sunrpc-use-after-free-in-svc_process_common.patch
new file mode 100644 (file)
index 0000000..6041048
--- /dev/null
@@ -0,0 +1,166 @@
+From d4b09acf924b84bae77cad090a9d108e70b43643 Mon Sep 17 00:00:00 2001
+From: Vasily Averin <vvs@virtuozzo.com>
+Date: Mon, 24 Dec 2018 14:44:52 +0300
+Subject: sunrpc: use-after-free in svc_process_common()
+
+From: Vasily Averin <vvs@virtuozzo.com>
+
+commit d4b09acf924b84bae77cad090a9d108e70b43643 upstream.
+
+if node have NFSv41+ mounts inside several net namespaces
+it can lead to use-after-free in svc_process_common()
+
+svc_process_common()
+        /* Setup reply header */
+        rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); <<< HERE
+
+svc_process_common() can use incorrect rqstp->rq_xprt,
+its caller function bc_svc_process() takes it from serv->sv_bc_xprt.
+The problem is that serv is global structure but sv_bc_xprt
+is assigned per-netnamespace.
+
+According to Trond, the whole "let's set up rqstp->rq_xprt
+for the back channel" is nothing but a giant hack in order
+to work around the fact that svc_process_common() uses it
+to find the xpt_ops, and perform a couple of (meaningless
+for the back channel) tests of xpt_flags.
+
+All we really need in svc_process_common() is to be able to run
+rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr()
+
+Bruce J Fields points that this xpo_prep_reply_hdr() call
+is an awfully roundabout way just to do "svc_putnl(resv, 0);"
+in the tcp case.
+
+This patch does not initialiuze rqstp->rq_xprt in bc_svc_process(),
+now it calls svc_process_common() with rqstp->rq_xprt = NULL.
+
+To adjust reply header svc_process_common() just check
+rqstp->rq_prot and calls svc_tcp_prep_reply_hdr() for tcp case.
+
+To handle rqstp->rq_xprt = NULL case in functions called from
+svc_process_common() patch intruduces net namespace pointer
+svc_rqst->rq_bc_net and adjust SVC_NET() definition.
+Some other function was also adopted to properly handle described case.
+
+Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
+Cc: stable@vger.kernel.org
+Fixes: 23c20ecd4475 ("NFS: callback up - users counting cleanup")
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/sunrpc/svc.h    |    5 ++++-
+ include/trace/events/sunrpc.h |    6 ++++--
+ net/sunrpc/svc.c              |    9 +++++----
+ net/sunrpc/svc_xprt.c         |    5 +++--
+ net/sunrpc/svcsock.c          |    2 +-
+ 5 files changed, 17 insertions(+), 10 deletions(-)
+
+--- a/include/linux/sunrpc/svc.h
++++ b/include/linux/sunrpc/svc.h
+@@ -295,9 +295,12 @@ struct svc_rqst {
+       struct svc_cacherep *   rq_cacherep;    /* cache info */
+       struct task_struct      *rq_task;       /* service thread */
+       spinlock_t              rq_lock;        /* per-request lock */
++      struct net              *rq_bc_net;     /* pointer to backchannel's
++                                               * net namespace
++                                               */
+ };
+-#define SVC_NET(svc_rqst)     (svc_rqst->rq_xprt->xpt_net)
++#define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net)
+ /*
+  * Rigorous type checking on sockaddr type conversions
+--- a/include/trace/events/sunrpc.h
++++ b/include/trace/events/sunrpc.h
+@@ -582,7 +582,8 @@ TRACE_EVENT(svc_process,
+               __field(u32, vers)
+               __field(u32, proc)
+               __string(service, name)
+-              __string(addr, rqst->rq_xprt->xpt_remotebuf)
++              __string(addr, rqst->rq_xprt ?
++                       rqst->rq_xprt->xpt_remotebuf : "(null)")
+       ),
+       TP_fast_assign(
+@@ -590,7 +591,8 @@ TRACE_EVENT(svc_process,
+               __entry->vers = rqst->rq_vers;
+               __entry->proc = rqst->rq_proc;
+               __assign_str(service, name);
+-              __assign_str(addr, rqst->rq_xprt->xpt_remotebuf);
++              __assign_str(addr, rqst->rq_xprt ?
++                           rqst->rq_xprt->xpt_remotebuf : "(null)");
+       ),
+       TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%u",
+--- a/net/sunrpc/svc.c
++++ b/net/sunrpc/svc.c
+@@ -1172,7 +1172,8 @@ svc_process_common(struct svc_rqst *rqst
+       clear_bit(RQ_DROPME, &rqstp->rq_flags);
+       /* Setup reply header */
+-      rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
++      if (rqstp->rq_prot == IPPROTO_TCP)
++              svc_tcp_prep_reply_hdr(rqstp);
+       svc_putu32(resv, rqstp->rq_xid);
+@@ -1244,7 +1245,7 @@ svc_process_common(struct svc_rqst *rqst
+        * for lower versions. RPC_PROG_MISMATCH seems to be the closest
+        * fit.
+        */
+-      if (versp->vs_need_cong_ctrl &&
++      if (versp->vs_need_cong_ctrl && rqstp->rq_xprt &&
+           !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
+               goto err_bad_vers;
+@@ -1336,7 +1337,7 @@ svc_process_common(struct svc_rqst *rqst
+       return 0;
+  close:
+-      if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
++      if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+               svc_close_xprt(rqstp->rq_xprt);
+       dprintk("svc: svc_process close\n");
+       return 0;
+@@ -1459,10 +1460,10 @@ bc_svc_process(struct svc_serv *serv, st
+       dprintk("svc: %s(%p)\n", __func__, req);
+       /* Build the svc_rqst used by the common processing routine */
+-      rqstp->rq_xprt = serv->sv_bc_xprt;
+       rqstp->rq_xid = req->rq_xid;
+       rqstp->rq_prot = req->rq_xprt->prot;
+       rqstp->rq_server = serv;
++      rqstp->rq_bc_net = req->rq_xprt->xprt_net;
+       rqstp->rq_addrlen = sizeof(req->rq_xprt->addr);
+       memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen);
+--- a/net/sunrpc/svc_xprt.c
++++ b/net/sunrpc/svc_xprt.c
+@@ -469,10 +469,11 @@ out:
+  */
+ void svc_reserve(struct svc_rqst *rqstp, int space)
+ {
++      struct svc_xprt *xprt = rqstp->rq_xprt;
++
+       space += rqstp->rq_res.head[0].iov_len;
+-      if (space < rqstp->rq_reserved) {
+-              struct svc_xprt *xprt = rqstp->rq_xprt;
++      if (xprt && space < rqstp->rq_reserved) {
+               atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
+               rqstp->rq_reserved = space;
+--- a/net/sunrpc/svcsock.c
++++ b/net/sunrpc/svcsock.c
+@@ -1198,7 +1198,7 @@ static int svc_tcp_sendto(struct svc_rqs
+ /*
+  * Setup response header. TCP has a 4B record length field.
+  */
+-static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
++void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
+ {
+       struct kvec *resv = &rqstp->rq_res.head[0];
diff --git a/queue-4.19/sunrpc-use-svc_net-in-svcauth_gss_-functions.patch b/queue-4.19/sunrpc-use-svc_net-in-svcauth_gss_-functions.patch
new file mode 100644 (file)
index 0000000..cf02d59
--- /dev/null
@@ -0,0 +1,56 @@
+From b8be5674fa9a6f3677865ea93f7803c4212f3e10 Mon Sep 17 00:00:00 2001
+From: Vasily Averin <vvs@virtuozzo.com>
+Date: Mon, 24 Dec 2018 14:44:42 +0300
+Subject: sunrpc: use SVC_NET() in svcauth_gss_* functions
+
+From: Vasily Averin <vvs@virtuozzo.com>
+
+commit b8be5674fa9a6f3677865ea93f7803c4212f3e10 upstream.
+
+Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/sunrpc/auth_gss/svcauth_gss.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/net/sunrpc/auth_gss/svcauth_gss.c
++++ b/net/sunrpc/auth_gss/svcauth_gss.c
+@@ -1122,7 +1122,7 @@ static int svcauth_gss_legacy_init(struc
+       struct kvec *resv = &rqstp->rq_res.head[0];
+       struct rsi *rsip, rsikey;
+       int ret;
+-      struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
++      struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
+       memset(&rsikey, 0, sizeof(rsikey));
+       ret = gss_read_verf(gc, argv, authp,
+@@ -1233,7 +1233,7 @@ static int svcauth_gss_proxy_init(struct
+       uint64_t handle;
+       int status;
+       int ret;
+-      struct net *net = rqstp->rq_xprt->xpt_net;
++      struct net *net = SVC_NET(rqstp);
+       struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+       memset(&ud, 0, sizeof(ud));
+@@ -1424,7 +1424,7 @@ svcauth_gss_accept(struct svc_rqst *rqst
+       __be32          *rpcstart;
+       __be32          *reject_stat = resv->iov_base + resv->iov_len;
+       int             ret;
+-      struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
++      struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
+       dprintk("RPC:       svcauth_gss: argv->iov_len = %zd\n",
+                       argv->iov_len);
+@@ -1714,7 +1714,7 @@ svcauth_gss_release(struct svc_rqst *rqs
+       struct rpc_gss_wire_cred *gc = &gsd->clcred;
+       struct xdr_buf *resbuf = &rqstp->rq_res;
+       int stat = -EINVAL;
+-      struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
++      struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
+       if (gc->gc_proc != RPC_GSS_PROC_DATA)
+               goto out;
diff --git a/queue-4.19/zram-fix-double-free-backing-device.patch b/queue-4.19/zram-fix-double-free-backing-device.patch
new file mode 100644 (file)
index 0000000..67acecb
--- /dev/null
@@ -0,0 +1,54 @@
+From 5547932dc67a48713eece4fa4703bfdf0cfcb818 Mon Sep 17 00:00:00 2001
+From: Minchan Kim <minchan@kernel.org>
+Date: Fri, 28 Dec 2018 00:36:37 -0800
+Subject: zram: fix double free backing device
+
+From: Minchan Kim <minchan@kernel.org>
+
+commit 5547932dc67a48713eece4fa4703bfdf0cfcb818 upstream.
+
+If blkdev_get fails, we shouldn't do blkdev_put.  Otherwise, kernel emits
+below log.  This patch fixes it.
+
+  WARNING: CPU: 0 PID: 1893 at fs/block_dev.c:1828 blkdev_put+0x105/0x120
+  Modules linked in:
+  CPU: 0 PID: 1893 Comm: swapoff Not tainted 4.19.0+ #453
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
+  RIP: 0010:blkdev_put+0x105/0x120
+  Call Trace:
+    __x64_sys_swapoff+0x46d/0x490
+    do_syscall_64+0x5a/0x190
+    entry_SYSCALL_64_after_hwframe+0x49/0xbe
+  irq event stamp: 4466
+  hardirqs last  enabled at (4465):  __free_pages_ok+0x1e3/0x490
+  hardirqs last disabled at (4466):  trace_hardirqs_off_thunk+0x1a/0x1c
+  softirqs last  enabled at (3420):  __do_softirq+0x333/0x446
+  softirqs last disabled at (3407):  irq_exit+0xd1/0xe0
+
+Link: http://lkml.kernel.org/r/20181127055429.251614-3-minchan@kernel.org
+Signed-off-by: Minchan Kim <minchan@kernel.org>
+Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Reviewed-by: Joey Pabalinas <joeypabalinas@gmail.com>
+Cc: <stable@vger.kernel.org>   [4.14+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/zram/zram_drv.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/block/zram/zram_drv.c
++++ b/drivers/block/zram/zram_drv.c
+@@ -382,8 +382,10 @@ static ssize_t backing_dev_store(struct
+       bdev = bdgrab(I_BDEV(inode));
+       err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
+-      if (err < 0)
++      if (err < 0) {
++              bdev = NULL;
+               goto out;
++      }
+       nr_pages = i_size_read(inode) >> PAGE_SHIFT;
+       bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);