]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.0-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 27 Aug 2012 18:43:54 +0000 (11:43 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 27 Aug 2012 18:43:54 +0000 (11:43 -0700)
added patches:
mm-hugetlbfs-correctly-populate-shared-pmd.patch

queue-3.0/mm-hugetlbfs-correctly-populate-shared-pmd.patch [new file with mode: 0644]
queue-3.0/series

diff --git a/queue-3.0/mm-hugetlbfs-correctly-populate-shared-pmd.patch b/queue-3.0/mm-hugetlbfs-correctly-populate-shared-pmd.patch
new file mode 100644 (file)
index 0000000..db46394
--- /dev/null
@@ -0,0 +1,170 @@
+From eb48c071464757414538c68a6033c8f8c15196f8 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.cz>
+Date: Tue, 21 Aug 2012 16:15:52 -0700
+Subject: mm: hugetlbfs: correctly populate shared pmd
+
+From: Michal Hocko <mhocko@suse.cz>
+
+commit eb48c071464757414538c68a6033c8f8c15196f8 upstream.
+
+Each page mapped in a process's address space must be correctly
+accounted for in _mapcount.  Normally the rules for this are
+straightforward but hugetlbfs page table sharing is different.  The page
+table pages at the PMD level are reference counted while the mapcount
+remains the same.
+
+If this accounting is wrong, it causes bugs like this one reported by
+Larry Woodman:
+
+  kernel BUG at mm/filemap.c:135!
+  invalid opcode: 0000 [#1] SMP
+  CPU 22
+  Modules linked in: bridge stp llc sunrpc binfmt_misc dcdbas microcode pcspkr acpi_pad acpi]
+  Pid: 18001, comm: mpitest Tainted: G        W    3.3.0+ #4 Dell Inc. PowerEdge R620/07NDJ2
+  RIP: 0010:[<ffffffff8112cfed>]  [<ffffffff8112cfed>] __delete_from_page_cache+0x15d/0x170
+  Process mpitest (pid: 18001, threadinfo ffff880428972000, task ffff880428b5cc20)
+  Call Trace:
+    delete_from_page_cache+0x40/0x80
+    truncate_hugepages+0x115/0x1f0
+    hugetlbfs_evict_inode+0x18/0x30
+    evict+0x9f/0x1b0
+    iput_final+0xe3/0x1e0
+    iput+0x3e/0x50
+    d_kill+0xf8/0x110
+    dput+0xe2/0x1b0
+    __fput+0x162/0x240
+
+During fork(), copy_hugetlb_page_range() detects if huge_pte_alloc()
+shared page tables with the check dst_pte == src_pte.  The logic is if
+the PMD page is the same, they must be shared.  This assumes that the
+sharing is between the parent and child.  However, if the sharing is
+with a different process entirely then this check fails as in this
+diagram:
+
+  parent
+    |
+    ------------>pmd
+                 src_pte----------> data page
+                                        ^
+  other--------->pmd--------------------|
+                  ^
+  child-----------|
+                 dst_pte
+
+For this situation to occur, it must be possible for Parent and Other to
+have faulted and failed to share page tables with each other.  This is
+possible due to the following style of race.
+
+  PROC A                                          PROC B
+  copy_hugetlb_page_range                         copy_hugetlb_page_range
+    src_pte == huge_pte_offset                      src_pte == huge_pte_offset
+    !src_pte so no sharing                          !src_pte so no sharing
+
+  (time passes)
+
+  hugetlb_fault                                   hugetlb_fault
+    huge_pte_alloc                                  huge_pte_alloc
+      huge_pmd_share                                 huge_pmd_share
+        LOCK(i_mmap_mutex)
+        find nothing, no sharing
+        UNLOCK(i_mmap_mutex)
+                                                      LOCK(i_mmap_mutex)
+                                                      find nothing, no sharing
+                                                      UNLOCK(i_mmap_mutex)
+      pmd_alloc                                       pmd_alloc
+      LOCK(instantiation_mutex)
+      fault
+      UNLOCK(instantiation_mutex)
+                                                  LOCK(instantiation_mutex)
+                                                  fault
+                                                  UNLOCK(instantiation_mutex)
+
+These two processes are not poing to the same data page but are not
+sharing page tables because the opportunity was missed.  When either
+process later forks, the src_pte == dst pte is potentially insufficient.
+As the check falls through, the wrong PTE information is copied in
+(harmless but wrong) and the mapcount is bumped for a page mapped by a
+shared page table leading to the BUG_ON.
+
+This patch addresses the issue by moving pmd_alloc into huge_pmd_share
+which guarantees that the shared pud is populated in the same critical
+section as pmd.  This also means that huge_pte_offset test in
+huge_pmd_share is serialized correctly now which in turn means that the
+success of the sharing will be higher as the racing tasks see the pud
+and pmd populated together.
+
+Race identified and changelog written mostly by Mel Gorman.
+
+{akpm@linux-foundation.org: attempt to make the huge_pmd_share() comment comprehensible, clean up coding style]
+Reported-by: Larry Woodman <lwoodman@redhat.com>
+Tested-by: Larry Woodman <lwoodman@redhat.com>
+Reviewed-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Michal Hocko <mhocko@suse.cz>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: David Gibson <david@gibson.dropbear.id.au>
+Cc: Ken Chen <kenchen@google.com>
+Cc: Cong Wang <xiyou.wangcong@gmail.com>
+Cc: Hillf Danton <dhillf@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/hugetlbpage.c |   21 ++++++++++++++++-----
+ 1 file changed, 16 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/mm/hugetlbpage.c
++++ b/arch/x86/mm/hugetlbpage.c
+@@ -56,9 +56,16 @@ static int vma_shareable(struct vm_area_
+ }
+ /*
+- * search for a shareable pmd page for hugetlb.
++ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
++ * and returns the corresponding pte. While this is not necessary for the
++ * !shared pmd case because we can allocate the pmd later as well, it makes the
++ * code much cleaner. pmd allocation is essential for the shared case because
++ * pud has to be populated inside the same i_mmap_mutex section - otherwise
++ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
++ * bad pmd for sharing.
+  */
+-static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
++static pte_t *
++huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+ {
+       struct vm_area_struct *vma = find_vma(mm, addr);
+       struct address_space *mapping = vma->vm_file->f_mapping;
+@@ -68,9 +75,10 @@ static void huge_pmd_share(struct mm_str
+       struct vm_area_struct *svma;
+       unsigned long saddr;
+       pte_t *spte = NULL;
++      pte_t *pte;
+       if (!vma_shareable(vma, addr))
+-              return;
++              return (pte_t *)pmd_alloc(mm, pud, addr);
+       mutex_lock(&mapping->i_mmap_mutex);
+       vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
+@@ -97,7 +105,9 @@ static void huge_pmd_share(struct mm_str
+               put_page(virt_to_page(spte));
+       spin_unlock(&mm->page_table_lock);
+ out:
++      pte = (pte_t *)pmd_alloc(mm, pud, addr);
+       mutex_unlock(&mapping->i_mmap_mutex);
++      return pte;
+ }
+ /*
+@@ -142,8 +152,9 @@ pte_t *huge_pte_alloc(struct mm_struct *
+               } else {
+                       BUG_ON(sz != PMD_SIZE);
+                       if (pud_none(*pud))
+-                              huge_pmd_share(mm, addr, pud);
+-                      pte = (pte_t *) pmd_alloc(mm, pud, addr);
++                              pte = huge_pmd_share(mm, addr, pud);
++                      else
++                              pte = (pte_t *)pmd_alloc(mm, pud, addr);
+               }
+       }
+       BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
index 0395acbab0f398b869bf53f962b6c3ac97d42371..8fadd85eee3acd0700c30fbfa9d2a1c91e023a87 100644 (file)
@@ -11,3 +11,4 @@ vfs-missed-source-of-f_pos-races.patch
 vfs-canonicalize-create-mode-in-build_open_flags.patch
 alpha-don-t-export-sock_nonblock-to-user-space.patch
 usb-winbond-remove-__devinit-from-the-struct-usb_device_id-table.patch
+mm-hugetlbfs-correctly-populate-shared-pmd.patch