From: Greg Kroah-Hartman Date: Sun, 26 May 2013 00:24:30 +0000 (+0900) Subject: 3.4-stable patches X-Git-Tag: v3.0.81~32 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=36023c9a9380087b5ba79eecf9d511e6a5621cc1;p=thirdparty%2Fkernel%2Fstable-queue.git 3.4-stable patches added patches: drivers-block-brd.c-fix-brd_lookup_page-race.patch drivers-leds-leds-ot200.c-fix-error-caused-by-shifted-mask.patch kirkwood-enable-pcie-port-1-on-qnap-ts-11x-ts-21x.patch klist-del-waiter-from-klist_remove_waiters-before-wakeup-waitting-process.patch mm-compaction-fix-of-improper-cache-flush-in-migration-code.patch mm-mmu_notifier-re-fix-freed-page-still-mapped-in-secondary-mmu.patch mm-pagewalk.c-walk_page_range-should-avoid-vm_pfnmap-areas.patch mm-thp-use-pmd_populate-to-update-the-pmd-with-pgtable_t-pointer.patch nilfs2-fix-issue-of-nilfs_set_page_dirty-for-page-at-eof-boundary.patch ocfs2-goto-out_unlock-if-ocfs2_get_clusters_nocache-failed-in-ocfs2_fiemap.patch perf-net_dropmonitor-fix-symbol-relative-addresses.patch perf-net_dropmonitor-fix-trace-parameter-order.patch tg3-fix-data-corruption-on-5725-with-tso.patch wait-fix-false-timeouts-when-using-wait_event_timeout.patch --- diff --git a/queue-3.4/drivers-block-brd.c-fix-brd_lookup_page-race.patch b/queue-3.4/drivers-block-brd.c-fix-brd_lookup_page-race.patch new file mode 100644 index 00000000000..1d5c97f0091 --- /dev/null +++ b/queue-3.4/drivers-block-brd.c-fix-brd_lookup_page-race.patch @@ -0,0 +1,43 @@ +From dfd20b2b174d3a9b258ea3b7a35ead33576587b1 Mon Sep 17 00:00:00 2001 +From: Brian Behlendorf +Date: Fri, 24 May 2013 15:55:28 -0700 +Subject: drivers/block/brd.c: fix brd_lookup_page() race + +From: Brian Behlendorf + +commit dfd20b2b174d3a9b258ea3b7a35ead33576587b1 upstream. + +The index on the page must be set before it is inserted in the radix +tree. Otherwise there is a small race which can occur during lookup +where the page can be found with the incorrect index. This will trigger +the BUG_ON() in brd_lookup_page(). + +Signed-off-by: Brian Behlendorf +Reported-by: Chris Wedgwood +Cc: Jens Axboe +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/brd.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/block/brd.c ++++ b/drivers/block/brd.c +@@ -117,13 +117,13 @@ static struct page *brd_insert_page(stru + + spin_lock(&brd->brd_lock); + idx = sector >> PAGE_SECTORS_SHIFT; ++ page->index = idx; + if (radix_tree_insert(&brd->brd_pages, idx, page)) { + __free_page(page); + page = radix_tree_lookup(&brd->brd_pages, idx); + BUG_ON(!page); + BUG_ON(page->index != idx); +- } else +- page->index = idx; ++ } + spin_unlock(&brd->brd_lock); + + radix_tree_preload_end(); diff --git a/queue-3.4/drivers-leds-leds-ot200.c-fix-error-caused-by-shifted-mask.patch b/queue-3.4/drivers-leds-leds-ot200.c-fix-error-caused-by-shifted-mask.patch new file mode 100644 index 00000000000..2a43085da35 --- /dev/null +++ b/queue-3.4/drivers-leds-leds-ot200.c-fix-error-caused-by-shifted-mask.patch @@ -0,0 +1,74 @@ +From 4b949b8af12e24b8a48fa5bb775a13b558d9f4da Mon Sep 17 00:00:00 2001 +From: Christian Gmeiner +Date: Fri, 24 May 2013 15:55:22 -0700 +Subject: drivers/leds/leds-ot200.c: fix error caused by shifted mask + +From: Christian Gmeiner + +commit 4b949b8af12e24b8a48fa5bb775a13b558d9f4da upstream. + +During the development of this driver an in-house register documentation +was used. The last week some integration tests were done and this +problem was found. It turned out that the released register +documentation is wrong. + +The fix is very simple: shift all masks by one. + +Signed-off-by: Christian Gmeiner +Cc: Bryan Wu +Cc: Sebastian Andrzej Siewior +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/leds/leds-ot200.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +--- a/drivers/leds/leds-ot200.c ++++ b/drivers/leds/leds-ot200.c +@@ -47,37 +47,37 @@ static struct ot200_led leds[] = { + { + .name = "led_1", + .port = 0x49, +- .mask = BIT(7), ++ .mask = BIT(6), + }, + { + .name = "led_2", + .port = 0x49, +- .mask = BIT(6), ++ .mask = BIT(5), + }, + { + .name = "led_3", + .port = 0x49, +- .mask = BIT(5), ++ .mask = BIT(4), + }, + { + .name = "led_4", + .port = 0x49, +- .mask = BIT(4), ++ .mask = BIT(3), + }, + { + .name = "led_5", + .port = 0x49, +- .mask = BIT(3), ++ .mask = BIT(2), + }, + { + .name = "led_6", + .port = 0x49, +- .mask = BIT(2), ++ .mask = BIT(1), + }, + { + .name = "led_7", + .port = 0x49, +- .mask = BIT(1), ++ .mask = BIT(0), + } + }; + diff --git a/queue-3.4/kirkwood-enable-pcie-port-1-on-qnap-ts-11x-ts-21x.patch b/queue-3.4/kirkwood-enable-pcie-port-1-on-qnap-ts-11x-ts-21x.patch new file mode 100644 index 00000000000..4913bb52e78 --- /dev/null +++ b/queue-3.4/kirkwood-enable-pcie-port-1-on-qnap-ts-11x-ts-21x.patch @@ -0,0 +1,34 @@ +From 99e11334dcb846f9b76fb808196c7f47aa83abb3 Mon Sep 17 00:00:00 2001 +From: Martin Michlmayr +Date: Sun, 21 Apr 2013 17:14:00 +0100 +Subject: Kirkwood: Enable PCIe port 1 on QNAP TS-11x/TS-21x + +From: Martin Michlmayr + +commit 99e11334dcb846f9b76fb808196c7f47aa83abb3 upstream. + +Enable KW_PCIE1 on QNAP TS-11x/TS-21x devices as newer revisions +(rev 1.3) have a USB 3.0 chip from Etron on PCIe port 1. Thanks +to Marek Vasut for identifying this issue! + +Signed-off-by: Martin Michlmayr +Tested-by: Marek Vasut +Acked-by: Andrew Lunn +Signed-off-by: Jason Cooper +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm/mach-kirkwood/ts219-setup.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/arm/mach-kirkwood/ts219-setup.c ++++ b/arch/arm/mach-kirkwood/ts219-setup.c +@@ -124,7 +124,7 @@ static void __init qnap_ts219_init(void) + static int __init ts219_pci_init(void) + { + if (machine_is_ts219()) +- kirkwood_pcie_init(KW_PCIE0); ++ kirkwood_pcie_init(KW_PCIE1 | KW_PCIE0); + + return 0; + } diff --git a/queue-3.4/klist-del-waiter-from-klist_remove_waiters-before-wakeup-waitting-process.patch b/queue-3.4/klist-del-waiter-from-klist_remove_waiters-before-wakeup-waitting-process.patch new file mode 100644 index 00000000000..9545f0a9a95 --- /dev/null +++ b/queue-3.4/klist-del-waiter-from-klist_remove_waiters-before-wakeup-waitting-process.patch @@ -0,0 +1,40 @@ +From ac5a2962b02f57dea76d314ef2521a2170b28ab6 Mon Sep 17 00:00:00 2001 +From: "wang, biao" +Date: Thu, 16 May 2013 09:50:13 +0800 +Subject: klist: del waiter from klist_remove_waiters before wakeup waitting process + +From: "wang, biao" + +commit ac5a2962b02f57dea76d314ef2521a2170b28ab6 upstream. + +There is a race between klist_remove and klist_release. klist_remove +uses a local var waiter saved on stack. When klist_release calls +wake_up_process(waiter->process) to wake up the waiter, waiter might run +immediately and reuse the stack. Then, klist_release calls +list_del(&waiter->list) to change previous +wait data and cause prior waiter thread corrupt. + +The patch fixes it against kernel 3.9. + +Signed-off-by: wang, biao +Acked-by: Peter Zijlstra +Signed-off-by: Greg Kroah-Hartman + +--- + lib/klist.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/lib/klist.c ++++ b/lib/klist.c +@@ -193,10 +193,10 @@ static void klist_release(struct kref *k + if (waiter->node != n) + continue; + ++ list_del(&waiter->list); + waiter->woken = 1; + mb(); + wake_up_process(waiter->process); +- list_del(&waiter->list); + } + spin_unlock(&klist_remove_lock); + knode_set_klist(n, NULL); diff --git a/queue-3.4/mm-compaction-fix-of-improper-cache-flush-in-migration-code.patch b/queue-3.4/mm-compaction-fix-of-improper-cache-flush-in-migration-code.patch new file mode 100644 index 00000000000..2fb2282c3ec --- /dev/null +++ b/queue-3.4/mm-compaction-fix-of-improper-cache-flush-in-migration-code.patch @@ -0,0 +1,55 @@ +From c2cc499c5bcf9040a738f49e8051b42078205748 Mon Sep 17 00:00:00 2001 +From: Leonid Yegoshin +Date: Fri, 24 May 2013 15:55:18 -0700 +Subject: mm compaction: fix of improper cache flush in migration code + +From: Leonid Yegoshin + +commit c2cc499c5bcf9040a738f49e8051b42078205748 upstream. + +Page 'new' during MIGRATION can't be flushed with flush_cache_page(). +Using flush_cache_page(vma, addr, pfn) is justified only if the page is +already placed in process page table, and that is done right after +flush_cache_page(). But without it the arch function has no knowledge +of process PTE and does nothing. + +Besides that, flush_cache_page() flushes an application cache page, but +the kernel has a different page virtual address and dirtied it. + +Replace it with flush_dcache_page(new) which is the proper usage. + +The old page is flushed in try_to_unmap_one() before migration. + +This bug takes place in Sead3 board with M14Kc MIPS CPU without cache +aliasing (but Harvard arch - separate I and D cache) in tight memory +environment (128MB) each 1-3days on SOAK test. It fails in cc1 during +kernel build (SIGILL, SIGBUS, SIGSEG) if CONFIG_COMPACTION is switched +ON. + +Signed-off-by: Leonid Yegoshin +Cc: Leonid Yegoshin +Acked-by: Rik van Riel +Cc: Michal Hocko +Acked-by: Mel Gorman +Cc: Ralf Baechle +Cc: Russell King +Cc: David Miller +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/migrate.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -145,7 +145,7 @@ static int remove_migration_pte(struct p + if (PageHuge(new)) + pte = pte_mkhuge(pte); + #endif +- flush_cache_page(vma, addr, pte_pfn(pte)); ++ flush_dcache_page(new); + set_pte_at(mm, addr, ptep, pte); + + if (PageHuge(new)) { diff --git a/queue-3.4/mm-mmu_notifier-re-fix-freed-page-still-mapped-in-secondary-mmu.patch b/queue-3.4/mm-mmu_notifier-re-fix-freed-page-still-mapped-in-secondary-mmu.patch new file mode 100644 index 00000000000..0b7d220a591 --- /dev/null +++ b/queue-3.4/mm-mmu_notifier-re-fix-freed-page-still-mapped-in-secondary-mmu.patch @@ -0,0 +1,181 @@ +From d34883d4e35c0a994e91dd847a82b4c9e0c31d83 Mon Sep 17 00:00:00 2001 +From: Xiao Guangrong +Date: Fri, 24 May 2013 15:55:11 -0700 +Subject: mm: mmu_notifier: re-fix freed page still mapped in secondary MMU + +From: Xiao Guangrong + +commit d34883d4e35c0a994e91dd847a82b4c9e0c31d83 upstream. + +Commit 751efd8610d3 ("mmu_notifier_unregister NULL Pointer deref and +multiple ->release()") breaks the fix 3ad3d901bbcf ("mm: mmu_notifier: +fix freed page still mapped in secondary MMU"). + +Since hlist_for_each_entry_rcu() is changed now, we can not revert that +patch directly, so this patch reverts the commit and simply fix the bug +spotted by that patch + +This bug spotted by commit 751efd8610d3 is: + + There is a race condition between mmu_notifier_unregister() and + __mmu_notifier_release(). + + Assume two tasks, one calling mmu_notifier_unregister() as a result + of a filp_close() ->flush() callout (task A), and the other calling + mmu_notifier_release() from an mmput() (task B). + + A B + t1 srcu_read_lock() + t2 if (!hlist_unhashed()) + t3 srcu_read_unlock() + t4 srcu_read_lock() + t5 hlist_del_init_rcu() + t6 synchronize_srcu() + t7 srcu_read_unlock() + t8 hlist_del_rcu() <--- NULL pointer deref. + +This can be fixed by using hlist_del_init_rcu instead of hlist_del_rcu. + +The another issue spotted in the commit is "multiple ->release() +callouts", we needn't care it too much because it is really rare (e.g, +can not happen on kvm since mmu-notify is unregistered after +exit_mmap()) and the later call of multiple ->release should be fast +since all the pages have already been released by the first call. +Anyway, this issue should be fixed in a separate patch. + +-stable suggestions: Any version that has commit 751efd8610d3 need to be +backported. I find the oldest version has this commit is 3.0-stable. + +[akpm@linux-foundation.org: tweak comments] +Signed-off-by: Xiao Guangrong +Tested-by: Robin Holt +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/mmu_notifier.c | 79 ++++++++++++++++++++++++++---------------------------- + 1 file changed, 39 insertions(+), 40 deletions(-) + +--- a/mm/mmu_notifier.c ++++ b/mm/mmu_notifier.c +@@ -40,48 +40,44 @@ void __mmu_notifier_release(struct mm_st + int id; + + /* +- * srcu_read_lock() here will block synchronize_srcu() in +- * mmu_notifier_unregister() until all registered +- * ->release() callouts this function makes have +- * returned. ++ * SRCU here will block mmu_notifier_unregister until ++ * ->release returns. + */ + id = srcu_read_lock(&srcu); ++ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) ++ /* ++ * If ->release runs before mmu_notifier_unregister it must be ++ * handled, as it's the only way for the driver to flush all ++ * existing sptes and stop the driver from establishing any more ++ * sptes before all the pages in the mm are freed. ++ */ ++ if (mn->ops->release) ++ mn->ops->release(mn, mm); ++ srcu_read_unlock(&srcu, id); ++ + spin_lock(&mm->mmu_notifier_mm->lock); + while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { + mn = hlist_entry(mm->mmu_notifier_mm->list.first, + struct mmu_notifier, + hlist); +- + /* +- * Unlink. This will prevent mmu_notifier_unregister() +- * from also making the ->release() callout. ++ * We arrived before mmu_notifier_unregister so ++ * mmu_notifier_unregister will do nothing other than to wait ++ * for ->release to finish and for mmu_notifier_unregister to ++ * return. + */ + hlist_del_init_rcu(&mn->hlist); +- spin_unlock(&mm->mmu_notifier_mm->lock); +- +- /* +- * Clear sptes. (see 'release' description in mmu_notifier.h) +- */ +- if (mn->ops->release) +- mn->ops->release(mn, mm); +- +- spin_lock(&mm->mmu_notifier_mm->lock); + } + spin_unlock(&mm->mmu_notifier_mm->lock); + + /* +- * All callouts to ->release() which we have done are complete. +- * Allow synchronize_srcu() in mmu_notifier_unregister() to complete +- */ +- srcu_read_unlock(&srcu, id); +- +- /* +- * mmu_notifier_unregister() may have unlinked a notifier and may +- * still be calling out to it. Additionally, other notifiers +- * may have been active via vmtruncate() et. al. Block here +- * to ensure that all notifier callouts for this mm have been +- * completed and the sptes are really cleaned up before returning +- * to exit_mmap(). ++ * synchronize_srcu here prevents mmu_notifier_release from returning to ++ * exit_mmap (which would proceed with freeing all pages in the mm) ++ * until the ->release method returns, if it was invoked by ++ * mmu_notifier_unregister. ++ * ++ * The mmu_notifier_mm can't go away from under us because one mm_count ++ * is held by exit_mmap. + */ + synchronize_srcu(&srcu); + } +@@ -302,31 +298,34 @@ void mmu_notifier_unregister(struct mmu_ + { + BUG_ON(atomic_read(&mm->mm_count) <= 0); + +- spin_lock(&mm->mmu_notifier_mm->lock); + if (!hlist_unhashed(&mn->hlist)) { ++ /* ++ * SRCU here will force exit_mmap to wait for ->release to ++ * finish before freeing the pages. ++ */ + int id; + ++ id = srcu_read_lock(&srcu); + /* +- * Ensure we synchronize up with __mmu_notifier_release(). ++ * exit_mmap will block in mmu_notifier_release to guarantee ++ * that ->release is called before freeing the pages. + */ +- id = srcu_read_lock(&srcu); +- +- hlist_del_rcu(&mn->hlist); +- spin_unlock(&mm->mmu_notifier_mm->lock); +- + if (mn->ops->release) + mn->ops->release(mn, mm); ++ srcu_read_unlock(&srcu, id); + ++ spin_lock(&mm->mmu_notifier_mm->lock); + /* +- * Allow __mmu_notifier_release() to complete. ++ * Can not use list_del_rcu() since __mmu_notifier_release ++ * can delete it before we hold the lock. + */ +- srcu_read_unlock(&srcu, id); +- } else ++ hlist_del_init_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); ++ } + + /* +- * Wait for any running method to finish, including ->release() if it +- * was run by __mmu_notifier_release() instead of us. ++ * Wait for any running method to finish, of course including ++ * ->release if it was run by mmu_notifier_relase instead of us. + */ + synchronize_srcu(&srcu); + diff --git a/queue-3.4/mm-pagewalk.c-walk_page_range-should-avoid-vm_pfnmap-areas.patch b/queue-3.4/mm-pagewalk.c-walk_page_range-should-avoid-vm_pfnmap-areas.patch new file mode 100644 index 00000000000..4be7e2d65fb --- /dev/null +++ b/queue-3.4/mm-pagewalk.c-walk_page_range-should-avoid-vm_pfnmap-areas.patch @@ -0,0 +1,151 @@ +From a9ff785e4437c83d2179161e012f5bdfbd6381f0 Mon Sep 17 00:00:00 2001 +From: Cliff Wickman +Date: Fri, 24 May 2013 15:55:36 -0700 +Subject: mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas + +From: Cliff Wickman + +commit a9ff785e4437c83d2179161e012f5bdfbd6381f0 upstream. + +A panic can be caused by simply cat'ing /proc//smaps while an +application has a VM_PFNMAP range. It happened in-house when a +benchmarker was trying to decipher the memory layout of his program. + +/proc//smaps and similar walks through a user page table should not +be looking at VM_PFNMAP areas. + +Certain tests in walk_page_range() (specifically split_huge_page_pmd()) +assume that all the mapped PFN's are backed with page structures. And +this is not usually true for VM_PFNMAP areas. This can result in panics +on kernel page faults when attempting to address those page structures. + +There are a half dozen callers of walk_page_range() that walk through a +task's entire page table (as N. Horiguchi pointed out). So rather than +change all of them, this patch changes just walk_page_range() to ignore +VM_PFNMAP areas. + +The logic of hugetlb_vma() is moved back into walk_page_range(), as we +want to test any vma in the range. + +VM_PFNMAP areas are used by: +- graphics memory manager gpu/drm/drm_gem.c +- global reference unit sgi-gru/grufile.c +- sgi special memory char/mspec.c +- and probably several out-of-tree modules + +[akpm@linux-foundation.org: remove now-unused hugetlb_vma() stub] +Signed-off-by: Cliff Wickman +Reviewed-by: Naoya Horiguchi +Cc: Mel Gorman +Cc: Andrea Arcangeli +Cc: Dave Hansen +Cc: David Sterba +Cc: Johannes Weiner +Cc: KOSAKI Motohiro +Cc: "Kirill A. Shutemov" +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/pagewalk.c | 70 +++++++++++++++++++++++++++++----------------------------- + 1 file changed, 36 insertions(+), 34 deletions(-) + +--- a/mm/pagewalk.c ++++ b/mm/pagewalk.c +@@ -127,28 +127,7 @@ static int walk_hugetlb_range(struct vm_ + return 0; + } + +-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) +-{ +- struct vm_area_struct *vma; +- +- /* We don't need vma lookup at all. */ +- if (!walk->hugetlb_entry) +- return NULL; +- +- VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); +- vma = find_vma(walk->mm, addr); +- if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) +- return vma; +- +- return NULL; +-} +- + #else /* CONFIG_HUGETLB_PAGE */ +-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) +-{ +- return NULL; +-} +- + static int walk_hugetlb_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +@@ -199,30 +178,53 @@ int walk_page_range(unsigned long addr, + if (!walk->mm) + return -EINVAL; + ++ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); ++ + pgd = pgd_offset(walk->mm, addr); + do { +- struct vm_area_struct *vma; ++ struct vm_area_struct *vma = NULL; + + next = pgd_addr_end(addr, end); + + /* +- * handle hugetlb vma individually because pagetable walk for +- * the hugetlb page is dependent on the architecture and +- * we can't handled it in the same manner as non-huge pages. ++ * This function was not intended to be vma based. ++ * But there are vma special cases to be handled: ++ * - hugetlb vma's ++ * - VM_PFNMAP vma's + */ +- vma = hugetlb_vma(addr, walk); ++ vma = find_vma(walk->mm, addr); + if (vma) { +- if (vma->vm_end < next) ++ /* ++ * There are no page structures backing a VM_PFNMAP ++ * range, so do not allow split_huge_page_pmd(). ++ */ ++ if ((vma->vm_start <= addr) && ++ (vma->vm_flags & VM_PFNMAP)) { + next = vma->vm_end; ++ pgd = pgd_offset(walk->mm, next); ++ continue; ++ } + /* +- * Hugepage is very tightly coupled with vma, so +- * walk through hugetlb entries within a given vma. ++ * Handle hugetlb vma individually because pagetable ++ * walk for the hugetlb page is dependent on the ++ * architecture and we can't handled it in the same ++ * manner as non-huge pages. + */ +- err = walk_hugetlb_range(vma, addr, next, walk); +- if (err) +- break; +- pgd = pgd_offset(walk->mm, next); +- continue; ++ if (walk->hugetlb_entry && (vma->vm_start <= addr) && ++ is_vm_hugetlb_page(vma)) { ++ if (vma->vm_end < next) ++ next = vma->vm_end; ++ /* ++ * Hugepage is very tightly coupled with vma, ++ * so walk through hugetlb entries within a ++ * given vma. ++ */ ++ err = walk_hugetlb_range(vma, addr, next, walk); ++ if (err) ++ break; ++ pgd = pgd_offset(walk->mm, next); ++ continue; ++ } + } + + if (pgd_none_or_clear_bad(pgd)) { diff --git a/queue-3.4/mm-thp-use-pmd_populate-to-update-the-pmd-with-pgtable_t-pointer.patch b/queue-3.4/mm-thp-use-pmd_populate-to-update-the-pmd-with-pgtable_t-pointer.patch new file mode 100644 index 00000000000..9267e615470 --- /dev/null +++ b/queue-3.4/mm-thp-use-pmd_populate-to-update-the-pmd-with-pgtable_t-pointer.patch @@ -0,0 +1,45 @@ +From 7c3425123ddfdc5f48e7913ff59d908789712b18 Mon Sep 17 00:00:00 2001 +From: "Aneesh Kumar K.V" +Date: Fri, 24 May 2013 15:55:21 -0700 +Subject: mm/THP: use pmd_populate() to update the pmd with pgtable_t pointer + +From: "Aneesh Kumar K.V" + +commit 7c3425123ddfdc5f48e7913ff59d908789712b18 upstream. + +We should not use set_pmd_at to update pmd_t with pgtable_t pointer. +set_pmd_at is used to set pmd with huge pte entries and architectures +like ppc64, clear few flags from the pte when saving a new entry. +Without this change we observe bad pte errors like below on ppc64 with +THP enabled. + + BUG: Bad page map in process ld mm=0xc000001ee39f4780 pte:7fc3f37848000001 pmd:c000001ec0000000 + +Signed-off-by: Aneesh Kumar K.V +Cc: Hugh Dickins +Cc: Benjamin Herrenschmidt +Reviewed-by: Andrea Arcangeli +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/huge_memory.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -1949,7 +1949,12 @@ static void collapse_huge_page(struct mm + pte_unmap(pte); + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); +- set_pmd_at(mm, address, pmd, _pmd); ++ /* ++ * We can only use set_pmd_at when establishing ++ * hugepmds and never for establishing regular pmds that ++ * points to regular pagetables. Use pmd_populate for that ++ */ ++ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); + spin_unlock(&mm->page_table_lock); + anon_vma_unlock(vma->anon_vma); + goto out; diff --git a/queue-3.4/nilfs2-fix-issue-of-nilfs_set_page_dirty-for-page-at-eof-boundary.patch b/queue-3.4/nilfs2-fix-issue-of-nilfs_set_page_dirty-for-page-at-eof-boundary.patch new file mode 100644 index 00000000000..172dadf0d97 --- /dev/null +++ b/queue-3.4/nilfs2-fix-issue-of-nilfs_set_page_dirty-for-page-at-eof-boundary.patch @@ -0,0 +1,153 @@ +From 136e8770cd5d1fe38b3c613100dd6dc4db6d4fa6 Mon Sep 17 00:00:00 2001 +From: Ryusuke Konishi +Date: Fri, 24 May 2013 15:55:29 -0700 +Subject: nilfs2: fix issue of nilfs_set_page_dirty() for page at EOF boundary + +From: Ryusuke Konishi + +commit 136e8770cd5d1fe38b3c613100dd6dc4db6d4fa6 upstream. + +nilfs2: fix issue of nilfs_set_page_dirty for page at EOF boundary + +DESCRIPTION: + There are use-cases when NILFS2 file system (formatted with block size +lesser than 4 KB) can be remounted in RO mode because of encountering of +"broken bmap" issue. + +The issue was reported by Anthony Doggett : + "The machine I've been trialling nilfs on is running Debian Testing, + Linux version 3.2.0-4-686-pae (debian-kernel@lists.debian.org) (gcc + version 4.6.3 (Debian 4.6.3-14) ) #1 SMP Debian 3.2.35-2), but I've + also reproduced it (identically) with Debian Unstable amd64 and Debian + Experimental (using the 3.8-trunk kernel). The problematic partitions + were formatted with "mkfs.nilfs2 -b 1024 -B 8192"." + +SYMPTOMS: +(1) System log contains error messages likewise: + + [63102.496756] nilfs_direct_assign: invalid pointer: 0 + [63102.496786] NILFS error (device dm-17): nilfs_bmap_assign: broken bmap (inode number=28) + [63102.496798] + [63102.524403] Remounting filesystem read-only + +(2) The NILFS2 file system is remounted in RO mode. + +REPRODUSING PATH: +(1) Create volume group with name "unencrypted" by means of vgcreate utility. +(2) Run script (prepared by Anthony Doggett ): + +----------------[BEGIN SCRIPT]-------------------- + +VG=unencrypted +lvcreate --size 2G --name ntest $VG +mkfs.nilfs2 -b 1024 -B 8192 /dev/mapper/$VG-ntest +mkdir /var/tmp/n +mkdir /var/tmp/n/ntest +mount /dev/mapper/$VG-ntest /var/tmp/n/ntest +mkdir /var/tmp/n/ntest/thedir +cd /var/tmp/n/ntest/thedir +sleep 2 +date +darcs init +sleep 2 +dmesg|tail -n 5 +date +darcs whatsnew || true +date +sleep 2 +dmesg|tail -n 5 +----------------[END SCRIPT]-------------------- + +REPRODUCIBILITY: 100% + +INVESTIGATION: +As it was discovered, the issue takes place during segment +construction after executing such sequence of user-space operations: + + open("_darcs/index", O_RDWR|O_CREAT|O_NOCTTY, 0666) = 7 + fstat(7, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0 + ftruncate(7, 60) + +The error message "NILFS error (device dm-17): nilfs_bmap_assign: broken +bmap (inode number=28)" takes place because of trying to get block +number for third block of the file with logical offset #3072 bytes. As +it is possible to see from above output, the file has 60 bytes of the +whole size. So, it is enough one block (1 KB in size) allocation for +the whole file. Trying to operate with several blocks instead of one +takes place because of discovering several dirty buffers for this file +in nilfs_segctor_scan_file() method. + +The root cause of this issue is in nilfs_set_page_dirty function which +is called just before writing to an mmapped page. + +When nilfs_page_mkwrite function handles a page at EOF boundary, it +fills hole blocks only inside EOF through __block_page_mkwrite(). + +The __block_page_mkwrite() function calls set_page_dirty() after filling +hole blocks, thus nilfs_set_page_dirty function (= +a_ops->set_page_dirty) is called. However, the current implementation +of nilfs_set_page_dirty() wrongly marks all buffers dirty even for page +at EOF boundary. + +As a result, buffers outside EOF are inconsistently marked dirty and +queued for write even though they are not mapped with nilfs_get_block +function. + +FIX: +This modifies nilfs_set_page_dirty() not to mark hole blocks dirty. + +Thanks to Vyacheslav Dubeyko for his effort on analysis and proposals +for this issue. + +Signed-off-by: Ryusuke Konishi +Reported-by: Anthony Doggett +Reported-by: Vyacheslav Dubeyko +Cc: Vyacheslav Dubeyko +Tested-by: Ryusuke Konishi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nilfs2/inode.c | 27 +++++++++++++++++++++++---- + 1 file changed, 23 insertions(+), 4 deletions(-) + +--- a/fs/nilfs2/inode.c ++++ b/fs/nilfs2/inode.c +@@ -195,13 +195,32 @@ static int nilfs_writepage(struct page * + + static int nilfs_set_page_dirty(struct page *page) + { +- int ret = __set_page_dirty_buffers(page); ++ int ret = __set_page_dirty_nobuffers(page); + +- if (ret) { ++ if (page_has_buffers(page)) { + struct inode *inode = page->mapping->host; +- unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); ++ unsigned nr_dirty = 0; ++ struct buffer_head *bh, *head; + +- nilfs_set_file_dirty(inode, nr_dirty); ++ /* ++ * This page is locked by callers, and no other thread ++ * concurrently marks its buffers dirty since they are ++ * only dirtied through routines in fs/buffer.c in ++ * which call sites of mark_buffer_dirty are protected ++ * by page lock. ++ */ ++ bh = head = page_buffers(page); ++ do { ++ /* Do not mark hole blocks dirty */ ++ if (buffer_dirty(bh) || !buffer_mapped(bh)) ++ continue; ++ ++ set_buffer_dirty(bh); ++ nr_dirty++; ++ } while (bh = bh->b_this_page, bh != head); ++ ++ if (nr_dirty) ++ nilfs_set_file_dirty(inode, nr_dirty); + } + return ret; + } diff --git a/queue-3.4/ocfs2-goto-out_unlock-if-ocfs2_get_clusters_nocache-failed-in-ocfs2_fiemap.patch b/queue-3.4/ocfs2-goto-out_unlock-if-ocfs2_get_clusters_nocache-failed-in-ocfs2_fiemap.patch new file mode 100644 index 00000000000..1458c2fba1d --- /dev/null +++ b/queue-3.4/ocfs2-goto-out_unlock-if-ocfs2_get_clusters_nocache-failed-in-ocfs2_fiemap.patch @@ -0,0 +1,41 @@ +From b4ca2b4b577c3530e34dcfaafccb2cc680ce95d1 Mon Sep 17 00:00:00 2001 +From: Joseph Qi +Date: Fri, 24 May 2013 15:55:34 -0700 +Subject: ocfs2: goto out_unlock if ocfs2_get_clusters_nocache() failed in ocfs2_fiemap() + +From: Joseph Qi + +commit b4ca2b4b577c3530e34dcfaafccb2cc680ce95d1 upstream. + +Last time we found there is lock/unlock bug in ocfs2_file_aio_write, and +then we did a thorough search for all lock resources in +ocfs2_inode_info, including rw, inode and open lockres and found this +bug. My kernel version is 3.0.13, and it is also in the lastest version +3.9. In ocfs2_fiemap, once ocfs2_get_clusters_nocache failed, it should +goto out_unlock instead of out, because we need release buffer head, up +read alloc sem and unlock inode. + +Signed-off-by: Joseph Qi +Reviewed-by: Jie Liu +Cc: Mark Fasheh +Cc: Joel Becker +Acked-by: Sunil Mushran +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/extent_map.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ocfs2/extent_map.c ++++ b/fs/ocfs2/extent_map.c +@@ -791,7 +791,7 @@ int ocfs2_fiemap(struct inode *inode, st + &hole_size, &rec, &is_last); + if (ret) { + mlog_errno(ret); +- goto out; ++ goto out_unlock; + } + + if (rec.e_blkno == 0ULL) { diff --git a/queue-3.4/perf-net_dropmonitor-fix-symbol-relative-addresses.patch b/queue-3.4/perf-net_dropmonitor-fix-symbol-relative-addresses.patch new file mode 100644 index 00000000000..8310e0008da --- /dev/null +++ b/queue-3.4/perf-net_dropmonitor-fix-symbol-relative-addresses.patch @@ -0,0 +1,37 @@ +From 5a1e99dd2028e00998d42029be86835d8ef4a46e Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Mon, 20 May 2013 14:45:26 +0000 +Subject: perf: net_dropmonitor: Fix symbol-relative addresses + +From: Ben Hutchings + +commit 5a1e99dd2028e00998d42029be86835d8ef4a46e upstream. + +The comparison between traced and symbol addresses is backwards: if +the traced address doesn't exactly match a symbol (which we don't +expect it to), we'll show the next symbol and the offset to it, +whereas we should show the previous symbol and the offset from it. + +Signed-off-by: Ben Hutchings +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + tools/perf/scripts/python/net_dropmonitor.py | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/tools/perf/scripts/python/net_dropmonitor.py ++++ b/tools/perf/scripts/python/net_dropmonitor.py +@@ -40,9 +40,9 @@ def get_kallsyms_table(): + + def get_sym(sloc): + loc = int(sloc) +- for i in kallsyms: +- if (i['loc'] >= loc): +- return (i['name'], i['loc']-loc) ++ for i in kallsyms[::-1]: ++ if loc >= i['loc']: ++ return (i['name'], loc - i['loc']) + return (None, 0) + + def print_drop_table(): diff --git a/queue-3.4/perf-net_dropmonitor-fix-trace-parameter-order.patch b/queue-3.4/perf-net_dropmonitor-fix-trace-parameter-order.patch new file mode 100644 index 00000000000..ddd19526112 --- /dev/null +++ b/queue-3.4/perf-net_dropmonitor-fix-trace-parameter-order.patch @@ -0,0 +1,30 @@ +From 140c3c6a2bcd2c31e2f7f5a8d59689724776c8e5 Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Mon, 20 May 2013 14:44:43 +0000 +Subject: perf: net_dropmonitor: Fix trace parameter order + +From: Ben Hutchings + +commit 140c3c6a2bcd2c31e2f7f5a8d59689724776c8e5 upstream. + +This works much better if we don't treat protocol numbers as addresses. + +Signed-off-by: Ben Hutchings +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + tools/perf/scripts/python/net_dropmonitor.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/tools/perf/scripts/python/net_dropmonitor.py ++++ b/tools/perf/scripts/python/net_dropmonitor.py +@@ -64,7 +64,7 @@ def trace_end(): + + # called from perf, when it finds a correspoinding event + def skb__kfree_skb(name, context, cpu, sec, nsec, pid, comm, +- skbaddr, protocol, location): ++ skbaddr, location, protocol): + slocation = str(location) + try: + drop_log[slocation] = drop_log[slocation] + 1 diff --git a/queue-3.4/series b/queue-3.4/series index 7ffddfa9be3..790f22198a8 100644 --- a/queue-3.4/series +++ b/queue-3.4/series @@ -12,3 +12,17 @@ usb-xhci-override-bogus-bulk-wmaxpacketsize-values.patch usb-uhci-fix-for-suspend-of-virtual-hp-controller.patch cifs-only-set-ops-for-inodes-in-i_new-state.patch fat-fix-possible-overflow-for-fat_clusters.patch +tg3-fix-data-corruption-on-5725-with-tso.patch +perf-net_dropmonitor-fix-trace-parameter-order.patch +perf-net_dropmonitor-fix-symbol-relative-addresses.patch +ocfs2-goto-out_unlock-if-ocfs2_get_clusters_nocache-failed-in-ocfs2_fiemap.patch +kirkwood-enable-pcie-port-1-on-qnap-ts-11x-ts-21x.patch +drivers-leds-leds-ot200.c-fix-error-caused-by-shifted-mask.patch +mm-compaction-fix-of-improper-cache-flush-in-migration-code.patch +klist-del-waiter-from-klist_remove_waiters-before-wakeup-waitting-process.patch +wait-fix-false-timeouts-when-using-wait_event_timeout.patch +nilfs2-fix-issue-of-nilfs_set_page_dirty-for-page-at-eof-boundary.patch +mm-mmu_notifier-re-fix-freed-page-still-mapped-in-secondary-mmu.patch +drivers-block-brd.c-fix-brd_lookup_page-race.patch +mm-pagewalk.c-walk_page_range-should-avoid-vm_pfnmap-areas.patch +mm-thp-use-pmd_populate-to-update-the-pmd-with-pgtable_t-pointer.patch diff --git a/queue-3.4/tg3-fix-data-corruption-on-5725-with-tso.patch b/queue-3.4/tg3-fix-data-corruption-on-5725-with-tso.patch new file mode 100644 index 00000000000..3baf50c39e4 --- /dev/null +++ b/queue-3.4/tg3-fix-data-corruption-on-5725-with-tso.patch @@ -0,0 +1,55 @@ +From 0f0d15100a8ac875bdd408324c473e16d73d3557 Mon Sep 17 00:00:00 2001 +From: Michael Chan +Date: Mon, 13 May 2013 11:04:16 +0000 +Subject: tg3: Fix data corruption on 5725 with TSO + +From: Michael Chan + +commit 0f0d15100a8ac875bdd408324c473e16d73d3557 upstream. + +The 5725 family of devices (asic rev 5762), corrupts TSO packets where +the buffer is within MSS bytes of a 4G boundary (4G, 8G etc.). Detect +this condition and trigger the workaround path. + +Signed-off-by: Michael Chan +Signed-off-by: Nithin Nayak Sujir +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/ethernet/broadcom/tg3.c | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +--- a/drivers/net/ethernet/broadcom/tg3.c ++++ b/drivers/net/ethernet/broadcom/tg3.c +@@ -6622,6 +6622,20 @@ static inline int tg3_4g_overflow_test(d + return (base > 0xffffdcc0) && (base + len + 8 < base); + } + ++/* Test for TSO DMA buffers that cross into regions which are within MSS bytes ++ * of any 4GB boundaries: 4G, 8G, etc ++ */ ++static inline int tg3_4g_tso_overflow_test(struct tg3 *tp, dma_addr_t mapping, ++ u32 len, u32 mss) ++{ ++ if (tg3_asic_rev(tp) == ASIC_REV_5762 && mss) { ++ u32 base = (u32) mapping & 0xffffffff; ++ ++ return ((base + len + (mss & 0x3fff)) < base); ++ } ++ return 0; ++} ++ + /* Test for DMA addresses > 40-bit */ + static inline int tg3_40bit_overflow_test(struct tg3 *tp, dma_addr_t mapping, + int len) +@@ -6658,6 +6672,9 @@ static bool tg3_tx_frag_set(struct tg3_n + if (tg3_4g_overflow_test(map, len)) + hwbug = true; + ++ if (tg3_4g_tso_overflow_test(tp, map, len, mss)) ++ hwbug = true; ++ + if (tg3_40bit_overflow_test(tp, map, len)) + hwbug = true; + diff --git a/queue-3.4/wait-fix-false-timeouts-when-using-wait_event_timeout.patch b/queue-3.4/wait-fix-false-timeouts-when-using-wait_event_timeout.patch new file mode 100644 index 00000000000..6029a0ab2c6 --- /dev/null +++ b/queue-3.4/wait-fix-false-timeouts-when-using-wait_event_timeout.patch @@ -0,0 +1,91 @@ +From 4c663cfc523a88d97a8309b04a089c27dc57fd7e Mon Sep 17 00:00:00 2001 +From: Imre Deak +Date: Fri, 24 May 2013 15:55:09 -0700 +Subject: wait: fix false timeouts when using wait_event_timeout() + +From: Imre Deak + +commit 4c663cfc523a88d97a8309b04a089c27dc57fd7e upstream. + +Many callers of the wait_event_timeout() and +wait_event_interruptible_timeout() expect that the return value will be +positive if the specified condition becomes true before the timeout +elapses. However, at the moment this isn't guaranteed. If the wake-up +handler is delayed enough, the time remaining until timeout will be +calculated as 0 - and passed back as a return value - even if the +condition became true before the timeout has passed. + +Fix this by returning at least 1 if the condition becomes true. This +semantic is in line with what wait_for_condition_timeout() does; see +commit bb10ed09 ("sched: fix wait_for_completion_timeout() spurious +failure under heavy load"). + +Daniel said "We have 3 instances of this bug in drm/i915. One case even +where we switch between the interruptible and not interruptible +wait_event_timeout variants, foolishly presuming they have the same +semantics. I very much like this." + +One such bug is reported at + https://bugs.freedesktop.org/show_bug.cgi?id=64133 + +Signed-off-by: Imre Deak +Acked-by: Daniel Vetter +Acked-by: David Howells +Acked-by: Jens Axboe +Cc: "Paul E. McKenney" +Cc: Dave Jones +Cc: Lukas Czerner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/wait.h | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +--- a/include/linux/wait.h ++++ b/include/linux/wait.h +@@ -233,6 +233,8 @@ do { \ + if (!ret) \ + break; \ + } \ ++ if (!ret && (condition)) \ ++ ret = 1; \ + finish_wait(&wq, &__wait); \ + } while (0) + +@@ -249,8 +251,9 @@ do { \ + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * +- * The function returns 0 if the @timeout elapsed, and the remaining +- * jiffies if the condition evaluated to true before the timeout elapsed. ++ * The function returns 0 if the @timeout elapsed, or the remaining ++ * jiffies (at least 1) if the @condition evaluated to %true before ++ * the @timeout elapsed. + */ + #define wait_event_timeout(wq, condition, timeout) \ + ({ \ +@@ -318,6 +321,8 @@ do { \ + ret = -ERESTARTSYS; \ + break; \ + } \ ++ if (!ret && (condition)) \ ++ ret = 1; \ + finish_wait(&wq, &__wait); \ + } while (0) + +@@ -334,9 +339,10 @@ do { \ + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * +- * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it +- * was interrupted by a signal, and the remaining jiffies otherwise +- * if the condition evaluated to true before the timeout elapsed. ++ * Returns: ++ * 0 if the @timeout elapsed, -%ERESTARTSYS if it was interrupted by ++ * a signal, or the remaining jiffies (at least 1) if the @condition ++ * evaluated to %true before the @timeout elapsed. + */ + #define wait_event_interruptible_timeout(wq, condition, timeout) \ + ({ \