From: Greg Kroah-Hartman Date: Mon, 31 Mar 2014 23:59:31 +0000 (-0700) Subject: 3.13-stable patches X-Git-Tag: v3.4.86~2 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2c9c044bc79e93ef593d344a479acba2162887c9;p=thirdparty%2Fkernel%2Fstable-queue.git 3.13-stable patches added patches: cgroup-protect-modifications-to-cgroup_idr-with-cgroup_mutex.patch don-t-bother-with-propagate_mnt-unless-the-target-is-shared.patch keep-shadowed-vfsmounts-together.patch mm-close-pagetail-race.patch netfilter-nf_conntrack_dccp-fix-skb_header_pointer-api-usages.patch switch-mnt_hash-to-hlist.patch --- diff --git a/queue-3.13/cgroup-protect-modifications-to-cgroup_idr-with-cgroup_mutex.patch b/queue-3.13/cgroup-protect-modifications-to-cgroup_idr-with-cgroup_mutex.patch new file mode 100644 index 00000000000..6f13d3fd069 --- /dev/null +++ b/queue-3.13/cgroup-protect-modifications-to-cgroup_idr-with-cgroup_mutex.patch @@ -0,0 +1,131 @@ +From 0ab02ca8f887908152d1a96db5130fc661d36a1e Mon Sep 17 00:00:00 2001 +From: Li Zefan +Date: Tue, 11 Feb 2014 16:05:46 +0800 +Subject: cgroup: protect modifications to cgroup_idr with cgroup_mutex + +From: Li Zefan + +commit 0ab02ca8f887908152d1a96db5130fc661d36a1e upstream. + +Setup cgroupfs like this: + # mount -t cgroup -o cpuacct xxx /cgroup + # mkdir /cgroup/sub1 + # mkdir /cgroup/sub2 + +Then run these two commands: + # for ((; ;)) { mkdir /cgroup/sub1/tmp && rmdir /mnt/sub1/tmp; } & + # for ((; ;)) { mkdir /cgroup/sub2/tmp && rmdir /mnt/sub2/tmp; } & + +After seconds you may see this warning: + +------------[ cut here ]------------ +WARNING: CPU: 1 PID: 25243 at lib/idr.c:527 sub_remove+0x87/0x1b0() +idr_remove called for id=6 which is not allocated. +... +Call Trace: + [] dump_stack+0x7a/0x96 + [] warn_slowpath_common+0x8c/0xc0 + [] warn_slowpath_fmt+0x46/0x50 + [] sub_remove+0x87/0x1b0 + [] ? css_killed_work_fn+0x32/0x1b0 + [] idr_remove+0x25/0xd0 + [] cgroup_destroy_css_killed+0x5b/0xc0 + [] css_killed_work_fn+0x130/0x1b0 + [] process_one_work+0x26c/0x550 + [] worker_thread+0x12e/0x3b0 + [] kthread+0xe6/0xf0 + [] ret_from_fork+0x7c/0xb0 +---[ end trace 2d1577ec10cf80d0 ]--- + +It's because allocating/removing cgroup ID is not properly synchronized. + +The bug was introduced when we converted cgroup_ida to cgroup_idr. +While synchronization is already done inside ida_simple_{get,remove}(), +users are responsible for concurrent calls to idr_{alloc,remove}(). + +tj: Refreshed on top of b58c89986a77 ("cgroup: fix error return from +cgroup_create()"). + +[mhocko@suse.cz: ported to 3.12] +Fixes: 4e96ee8e981b ("cgroup: convert cgroup_ida to cgroup_idr") +Cc: #3.12+ +Reported-by: Michal Hocko +Signed-off-by: Li Zefan +Signed-off-by: Michal Hocko +Signed-off-by: Jiri Slaby +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/cgroup.h | 2 ++ + kernel/cgroup.c | 26 +++++++++++++------------- + 2 files changed, 15 insertions(+), 13 deletions(-) + +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -169,6 +169,8 @@ struct cgroup { + * + * The ID of the root cgroup is always 0, and a new cgroup + * will be assigned with a smallest available ID. ++ * ++ * Allocating/Removing ID must be protected by cgroup_mutex. + */ + int id; + +--- a/kernel/cgroup.c ++++ b/kernel/cgroup.c +@@ -4363,16 +4363,6 @@ static long cgroup_create(struct cgroup + rcu_assign_pointer(cgrp->name, name); + + /* +- * Temporarily set the pointer to NULL, so idr_find() won't return +- * a half-baked cgroup. +- */ +- cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); +- if (cgrp->id < 0) { +- err = -ENOMEM; +- goto err_free_name; +- } +- +- /* + * Only live parents can have children. Note that the liveliness + * check isn't strictly necessary because cgroup_mkdir() and + * cgroup_rmdir() are fully synchronized by i_mutex; however, do it +@@ -4381,7 +4371,7 @@ static long cgroup_create(struct cgroup + */ + if (!cgroup_lock_live_group(parent)) { + err = -ENODEV; +- goto err_free_id; ++ goto err_free_name; + } + + /* Grab a reference on the superblock so the hierarchy doesn't +@@ -4391,6 +4381,16 @@ static long cgroup_create(struct cgroup + * fs */ + atomic_inc(&sb->s_active); + ++ /* ++ * Temporarily set the pointer to NULL, so idr_find() won't return ++ * a half-baked cgroup. ++ */ ++ cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); ++ if (cgrp->id < 0) { ++ err = -ENOMEM; ++ goto err_unlock; ++ } ++ + init_cgroup_housekeeping(cgrp); + + dentry->d_fsdata = cgrp; +@@ -4491,11 +4491,11 @@ err_free_all: + ss->css_free(css); + } + } ++ idr_remove(&root->cgroup_idr, cgrp->id); ++err_unlock: + mutex_unlock(&cgroup_mutex); + /* Release the reference count that we took on the superblock */ + deactivate_super(sb); +-err_free_id: +- idr_remove(&root->cgroup_idr, cgrp->id); + err_free_name: + kfree(rcu_dereference_raw(cgrp->name)); + err_free_cgrp: diff --git a/queue-3.13/don-t-bother-with-propagate_mnt-unless-the-target-is-shared.patch b/queue-3.13/don-t-bother-with-propagate_mnt-unless-the-target-is-shared.patch new file mode 100644 index 00000000000..05170c7edc4 --- /dev/null +++ b/queue-3.13/don-t-bother-with-propagate_mnt-unless-the-target-is-shared.patch @@ -0,0 +1,55 @@ +From 0b1b901b5a98bb36943d10820efc796f7cd45ff3 Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Fri, 21 Mar 2014 10:14:08 -0400 +Subject: don't bother with propagate_mnt() unless the target is shared + +From: Al Viro + +commit 0b1b901b5a98bb36943d10820efc796f7cd45ff3 upstream. + +If the dest_mnt is not shared, propagate_mnt() does nothing - +there's no mounts to propagate to and thus no copies to create. +Might as well don't bother calling it in that case. + +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman + +--- + fs/namespace.c | 17 +++++++---------- + 1 file changed, 7 insertions(+), 10 deletions(-) + +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -1653,16 +1653,14 @@ static int attach_recursive_mnt(struct m + err = invent_group_ids(source_mnt, true); + if (err) + goto out; +- } +- err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); +- if (err) +- goto out_cleanup_ids; +- +- lock_mount_hash(); +- +- if (IS_MNT_SHARED(dest_mnt)) { ++ err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); ++ if (err) ++ goto out_cleanup_ids; ++ lock_mount_hash(); + for (p = source_mnt; p; p = next_mnt(p, source_mnt)) + set_mnt_shared(p); ++ } else { ++ lock_mount_hash(); + } + if (parent_path) { + detach_mnt(source_mnt, parent_path); +@@ -1685,8 +1683,7 @@ static int attach_recursive_mnt(struct m + return 0; + + out_cleanup_ids: +- if (IS_MNT_SHARED(dest_mnt)) +- cleanup_group_ids(source_mnt, NULL); ++ cleanup_group_ids(source_mnt, NULL); + out: + return err; + } diff --git a/queue-3.13/keep-shadowed-vfsmounts-together.patch b/queue-3.13/keep-shadowed-vfsmounts-together.patch new file mode 100644 index 00000000000..baec3b1dbdd --- /dev/null +++ b/queue-3.13/keep-shadowed-vfsmounts-together.patch @@ -0,0 +1,92 @@ +From 1d6a32acd70ab18499829c0a9a5dbe2bace72a13 Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Thu, 20 Mar 2014 20:34:43 -0400 +Subject: keep shadowed vfsmounts together + +From: Al Viro + +commit 1d6a32acd70ab18499829c0a9a5dbe2bace72a13 upstream. + +preparation to switching mnt_hash to hlist + +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman + +--- + fs/namespace.c | 32 +++++++++++++++++++++++--------- + 1 file changed, 23 insertions(+), 9 deletions(-) + +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -621,12 +621,20 @@ struct mount *__lookup_mnt(struct vfsmou + struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) + { + struct list_head *head = m_hash(mnt, dentry); +- struct mount *p; ++ struct mount *p, *res = NULL; + +- list_for_each_entry_reverse(p, head, mnt_hash) ++ list_for_each_entry(p, head, mnt_hash) + if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) +- return p; +- return NULL; ++ goto found; ++ return res; ++found: ++ res = p; ++ list_for_each_entry_continue(p, head, mnt_hash) { ++ if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) ++ break; ++ res = p; ++ } ++ return res; + } + + /* +@@ -769,14 +777,14 @@ static void attach_mnt(struct mount *mnt + struct mountpoint *mp) + { + mnt_set_mountpoint(parent, mp, mnt); +- list_add_tail(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); ++ list_add(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + } + + /* + * vfsmount lock must be held for write + */ +-static void commit_tree(struct mount *mnt) ++static void commit_tree(struct mount *mnt, struct mount *shadows) + { + struct mount *parent = mnt->mnt_parent; + struct mount *m; +@@ -791,7 +799,10 @@ static void commit_tree(struct mount *mn + + list_splice(&head, n->list.prev); + +- list_add_tail(&mnt->mnt_hash, ++ if (shadows) ++ list_add(&mnt->mnt_hash, &shadows->mnt_hash); ++ else ++ list_add(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + touch_mnt_namespace(n); +@@ -1659,12 +1670,15 @@ static int attach_recursive_mnt(struct m + touch_mnt_namespace(source_mnt->mnt_ns); + } else { + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); +- commit_tree(source_mnt); ++ commit_tree(source_mnt, NULL); + } + + list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { ++ struct mount *q; + list_del_init(&child->mnt_hash); +- commit_tree(child); ++ q = __lookup_mnt_last(&child->mnt_parent->mnt, ++ child->mnt_mountpoint); ++ commit_tree(child, q); + } + unlock_mount_hash(); + diff --git a/queue-3.13/mm-close-pagetail-race.patch b/queue-3.13/mm-close-pagetail-race.patch new file mode 100644 index 00000000000..d1cf99b5371 --- /dev/null +++ b/queue-3.13/mm-close-pagetail-race.patch @@ -0,0 +1,222 @@ +From 668f9abbd4334e6c29fa8acd71635c4f9101caa7 Mon Sep 17 00:00:00 2001 +From: David Rientjes +Date: Mon, 3 Mar 2014 15:38:18 -0800 +Subject: mm: close PageTail race + +From: David Rientjes + +commit 668f9abbd4334e6c29fa8acd71635c4f9101caa7 upstream. + +Commit bf6bddf1924e ("mm: introduce compaction and migration for +ballooned pages") introduces page_count(page) into memory compaction +which dereferences page->first_page if PageTail(page). + +This results in a very rare NULL pointer dereference on the +aforementioned page_count(page). Indeed, anything that does +compound_head(), including page_count() is susceptible to racing with +prep_compound_page() and seeing a NULL or dangling page->first_page +pointer. + +This patch uses Andrea's implementation of compound_trans_head() that +deals with such a race and makes it the default compound_head() +implementation. This includes a read memory barrier that ensures that +if PageTail(head) is true that we return a head page that is neither +NULL nor dangling. The patch then adds a store memory barrier to +prep_compound_page() to ensure page->first_page is set. + +This is the safest way to ensure we see the head page that we are +expecting, PageTail(page) is already in the unlikely() path and the +memory barriers are unfortunately required. + +Hugetlbfs is the exception, we don't enforce a store memory barrier +during init since no race is possible. + +Signed-off-by: David Rientjes +Cc: Holger Kiehl +Cc: Christoph Lameter +Cc: Rafael Aquini +Cc: Vlastimil Babka +Cc: Michal Hocko +Cc: Mel Gorman +Cc: Andrea Arcangeli +Cc: Rik van Riel +Cc: "Kirill A. Shutemov" +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + + +--- + drivers/block/aoe/aoecmd.c | 4 ++-- + drivers/vfio/vfio_iommu_type1.c | 4 ++-- + fs/proc/page.c | 2 +- + include/linux/huge_mm.h | 18 ------------------ + include/linux/mm.h | 14 ++++++++++++-- + mm/ksm.c | 2 +- + mm/memory-failure.c | 2 +- + mm/page_alloc.c | 4 +++- + mm/swap.c | 4 ++-- + 9 files changed, 24 insertions(+), 30 deletions(-) + +--- a/drivers/block/aoe/aoecmd.c ++++ b/drivers/block/aoe/aoecmd.c +@@ -905,7 +905,7 @@ bio_pageinc(struct bio *bio) + /* Non-zero page count for non-head members of + * compound pages is no longer allowed by the kernel. + */ +- page = compound_trans_head(bv->bv_page); ++ page = compound_head(bv->bv_page); + atomic_inc(&page->_count); + } + } +@@ -918,7 +918,7 @@ bio_pagedec(struct bio *bio) + int i; + + bio_for_each_segment(bv, bio, i) { +- page = compound_trans_head(bv->bv_page); ++ page = compound_head(bv->bv_page); + atomic_dec(&page->_count); + } + } +--- a/drivers/vfio/vfio_iommu_type1.c ++++ b/drivers/vfio/vfio_iommu_type1.c +@@ -186,12 +186,12 @@ static bool is_invalid_reserved_pfn(unsi + if (pfn_valid(pfn)) { + bool reserved; + struct page *tail = pfn_to_page(pfn); +- struct page *head = compound_trans_head(tail); ++ struct page *head = compound_head(tail); + reserved = !!(PageReserved(head)); + if (head != tail) { + /* + * "head" is not a dangling pointer +- * (compound_trans_head takes care of that) ++ * (compound_head takes care of that) + * but the hugepage may have been split + * from under us (and we may not hold a + * reference count on the head page so it can +--- a/fs/proc/page.c ++++ b/fs/proc/page.c +@@ -121,7 +121,7 @@ u64 stable_page_flags(struct page *page) + * just checks PG_head/PG_tail, so we need to check PageLRU to make + * sure a given page is a thp, not a non-huge compound page. + */ +- else if (PageTransCompound(page) && PageLRU(compound_trans_head(page))) ++ else if (PageTransCompound(page) && PageLRU(compound_head(page))) + u |= 1 << KPF_THP; + + /* +--- a/include/linux/huge_mm.h ++++ b/include/linux/huge_mm.h +@@ -157,23 +157,6 @@ static inline int hpage_nr_pages(struct + return HPAGE_PMD_NR; + return 1; + } +-static inline struct page *compound_trans_head(struct page *page) +-{ +- if (PageTail(page)) { +- struct page *head; +- head = page->first_page; +- smp_rmb(); +- /* +- * head may be a dangling pointer. +- * __split_huge_page_refcount clears PageTail before +- * overwriting first_page, so if PageTail is still +- * there it means the head pointer isn't dangling. +- */ +- if (PageTail(page)) +- return head; +- } +- return page; +-} + + extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp); +@@ -203,7 +186,6 @@ static inline int split_huge_page(struct + do { } while (0) + #define split_huge_page_pmd_mm(__mm, __address, __pmd) \ + do { } while (0) +-#define compound_trans_head(page) compound_head(page) + static inline int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) + { +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -389,8 +389,18 @@ static inline void compound_unlock_irqre + + static inline struct page *compound_head(struct page *page) + { +- if (unlikely(PageTail(page))) +- return page->first_page; ++ if (unlikely(PageTail(page))) { ++ struct page *head = page->first_page; ++ ++ /* ++ * page->first_page may be a dangling pointer to an old ++ * compound page, so recheck that it is still a tail ++ * page before returning. ++ */ ++ smp_rmb(); ++ if (likely(PageTail(page))) ++ return head; ++ } + return page; + } + +--- a/mm/ksm.c ++++ b/mm/ksm.c +@@ -444,7 +444,7 @@ static void break_cow(struct rmap_item * + static struct page *page_trans_compound_anon(struct page *page) + { + if (PageTransCompound(page)) { +- struct page *head = compound_trans_head(page); ++ struct page *head = compound_head(page); + /* + * head may actually be splitted and freed from under + * us but it's ok here. +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1645,7 +1645,7 @@ int soft_offline_page(struct page *page, + { + int ret; + unsigned long pfn = page_to_pfn(page); +- struct page *hpage = compound_trans_head(page); ++ struct page *hpage = compound_head(page); + + if (PageHWPoison(page)) { + pr_info("soft offline: %#lx page already poisoned\n", pfn); +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -369,9 +369,11 @@ void prep_compound_page(struct page *pag + __SetPageHead(page); + for (i = 1; i < nr_pages; i++) { + struct page *p = page + i; +- __SetPageTail(p); + set_page_count(p, 0); + p->first_page = page; ++ /* Make sure p->first_page is always valid for PageTail() */ ++ smp_wmb(); ++ __SetPageTail(p); + } + } + +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -84,7 +84,7 @@ static void put_compound_page(struct pag + { + if (unlikely(PageTail(page))) { + /* __split_huge_page_refcount can run under us */ +- struct page *page_head = compound_trans_head(page); ++ struct page *page_head = compound_head(page); + + if (likely(page != page_head && + get_page_unless_zero(page_head))) { +@@ -222,7 +222,7 @@ bool __get_page_tail(struct page *page) + */ + unsigned long flags; + bool got = false; +- struct page *page_head = compound_trans_head(page); ++ struct page *page_head = compound_head(page); + + if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* Ref to put_compound_page() comment. */ diff --git a/queue-3.13/netfilter-nf_conntrack_dccp-fix-skb_header_pointer-api-usages.patch b/queue-3.13/netfilter-nf_conntrack_dccp-fix-skb_header_pointer-api-usages.patch new file mode 100644 index 00000000000..7a39612ef3d --- /dev/null +++ b/queue-3.13/netfilter-nf_conntrack_dccp-fix-skb_header_pointer-api-usages.patch @@ -0,0 +1,62 @@ +From b22f5126a24b3b2f15448c3f2a254fc10cbc2b92 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Mon, 6 Jan 2014 00:57:54 +0100 +Subject: netfilter: nf_conntrack_dccp: fix skb_header_pointer API usages + +From: Daniel Borkmann + +commit b22f5126a24b3b2f15448c3f2a254fc10cbc2b92 upstream. + +Some occurences in the netfilter tree use skb_header_pointer() in +the following way ... + + struct dccp_hdr _dh, *dh; + ... + skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + +... where dh itself is a pointer that is being passed as the copy +buffer. Instead, we need to use &_dh as the forth argument so that +we're copying the data into an actual buffer that sits on the stack. + +Currently, we probably could overwrite memory on the stack (e.g. +with a possibly mal-formed DCCP packet), but unintentionally, as +we only want the buffer to be placed into _dh variable. + +Fixes: 2bc780499aa3 ("[NETFILTER]: nf_conntrack: add DCCP protocol support") +Signed-off-by: Daniel Borkmann +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman + +--- + net/netfilter/nf_conntrack_proto_dccp.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/net/netfilter/nf_conntrack_proto_dccp.c ++++ b/net/netfilter/nf_conntrack_proto_dccp.c +@@ -428,7 +428,7 @@ static bool dccp_new(struct nf_conn *ct, + const char *msg; + u_int8_t state; + +- dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); ++ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); + BUG_ON(dh == NULL); + + state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; +@@ -486,7 +486,7 @@ static int dccp_packet(struct nf_conn *c + u_int8_t type, old_state, new_state; + enum ct_dccp_roles role; + +- dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); ++ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); + BUG_ON(dh == NULL); + type = dh->dccph_type; + +@@ -577,7 +577,7 @@ static int dccp_error(struct net *net, s + unsigned int cscov; + const char *msg; + +- dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); ++ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); + if (dh == NULL) { + msg = "nf_ct_dccp: short packet "; + goto out_invalid; diff --git a/queue-3.13/series b/queue-3.13/series index bfcda6e6cad..f16d672c7bf 100644 --- a/queue-3.13/series +++ b/queue-3.13/series @@ -14,3 +14,9 @@ net-mvneta-rename-mvneta_gmac2_psc_enable-to-mvneta_gmac2_pcs_enable.patch net-mvneta-fix-usage-as-a-module-on-rgmii-configurations.patch random32-avoid-attempt-to-late-reseed-if-in-the-middle-of-seeding.patch resizable-namespace.c-hashes.patch +keep-shadowed-vfsmounts-together.patch +don-t-bother-with-propagate_mnt-unless-the-target-is-shared.patch +switch-mnt_hash-to-hlist.patch +mm-close-pagetail-race.patch +cgroup-protect-modifications-to-cgroup_idr-with-cgroup_mutex.patch +netfilter-nf_conntrack_dccp-fix-skb_header_pointer-api-usages.patch diff --git a/queue-3.13/switch-mnt_hash-to-hlist.patch b/queue-3.13/switch-mnt_hash-to-hlist.patch new file mode 100644 index 00000000000..e40c80e6d43 --- /dev/null +++ b/queue-3.13/switch-mnt_hash-to-hlist.patch @@ -0,0 +1,347 @@ +From 38129a13e6e71f666e0468e99fdd932a687b4d7e Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Thu, 20 Mar 2014 21:10:51 -0400 +Subject: switch mnt_hash to hlist + +From: Al Viro + +commit 38129a13e6e71f666e0468e99fdd932a687b4d7e upstream. + +fixes RCU bug - walking through hlist is safe in face of element moves, +since it's self-terminating. Cyclic lists are not - if we end up jumping +to another hash chain, we'll loop infinitely without ever hitting the +original list head. + +[fix for dumb braino folded] + +Spotted by: Max Kellermann +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman + +--- + fs/mount.h | 2 - + fs/namespace.c | 79 +++++++++++++++++++++++++++++++-------------------------- + fs/pnode.c | 26 ++++++++++-------- + fs/pnode.h | 4 +- + 4 files changed, 61 insertions(+), 50 deletions(-) + +--- a/fs/mount.h ++++ b/fs/mount.h +@@ -25,7 +25,7 @@ struct mountpoint { + }; + + struct mount { +- struct list_head mnt_hash; ++ struct hlist_node mnt_hash; + struct mount *mnt_parent; + struct dentry *mnt_mountpoint; + struct vfsmount mnt; +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(mnt_id_lock); + static int mnt_id_start = 0; + static int mnt_group_start = 1; + +-static struct list_head *mount_hashtable __read_mostly; ++static struct hlist_head *mount_hashtable __read_mostly; + static struct hlist_head *mountpoint_hashtable __read_mostly; + static struct kmem_cache *mnt_cache __read_mostly; + static DECLARE_RWSEM(namespace_sem); +@@ -78,7 +78,7 @@ EXPORT_SYMBOL_GPL(fs_kobj); + */ + __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); + +-static inline struct list_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) ++static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) + { + unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); + tmp += ((unsigned long)dentry / L1_CACHE_BYTES); +@@ -217,7 +217,7 @@ static struct mount *alloc_vfsmnt(const + mnt->mnt_writers = 0; + #endif + +- INIT_LIST_HEAD(&mnt->mnt_hash); ++ INIT_HLIST_NODE(&mnt->mnt_hash); + INIT_LIST_HEAD(&mnt->mnt_child); + INIT_LIST_HEAD(&mnt->mnt_mounts); + INIT_LIST_HEAD(&mnt->mnt_list); +@@ -605,10 +605,10 @@ bool legitimize_mnt(struct vfsmount *bas + */ + struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) + { +- struct list_head *head = m_hash(mnt, dentry); ++ struct hlist_head *head = m_hash(mnt, dentry); + struct mount *p; + +- list_for_each_entry_rcu(p, head, mnt_hash) ++ hlist_for_each_entry_rcu(p, head, mnt_hash) + if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) + return p; + return NULL; +@@ -620,20 +620,16 @@ struct mount *__lookup_mnt(struct vfsmou + */ + struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) + { +- struct list_head *head = m_hash(mnt, dentry); +- struct mount *p, *res = NULL; +- +- list_for_each_entry(p, head, mnt_hash) +- if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) +- goto found; +- return res; +-found: +- res = p; +- list_for_each_entry_continue(p, head, mnt_hash) { ++ struct mount *p, *res; ++ res = p = __lookup_mnt(mnt, dentry); ++ if (!p) ++ goto out; ++ hlist_for_each_entry_continue(p, mnt_hash) { + if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) + break; + res = p; + } ++out: + return res; + } + +@@ -750,7 +746,7 @@ static void detach_mnt(struct mount *mnt + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + list_del_init(&mnt->mnt_child); +- list_del_init(&mnt->mnt_hash); ++ hlist_del_init_rcu(&mnt->mnt_hash); + put_mountpoint(mnt->mnt_mp); + mnt->mnt_mp = NULL; + } +@@ -777,7 +773,7 @@ static void attach_mnt(struct mount *mnt + struct mountpoint *mp) + { + mnt_set_mountpoint(parent, mp, mnt); +- list_add(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); ++ hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + } + +@@ -800,9 +796,9 @@ static void commit_tree(struct mount *mn + list_splice(&head, n->list.prev); + + if (shadows) +- list_add(&mnt->mnt_hash, &shadows->mnt_hash); ++ hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + else +- list_add(&mnt->mnt_hash, ++ hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + touch_mnt_namespace(n); +@@ -1193,26 +1189,28 @@ int may_umount(struct vfsmount *mnt) + + EXPORT_SYMBOL(may_umount); + +-static LIST_HEAD(unmounted); /* protected by namespace_sem */ ++static HLIST_HEAD(unmounted); /* protected by namespace_sem */ + + static void namespace_unlock(void) + { + struct mount *mnt; +- LIST_HEAD(head); ++ struct hlist_head head = unmounted; + +- if (likely(list_empty(&unmounted))) { ++ if (likely(hlist_empty(&head))) { + up_write(&namespace_sem); + return; + } + +- list_splice_init(&unmounted, &head); ++ head.first->pprev = &head.first; ++ INIT_HLIST_HEAD(&unmounted); ++ + up_write(&namespace_sem); + + synchronize_rcu(); + +- while (!list_empty(&head)) { +- mnt = list_first_entry(&head, struct mount, mnt_hash); +- list_del_init(&mnt->mnt_hash); ++ while (!hlist_empty(&head)) { ++ mnt = hlist_entry(head.first, struct mount, mnt_hash); ++ hlist_del_init(&mnt->mnt_hash); + if (mnt->mnt_ex_mountpoint.mnt) + path_put(&mnt->mnt_ex_mountpoint); + mntput(&mnt->mnt); +@@ -1233,16 +1231,19 @@ static inline void namespace_lock(void) + */ + void umount_tree(struct mount *mnt, int how) + { +- LIST_HEAD(tmp_list); ++ HLIST_HEAD(tmp_list); + struct mount *p; ++ struct mount *last = NULL; + +- for (p = mnt; p; p = next_mnt(p, mnt)) +- list_move(&p->mnt_hash, &tmp_list); ++ for (p = mnt; p; p = next_mnt(p, mnt)) { ++ hlist_del_init_rcu(&p->mnt_hash); ++ hlist_add_head(&p->mnt_hash, &tmp_list); ++ } + + if (how) + propagate_umount(&tmp_list); + +- list_for_each_entry(p, &tmp_list, mnt_hash) { ++ hlist_for_each_entry(p, &tmp_list, mnt_hash) { + list_del_init(&p->mnt_expire); + list_del_init(&p->mnt_list); + __touch_mnt_namespace(p->mnt_ns); +@@ -1260,8 +1261,13 @@ void umount_tree(struct mount *mnt, int + p->mnt_mp = NULL; + } + change_mnt_propagation(p, MS_PRIVATE); ++ last = p; ++ } ++ if (last) { ++ last->mnt_hash.next = unmounted.first; ++ unmounted.first = tmp_list.first; ++ unmounted.first->pprev = &unmounted.first; + } +- list_splice(&tmp_list, &unmounted); + } + + static void shrink_submounts(struct mount *mnt); +@@ -1645,8 +1651,9 @@ static int attach_recursive_mnt(struct m + struct mountpoint *dest_mp, + struct path *parent_path) + { +- LIST_HEAD(tree_list); ++ HLIST_HEAD(tree_list); + struct mount *child, *p; ++ struct hlist_node *n; + int err; + + if (IS_MNT_SHARED(dest_mnt)) { +@@ -1671,9 +1678,9 @@ static int attach_recursive_mnt(struct m + commit_tree(source_mnt, NULL); + } + +- list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { ++ hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { + struct mount *q; +- list_del_init(&child->mnt_hash); ++ hlist_del_init(&child->mnt_hash); + q = __lookup_mnt_last(&child->mnt_parent->mnt, + child->mnt_mountpoint); + commit_tree(child, q); +@@ -2818,7 +2825,7 @@ void __init mnt_init(void) + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + mount_hashtable = alloc_large_system_hash("Mount-cache", +- sizeof(struct list_head), ++ sizeof(struct hlist_head), + mhash_entries, 19, + 0, + &m_hash_shift, &m_hash_mask, 0, 0); +@@ -2832,7 +2839,7 @@ void __init mnt_init(void) + panic("Failed to allocate mount hash table\n"); + + for (u = 0; u <= m_hash_mask; u++) +- INIT_LIST_HEAD(&mount_hashtable[u]); ++ INIT_HLIST_HEAD(&mount_hashtable[u]); + for (u = 0; u <= mp_hash_mask; u++) + INIT_HLIST_HEAD(&mountpoint_hashtable[u]); + +--- a/fs/pnode.c ++++ b/fs/pnode.c +@@ -220,14 +220,14 @@ static struct mount *get_source(struct m + * @tree_list : list of heads of trees to be attached. + */ + int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, +- struct mount *source_mnt, struct list_head *tree_list) ++ struct mount *source_mnt, struct hlist_head *tree_list) + { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct mount *m, *child; + int ret = 0; + struct mount *prev_dest_mnt = dest_mnt; + struct mount *prev_src_mnt = source_mnt; +- LIST_HEAD(tmp_list); ++ HLIST_HEAD(tmp_list); + + for (m = propagation_next(dest_mnt, dest_mnt); m; + m = propagation_next(m, dest_mnt)) { +@@ -246,27 +246,29 @@ int propagate_mnt(struct mount *dest_mnt + child = copy_tree(source, source->mnt.mnt_root, type); + if (IS_ERR(child)) { + ret = PTR_ERR(child); +- list_splice(tree_list, tmp_list.prev); ++ tmp_list = *tree_list; ++ tmp_list.first->pprev = &tmp_list.first; ++ INIT_HLIST_HEAD(tree_list); + goto out; + } + + if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { + mnt_set_mountpoint(m, dest_mp, child); +- list_add_tail(&child->mnt_hash, tree_list); ++ hlist_add_head(&child->mnt_hash, tree_list); + } else { + /* + * This can happen if the parent mount was bind mounted + * on some subdirectory of a shared/slave mount. + */ +- list_add_tail(&child->mnt_hash, &tmp_list); ++ hlist_add_head(&child->mnt_hash, &tmp_list); + } + prev_dest_mnt = m; + prev_src_mnt = child; + } + out: + lock_mount_hash(); +- while (!list_empty(&tmp_list)) { +- child = list_first_entry(&tmp_list, struct mount, mnt_hash); ++ while (!hlist_empty(&tmp_list)) { ++ child = hlist_entry(tmp_list.first, struct mount, mnt_hash); + umount_tree(child, 0); + } + unlock_mount_hash(); +@@ -338,8 +340,10 @@ static void __propagate_umount(struct mo + * umount the child only if the child has no + * other children + */ +- if (child && list_empty(&child->mnt_mounts)) +- list_move_tail(&child->mnt_hash, &mnt->mnt_hash); ++ if (child && list_empty(&child->mnt_mounts)) { ++ hlist_del_init_rcu(&child->mnt_hash); ++ hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); ++ } + } + } + +@@ -350,11 +354,11 @@ static void __propagate_umount(struct mo + * + * vfsmount lock must be held for write + */ +-int propagate_umount(struct list_head *list) ++int propagate_umount(struct hlist_head *list) + { + struct mount *mnt; + +- list_for_each_entry(mnt, list, mnt_hash) ++ hlist_for_each_entry(mnt, list, mnt_hash) + __propagate_umount(mnt); + return 0; + } +--- a/fs/pnode.h ++++ b/fs/pnode.h +@@ -36,8 +36,8 @@ static inline void set_mnt_shared(struct + + void change_mnt_propagation(struct mount *, int); + int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, +- struct list_head *); +-int propagate_umount(struct list_head *); ++ struct hlist_head *); ++int propagate_umount(struct hlist_head *); + int propagate_mount_busy(struct mount *, int); + void mnt_release_group_id(struct mount *); + int get_dominating_id(struct mount *mnt, const struct path *root);