From: Greg Kroah-Hartman Date: Thu, 3 Nov 2011 19:05:08 +0000 (-0700) Subject: 3.0 patches X-Git-Tag: v3.0.9~35 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=612d75393513d20c8e0530e06dfef17913b797e7;p=thirdparty%2Fkernel%2Fstable-queue.git 3.0 patches --- diff --git a/queue-3.0/binfmt_elf-fix-pie-execution-with-randomization-disabled.patch b/queue-3.0/binfmt_elf-fix-pie-execution-with-randomization-disabled.patch new file mode 100644 index 00000000000..0f6f370b109 --- /dev/null +++ b/queue-3.0/binfmt_elf-fix-pie-execution-with-randomization-disabled.patch @@ -0,0 +1,55 @@ +From a3defbe5c337dbc6da911f8cc49ae3cc3b49b453 Mon Sep 17 00:00:00 2001 +From: Jiri Kosina +Date: Wed, 2 Nov 2011 13:37:41 -0700 +Subject: binfmt_elf: fix PIE execution with randomization disabled + +From: Jiri Kosina + +commit a3defbe5c337dbc6da911f8cc49ae3cc3b49b453 upstream. + +The case of address space randomization being disabled in runtime through +randomize_va_space sysctl is not treated properly in load_elf_binary(), +resulting in SIGKILL coming at exec() time for certain PIE-linked binaries +in case the randomization has been disabled at runtime prior to calling +exec(). + +Handle the randomize_va_space == 0 case the same way as if we were not +supporting .text randomization at all. + +Based on original patch by H.J. Lu and Josh Boyer. + +Signed-off-by: Jiri Kosina +Cc: Ingo Molnar +Cc: Russell King +Cc: H.J. Lu +Cc: +Tested-by: Josh Boyer +Acked-by: Nicolas Pitre +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/binfmt_elf.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/fs/binfmt_elf.c ++++ b/fs/binfmt_elf.c +@@ -796,7 +796,16 @@ static int load_elf_binary(struct linux_ + * might try to exec. This is because the brk will + * follow the loader, and is not movable. */ + #if defined(CONFIG_X86) || defined(CONFIG_ARM) +- load_bias = 0; ++ /* Memory randomization might have been switched off ++ * in runtime via sysctl. ++ * If that is the case, retain the original non-zero ++ * load_bias value in order to establish proper ++ * non-randomized mappings. ++ */ ++ if (current->flags & PF_RANDOMIZE) ++ load_bias = 0; ++ else ++ load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); + #else + load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); + #endif diff --git a/queue-3.0/drivers-net-rionet.c-fix-ethernet-address-macros-for-le-platforms.patch b/queue-3.0/drivers-net-rionet.c-fix-ethernet-address-macros-for-le-platforms.patch new file mode 100644 index 00000000000..56b713de449 --- /dev/null +++ b/queue-3.0/drivers-net-rionet.c-fix-ethernet-address-macros-for-le-platforms.patch @@ -0,0 +1,37 @@ +From e0c87bd95e8dad455c23bc56513af8dcb1737e55 Mon Sep 17 00:00:00 2001 +From: Alexandre Bounine +Date: Wed, 2 Nov 2011 13:39:15 -0700 +Subject: drivers/net/rionet.c: fix ethernet address macros for LE platforms + +From: Alexandre Bounine + +commit e0c87bd95e8dad455c23bc56513af8dcb1737e55 upstream. + +Modify Ethernet addess macros to be compatible with BE/LE platforms + +Signed-off-by: Alexandre Bounine +Cc: Chul Kim +Cc: Kumar Gala +Cc: Matt Porter +Cc: Li Yang +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/rionet.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/rionet.c ++++ b/drivers/net/rionet.c +@@ -88,8 +88,8 @@ static struct rio_dev **rionet_active; + #define dev_rionet_capable(dev) \ + is_rionet_capable(dev->src_ops, dev->dst_ops) + +-#define RIONET_MAC_MATCH(x) (*(u32 *)x == 0x00010001) +-#define RIONET_GET_DESTID(x) (*(u16 *)(x + 4)) ++#define RIONET_MAC_MATCH(x) (!memcmp((x), "\00\01\00\01", 4)) ++#define RIONET_GET_DESTID(x) ((*((u8 *)x + 4) << 8) | *((u8 *)x + 5)) + + static int rionet_rx_clean(struct net_device *ndev) + { diff --git a/queue-3.0/iwlagn-do-not-use-interruptible-waits.patch b/queue-3.0/iwlagn-do-not-use-interruptible-waits.patch new file mode 100644 index 00000000000..2ed3b4c20a7 --- /dev/null +++ b/queue-3.0/iwlagn-do-not-use-interruptible-waits.patch @@ -0,0 +1,130 @@ +From johannes@sipsolutions.net Thu Nov 3 10:55:59 2011 +From: Johannes Berg +Date: Thu, 03 Nov 2011 13:46:08 +0100 +Subject: iwlagn: do not use interruptible waits +To: stable@vger.kernel.org +Message-ID: <1320324368.3950.44.camel@jlt3.sipsolutions.net> + + +From: Johannes Berg + +Upstream commit effd4d9aece9184f526e6556786a94d335e38b71. + +Since the dawn of its time, iwlwifi has used +interruptible waits to wait for synchronous +commands and firmware loading. + +This leads to "interesting" bugs, because it +can't actually handle the interruptions; for +example when a command sending is interrupted +it will assume the command completed fully, +and then leave it pending, which leads to all +kinds of trouble when the command finishes +later. + +Since there's no easy way to gracefully deal +with interruptions, fix the driver to not use +interruptible waits. + +This at least fixes the error +iwlagn 0000:02:00.0: Error: Response NULL in 'REPLY_SCAN_ABORT_CMD' + +I have seen in P2P testing, but it is likely +that there are other errors caused by this. + +Cc: Stanislaw Gruszka +Signed-off-by: Johannes Berg +Signed-off-by: Wey-Yi Guy +Signed-off-by: John W. Linville +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/wireless/iwlwifi/iwl-agn-ucode.c | 9 ++------- + drivers/net/wireless/iwlwifi/iwl-agn.c | 2 +- + drivers/net/wireless/iwlwifi/iwl-core.c | 4 ++-- + drivers/net/wireless/iwlwifi/iwl-hcmd.c | 2 +- + drivers/net/wireless/iwlwifi/iwl-rx.c | 2 +- + drivers/net/wireless/iwlwifi/iwl-tx.c | 2 +- + 6 files changed, 8 insertions(+), 13 deletions(-) + +--- a/drivers/net/wireless/iwlwifi/iwl-agn-ucode.c ++++ b/drivers/net/wireless/iwlwifi/iwl-agn-ucode.c +@@ -144,13 +144,8 @@ static int iwlagn_load_section(struct iw + FH_TCSR_TX_CONFIG_REG_VAL_CIRQ_HOST_ENDTFD); + + IWL_DEBUG_INFO(priv, "%s uCode section being loaded...\n", name); +- ret = wait_event_interruptible_timeout(priv->wait_command_queue, +- priv->ucode_write_complete, 5 * HZ); +- if (ret == -ERESTARTSYS) { +- IWL_ERR(priv, "Could not load the %s uCode section due " +- "to interrupt\n", name); +- return ret; +- } ++ ret = wait_event_timeout(priv->wait_command_queue, ++ priv->ucode_write_complete, 5 * HZ); + if (!ret) { + IWL_ERR(priv, "Could not load the %s uCode section\n", + name); +--- a/drivers/net/wireless/iwlwifi/iwl-agn.c ++++ b/drivers/net/wireless/iwlwifi/iwl-agn.c +@@ -797,7 +797,7 @@ static void iwl_irq_tasklet(struct iwl_p + handled |= CSR_INT_BIT_FH_TX; + /* Wake up uCode load routine, now that load is complete */ + priv->ucode_write_complete = 1; +- wake_up_interruptible(&priv->wait_command_queue); ++ wake_up(&priv->wait_command_queue); + } + + if (inta & ~handled) { +--- a/drivers/net/wireless/iwlwifi/iwl-core.c ++++ b/drivers/net/wireless/iwlwifi/iwl-core.c +@@ -899,7 +899,7 @@ void iwlagn_fw_error(struct iwl_priv *pr + * commands by clearing the ready bit */ + clear_bit(STATUS_READY, &priv->status); + +- wake_up_interruptible(&priv->wait_command_queue); ++ wake_up(&priv->wait_command_queue); + + if (!ondemand) { + /* +@@ -950,7 +950,7 @@ void iwl_irq_handle_error(struct iwl_pri + */ + clear_bit(STATUS_READY, &priv->status); + clear_bit(STATUS_HCMD_ACTIVE, &priv->status); +- wake_up_interruptible(&priv->wait_command_queue); ++ wake_up(&priv->wait_command_queue); + IWL_ERR(priv, "RF is used by WiMAX\n"); + return; + } +--- a/drivers/net/wireless/iwlwifi/iwl-hcmd.c ++++ b/drivers/net/wireless/iwlwifi/iwl-hcmd.c +@@ -194,7 +194,7 @@ int iwl_send_cmd_sync(struct iwl_priv *p + return ret; + } + +- ret = wait_event_interruptible_timeout(priv->wait_command_queue, ++ ret = wait_event_timeout(priv->wait_command_queue, + !test_bit(STATUS_HCMD_ACTIVE, &priv->status), + HOST_COMPLETE_TIMEOUT); + if (!ret) { +--- a/drivers/net/wireless/iwlwifi/iwl-rx.c ++++ b/drivers/net/wireless/iwlwifi/iwl-rx.c +@@ -738,7 +738,7 @@ static void iwl_rx_card_state_notif(stru + wiphy_rfkill_set_hw_state(priv->hw->wiphy, + test_bit(STATUS_RF_KILL_HW, &priv->status)); + else +- wake_up_interruptible(&priv->wait_command_queue); ++ wake_up(&priv->wait_command_queue); + } + + static void iwl_rx_missed_beacon_notif(struct iwl_priv *priv, +--- a/drivers/net/wireless/iwlwifi/iwl-tx.c ++++ b/drivers/net/wireless/iwlwifi/iwl-tx.c +@@ -821,7 +821,7 @@ void iwl_tx_cmd_complete(struct iwl_priv + clear_bit(STATUS_HCMD_ACTIVE, &priv->status); + IWL_DEBUG_INFO(priv, "Clearing HCMD_ACTIVE for command %s\n", + get_cmd_string(cmd->hdr.cmd)); +- wake_up_interruptible(&priv->wait_command_queue); ++ wake_up(&priv->wait_command_queue); + } + + /* Mark as unmapped */ diff --git a/queue-3.0/mm-thp-tail-page-refcounting-fix.patch b/queue-3.0/mm-thp-tail-page-refcounting-fix.patch new file mode 100644 index 00000000000..983badc0f79 --- /dev/null +++ b/queue-3.0/mm-thp-tail-page-refcounting-fix.patch @@ -0,0 +1,492 @@ +From 70b50f94f1644e2aa7cb374819cfd93f3c28d725 Mon Sep 17 00:00:00 2001 +From: Andrea Arcangeli +Date: Wed, 2 Nov 2011 13:36:59 -0700 +Subject: mm: thp: tail page refcounting fix + +From: Andrea Arcangeli + +commit 70b50f94f1644e2aa7cb374819cfd93f3c28d725 upstream. + +Michel while working on the working set estimation code, noticed that +calling get_page_unless_zero() on a random pfn_to_page(random_pfn) +wasn't safe, if the pfn ended up being a tail page of a transparent +hugepage under splitting by __split_huge_page_refcount(). + +He then found the problem could also theoretically materialize with +page_cache_get_speculative() during the speculative radix tree lookups +that uses get_page_unless_zero() in SMP if the radix tree page is freed +and reallocated and get_user_pages is called on it before +page_cache_get_speculative has a chance to call get_page_unless_zero(). + +So the best way to fix the problem is to keep page_tail->_count zero at +all times. This will guarantee that get_page_unless_zero() can never +succeed on any tail page. page_tail->_mapcount is guaranteed zero and +is unused for all tail pages of a compound page, so we can simply +account the tail page references there and transfer them to +tail_page->_count in __split_huge_page_refcount() (in addition to the +head_page->_mapcount). + +While debugging this s/_count/_mapcount/ change I also noticed get_page is +called by direct-io.c on pages returned by get_user_pages. That wasn't +entirely safe because the two atomic_inc in get_page weren't atomic. As +opposed to other get_user_page users like secondary-MMU page fault to +establish the shadow pagetables would never call any superflous get_page +after get_user_page returns. It's safer to make get_page universally safe +for tail pages and to use get_page_foll() within follow_page (inside +get_user_pages()). get_page_foll() is safe to do the refcounting for tail +pages without taking any locks because it is run within PT lock protected +critical sections (PT lock for pte and page_table_lock for +pmd_trans_huge). + +The standard get_page() as invoked by direct-io instead will now take +the compound_lock but still only for tail pages. The direct-io paths +are usually I/O bound and the compound_lock is per THP so very +finegrined, so there's no risk of scalability issues with it. A simple +direct-io benchmarks with all lockdep prove locking and spinlock +debugging infrastructure enabled shows identical performance and no +overhead. So it's worth it. Ideally direct-io should stop calling +get_page() on pages returned by get_user_pages(). The spinlock in +get_page() is already optimized away for no-THP builds but doing +get_page() on tail pages returned by GUP is generally a rare operation +and usually only run in I/O paths. + +This new refcounting on page_tail->_mapcount in addition to avoiding new +RCU critical sections will also allow the working set estimation code to +work without any further complexity associated to the tail page +refcounting with THP. + +Signed-off-by: Andrea Arcangeli +Reported-by: Michel Lespinasse +Reviewed-by: Michel Lespinasse +Reviewed-by: Minchan Kim +Cc: Peter Zijlstra +Cc: Hugh Dickins +Cc: Johannes Weiner +Cc: Rik van Riel +Cc: Mel Gorman +Cc: KOSAKI Motohiro +Cc: Benjamin Herrenschmidt +Cc: David Gibson +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/powerpc/mm/gup.c | 5 +- + arch/x86/mm/gup.c | 5 +- + include/linux/mm.h | 56 ++++++++++++------------------- + include/linux/mm_types.h | 22 ++++++++++-- + mm/huge_memory.c | 37 ++++++++++++++------ + mm/internal.h | 46 ++++++++++++++++++++++++++ + mm/memory.c | 2 - + mm/swap.c | 83 ++++++++++++++++++++++++++++++----------------- + 8 files changed, 172 insertions(+), 84 deletions(-) + +--- a/arch/powerpc/mm/gup.c ++++ b/arch/powerpc/mm/gup.c +@@ -22,8 +22,9 @@ static inline void get_huge_page_tail(st + * __split_huge_page_refcount() cannot run + * from under us. + */ +- VM_BUG_ON(atomic_read(&page->_count) < 0); +- atomic_inc(&page->_count); ++ VM_BUG_ON(page_mapcount(page) < 0); ++ VM_BUG_ON(atomic_read(&page->_count) != 0); ++ atomic_inc(&page->_mapcount); + } + + /* +--- a/arch/x86/mm/gup.c ++++ b/arch/x86/mm/gup.c +@@ -114,8 +114,9 @@ static inline void get_huge_page_tail(st + * __split_huge_page_refcount() cannot run + * from under us. + */ +- VM_BUG_ON(atomic_read(&page->_count) < 0); +- atomic_inc(&page->_count); ++ VM_BUG_ON(page_mapcount(page) < 0); ++ VM_BUG_ON(atomic_read(&page->_count) != 0); ++ atomic_inc(&page->_mapcount); + } + + static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -355,36 +355,39 @@ static inline struct page *compound_head + return page; + } + ++/* ++ * The atomic page->_mapcount, starts from -1: so that transitions ++ * both from it and to it can be tracked, using atomic_inc_and_test ++ * and atomic_add_negative(-1). ++ */ ++static inline void reset_page_mapcount(struct page *page) ++{ ++ atomic_set(&(page)->_mapcount, -1); ++} ++ ++static inline int page_mapcount(struct page *page) ++{ ++ return atomic_read(&(page)->_mapcount) + 1; ++} ++ + static inline int page_count(struct page *page) + { + return atomic_read(&compound_head(page)->_count); + } + ++extern bool __get_page_tail(struct page *page); ++ + static inline void get_page(struct page *page) + { ++ if (unlikely(PageTail(page))) ++ if (likely(__get_page_tail(page))) ++ return; + /* + * Getting a normal page or the head of a compound page +- * requires to already have an elevated page->_count. Only if +- * we're getting a tail page, the elevated page->_count is +- * required only in the head page, so for tail pages the +- * bugcheck only verifies that the page->_count isn't +- * negative. ++ * requires to already have an elevated page->_count. + */ +- VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); ++ VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_inc(&page->_count); +- /* +- * Getting a tail page will elevate both the head and tail +- * page->_count(s). +- */ +- if (unlikely(PageTail(page))) { +- /* +- * This is safe only because +- * __split_huge_page_refcount can't run under +- * get_page(). +- */ +- VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); +- atomic_inc(&page->first_page->_count); +- } + } + + static inline struct page *virt_to_head_page(const void *x) +@@ -803,21 +806,6 @@ static inline pgoff_t page_index(struct + } + + /* +- * The atomic page->_mapcount, like _count, starts from -1: +- * so that transitions both from it and to it can be tracked, +- * using atomic_inc_and_test and atomic_add_negative(-1). +- */ +-static inline void reset_page_mapcount(struct page *page) +-{ +- atomic_set(&(page)->_mapcount, -1); +-} +- +-static inline int page_mapcount(struct page *page) +-{ +- return atomic_read(&(page)->_mapcount) + 1; +-} +- +-/* + * Return true if this page is mapped into pagetables. + */ + static inline int page_mapped(struct page *page) +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -36,10 +36,24 @@ struct page { + * updated asynchronously */ + atomic_t _count; /* Usage count, see below. */ + union { +- atomic_t _mapcount; /* Count of ptes mapped in mms, +- * to show when page is mapped +- * & limit reverse map searches. +- */ ++ /* ++ * Count of ptes mapped in ++ * mms, to show when page is ++ * mapped & limit reverse map ++ * searches. ++ * ++ * Used also for tail pages ++ * refcounting instead of ++ * _count. Tail pages cannot ++ * be mapped and keeping the ++ * tail page _count zero at ++ * all times guarantees ++ * get_page_unless_zero() will ++ * never succeed on tail ++ * pages. ++ */ ++ atomic_t _mapcount; ++ + struct { /* SLUB */ + u16 inuse; + u16 objects; +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -989,7 +989,7 @@ struct page *follow_trans_huge_pmd(struc + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + VM_BUG_ON(!PageCompound(page)); + if (flags & FOLL_GET) +- get_page(page); ++ get_page_foll(page); + + out: + return page; +@@ -1156,6 +1156,7 @@ static void __split_huge_page_refcount(s + unsigned long head_index = page->index; + struct zone *zone = page_zone(page); + int zonestat; ++ int tail_count = 0; + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); +@@ -1164,11 +1165,27 @@ static void __split_huge_page_refcount(s + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + +- /* tail_page->_count cannot change */ +- atomic_sub(atomic_read(&page_tail->_count), &page->_count); +- BUG_ON(page_count(page) <= 0); +- atomic_add(page_mapcount(page) + 1, &page_tail->_count); +- BUG_ON(atomic_read(&page_tail->_count) <= 0); ++ /* tail_page->_mapcount cannot change */ ++ BUG_ON(page_mapcount(page_tail) < 0); ++ tail_count += page_mapcount(page_tail); ++ /* check for overflow */ ++ BUG_ON(tail_count < 0); ++ BUG_ON(atomic_read(&page_tail->_count) != 0); ++ /* ++ * tail_page->_count is zero and not changing from ++ * under us. But get_page_unless_zero() may be running ++ * from under us on the tail_page. If we used ++ * atomic_set() below instead of atomic_add(), we ++ * would then run atomic_set() concurrently with ++ * get_page_unless_zero(), and atomic_set() is ++ * implemented in C not using locked ops. spin_unlock ++ * on x86 sometime uses locked ops because of PPro ++ * errata 66, 92, so unless somebody can guarantee ++ * atomic_set() here would be safe on all archs (and ++ * not only on x86), it's safer to use atomic_add(). ++ */ ++ atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, ++ &page_tail->_count); + + /* after clearing PageTail the gup refcount can be released */ + smp_mb(); +@@ -1186,10 +1203,7 @@ static void __split_huge_page_refcount(s + (1L << PG_uptodate))); + page_tail->flags |= (1L << PG_dirty); + +- /* +- * 1) clear PageTail before overwriting first_page +- * 2) clear PageTail before clearing PageHead for VM_BUG_ON +- */ ++ /* clear PageTail before overwriting first_page */ + smp_wmb(); + + /* +@@ -1206,7 +1220,6 @@ static void __split_huge_page_refcount(s + * status is achieved setting a reserved bit in the + * pmd, not by clearing the present bit. + */ +- BUG_ON(page_mapcount(page_tail)); + page_tail->_mapcount = page->_mapcount; + + BUG_ON(page_tail->mapping); +@@ -1223,6 +1236,8 @@ static void __split_huge_page_refcount(s + + lru_add_page_tail(zone, page, page_tail); + } ++ atomic_sub(tail_count, &page->_count); ++ BUG_ON(atomic_read(&page->_count) <= 0); + + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -37,6 +37,52 @@ static inline void __put_page(struct pag + atomic_dec(&page->_count); + } + ++static inline void __get_page_tail_foll(struct page *page, ++ bool get_page_head) ++{ ++ /* ++ * If we're getting a tail page, the elevated page->_count is ++ * required only in the head page and we will elevate the head ++ * page->_count and tail page->_mapcount. ++ * ++ * We elevate page_tail->_mapcount for tail pages to force ++ * page_tail->_count to be zero at all times to avoid getting ++ * false positives from get_page_unless_zero() with ++ * speculative page access (like in ++ * page_cache_get_speculative()) on tail pages. ++ */ ++ VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); ++ VM_BUG_ON(atomic_read(&page->_count) != 0); ++ VM_BUG_ON(page_mapcount(page) < 0); ++ if (get_page_head) ++ atomic_inc(&page->first_page->_count); ++ atomic_inc(&page->_mapcount); ++} ++ ++/* ++ * This is meant to be called as the FOLL_GET operation of ++ * follow_page() and it must be called while holding the proper PT ++ * lock while the pte (or pmd_trans_huge) is still mapping the page. ++ */ ++static inline void get_page_foll(struct page *page) ++{ ++ if (unlikely(PageTail(page))) ++ /* ++ * This is safe only because ++ * __split_huge_page_refcount() can't run under ++ * get_page_foll() because we hold the proper PT lock. ++ */ ++ __get_page_tail_foll(page, true); ++ else { ++ /* ++ * Getting a normal page or the head of a compound page ++ * requires to already have an elevated page->_count. ++ */ ++ VM_BUG_ON(atomic_read(&page->_count) <= 0); ++ atomic_inc(&page->_count); ++ } ++} ++ + extern unsigned long highest_memmap_pfn; + + /* +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1514,7 +1514,7 @@ split_fallthrough: + } + + if (flags & FOLL_GET) +- get_page(page); ++ get_page_foll(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -78,39 +78,22 @@ static void put_compound_page(struct pag + { + if (unlikely(PageTail(page))) { + /* __split_huge_page_refcount can run under us */ +- struct page *page_head = page->first_page; +- smp_rmb(); +- /* +- * If PageTail is still set after smp_rmb() we can be sure +- * that the page->first_page we read wasn't a dangling pointer. +- * See __split_huge_page_refcount() smp_wmb(). +- */ +- if (likely(PageTail(page) && get_page_unless_zero(page_head))) { ++ struct page *page_head = compound_trans_head(page); ++ ++ if (likely(page != page_head && ++ get_page_unless_zero(page_head))) { + unsigned long flags; + /* +- * Verify that our page_head wasn't converted +- * to a a regular page before we got a +- * reference on it. ++ * page_head wasn't a dangling pointer but it ++ * may not be a head page anymore by the time ++ * we obtain the lock. That is ok as long as it ++ * can't be freed from under us. + */ +- if (unlikely(!PageHead(page_head))) { +- /* PageHead is cleared after PageTail */ +- smp_rmb(); +- VM_BUG_ON(PageTail(page)); +- goto out_put_head; +- } +- /* +- * Only run compound_lock on a valid PageHead, +- * after having it pinned with +- * get_page_unless_zero() above. +- */ +- smp_mb(); +- /* page_head wasn't a dangling pointer */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); + VM_BUG_ON(PageHead(page_head)); +- out_put_head: + if (put_page_testzero(page_head)) + __put_single_page(page_head); + out_put_single: +@@ -121,16 +104,17 @@ static void put_compound_page(struct pag + VM_BUG_ON(page_head != page->first_page); + /* + * We can release the refcount taken by +- * get_page_unless_zero now that +- * split_huge_page_refcount is blocked on the +- * compound_lock. ++ * get_page_unless_zero() now that ++ * __split_huge_page_refcount() is blocked on ++ * the compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + /* __split_huge_page_refcount will wait now */ +- VM_BUG_ON(atomic_read(&page->_count) <= 0); +- atomic_dec(&page->_count); ++ VM_BUG_ON(page_mapcount(page) <= 0); ++ atomic_dec(&page->_mapcount); + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); ++ VM_BUG_ON(atomic_read(&page->_count) != 0); + compound_unlock_irqrestore(page_head, flags); + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) +@@ -160,6 +144,45 @@ void put_page(struct page *page) + } + EXPORT_SYMBOL(put_page); + ++/* ++ * This function is exported but must not be called by anything other ++ * than get_page(). It implements the slow path of get_page(). ++ */ ++bool __get_page_tail(struct page *page) ++{ ++ /* ++ * This takes care of get_page() if run on a tail page ++ * returned by one of the get_user_pages/follow_page variants. ++ * get_user_pages/follow_page itself doesn't need the compound ++ * lock because it runs __get_page_tail_foll() under the ++ * proper PT lock that already serializes against ++ * split_huge_page(). ++ */ ++ unsigned long flags; ++ bool got = false; ++ struct page *page_head = compound_trans_head(page); ++ ++ if (likely(page != page_head && get_page_unless_zero(page_head))) { ++ /* ++ * page_head wasn't a dangling pointer but it ++ * may not be a head page anymore by the time ++ * we obtain the lock. That is ok as long as it ++ * can't be freed from under us. ++ */ ++ flags = compound_lock_irqsave(page_head); ++ /* here __split_huge_page_refcount won't run anymore */ ++ if (likely(PageTail(page))) { ++ __get_page_tail_foll(page, false); ++ got = true; ++ } ++ compound_unlock_irqrestore(page_head, flags); ++ if (unlikely(!got)) ++ put_page(page_head); ++ } ++ return got; ++} ++EXPORT_SYMBOL(__get_page_tail); ++ + /** + * put_pages_list() - release a list of pages + * @pages: list of pages threaded on page->lru diff --git a/queue-3.0/proc-fix-races-against-execve-of-proc-pid-fd.patch b/queue-3.0/proc-fix-races-against-execve-of-proc-pid-fd.patch new file mode 100644 index 00000000000..20688cf6371 --- /dev/null +++ b/queue-3.0/proc-fix-races-against-execve-of-proc-pid-fd.patch @@ -0,0 +1,261 @@ +From aa6afca5bcaba8101f3ea09d5c3e4100b2b9f0e5 Mon Sep 17 00:00:00 2001 +From: Vasiliy Kulikov +Date: Wed, 2 Nov 2011 13:38:44 -0700 +Subject: proc: fix races against execve() of /proc/PID/fd** + +From: Vasiliy Kulikov + +commit aa6afca5bcaba8101f3ea09d5c3e4100b2b9f0e5 upstream. + +fd* files are restricted to the task's owner, and other users may not get +direct access to them. But one may open any of these files and run any +setuid program, keeping opened file descriptors. As there are permission +checks on open(), but not on readdir() and read(), operations on the kept +file descriptors will not be checked. It makes it possible to violate +procfs permission model. + +Reading fdinfo/* may disclosure current fds' position and flags, reading +directory contents of fdinfo/ and fd/ may disclosure the number of opened +files by the target task. This information is not sensible per se, but it +can reveal some private information (like length of a password stored in a +file) under certain conditions. + +Used existing (un)lock_trace functions to check for ptrace_may_access(), +but instead of using EPERM return code from it use EACCES to be consistent +with existing proc_pid_follow_link()/proc_pid_readlink() return code. If +they differ, attacker can guess what fds exist by analyzing stat() return +code. Patched handlers: stat() for fd/*, stat() and read() for fdindo/*, +readdir() and lookup() for fd/ and fdinfo/. + +Signed-off-by: Vasiliy Kulikov +Cc: Cyrill Gorcunov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/proc/base.c | 146 ++++++++++++++++++++++++++++++++++++++++----------------- + 1 file changed, 103 insertions(+), 43 deletions(-) + +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -1666,12 +1666,46 @@ out: + return error; + } + ++static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ struct inode *inode = dentry->d_inode; ++ struct task_struct *task = get_proc_task(inode); ++ int rc; ++ ++ if (task == NULL) ++ return -ESRCH; ++ ++ rc = -EACCES; ++ if (lock_trace(task)) ++ goto out_task; ++ ++ generic_fillattr(inode, stat); ++ unlock_trace(task); ++ rc = 0; ++out_task: ++ put_task_struct(task); ++ return rc; ++} ++ + static const struct inode_operations proc_pid_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_pid_follow_link, + .setattr = proc_setattr, + }; + ++static const struct inode_operations proc_fdinfo_link_inode_operations = { ++ .setattr = proc_setattr, ++ .getattr = proc_pid_fd_link_getattr, ++}; ++ ++static const struct inode_operations proc_fd_link_inode_operations = { ++ .readlink = proc_pid_readlink, ++ .follow_link = proc_pid_follow_link, ++ .setattr = proc_setattr, ++ .getattr = proc_pid_fd_link_getattr, ++}; ++ + + /* building an inode */ + +@@ -1903,49 +1937,61 @@ out: + + static int proc_fd_info(struct inode *inode, struct path *path, char *info) + { +- struct task_struct *task = get_proc_task(inode); +- struct files_struct *files = NULL; ++ struct task_struct *task; ++ struct files_struct *files; + struct file *file; + int fd = proc_fd(inode); ++ int rc; + +- if (task) { +- files = get_files_struct(task); +- put_task_struct(task); +- } +- if (files) { +- /* +- * We are not taking a ref to the file structure, so we must +- * hold ->file_lock. +- */ +- spin_lock(&files->file_lock); +- file = fcheck_files(files, fd); +- if (file) { +- unsigned int f_flags; +- struct fdtable *fdt; +- +- fdt = files_fdtable(files); +- f_flags = file->f_flags & ~O_CLOEXEC; +- if (FD_ISSET(fd, fdt->close_on_exec)) +- f_flags |= O_CLOEXEC; +- +- if (path) { +- *path = file->f_path; +- path_get(&file->f_path); +- } +- if (info) +- snprintf(info, PROC_FDINFO_MAX, +- "pos:\t%lli\n" +- "flags:\t0%o\n", +- (long long) file->f_pos, +- f_flags); +- spin_unlock(&files->file_lock); +- put_files_struct(files); +- return 0; ++ task = get_proc_task(inode); ++ if (!task) ++ return -ENOENT; ++ ++ rc = -EACCES; ++ if (lock_trace(task)) ++ goto out_task; ++ ++ rc = -ENOENT; ++ files = get_files_struct(task); ++ if (files == NULL) ++ goto out_unlock; ++ ++ /* ++ * We are not taking a ref to the file structure, so we must ++ * hold ->file_lock. ++ */ ++ spin_lock(&files->file_lock); ++ file = fcheck_files(files, fd); ++ if (file) { ++ unsigned int f_flags; ++ struct fdtable *fdt; ++ ++ fdt = files_fdtable(files); ++ f_flags = file->f_flags & ~O_CLOEXEC; ++ if (FD_ISSET(fd, fdt->close_on_exec)) ++ f_flags |= O_CLOEXEC; ++ ++ if (path) { ++ *path = file->f_path; ++ path_get(&file->f_path); + } +- spin_unlock(&files->file_lock); +- put_files_struct(files); +- } +- return -ENOENT; ++ if (info) ++ snprintf(info, PROC_FDINFO_MAX, ++ "pos:\t%lli\n" ++ "flags:\t0%o\n", ++ (long long) file->f_pos, ++ f_flags); ++ rc = 0; ++ } else ++ rc = -ENOENT; ++ spin_unlock(&files->file_lock); ++ put_files_struct(files); ++ ++out_unlock: ++ unlock_trace(task); ++out_task: ++ put_task_struct(task); ++ return rc; + } + + static int proc_fd_link(struct inode *inode, struct path *path) +@@ -2040,7 +2086,7 @@ static struct dentry *proc_fd_instantiat + spin_unlock(&files->file_lock); + put_files_struct(files); + +- inode->i_op = &proc_pid_link_inode_operations; ++ inode->i_op = &proc_fd_link_inode_operations; + inode->i_size = 64; + ei->op.proc_get_link = proc_fd_link; + d_set_d_op(dentry, &tid_fd_dentry_operations); +@@ -2072,7 +2118,12 @@ static struct dentry *proc_lookupfd_comm + if (fd == ~0U) + goto out; + ++ result = ERR_PTR(-EACCES); ++ if (lock_trace(task)) ++ goto out; ++ + result = instantiate(dir, dentry, task, &fd); ++ unlock_trace(task); + out: + put_task_struct(task); + out_no_task: +@@ -2092,23 +2143,28 @@ static int proc_readfd_common(struct fil + retval = -ENOENT; + if (!p) + goto out_no_task; ++ ++ retval = -EACCES; ++ if (lock_trace(p)) ++ goto out; ++ + retval = 0; + + fd = filp->f_pos; + switch (fd) { + case 0: + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) +- goto out; ++ goto out_unlock; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) +- goto out; ++ goto out_unlock; + filp->f_pos++; + default: + files = get_files_struct(p); + if (!files) +- goto out; ++ goto out_unlock; + rcu_read_lock(); + for (fd = filp->f_pos-2; + fd < files_fdtable(files)->max_fds; +@@ -2132,6 +2188,9 @@ static int proc_readfd_common(struct fil + rcu_read_unlock(); + put_files_struct(files); + } ++ ++out_unlock: ++ unlock_trace(p); + out: + put_task_struct(p); + out_no_task: +@@ -2209,6 +2268,7 @@ static struct dentry *proc_fdinfo_instan + ei->fd = fd; + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_fop = &proc_fdinfo_file_operations; ++ inode->i_op = &proc_fdinfo_link_inode_operations; + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ diff --git a/queue-3.0/series b/queue-3.0/series index 647b8f7b8e0..1ced05fc8f1 100644 --- a/queue-3.0/series +++ b/queue-3.0/series @@ -145,3 +145,9 @@ tg3-negate-use_phylib-flag-check.patch ipv6-nullify-ipv6_ac_list-and-ipv6_fl_list-when-creating-new-socket.patch make-packet_statistics-getsockopt-report-consistently-between-ring-and-non-ring.patch net-xen-netback-correctly-restart-tx-after-a-vm-restore-migrate.patch +mm-thp-tail-page-refcounting-fix.patch +binfmt_elf-fix-pie-execution-with-randomization-disabled.patch +vfs-show-o_cloexe-bit-properly-in-proc-pid-fdinfo-fd-files.patch +proc-fix-races-against-execve-of-proc-pid-fd.patch +iwlagn-do-not-use-interruptible-waits.patch +drivers-net-rionet.c-fix-ethernet-address-macros-for-le-platforms.patch diff --git a/queue-3.0/vfs-show-o_cloexe-bit-properly-in-proc-pid-fdinfo-fd-files.patch b/queue-3.0/vfs-show-o_cloexe-bit-properly-in-proc-pid-fdinfo-fd-files.patch new file mode 100644 index 00000000000..8823d27db35 --- /dev/null +++ b/queue-3.0/vfs-show-o_cloexe-bit-properly-in-proc-pid-fdinfo-fd-files.patch @@ -0,0 +1,59 @@ +From 1117f72ea0217ba0cc19f05adbbd8b9a397f5ab7 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sat, 6 Aug 2011 11:51:33 -0700 +Subject: vfs: show O_CLOEXE bit properly in /proc//fdinfo/ files + +From: Linus Torvalds + +commit 1117f72ea0217ba0cc19f05adbbd8b9a397f5ab7 upstream. + +The CLOEXE bit is magical, and for performance (and semantic) reasons we +don't actually maintain it in the file descriptor itself, but in a +separate bit array. Which means that when we show f_flags, the CLOEXE +status is shown incorrectly: we show the status not as it is now, but as +it was when the file was opened. + +Fix that by looking up the bit properly in the 'fdt->close_on_exec' bit +array. + +Uli needs this in order to re-implement the pfiles program: + + "For normal file descriptors (not sockets) this was the last piece of + information which wasn't available. This is all part of my 'give + Solaris users no reason to not switch' effort. I intend to offer the + code to the util-linux-ng maintainers." + +Requested-by: Ulrich Drepper +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/proc/base.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -1920,6 +1920,14 @@ static int proc_fd_info(struct inode *in + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { ++ unsigned int f_flags; ++ struct fdtable *fdt; ++ ++ fdt = files_fdtable(files); ++ f_flags = file->f_flags & ~O_CLOEXEC; ++ if (FD_ISSET(fd, fdt->close_on_exec)) ++ f_flags |= O_CLOEXEC; ++ + if (path) { + *path = file->f_path; + path_get(&file->f_path); +@@ -1929,7 +1937,7 @@ static int proc_fd_info(struct inode *in + "pos:\t%lli\n" + "flags:\t0%o\n", + (long long) file->f_pos, +- file->f_flags); ++ f_flags); + spin_unlock(&files->file_lock); + put_files_struct(files); + return 0;