]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.0 patches
authorGreg Kroah-Hartman <gregkh@suse.de>
Thu, 3 Nov 2011 19:05:08 +0000 (12:05 -0700)
committerGreg Kroah-Hartman <gregkh@suse.de>
Thu, 3 Nov 2011 19:05:08 +0000 (12:05 -0700)
queue-3.0/binfmt_elf-fix-pie-execution-with-randomization-disabled.patch [new file with mode: 0644]
queue-3.0/drivers-net-rionet.c-fix-ethernet-address-macros-for-le-platforms.patch [new file with mode: 0644]
queue-3.0/iwlagn-do-not-use-interruptible-waits.patch [new file with mode: 0644]
queue-3.0/mm-thp-tail-page-refcounting-fix.patch [new file with mode: 0644]
queue-3.0/proc-fix-races-against-execve-of-proc-pid-fd.patch [new file with mode: 0644]
queue-3.0/series
queue-3.0/vfs-show-o_cloexe-bit-properly-in-proc-pid-fdinfo-fd-files.patch [new file with mode: 0644]

diff --git a/queue-3.0/binfmt_elf-fix-pie-execution-with-randomization-disabled.patch b/queue-3.0/binfmt_elf-fix-pie-execution-with-randomization-disabled.patch
new file mode 100644 (file)
index 0000000..0f6f370
--- /dev/null
@@ -0,0 +1,55 @@
+From a3defbe5c337dbc6da911f8cc49ae3cc3b49b453 Mon Sep 17 00:00:00 2001
+From: Jiri Kosina <jkosina@suse.cz>
+Date: Wed, 2 Nov 2011 13:37:41 -0700
+Subject: binfmt_elf: fix PIE execution with randomization disabled
+
+From: Jiri Kosina <jkosina@suse.cz>
+
+commit a3defbe5c337dbc6da911f8cc49ae3cc3b49b453 upstream.
+
+The case of address space randomization being disabled in runtime through
+randomize_va_space sysctl is not treated properly in load_elf_binary(),
+resulting in SIGKILL coming at exec() time for certain PIE-linked binaries
+in case the randomization has been disabled at runtime prior to calling
+exec().
+
+Handle the randomize_va_space == 0 case the same way as if we were not
+supporting .text randomization at all.
+
+Based on original patch by H.J. Lu and Josh Boyer.
+
+Signed-off-by: Jiri Kosina <jkosina@suse.cz>
+Cc: Ingo Molnar <mingo@elte.hu>
+Cc: Russell King <rmk@arm.linux.org.uk>
+Cc: H.J. Lu <hongjiu.lu@intel.com>
+Cc: <stable@kernel.org>
+Tested-by: Josh Boyer <jwboyer@redhat.com>
+Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/binfmt_elf.c |   11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -796,7 +796,16 @@ static int load_elf_binary(struct linux_
+                        * might try to exec.  This is because the brk will
+                        * follow the loader, and is not movable.  */
+ #if defined(CONFIG_X86) || defined(CONFIG_ARM)
+-                      load_bias = 0;
++                      /* Memory randomization might have been switched off
++                       * in runtime via sysctl.
++                       * If that is the case, retain the original non-zero
++                       * load_bias value in order to establish proper
++                       * non-randomized mappings.
++                       */
++                      if (current->flags & PF_RANDOMIZE)
++                              load_bias = 0;
++                      else
++                              load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+ #else
+                       load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+ #endif
diff --git a/queue-3.0/drivers-net-rionet.c-fix-ethernet-address-macros-for-le-platforms.patch b/queue-3.0/drivers-net-rionet.c-fix-ethernet-address-macros-for-le-platforms.patch
new file mode 100644 (file)
index 0000000..56b713d
--- /dev/null
@@ -0,0 +1,37 @@
+From e0c87bd95e8dad455c23bc56513af8dcb1737e55 Mon Sep 17 00:00:00 2001
+From: Alexandre Bounine <alexandre.bounine@idt.com>
+Date: Wed, 2 Nov 2011 13:39:15 -0700
+Subject: drivers/net/rionet.c: fix ethernet address macros for LE platforms
+
+From: Alexandre Bounine <alexandre.bounine@idt.com>
+
+commit e0c87bd95e8dad455c23bc56513af8dcb1737e55 upstream.
+
+Modify Ethernet addess macros to be compatible with BE/LE platforms
+
+Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
+Cc: Chul Kim <chul.kim@idt.com>
+Cc: Kumar Gala <galak@kernel.crashing.org>
+Cc: Matt Porter <mporter@kernel.crashing.org>
+Cc: Li Yang <leoli@freescale.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/net/rionet.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/rionet.c
++++ b/drivers/net/rionet.c
+@@ -88,8 +88,8 @@ static struct rio_dev **rionet_active;
+ #define dev_rionet_capable(dev) \
+       is_rionet_capable(dev->src_ops, dev->dst_ops)
+-#define RIONET_MAC_MATCH(x)   (*(u32 *)x == 0x00010001)
+-#define RIONET_GET_DESTID(x)  (*(u16 *)(x + 4))
++#define RIONET_MAC_MATCH(x)   (!memcmp((x), "\00\01\00\01", 4))
++#define RIONET_GET_DESTID(x)  ((*((u8 *)x + 4) << 8) | *((u8 *)x + 5))
+ static int rionet_rx_clean(struct net_device *ndev)
+ {
diff --git a/queue-3.0/iwlagn-do-not-use-interruptible-waits.patch b/queue-3.0/iwlagn-do-not-use-interruptible-waits.patch
new file mode 100644 (file)
index 0000000..2ed3b4c
--- /dev/null
@@ -0,0 +1,130 @@
+From johannes@sipsolutions.net  Thu Nov  3 10:55:59 2011
+From: Johannes Berg <johannes@sipsolutions.net>
+Date: Thu, 03 Nov 2011 13:46:08 +0100
+Subject: iwlagn: do not use interruptible waits
+To: stable@vger.kernel.org
+Message-ID: <1320324368.3950.44.camel@jlt3.sipsolutions.net>
+
+
+From: Johannes Berg <johannes.berg@intel.com>
+
+Upstream commit effd4d9aece9184f526e6556786a94d335e38b71.
+
+Since the dawn of its time, iwlwifi has used
+interruptible waits to wait for synchronous
+commands and firmware loading.
+
+This leads to "interesting" bugs, because it
+can't actually handle the interruptions; for
+example when a command sending is interrupted
+it will assume the command completed fully,
+and then leave it pending, which leads to all
+kinds of trouble when the command finishes
+later.
+
+Since there's no easy way to gracefully deal
+with interruptions, fix the driver to not use
+interruptible waits.
+
+This at least fixes the error
+iwlagn 0000:02:00.0: Error: Response NULL in  'REPLY_SCAN_ABORT_CMD'
+
+I have seen in P2P testing, but it is likely
+that there are other errors caused by this.
+
+Cc: Stanislaw Gruszka <sgruszka@redhat.com>
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Signed-off-by: Wey-Yi Guy <wey-yi.w.guy@intel.com>
+Signed-off-by: John W. Linville <linville@tuxdriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ drivers/net/wireless/iwlwifi/iwl-agn-ucode.c |    9 ++-------
+ drivers/net/wireless/iwlwifi/iwl-agn.c       |    2 +-
+ drivers/net/wireless/iwlwifi/iwl-core.c      |    4 ++--
+ drivers/net/wireless/iwlwifi/iwl-hcmd.c      |    2 +-
+ drivers/net/wireless/iwlwifi/iwl-rx.c        |    2 +-
+ drivers/net/wireless/iwlwifi/iwl-tx.c        |    2 +-
+ 6 files changed, 8 insertions(+), 13 deletions(-)
+
+--- a/drivers/net/wireless/iwlwifi/iwl-agn-ucode.c
++++ b/drivers/net/wireless/iwlwifi/iwl-agn-ucode.c
+@@ -144,13 +144,8 @@ static int iwlagn_load_section(struct iw
+               FH_TCSR_TX_CONFIG_REG_VAL_CIRQ_HOST_ENDTFD);
+       IWL_DEBUG_INFO(priv, "%s uCode section being loaded...\n", name);
+-      ret = wait_event_interruptible_timeout(priv->wait_command_queue,
+-                                      priv->ucode_write_complete, 5 * HZ);
+-      if (ret == -ERESTARTSYS) {
+-              IWL_ERR(priv, "Could not load the %s uCode section due "
+-                      "to interrupt\n", name);
+-              return ret;
+-      }
++      ret = wait_event_timeout(priv->wait_command_queue,
++                               priv->ucode_write_complete, 5 * HZ);
+       if (!ret) {
+               IWL_ERR(priv, "Could not load the %s uCode section\n",
+                       name);
+--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
++++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
+@@ -797,7 +797,7 @@ static void iwl_irq_tasklet(struct iwl_p
+               handled |= CSR_INT_BIT_FH_TX;
+               /* Wake up uCode load routine, now that load is complete */
+               priv->ucode_write_complete = 1;
+-              wake_up_interruptible(&priv->wait_command_queue);
++              wake_up(&priv->wait_command_queue);
+       }
+       if (inta & ~handled) {
+--- a/drivers/net/wireless/iwlwifi/iwl-core.c
++++ b/drivers/net/wireless/iwlwifi/iwl-core.c
+@@ -899,7 +899,7 @@ void iwlagn_fw_error(struct iwl_priv *pr
+        * commands by clearing the ready bit */
+       clear_bit(STATUS_READY, &priv->status);
+-      wake_up_interruptible(&priv->wait_command_queue);
++      wake_up(&priv->wait_command_queue);
+       if (!ondemand) {
+               /*
+@@ -950,7 +950,7 @@ void iwl_irq_handle_error(struct iwl_pri
+                */
+               clear_bit(STATUS_READY, &priv->status);
+               clear_bit(STATUS_HCMD_ACTIVE, &priv->status);
+-              wake_up_interruptible(&priv->wait_command_queue);
++              wake_up(&priv->wait_command_queue);
+               IWL_ERR(priv, "RF is used by WiMAX\n");
+               return;
+       }
+--- a/drivers/net/wireless/iwlwifi/iwl-hcmd.c
++++ b/drivers/net/wireless/iwlwifi/iwl-hcmd.c
+@@ -194,7 +194,7 @@ int iwl_send_cmd_sync(struct iwl_priv *p
+               return ret;
+       }
+-      ret = wait_event_interruptible_timeout(priv->wait_command_queue,
++      ret = wait_event_timeout(priv->wait_command_queue,
+                       !test_bit(STATUS_HCMD_ACTIVE, &priv->status),
+                       HOST_COMPLETE_TIMEOUT);
+       if (!ret) {
+--- a/drivers/net/wireless/iwlwifi/iwl-rx.c
++++ b/drivers/net/wireless/iwlwifi/iwl-rx.c
+@@ -738,7 +738,7 @@ static void iwl_rx_card_state_notif(stru
+               wiphy_rfkill_set_hw_state(priv->hw->wiphy,
+                       test_bit(STATUS_RF_KILL_HW, &priv->status));
+       else
+-              wake_up_interruptible(&priv->wait_command_queue);
++              wake_up(&priv->wait_command_queue);
+ }
+ static void iwl_rx_missed_beacon_notif(struct iwl_priv *priv,
+--- a/drivers/net/wireless/iwlwifi/iwl-tx.c
++++ b/drivers/net/wireless/iwlwifi/iwl-tx.c
+@@ -821,7 +821,7 @@ void iwl_tx_cmd_complete(struct iwl_priv
+               clear_bit(STATUS_HCMD_ACTIVE, &priv->status);
+               IWL_DEBUG_INFO(priv, "Clearing HCMD_ACTIVE for command %s\n",
+                              get_cmd_string(cmd->hdr.cmd));
+-              wake_up_interruptible(&priv->wait_command_queue);
++              wake_up(&priv->wait_command_queue);
+       }
+       /* Mark as unmapped */
diff --git a/queue-3.0/mm-thp-tail-page-refcounting-fix.patch b/queue-3.0/mm-thp-tail-page-refcounting-fix.patch
new file mode 100644 (file)
index 0000000..983badc
--- /dev/null
@@ -0,0 +1,492 @@
+From 70b50f94f1644e2aa7cb374819cfd93f3c28d725 Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Wed, 2 Nov 2011 13:36:59 -0700
+Subject: mm: thp: tail page refcounting fix
+
+From: Andrea Arcangeli <aarcange@redhat.com>
+
+commit 70b50f94f1644e2aa7cb374819cfd93f3c28d725 upstream.
+
+Michel while working on the working set estimation code, noticed that
+calling get_page_unless_zero() on a random pfn_to_page(random_pfn)
+wasn't safe, if the pfn ended up being a tail page of a transparent
+hugepage under splitting by __split_huge_page_refcount().
+
+He then found the problem could also theoretically materialize with
+page_cache_get_speculative() during the speculative radix tree lookups
+that uses get_page_unless_zero() in SMP if the radix tree page is freed
+and reallocated and get_user_pages is called on it before
+page_cache_get_speculative has a chance to call get_page_unless_zero().
+
+So the best way to fix the problem is to keep page_tail->_count zero at
+all times.  This will guarantee that get_page_unless_zero() can never
+succeed on any tail page.  page_tail->_mapcount is guaranteed zero and
+is unused for all tail pages of a compound page, so we can simply
+account the tail page references there and transfer them to
+tail_page->_count in __split_huge_page_refcount() (in addition to the
+head_page->_mapcount).
+
+While debugging this s/_count/_mapcount/ change I also noticed get_page is
+called by direct-io.c on pages returned by get_user_pages.  That wasn't
+entirely safe because the two atomic_inc in get_page weren't atomic.  As
+opposed to other get_user_page users like secondary-MMU page fault to
+establish the shadow pagetables would never call any superflous get_page
+after get_user_page returns.  It's safer to make get_page universally safe
+for tail pages and to use get_page_foll() within follow_page (inside
+get_user_pages()).  get_page_foll() is safe to do the refcounting for tail
+pages without taking any locks because it is run within PT lock protected
+critical sections (PT lock for pte and page_table_lock for
+pmd_trans_huge).
+
+The standard get_page() as invoked by direct-io instead will now take
+the compound_lock but still only for tail pages.  The direct-io paths
+are usually I/O bound and the compound_lock is per THP so very
+finegrined, so there's no risk of scalability issues with it.  A simple
+direct-io benchmarks with all lockdep prove locking and spinlock
+debugging infrastructure enabled shows identical performance and no
+overhead.  So it's worth it.  Ideally direct-io should stop calling
+get_page() on pages returned by get_user_pages().  The spinlock in
+get_page() is already optimized away for no-THP builds but doing
+get_page() on tail pages returned by GUP is generally a rare operation
+and usually only run in I/O paths.
+
+This new refcounting on page_tail->_mapcount in addition to avoiding new
+RCU critical sections will also allow the working set estimation code to
+work without any further complexity associated to the tail page
+refcounting with THP.
+
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+Reported-by: Michel Lespinasse <walken@google.com>
+Reviewed-by: Michel Lespinasse <walken@google.com>
+Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
+Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Johannes Weiner <jweiner@redhat.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: David Gibson <david@gibson.dropbear.id.au>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/powerpc/mm/gup.c    |    5 +-
+ arch/x86/mm/gup.c        |    5 +-
+ include/linux/mm.h       |   56 ++++++++++++-------------------
+ include/linux/mm_types.h |   22 ++++++++++--
+ mm/huge_memory.c         |   37 ++++++++++++++------
+ mm/internal.h            |   46 ++++++++++++++++++++++++++
+ mm/memory.c              |    2 -
+ mm/swap.c                |   83 ++++++++++++++++++++++++++++++-----------------
+ 8 files changed, 172 insertions(+), 84 deletions(-)
+
+--- a/arch/powerpc/mm/gup.c
++++ b/arch/powerpc/mm/gup.c
+@@ -22,8 +22,9 @@ static inline void get_huge_page_tail(st
+        * __split_huge_page_refcount() cannot run
+        * from under us.
+        */
+-      VM_BUG_ON(atomic_read(&page->_count) < 0);
+-      atomic_inc(&page->_count);
++      VM_BUG_ON(page_mapcount(page) < 0);
++      VM_BUG_ON(atomic_read(&page->_count) != 0);
++      atomic_inc(&page->_mapcount);
+ }
+ /*
+--- a/arch/x86/mm/gup.c
++++ b/arch/x86/mm/gup.c
+@@ -114,8 +114,9 @@ static inline void get_huge_page_tail(st
+        * __split_huge_page_refcount() cannot run
+        * from under us.
+        */
+-      VM_BUG_ON(atomic_read(&page->_count) < 0);
+-      atomic_inc(&page->_count);
++      VM_BUG_ON(page_mapcount(page) < 0);
++      VM_BUG_ON(atomic_read(&page->_count) != 0);
++      atomic_inc(&page->_mapcount);
+ }
+ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -355,36 +355,39 @@ static inline struct page *compound_head
+       return page;
+ }
++/*
++ * The atomic page->_mapcount, starts from -1: so that transitions
++ * both from it and to it can be tracked, using atomic_inc_and_test
++ * and atomic_add_negative(-1).
++ */
++static inline void reset_page_mapcount(struct page *page)
++{
++      atomic_set(&(page)->_mapcount, -1);
++}
++
++static inline int page_mapcount(struct page *page)
++{
++      return atomic_read(&(page)->_mapcount) + 1;
++}
++
+ static inline int page_count(struct page *page)
+ {
+       return atomic_read(&compound_head(page)->_count);
+ }
++extern bool __get_page_tail(struct page *page);
++
+ static inline void get_page(struct page *page)
+ {
++      if (unlikely(PageTail(page)))
++              if (likely(__get_page_tail(page)))
++                      return;
+       /*
+        * Getting a normal page or the head of a compound page
+-       * requires to already have an elevated page->_count. Only if
+-       * we're getting a tail page, the elevated page->_count is
+-       * required only in the head page, so for tail pages the
+-       * bugcheck only verifies that the page->_count isn't
+-       * negative.
++       * requires to already have an elevated page->_count.
+        */
+-      VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
++      VM_BUG_ON(atomic_read(&page->_count) <= 0);
+       atomic_inc(&page->_count);
+-      /*
+-       * Getting a tail page will elevate both the head and tail
+-       * page->_count(s).
+-       */
+-      if (unlikely(PageTail(page))) {
+-              /*
+-               * This is safe only because
+-               * __split_huge_page_refcount can't run under
+-               * get_page().
+-               */
+-              VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+-              atomic_inc(&page->first_page->_count);
+-      }
+ }
+ static inline struct page *virt_to_head_page(const void *x)
+@@ -803,21 +806,6 @@ static inline pgoff_t page_index(struct
+ }
+ /*
+- * The atomic page->_mapcount, like _count, starts from -1:
+- * so that transitions both from it and to it can be tracked,
+- * using atomic_inc_and_test and atomic_add_negative(-1).
+- */
+-static inline void reset_page_mapcount(struct page *page)
+-{
+-      atomic_set(&(page)->_mapcount, -1);
+-}
+-
+-static inline int page_mapcount(struct page *page)
+-{
+-      return atomic_read(&(page)->_mapcount) + 1;
+-}
+-
+-/*
+  * Return true if this page is mapped into pagetables.
+  */
+ static inline int page_mapped(struct page *page)
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -36,10 +36,24 @@ struct page {
+                                        * updated asynchronously */
+       atomic_t _count;                /* Usage count, see below. */
+       union {
+-              atomic_t _mapcount;     /* Count of ptes mapped in mms,
+-                                       * to show when page is mapped
+-                                       * & limit reverse map searches.
+-                                       */
++              /*
++               * Count of ptes mapped in
++               * mms, to show when page is
++               * mapped & limit reverse map
++               * searches.
++               *
++               * Used also for tail pages
++               * refcounting instead of
++               * _count. Tail pages cannot
++               * be mapped and keeping the
++               * tail page _count zero at
++               * all times guarantees
++               * get_page_unless_zero() will
++               * never succeed on tail
++               * pages.
++               */
++              atomic_t _mapcount;
++
+               struct {                /* SLUB */
+                       u16 inuse;
+                       u16 objects;
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -989,7 +989,7 @@ struct page *follow_trans_huge_pmd(struc
+       page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+       VM_BUG_ON(!PageCompound(page));
+       if (flags & FOLL_GET)
+-              get_page(page);
++              get_page_foll(page);
+ out:
+       return page;
+@@ -1156,6 +1156,7 @@ static void __split_huge_page_refcount(s
+       unsigned long head_index = page->index;
+       struct zone *zone = page_zone(page);
+       int zonestat;
++      int tail_count = 0;
+       /* prevent PageLRU to go away from under us, and freeze lru stats */
+       spin_lock_irq(&zone->lru_lock);
+@@ -1164,11 +1165,27 @@ static void __split_huge_page_refcount(s
+       for (i = 1; i < HPAGE_PMD_NR; i++) {
+               struct page *page_tail = page + i;
+-              /* tail_page->_count cannot change */
+-              atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+-              BUG_ON(page_count(page) <= 0);
+-              atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+-              BUG_ON(atomic_read(&page_tail->_count) <= 0);
++              /* tail_page->_mapcount cannot change */
++              BUG_ON(page_mapcount(page_tail) < 0);
++              tail_count += page_mapcount(page_tail);
++              /* check for overflow */
++              BUG_ON(tail_count < 0);
++              BUG_ON(atomic_read(&page_tail->_count) != 0);
++              /*
++               * tail_page->_count is zero and not changing from
++               * under us. But get_page_unless_zero() may be running
++               * from under us on the tail_page. If we used
++               * atomic_set() below instead of atomic_add(), we
++               * would then run atomic_set() concurrently with
++               * get_page_unless_zero(), and atomic_set() is
++               * implemented in C not using locked ops. spin_unlock
++               * on x86 sometime uses locked ops because of PPro
++               * errata 66, 92, so unless somebody can guarantee
++               * atomic_set() here would be safe on all archs (and
++               * not only on x86), it's safer to use atomic_add().
++               */
++              atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
++                         &page_tail->_count);
+               /* after clearing PageTail the gup refcount can be released */
+               smp_mb();
+@@ -1186,10 +1203,7 @@ static void __split_huge_page_refcount(s
+                                     (1L << PG_uptodate)));
+               page_tail->flags |= (1L << PG_dirty);
+-              /*
+-               * 1) clear PageTail before overwriting first_page
+-               * 2) clear PageTail before clearing PageHead for VM_BUG_ON
+-               */
++              /* clear PageTail before overwriting first_page */
+               smp_wmb();
+               /*
+@@ -1206,7 +1220,6 @@ static void __split_huge_page_refcount(s
+                * status is achieved setting a reserved bit in the
+                * pmd, not by clearing the present bit.
+               */
+-              BUG_ON(page_mapcount(page_tail));
+               page_tail->_mapcount = page->_mapcount;
+               BUG_ON(page_tail->mapping);
+@@ -1223,6 +1236,8 @@ static void __split_huge_page_refcount(s
+               lru_add_page_tail(zone, page, page_tail);
+       }
++      atomic_sub(tail_count, &page->_count);
++      BUG_ON(atomic_read(&page->_count) <= 0);
+       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+       __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -37,6 +37,52 @@ static inline void __put_page(struct pag
+       atomic_dec(&page->_count);
+ }
++static inline void __get_page_tail_foll(struct page *page,
++                                      bool get_page_head)
++{
++      /*
++       * If we're getting a tail page, the elevated page->_count is
++       * required only in the head page and we will elevate the head
++       * page->_count and tail page->_mapcount.
++       *
++       * We elevate page_tail->_mapcount for tail pages to force
++       * page_tail->_count to be zero at all times to avoid getting
++       * false positives from get_page_unless_zero() with
++       * speculative page access (like in
++       * page_cache_get_speculative()) on tail pages.
++       */
++      VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
++      VM_BUG_ON(atomic_read(&page->_count) != 0);
++      VM_BUG_ON(page_mapcount(page) < 0);
++      if (get_page_head)
++              atomic_inc(&page->first_page->_count);
++      atomic_inc(&page->_mapcount);
++}
++
++/*
++ * This is meant to be called as the FOLL_GET operation of
++ * follow_page() and it must be called while holding the proper PT
++ * lock while the pte (or pmd_trans_huge) is still mapping the page.
++ */
++static inline void get_page_foll(struct page *page)
++{
++      if (unlikely(PageTail(page)))
++              /*
++               * This is safe only because
++               * __split_huge_page_refcount() can't run under
++               * get_page_foll() because we hold the proper PT lock.
++               */
++              __get_page_tail_foll(page, true);
++      else {
++              /*
++               * Getting a normal page or the head of a compound page
++               * requires to already have an elevated page->_count.
++               */
++              VM_BUG_ON(atomic_read(&page->_count) <= 0);
++              atomic_inc(&page->_count);
++      }
++}
++
+ extern unsigned long highest_memmap_pfn;
+ /*
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1514,7 +1514,7 @@ split_fallthrough:
+       }
+       if (flags & FOLL_GET)
+-              get_page(page);
++              get_page_foll(page);
+       if (flags & FOLL_TOUCH) {
+               if ((flags & FOLL_WRITE) &&
+                   !pte_dirty(pte) && !PageDirty(page))
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -78,39 +78,22 @@ static void put_compound_page(struct pag
+ {
+       if (unlikely(PageTail(page))) {
+               /* __split_huge_page_refcount can run under us */
+-              struct page *page_head = page->first_page;
+-              smp_rmb();
+-              /*
+-               * If PageTail is still set after smp_rmb() we can be sure
+-               * that the page->first_page we read wasn't a dangling pointer.
+-               * See __split_huge_page_refcount() smp_wmb().
+-               */
+-              if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
++              struct page *page_head = compound_trans_head(page);
++
++              if (likely(page != page_head &&
++                         get_page_unless_zero(page_head))) {
+                       unsigned long flags;
+                       /*
+-                       * Verify that our page_head wasn't converted
+-                       * to a a regular page before we got a
+-                       * reference on it.
++                       * page_head wasn't a dangling pointer but it
++                       * may not be a head page anymore by the time
++                       * we obtain the lock. That is ok as long as it
++                       * can't be freed from under us.
+                        */
+-                      if (unlikely(!PageHead(page_head))) {
+-                              /* PageHead is cleared after PageTail */
+-                              smp_rmb();
+-                              VM_BUG_ON(PageTail(page));
+-                              goto out_put_head;
+-                      }
+-                      /*
+-                       * Only run compound_lock on a valid PageHead,
+-                       * after having it pinned with
+-                       * get_page_unless_zero() above.
+-                       */
+-                      smp_mb();
+-                      /* page_head wasn't a dangling pointer */
+                       flags = compound_lock_irqsave(page_head);
+                       if (unlikely(!PageTail(page))) {
+                               /* __split_huge_page_refcount run before us */
+                               compound_unlock_irqrestore(page_head, flags);
+                               VM_BUG_ON(PageHead(page_head));
+-                      out_put_head:
+                               if (put_page_testzero(page_head))
+                                       __put_single_page(page_head);
+                       out_put_single:
+@@ -121,16 +104,17 @@ static void put_compound_page(struct pag
+                       VM_BUG_ON(page_head != page->first_page);
+                       /*
+                        * We can release the refcount taken by
+-                       * get_page_unless_zero now that
+-                       * split_huge_page_refcount is blocked on the
+-                       * compound_lock.
++                       * get_page_unless_zero() now that
++                       * __split_huge_page_refcount() is blocked on
++                       * the compound_lock.
+                        */
+                       if (put_page_testzero(page_head))
+                               VM_BUG_ON(1);
+                       /* __split_huge_page_refcount will wait now */
+-                      VM_BUG_ON(atomic_read(&page->_count) <= 0);
+-                      atomic_dec(&page->_count);
++                      VM_BUG_ON(page_mapcount(page) <= 0);
++                      atomic_dec(&page->_mapcount);
+                       VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
++                      VM_BUG_ON(atomic_read(&page->_count) != 0);
+                       compound_unlock_irqrestore(page_head, flags);
+                       if (put_page_testzero(page_head)) {
+                               if (PageHead(page_head))
+@@ -160,6 +144,45 @@ void put_page(struct page *page)
+ }
+ EXPORT_SYMBOL(put_page);
++/*
++ * This function is exported but must not be called by anything other
++ * than get_page(). It implements the slow path of get_page().
++ */
++bool __get_page_tail(struct page *page)
++{
++      /*
++       * This takes care of get_page() if run on a tail page
++       * returned by one of the get_user_pages/follow_page variants.
++       * get_user_pages/follow_page itself doesn't need the compound
++       * lock because it runs __get_page_tail_foll() under the
++       * proper PT lock that already serializes against
++       * split_huge_page().
++       */
++      unsigned long flags;
++      bool got = false;
++      struct page *page_head = compound_trans_head(page);
++
++      if (likely(page != page_head && get_page_unless_zero(page_head))) {
++              /*
++               * page_head wasn't a dangling pointer but it
++               * may not be a head page anymore by the time
++               * we obtain the lock. That is ok as long as it
++               * can't be freed from under us.
++               */
++              flags = compound_lock_irqsave(page_head);
++              /* here __split_huge_page_refcount won't run anymore */
++              if (likely(PageTail(page))) {
++                      __get_page_tail_foll(page, false);
++                      got = true;
++              }
++              compound_unlock_irqrestore(page_head, flags);
++              if (unlikely(!got))
++                      put_page(page_head);
++      }
++      return got;
++}
++EXPORT_SYMBOL(__get_page_tail);
++
+ /**
+  * put_pages_list() - release a list of pages
+  * @pages: list of pages threaded on page->lru
diff --git a/queue-3.0/proc-fix-races-against-execve-of-proc-pid-fd.patch b/queue-3.0/proc-fix-races-against-execve-of-proc-pid-fd.patch
new file mode 100644 (file)
index 0000000..20688cf
--- /dev/null
@@ -0,0 +1,261 @@
+From aa6afca5bcaba8101f3ea09d5c3e4100b2b9f0e5 Mon Sep 17 00:00:00 2001
+From: Vasiliy Kulikov <segoon@openwall.com>
+Date: Wed, 2 Nov 2011 13:38:44 -0700
+Subject: proc: fix races against execve() of /proc/PID/fd**
+
+From: Vasiliy Kulikov <segoon@openwall.com>
+
+commit aa6afca5bcaba8101f3ea09d5c3e4100b2b9f0e5 upstream.
+
+fd* files are restricted to the task's owner, and other users may not get
+direct access to them.  But one may open any of these files and run any
+setuid program, keeping opened file descriptors.  As there are permission
+checks on open(), but not on readdir() and read(), operations on the kept
+file descriptors will not be checked.  It makes it possible to violate
+procfs permission model.
+
+Reading fdinfo/* may disclosure current fds' position and flags, reading
+directory contents of fdinfo/ and fd/ may disclosure the number of opened
+files by the target task.  This information is not sensible per se, but it
+can reveal some private information (like length of a password stored in a
+file) under certain conditions.
+
+Used existing (un)lock_trace functions to check for ptrace_may_access(),
+but instead of using EPERM return code from it use EACCES to be consistent
+with existing proc_pid_follow_link()/proc_pid_readlink() return code.  If
+they differ, attacker can guess what fds exist by analyzing stat() return
+code.  Patched handlers: stat() for fd/*, stat() and read() for fdindo/*,
+readdir() and lookup() for fd/ and fdinfo/.
+
+Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
+Cc: Cyrill Gorcunov <gorcunov@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/proc/base.c |  146 ++++++++++++++++++++++++++++++++++++++++-----------------
+ 1 file changed, 103 insertions(+), 43 deletions(-)
+
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -1666,12 +1666,46 @@ out:
+       return error;
+ }
++static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
++              struct kstat *stat)
++{
++      struct inode *inode = dentry->d_inode;
++      struct task_struct *task = get_proc_task(inode);
++      int rc;
++
++      if (task == NULL)
++              return -ESRCH;
++
++      rc = -EACCES;
++      if (lock_trace(task))
++              goto out_task;
++
++      generic_fillattr(inode, stat);
++      unlock_trace(task);
++      rc = 0;
++out_task:
++      put_task_struct(task);
++      return rc;
++}
++
+ static const struct inode_operations proc_pid_link_inode_operations = {
+       .readlink       = proc_pid_readlink,
+       .follow_link    = proc_pid_follow_link,
+       .setattr        = proc_setattr,
+ };
++static const struct inode_operations proc_fdinfo_link_inode_operations = {
++      .setattr        = proc_setattr,
++      .getattr        = proc_pid_fd_link_getattr,
++};
++
++static const struct inode_operations proc_fd_link_inode_operations = {
++      .readlink       = proc_pid_readlink,
++      .follow_link    = proc_pid_follow_link,
++      .setattr        = proc_setattr,
++      .getattr        = proc_pid_fd_link_getattr,
++};
++
+ /* building an inode */
+@@ -1903,49 +1937,61 @@ out:
+ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
+ {
+-      struct task_struct *task = get_proc_task(inode);
+-      struct files_struct *files = NULL;
++      struct task_struct *task;
++      struct files_struct *files;
+       struct file *file;
+       int fd = proc_fd(inode);
++      int rc;
+-      if (task) {
+-              files = get_files_struct(task);
+-              put_task_struct(task);
+-      }
+-      if (files) {
+-              /*
+-               * We are not taking a ref to the file structure, so we must
+-               * hold ->file_lock.
+-               */
+-              spin_lock(&files->file_lock);
+-              file = fcheck_files(files, fd);
+-              if (file) {
+-                      unsigned int f_flags;
+-                      struct fdtable *fdt;
+-
+-                      fdt = files_fdtable(files);
+-                      f_flags = file->f_flags & ~O_CLOEXEC;
+-                      if (FD_ISSET(fd, fdt->close_on_exec))
+-                              f_flags |= O_CLOEXEC;
+-
+-                      if (path) {
+-                              *path = file->f_path;
+-                              path_get(&file->f_path);
+-                      }
+-                      if (info)
+-                              snprintf(info, PROC_FDINFO_MAX,
+-                                       "pos:\t%lli\n"
+-                                       "flags:\t0%o\n",
+-                                       (long long) file->f_pos,
+-                                       f_flags);
+-                      spin_unlock(&files->file_lock);
+-                      put_files_struct(files);
+-                      return 0;
++      task = get_proc_task(inode);
++      if (!task)
++              return -ENOENT;
++
++      rc = -EACCES;
++      if (lock_trace(task))
++              goto out_task;
++
++      rc = -ENOENT;
++      files = get_files_struct(task);
++      if (files == NULL)
++              goto out_unlock;
++
++      /*
++       * We are not taking a ref to the file structure, so we must
++       * hold ->file_lock.
++       */
++      spin_lock(&files->file_lock);
++      file = fcheck_files(files, fd);
++      if (file) {
++              unsigned int f_flags;
++              struct fdtable *fdt;
++
++              fdt = files_fdtable(files);
++              f_flags = file->f_flags & ~O_CLOEXEC;
++              if (FD_ISSET(fd, fdt->close_on_exec))
++                      f_flags |= O_CLOEXEC;
++
++              if (path) {
++                      *path = file->f_path;
++                      path_get(&file->f_path);
+               }
+-              spin_unlock(&files->file_lock);
+-              put_files_struct(files);
+-      }
+-      return -ENOENT;
++              if (info)
++                      snprintf(info, PROC_FDINFO_MAX,
++                               "pos:\t%lli\n"
++                               "flags:\t0%o\n",
++                               (long long) file->f_pos,
++                               f_flags);
++              rc = 0;
++      } else
++              rc = -ENOENT;
++      spin_unlock(&files->file_lock);
++      put_files_struct(files);
++
++out_unlock:
++      unlock_trace(task);
++out_task:
++      put_task_struct(task);
++      return rc;
+ }
+ static int proc_fd_link(struct inode *inode, struct path *path)
+@@ -2040,7 +2086,7 @@ static struct dentry *proc_fd_instantiat
+       spin_unlock(&files->file_lock);
+       put_files_struct(files);
+-      inode->i_op = &proc_pid_link_inode_operations;
++      inode->i_op = &proc_fd_link_inode_operations;
+       inode->i_size = 64;
+       ei->op.proc_get_link = proc_fd_link;
+       d_set_d_op(dentry, &tid_fd_dentry_operations);
+@@ -2072,7 +2118,12 @@ static struct dentry *proc_lookupfd_comm
+       if (fd == ~0U)
+               goto out;
++      result = ERR_PTR(-EACCES);
++      if (lock_trace(task))
++              goto out;
++
+       result = instantiate(dir, dentry, task, &fd);
++      unlock_trace(task);
+ out:
+       put_task_struct(task);
+ out_no_task:
+@@ -2092,23 +2143,28 @@ static int proc_readfd_common(struct fil
+       retval = -ENOENT;
+       if (!p)
+               goto out_no_task;
++
++      retval = -EACCES;
++      if (lock_trace(p))
++              goto out;
++
+       retval = 0;
+       fd = filp->f_pos;
+       switch (fd) {
+               case 0:
+                       if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+-                              goto out;
++                              goto out_unlock;
+                       filp->f_pos++;
+               case 1:
+                       ino = parent_ino(dentry);
+                       if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+-                              goto out;
++                              goto out_unlock;
+                       filp->f_pos++;
+               default:
+                       files = get_files_struct(p);
+                       if (!files)
+-                              goto out;
++                              goto out_unlock;
+                       rcu_read_lock();
+                       for (fd = filp->f_pos-2;
+                            fd < files_fdtable(files)->max_fds;
+@@ -2132,6 +2188,9 @@ static int proc_readfd_common(struct fil
+                       rcu_read_unlock();
+                       put_files_struct(files);
+       }
++
++out_unlock:
++      unlock_trace(p);
+ out:
+       put_task_struct(p);
+ out_no_task:
+@@ -2209,6 +2268,7 @@ static struct dentry *proc_fdinfo_instan
+       ei->fd = fd;
+       inode->i_mode = S_IFREG | S_IRUSR;
+       inode->i_fop = &proc_fdinfo_file_operations;
++      inode->i_op = &proc_fdinfo_link_inode_operations;
+       d_set_d_op(dentry, &tid_fd_dentry_operations);
+       d_add(dentry, inode);
+       /* Close the race of the process dying before we return the dentry */
index 647b8f7b8e008a40f159cf300766850d3bd5ecc7..1ced05fc8f1cd8ffcb6e7eb46f0d8dfd718a3b72 100644 (file)
@@ -145,3 +145,9 @@ tg3-negate-use_phylib-flag-check.patch
 ipv6-nullify-ipv6_ac_list-and-ipv6_fl_list-when-creating-new-socket.patch
 make-packet_statistics-getsockopt-report-consistently-between-ring-and-non-ring.patch
 net-xen-netback-correctly-restart-tx-after-a-vm-restore-migrate.patch
+mm-thp-tail-page-refcounting-fix.patch
+binfmt_elf-fix-pie-execution-with-randomization-disabled.patch
+vfs-show-o_cloexe-bit-properly-in-proc-pid-fdinfo-fd-files.patch
+proc-fix-races-against-execve-of-proc-pid-fd.patch
+iwlagn-do-not-use-interruptible-waits.patch
+drivers-net-rionet.c-fix-ethernet-address-macros-for-le-platforms.patch
diff --git a/queue-3.0/vfs-show-o_cloexe-bit-properly-in-proc-pid-fdinfo-fd-files.patch b/queue-3.0/vfs-show-o_cloexe-bit-properly-in-proc-pid-fdinfo-fd-files.patch
new file mode 100644 (file)
index 0000000..8823d27
--- /dev/null
@@ -0,0 +1,59 @@
+From 1117f72ea0217ba0cc19f05adbbd8b9a397f5ab7 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 6 Aug 2011 11:51:33 -0700
+Subject: vfs: show O_CLOEXE bit properly in /proc/<pid>/fdinfo/<fd> files
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 1117f72ea0217ba0cc19f05adbbd8b9a397f5ab7 upstream.
+
+The CLOEXE bit is magical, and for performance (and semantic) reasons we
+don't actually maintain it in the file descriptor itself, but in a
+separate bit array.  Which means that when we show f_flags, the CLOEXE
+status is shown incorrectly: we show the status not as it is now, but as
+it was when the file was opened.
+
+Fix that by looking up the bit properly in the 'fdt->close_on_exec' bit
+array.
+
+Uli needs this in order to re-implement the pfiles program:
+
+  "For normal file descriptors (not sockets) this was the last piece of
+   information which wasn't available.  This is all part of my 'give
+   Solaris users no reason to not switch' effort.  I intend to offer the
+   code to the util-linux-ng maintainers."
+
+Requested-by: Ulrich Drepper <drepper@akkadia.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/proc/base.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -1920,6 +1920,14 @@ static int proc_fd_info(struct inode *in
+               spin_lock(&files->file_lock);
+               file = fcheck_files(files, fd);
+               if (file) {
++                      unsigned int f_flags;
++                      struct fdtable *fdt;
++
++                      fdt = files_fdtable(files);
++                      f_flags = file->f_flags & ~O_CLOEXEC;
++                      if (FD_ISSET(fd, fdt->close_on_exec))
++                              f_flags |= O_CLOEXEC;
++
+                       if (path) {
+                               *path = file->f_path;
+                               path_get(&file->f_path);
+@@ -1929,7 +1937,7 @@ static int proc_fd_info(struct inode *in
+                                        "pos:\t%lli\n"
+                                        "flags:\t0%o\n",
+                                        (long long) file->f_pos,
+-                                       file->f_flags);
++                                       f_flags);
+                       spin_unlock(&files->file_lock);
+                       put_files_struct(files);
+                       return 0;