]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
more patches for .25
authorGreg Kroah-Hartman <gregkh@suse.de>
Mon, 30 Jun 2008 16:14:14 +0000 (09:14 -0700)
committerGreg Kroah-Hartman <gregkh@suse.de>
Mon, 30 Jun 2008 16:14:14 +0000 (09:14 -0700)
queue-2.6.25/futexes-fix-fault-handling-in-futex_lock_pi.patch [new file with mode: 0644]
queue-2.6.25/ib-mthca-clear-icm-pages-before-handing-to-fw.patch [new file with mode: 0644]
queue-2.6.25/series

diff --git a/queue-2.6.25/futexes-fix-fault-handling-in-futex_lock_pi.patch b/queue-2.6.25/futexes-fix-fault-handling-in-futex_lock_pi.patch
new file mode 100644 (file)
index 0000000..1e13724
--- /dev/null
@@ -0,0 +1,206 @@
+From stable-bounces@linux.kernel.org Mon Jun 23 16:30:22 2008
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 23 Jun 2008 23:30:13 GMT
+Subject: futexes: fix fault handling in futex_lock_pi
+To: jejb@kernel.org, stable@kernel.org
+Message-ID: <200806232330.m5NNUDEY010317@hera.kernel.org>
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 1b7558e457ed0de61023cfc913d2c342c7c3d9f2 upstream
+
+This patch addresses a very sporadic pi-futex related failure in
+highly threaded java apps on large SMP systems.
+
+David Holmes reported that the pi_state consistency check in
+lookup_pi_state triggered with his test application. This means that
+the kernel internal pi_state and the user space futex variable are out
+of sync. First we assumed that this is a user space data corruption,
+but deeper investigation revieled that the problem happend because the
+pi-futex code is not handling a fault in the futex_lock_pi path when
+the user space variable needs to be fixed up.
+
+The fault happens when a fork mapped the anon memory which contains
+the futex readonly for COW or the page got swapped out exactly between
+the unlock of the futex and the return of either the new futex owner
+or the task which was the expected owner but failed to acquire the
+kernel internal rtmutex. The current futex_lock_pi() code drops out
+with an inconsistent in case it faults and returns -EFAULT to user
+space. User space has no way to fixup that state.
+
+When we wrote this code we thought that we could not drop the hash
+bucket lock at this point to handle the fault.
+
+After analysing the code again it turned out to be wrong because there
+are only two tasks involved which might modify the pi_state and the
+user space variable:
+
+ - the task which acquired the rtmutex
+ - the pending owner of the pi_state which did not get the rtmutex
+
+Both tasks drop into the fixup_pi_state() function before returning to
+user space. The first task which acquired the hash bucket lock faults
+in the fixup of the user space variable, drops the spinlock and calls
+futex_handle_fault() to fault in the page. Now the second task could
+acquire the hash bucket lock and tries to fixup the user space
+variable as well. It either faults as well or it succeeds because the
+first task already faulted the page in.
+
+One caveat is to avoid a double fixup. After returning from the fault
+handling we reacquire the hash bucket lock and check whether the
+pi_state owner has been modified already.
+
+Reported-by: David Holmes <david.holmes@sun.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: David Holmes <david.holmes@sun.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/futex.c |   93 ++++++++++++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 73 insertions(+), 20 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1118,21 +1118,64 @@ static void unqueue_me_pi(struct futex_q
+  * private futexes.
+  */
+ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+-                              struct task_struct *newowner)
++                              struct task_struct *newowner,
++                              struct rw_semaphore *fshared)
+ {
+       u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+       struct futex_pi_state *pi_state = q->pi_state;
++      struct task_struct *oldowner = pi_state->owner;
+       u32 uval, curval, newval;
+-      int ret;
++      int ret, attempt = 0;
+       /* Owner died? */
++      if (!pi_state->owner)
++              newtid |= FUTEX_OWNER_DIED;
++
++      /*
++       * We are here either because we stole the rtmutex from the
++       * pending owner or we are the pending owner which failed to
++       * get the rtmutex. We have to replace the pending owner TID
++       * in the user space variable. This must be atomic as we have
++       * to preserve the owner died bit here.
++       *
++       * Note: We write the user space value _before_ changing the
++       * pi_state because we can fault here. Imagine swapped out
++       * pages or a fork, which was running right before we acquired
++       * mmap_sem, that marked all the anonymous memory readonly for
++       * cow.
++       *
++       * Modifying pi_state _before_ the user space value would
++       * leave the pi_state in an inconsistent state when we fault
++       * here, because we need to drop the hash bucket lock to
++       * handle the fault. This might be observed in the PID check
++       * in lookup_pi_state.
++       */
++retry:
++      if (get_futex_value_locked(&uval, uaddr))
++              goto handle_fault;
++
++      while (1) {
++              newval = (uval & FUTEX_OWNER_DIED) | newtid;
++
++              curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
++
++              if (curval == -EFAULT)
++                      goto handle_fault;
++              if (curval == uval)
++                      break;
++              uval = curval;
++      }
++
++      /*
++       * We fixed up user space. Now we need to fix the pi_state
++       * itself.
++       */
+       if (pi_state->owner != NULL) {
+               spin_lock_irq(&pi_state->owner->pi_lock);
+               WARN_ON(list_empty(&pi_state->list));
+               list_del_init(&pi_state->list);
+               spin_unlock_irq(&pi_state->owner->pi_lock);
+-      } else
+-              newtid |= FUTEX_OWNER_DIED;
++      }
+       pi_state->owner = newowner;
+@@ -1140,26 +1183,35 @@ static int fixup_pi_state_owner(u32 __us
+       WARN_ON(!list_empty(&pi_state->list));
+       list_add(&pi_state->list, &newowner->pi_state_list);
+       spin_unlock_irq(&newowner->pi_lock);
++      return 0;
+       /*
+-       * We own it, so we have to replace the pending owner
+-       * TID. This must be atomic as we have preserve the
+-       * owner died bit here.
++       * To handle the page fault we need to drop the hash bucket
++       * lock here. That gives the other task (either the pending
++       * owner itself or the task which stole the rtmutex) the
++       * chance to try the fixup of the pi_state. So once we are
++       * back from handling the fault we need to check the pi_state
++       * after reacquiring the hash bucket lock and before trying to
++       * do another fixup. When the fixup has been done already we
++       * simply return.
+        */
+-      ret = get_futex_value_locked(&uval, uaddr);
++handle_fault:
++      spin_unlock(q->lock_ptr);
+-      while (!ret) {
+-              newval = (uval & FUTEX_OWNER_DIED) | newtid;
++      ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
+-              curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
++      spin_lock(q->lock_ptr);
+-              if (curval == -EFAULT)
+-                      ret = -EFAULT;
+-              if (curval == uval)
+-                      break;
+-              uval = curval;
+-      }
+-      return ret;
++      /*
++       * Check if someone else fixed it for us:
++       */
++      if (pi_state->owner != oldowner)
++              return 0;
++
++      if (ret)
++              return ret;
++
++      goto retry;
+ }
+ /*
+@@ -1524,7 +1576,7 @@ static int futex_lock_pi(u32 __user *uad
+                * that case:
+                */
+               if (q.pi_state->owner != curr)
+-                      ret = fixup_pi_state_owner(uaddr, &q, curr);
++                      ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
+       } else {
+               /*
+                * Catch the rare case, where the lock was released
+@@ -1556,7 +1608,8 @@ static int futex_lock_pi(u32 __user *uad
+                               int res;
+                               owner = rt_mutex_owner(&q.pi_state->pi_mutex);
+-                              res = fixup_pi_state_owner(uaddr, &q, owner);
++                              res = fixup_pi_state_owner(uaddr, &q, owner,
++                                                         fshared);
+                               /* propagate -EFAULT, if the fixup failed */
+                               if (res)
diff --git a/queue-2.6.25/ib-mthca-clear-icm-pages-before-handing-to-fw.patch b/queue-2.6.25/ib-mthca-clear-icm-pages-before-handing-to-fw.patch
new file mode 100644 (file)
index 0000000..13fe43d
--- /dev/null
@@ -0,0 +1,45 @@
+From stable-bounces@linux.kernel.org Mon Jun 23 16:30:22 2008
+From: Eli Cohen <eli@mellanox.co.il>
+Date: Mon, 23 Jun 2008 23:30:09 GMT
+Subject: IB/mthca: Clear ICM pages before handing to FW
+To: jejb@kernel.org, stable@kernel.org
+Message-ID: <200806232330.m5NNU9f6010280@hera.kernel.org>
+
+From: Eli Cohen <eli@mellanox.co.il>
+
+commit 87afd448b186c885d67a08b7417cd46253b6a9d6 upstream
+
+Current memfree FW has a bug which in some cases, assumes that ICM
+pages passed to it are cleared.  This patch uses __GFP_ZERO to
+allocate all ICM pages passed to the FW.  Once firmware with a fix is
+released, we can make the workaround conditional on firmware version.
+
+This fixes the bug reported by Arthur Kepner <akepner@sgi.com> here:
+http://lists.openfabrics.org/pipermail/general/2008-May/050026.html
+
+[ Rewritten to be a one-liner using __GFP_ZERO instead of vmap()ing
+  ICM memory and memset()ing it to 0. - Roland ]
+
+Signed-off-by: Eli Cohen <eli@mellanox.co.il>
+Signed-off-by: Roland Dreier <rolandd@cisco.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/infiniband/hw/mthca/mthca_memfree.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
++++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
+@@ -109,7 +109,11 @@ static int mthca_alloc_icm_pages(struct 
+ {
+       struct page *page;
+-      page = alloc_pages(gfp_mask, order);
++      /*
++       * Use __GFP_ZERO because buggy firmware assumes ICM pages are
++       * cleared, and subtle failures are seen if they aren't.
++       */
++      page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+       if (!page)
+               return -ENOMEM;
index 1118e330deabffad79fc23724f9fb8b5d5c6abd5..764e4d9a0e57c8f887fd0490ef8410e248e437ce 100644 (file)
@@ -1,2 +1,4 @@
 tty-fix-for-tty-operations-bugs.patch
 xen-mask-unwanted-pte-bits-in-__supported_pte_mask.patch
+futexes-fix-fault-handling-in-futex_lock_pi.patch
+ib-mthca-clear-icm-pages-before-handing-to-fw.patch