From eb26edfe9b5d192a8ae36df9ff755862c27472e5 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 30 Jun 2008 09:14:14 -0700 Subject: [PATCH] more patches for .25 --- ...-fix-fault-handling-in-futex_lock_pi.patch | 206 ++++++++++++++++++ ...clear-icm-pages-before-handing-to-fw.patch | 45 ++++ queue-2.6.25/series | 2 + 3 files changed, 253 insertions(+) create mode 100644 queue-2.6.25/futexes-fix-fault-handling-in-futex_lock_pi.patch create mode 100644 queue-2.6.25/ib-mthca-clear-icm-pages-before-handing-to-fw.patch diff --git a/queue-2.6.25/futexes-fix-fault-handling-in-futex_lock_pi.patch b/queue-2.6.25/futexes-fix-fault-handling-in-futex_lock_pi.patch new file mode 100644 index 00000000000..1e1372431ca --- /dev/null +++ b/queue-2.6.25/futexes-fix-fault-handling-in-futex_lock_pi.patch @@ -0,0 +1,206 @@ +From stable-bounces@linux.kernel.org Mon Jun 23 16:30:22 2008 +From: Thomas Gleixner +Date: Mon, 23 Jun 2008 23:30:13 GMT +Subject: futexes: fix fault handling in futex_lock_pi +To: jejb@kernel.org, stable@kernel.org +Message-ID: <200806232330.m5NNUDEY010317@hera.kernel.org> + +From: Thomas Gleixner + +commit 1b7558e457ed0de61023cfc913d2c342c7c3d9f2 upstream + +This patch addresses a very sporadic pi-futex related failure in +highly threaded java apps on large SMP systems. + +David Holmes reported that the pi_state consistency check in +lookup_pi_state triggered with his test application. This means that +the kernel internal pi_state and the user space futex variable are out +of sync. First we assumed that this is a user space data corruption, +but deeper investigation revieled that the problem happend because the +pi-futex code is not handling a fault in the futex_lock_pi path when +the user space variable needs to be fixed up. + +The fault happens when a fork mapped the anon memory which contains +the futex readonly for COW or the page got swapped out exactly between +the unlock of the futex and the return of either the new futex owner +or the task which was the expected owner but failed to acquire the +kernel internal rtmutex. The current futex_lock_pi() code drops out +with an inconsistent in case it faults and returns -EFAULT to user +space. User space has no way to fixup that state. + +When we wrote this code we thought that we could not drop the hash +bucket lock at this point to handle the fault. + +After analysing the code again it turned out to be wrong because there +are only two tasks involved which might modify the pi_state and the +user space variable: + + - the task which acquired the rtmutex + - the pending owner of the pi_state which did not get the rtmutex + +Both tasks drop into the fixup_pi_state() function before returning to +user space. The first task which acquired the hash bucket lock faults +in the fixup of the user space variable, drops the spinlock and calls +futex_handle_fault() to fault in the page. Now the second task could +acquire the hash bucket lock and tries to fixup the user space +variable as well. It either faults as well or it succeeds because the +first task already faulted the page in. + +One caveat is to avoid a double fixup. After returning from the fault +handling we reacquire the hash bucket lock and check whether the +pi_state owner has been modified already. + +Reported-by: David Holmes +Signed-off-by: Thomas Gleixner +Cc: Andrew Morton +Cc: David Holmes +Cc: Peter Zijlstra +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/futex.c | 93 ++++++++++++++++++++++++++++++++++++++++++++------------- + 1 file changed, 73 insertions(+), 20 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1118,21 +1118,64 @@ static void unqueue_me_pi(struct futex_q + * private futexes. + */ + static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, +- struct task_struct *newowner) ++ struct task_struct *newowner, ++ struct rw_semaphore *fshared) + { + u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + struct futex_pi_state *pi_state = q->pi_state; ++ struct task_struct *oldowner = pi_state->owner; + u32 uval, curval, newval; +- int ret; ++ int ret, attempt = 0; + + /* Owner died? */ ++ if (!pi_state->owner) ++ newtid |= FUTEX_OWNER_DIED; ++ ++ /* ++ * We are here either because we stole the rtmutex from the ++ * pending owner or we are the pending owner which failed to ++ * get the rtmutex. We have to replace the pending owner TID ++ * in the user space variable. This must be atomic as we have ++ * to preserve the owner died bit here. ++ * ++ * Note: We write the user space value _before_ changing the ++ * pi_state because we can fault here. Imagine swapped out ++ * pages or a fork, which was running right before we acquired ++ * mmap_sem, that marked all the anonymous memory readonly for ++ * cow. ++ * ++ * Modifying pi_state _before_ the user space value would ++ * leave the pi_state in an inconsistent state when we fault ++ * here, because we need to drop the hash bucket lock to ++ * handle the fault. This might be observed in the PID check ++ * in lookup_pi_state. ++ */ ++retry: ++ if (get_futex_value_locked(&uval, uaddr)) ++ goto handle_fault; ++ ++ while (1) { ++ newval = (uval & FUTEX_OWNER_DIED) | newtid; ++ ++ curval = cmpxchg_futex_value_locked(uaddr, uval, newval); ++ ++ if (curval == -EFAULT) ++ goto handle_fault; ++ if (curval == uval) ++ break; ++ uval = curval; ++ } ++ ++ /* ++ * We fixed up user space. Now we need to fix the pi_state ++ * itself. ++ */ + if (pi_state->owner != NULL) { + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); +- } else +- newtid |= FUTEX_OWNER_DIED; ++ } + + pi_state->owner = newowner; + +@@ -1140,26 +1183,35 @@ static int fixup_pi_state_owner(u32 __us + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &newowner->pi_state_list); + spin_unlock_irq(&newowner->pi_lock); ++ return 0; + + /* +- * We own it, so we have to replace the pending owner +- * TID. This must be atomic as we have preserve the +- * owner died bit here. ++ * To handle the page fault we need to drop the hash bucket ++ * lock here. That gives the other task (either the pending ++ * owner itself or the task which stole the rtmutex) the ++ * chance to try the fixup of the pi_state. So once we are ++ * back from handling the fault we need to check the pi_state ++ * after reacquiring the hash bucket lock and before trying to ++ * do another fixup. When the fixup has been done already we ++ * simply return. + */ +- ret = get_futex_value_locked(&uval, uaddr); ++handle_fault: ++ spin_unlock(q->lock_ptr); + +- while (!ret) { +- newval = (uval & FUTEX_OWNER_DIED) | newtid; ++ ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); + +- curval = cmpxchg_futex_value_locked(uaddr, uval, newval); ++ spin_lock(q->lock_ptr); + +- if (curval == -EFAULT) +- ret = -EFAULT; +- if (curval == uval) +- break; +- uval = curval; +- } +- return ret; ++ /* ++ * Check if someone else fixed it for us: ++ */ ++ if (pi_state->owner != oldowner) ++ return 0; ++ ++ if (ret) ++ return ret; ++ ++ goto retry; + } + + /* +@@ -1524,7 +1576,7 @@ static int futex_lock_pi(u32 __user *uad + * that case: + */ + if (q.pi_state->owner != curr) +- ret = fixup_pi_state_owner(uaddr, &q, curr); ++ ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); + } else { + /* + * Catch the rare case, where the lock was released +@@ -1556,7 +1608,8 @@ static int futex_lock_pi(u32 __user *uad + int res; + + owner = rt_mutex_owner(&q.pi_state->pi_mutex); +- res = fixup_pi_state_owner(uaddr, &q, owner); ++ res = fixup_pi_state_owner(uaddr, &q, owner, ++ fshared); + + /* propagate -EFAULT, if the fixup failed */ + if (res) diff --git a/queue-2.6.25/ib-mthca-clear-icm-pages-before-handing-to-fw.patch b/queue-2.6.25/ib-mthca-clear-icm-pages-before-handing-to-fw.patch new file mode 100644 index 00000000000..13fe43d8d85 --- /dev/null +++ b/queue-2.6.25/ib-mthca-clear-icm-pages-before-handing-to-fw.patch @@ -0,0 +1,45 @@ +From stable-bounces@linux.kernel.org Mon Jun 23 16:30:22 2008 +From: Eli Cohen +Date: Mon, 23 Jun 2008 23:30:09 GMT +Subject: IB/mthca: Clear ICM pages before handing to FW +To: jejb@kernel.org, stable@kernel.org +Message-ID: <200806232330.m5NNU9f6010280@hera.kernel.org> + +From: Eli Cohen + +commit 87afd448b186c885d67a08b7417cd46253b6a9d6 upstream + +Current memfree FW has a bug which in some cases, assumes that ICM +pages passed to it are cleared. This patch uses __GFP_ZERO to +allocate all ICM pages passed to the FW. Once firmware with a fix is +released, we can make the workaround conditional on firmware version. + +This fixes the bug reported by Arthur Kepner here: +http://lists.openfabrics.org/pipermail/general/2008-May/050026.html + +[ Rewritten to be a one-liner using __GFP_ZERO instead of vmap()ing + ICM memory and memset()ing it to 0. - Roland ] + +Signed-off-by: Eli Cohen +Signed-off-by: Roland Dreier +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/hw/mthca/mthca_memfree.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/hw/mthca/mthca_memfree.c ++++ b/drivers/infiniband/hw/mthca/mthca_memfree.c +@@ -109,7 +109,11 @@ static int mthca_alloc_icm_pages(struct + { + struct page *page; + +- page = alloc_pages(gfp_mask, order); ++ /* ++ * Use __GFP_ZERO because buggy firmware assumes ICM pages are ++ * cleared, and subtle failures are seen if they aren't. ++ */ ++ page = alloc_pages(gfp_mask | __GFP_ZERO, order); + if (!page) + return -ENOMEM; + diff --git a/queue-2.6.25/series b/queue-2.6.25/series index 1118e330dea..764e4d9a0e5 100644 --- a/queue-2.6.25/series +++ b/queue-2.6.25/series @@ -1,2 +1,4 @@ tty-fix-for-tty-operations-bugs.patch xen-mask-unwanted-pte-bits-in-__supported_pte_mask.patch +futexes-fix-fault-handling-in-futex_lock_pi.patch +ib-mthca-clear-icm-pages-before-handing-to-fw.patch -- 2.47.3