From eb26edfe9b5d192a8ae36df9ff755862c27472e5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 30 Jun 2008 09:14:14 -0700
Subject: [PATCH] more patches for .25

---
 ...-fix-fault-handling-in-futex_lock_pi.patch | 206 ++++++++++++++++++
 ...clear-icm-pages-before-handing-to-fw.patch |  45 ++++
 queue-2.6.25/series                           |   2 +
 3 files changed, 253 insertions(+)
 create mode 100644 queue-2.6.25/futexes-fix-fault-handling-in-futex_lock_pi.patch
 create mode 100644 queue-2.6.25/ib-mthca-clear-icm-pages-before-handing-to-fw.patch

diff --git a/queue-2.6.25/futexes-fix-fault-handling-in-futex_lock_pi.patch b/queue-2.6.25/futexes-fix-fault-handling-in-futex_lock_pi.patch
new file mode 100644
index 00000000000..1e1372431ca
--- /dev/null
+++ b/queue-2.6.25/futexes-fix-fault-handling-in-futex_lock_pi.patch
@@ -0,0 +1,206 @@
+From stable-bounces@linux.kernel.org Mon Jun 23 16:30:22 2008
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Mon, 23 Jun 2008 23:30:13 GMT
+Subject: futexes: fix fault handling in futex_lock_pi
+To: jejb@kernel.org, stable@kernel.org
+Message-ID: <200806232330.m5NNUDEY010317@hera.kernel.org>
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 1b7558e457ed0de61023cfc913d2c342c7c3d9f2 upstream
+
+This patch addresses a very sporadic pi-futex related failure in
+highly threaded java apps on large SMP systems.
+
+David Holmes reported that the pi_state consistency check in
+lookup_pi_state triggered with his test application. This means that
+the kernel internal pi_state and the user space futex variable are out
+of sync. First we assumed that this is a user space data corruption,
+but deeper investigation revieled that the problem happend because the
+pi-futex code is not handling a fault in the futex_lock_pi path when
+the user space variable needs to be fixed up.
+
+The fault happens when a fork mapped the anon memory which contains
+the futex readonly for COW or the page got swapped out exactly between
+the unlock of the futex and the return of either the new futex owner
+or the task which was the expected owner but failed to acquire the
+kernel internal rtmutex. The current futex_lock_pi() code drops out
+with an inconsistent in case it faults and returns -EFAULT to user
+space. User space has no way to fixup that state.
+
+When we wrote this code we thought that we could not drop the hash
+bucket lock at this point to handle the fault.
+
+After analysing the code again it turned out to be wrong because there
+are only two tasks involved which might modify the pi_state and the
+user space variable:
+
+ - the task which acquired the rtmutex
+ - the pending owner of the pi_state which did not get the rtmutex
+
+Both tasks drop into the fixup_pi_state() function before returning to
+user space. The first task which acquired the hash bucket lock faults
+in the fixup of the user space variable, drops the spinlock and calls
+futex_handle_fault() to fault in the page. Now the second task could
+acquire the hash bucket lock and tries to fixup the user space
+variable as well. It either faults as well or it succeeds because the
+first task already faulted the page in.
+
+One caveat is to avoid a double fixup. After returning from the fault
+handling we reacquire the hash bucket lock and check whether the
+pi_state owner has been modified already.
+
+Reported-by: David Holmes <david.holmes@sun.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: David Holmes <david.holmes@sun.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/futex.c |   93 ++++++++++++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 73 insertions(+), 20 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1118,21 +1118,64 @@ static void unqueue_me_pi(struct futex_q
+  * private futexes.
+  */
+ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+-				struct task_struct *newowner)
++				struct task_struct *newowner,
++				struct rw_semaphore *fshared)
+ {
+ 	u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ 	struct futex_pi_state *pi_state = q->pi_state;
++	struct task_struct *oldowner = pi_state->owner;
+ 	u32 uval, curval, newval;
+-	int ret;
++	int ret, attempt = 0;
+ 
+ 	/* Owner died? */
++	if (!pi_state->owner)
++		newtid |= FUTEX_OWNER_DIED;
++
++	/*
++	 * We are here either because we stole the rtmutex from the
++	 * pending owner or we are the pending owner which failed to
++	 * get the rtmutex. We have to replace the pending owner TID
++	 * in the user space variable. This must be atomic as we have
++	 * to preserve the owner died bit here.
++	 *
++	 * Note: We write the user space value _before_ changing the
++	 * pi_state because we can fault here. Imagine swapped out
++	 * pages or a fork, which was running right before we acquired
++	 * mmap_sem, that marked all the anonymous memory readonly for
++	 * cow.
++	 *
++	 * Modifying pi_state _before_ the user space value would
++	 * leave the pi_state in an inconsistent state when we fault
++	 * here, because we need to drop the hash bucket lock to
++	 * handle the fault. This might be observed in the PID check
++	 * in lookup_pi_state.
++	 */
++retry:
++	if (get_futex_value_locked(&uval, uaddr))
++		goto handle_fault;
++
++	while (1) {
++		newval = (uval & FUTEX_OWNER_DIED) | newtid;
++
++		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
++
++		if (curval == -EFAULT)
++			goto handle_fault;
++		if (curval == uval)
++			break;
++		uval = curval;
++	}
++
++	/*
++	 * We fixed up user space. Now we need to fix the pi_state
++	 * itself.
++	 */
+ 	if (pi_state->owner != NULL) {
+ 		spin_lock_irq(&pi_state->owner->pi_lock);
+ 		WARN_ON(list_empty(&pi_state->list));
+ 		list_del_init(&pi_state->list);
+ 		spin_unlock_irq(&pi_state->owner->pi_lock);
+-	} else
+-		newtid |= FUTEX_OWNER_DIED;
++	}
+ 
+ 	pi_state->owner = newowner;
+ 
+@@ -1140,26 +1183,35 @@ static int fixup_pi_state_owner(u32 __us
+ 	WARN_ON(!list_empty(&pi_state->list));
+ 	list_add(&pi_state->list, &newowner->pi_state_list);
+ 	spin_unlock_irq(&newowner->pi_lock);
++	return 0;
+ 
+ 	/*
+-	 * We own it, so we have to replace the pending owner
+-	 * TID. This must be atomic as we have preserve the
+-	 * owner died bit here.
++	 * To handle the page fault we need to drop the hash bucket
++	 * lock here. That gives the other task (either the pending
++	 * owner itself or the task which stole the rtmutex) the
++	 * chance to try the fixup of the pi_state. So once we are
++	 * back from handling the fault we need to check the pi_state
++	 * after reacquiring the hash bucket lock and before trying to
++	 * do another fixup. When the fixup has been done already we
++	 * simply return.
+ 	 */
+-	ret = get_futex_value_locked(&uval, uaddr);
++handle_fault:
++	spin_unlock(q->lock_ptr);
+ 
+-	while (!ret) {
+-		newval = (uval & FUTEX_OWNER_DIED) | newtid;
++	ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
+ 
+-		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
++	spin_lock(q->lock_ptr);
+ 
+-		if (curval == -EFAULT)
+-			ret = -EFAULT;
+-		if (curval == uval)
+-			break;
+-		uval = curval;
+-	}
+-	return ret;
++	/*
++	 * Check if someone else fixed it for us:
++	 */
++	if (pi_state->owner != oldowner)
++		return 0;
++
++	if (ret)
++		return ret;
++
++	goto retry;
+ }
+ 
+ /*
+@@ -1524,7 +1576,7 @@ static int futex_lock_pi(u32 __user *uad
+ 		 * that case:
+ 		 */
+ 		if (q.pi_state->owner != curr)
+-			ret = fixup_pi_state_owner(uaddr, &q, curr);
++			ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
+ 	} else {
+ 		/*
+ 		 * Catch the rare case, where the lock was released
+@@ -1556,7 +1608,8 @@ static int futex_lock_pi(u32 __user *uad
+ 				int res;
+ 
+ 				owner = rt_mutex_owner(&q.pi_state->pi_mutex);
+-				res = fixup_pi_state_owner(uaddr, &q, owner);
++				res = fixup_pi_state_owner(uaddr, &q, owner,
++							   fshared);
+ 
+ 				/* propagate -EFAULT, if the fixup failed */
+ 				if (res)
diff --git a/queue-2.6.25/ib-mthca-clear-icm-pages-before-handing-to-fw.patch b/queue-2.6.25/ib-mthca-clear-icm-pages-before-handing-to-fw.patch
new file mode 100644
index 00000000000..13fe43d8d85
--- /dev/null
+++ b/queue-2.6.25/ib-mthca-clear-icm-pages-before-handing-to-fw.patch
@@ -0,0 +1,45 @@
+From stable-bounces@linux.kernel.org Mon Jun 23 16:30:22 2008
+From: Eli Cohen <eli@mellanox.co.il>
+Date: Mon, 23 Jun 2008 23:30:09 GMT
+Subject: IB/mthca: Clear ICM pages before handing to FW
+To: jejb@kernel.org, stable@kernel.org
+Message-ID: <200806232330.m5NNU9f6010280@hera.kernel.org>
+
+From: Eli Cohen <eli@mellanox.co.il>
+
+commit 87afd448b186c885d67a08b7417cd46253b6a9d6 upstream
+
+Current memfree FW has a bug which in some cases, assumes that ICM
+pages passed to it are cleared.  This patch uses __GFP_ZERO to
+allocate all ICM pages passed to the FW.  Once firmware with a fix is
+released, we can make the workaround conditional on firmware version.
+
+This fixes the bug reported by Arthur Kepner <akepner@sgi.com> here:
+http://lists.openfabrics.org/pipermail/general/2008-May/050026.html
+
+[ Rewritten to be a one-liner using __GFP_ZERO instead of vmap()ing
+  ICM memory and memset()ing it to 0. - Roland ]
+
+Signed-off-by: Eli Cohen <eli@mellanox.co.il>
+Signed-off-by: Roland Dreier <rolandd@cisco.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/infiniband/hw/mthca/mthca_memfree.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
++++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
+@@ -109,7 +109,11 @@ static int mthca_alloc_icm_pages(struct 
+ {
+ 	struct page *page;
+ 
+-	page = alloc_pages(gfp_mask, order);
++	/*
++	 * Use __GFP_ZERO because buggy firmware assumes ICM pages are
++	 * cleared, and subtle failures are seen if they aren't.
++	 */
++	page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+ 	if (!page)
+ 		return -ENOMEM;
+ 
diff --git a/queue-2.6.25/series b/queue-2.6.25/series
index 1118e330dea..764e4d9a0e5 100644
--- a/queue-2.6.25/series
+++ b/queue-2.6.25/series
@@ -1,2 +1,4 @@
 tty-fix-for-tty-operations-bugs.patch
 xen-mask-unwanted-pte-bits-in-__supported_pte_mask.patch
+futexes-fix-fault-handling-in-futex_lock_pi.patch
+ib-mthca-clear-icm-pages-before-handing-to-fw.patch
-- 
2.47.3