From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 10 Oct 2006 06:38:55 +0000 (-0700)
Subject: more 2.6.18.1 patches
X-Git-Tag: v2.6.17.14~18
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=917f9eb19484f74b091a725ded2055817074f863;p=thirdparty%2Fkernel%2Fstable-queue.git

more 2.6.18.1 patches
---

diff --git a/queue-2.6.18/alsa-fix-initiailization-of-user-space-controls.patch b/queue-2.6.18/alsa-fix-initiailization-of-user-space-controls.patch
new file mode 100644
index 00000000000..f519643b5d2
--- /dev/null
+++ b/queue-2.6.18/alsa-fix-initiailization-of-user-space-controls.patch
@@ -0,0 +1,33 @@
+From stable-bounces@linux.kernel.org Mon Sep 25 03:51:05 2006
+Date: Mon, 25 Sep 2006 11:49:01 +0200
+Message-ID: <s5h7izs8eeq.wl%tiwai@suse.de>
+From: Takashi Iwai <tiwai@suse.de>
+To: stable@kernel.org
+Subject: ALSA: Fix initiailization of user-space controls
+
+From: Takashi Iwai <tiwai@suse.de>
+
+ALSA: Fix initiailization of user-space controls
+
+Fix an assertion when accessing a user-defined control due to lack of
+initialization (appears only when CONFIG_SND_DEBUg is enabled).
+
+  ALSA sound/core/control.c:660: BUG? (info->access == 0)
+
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ sound/core/control.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- linux-2.6.18.orig/sound/core/control.c
++++ linux-2.6.18/sound/core/control.c
+@@ -997,6 +997,7 @@ static int snd_ctl_elem_add(struct snd_c
+ 	if (ue == NULL)
+ 		return -ENOMEM;
+ 	ue->info = *info;
++	ue->info.access = 0;
+ 	ue->elem_data = (char *)ue + sizeof(*ue);
+ 	ue->elem_data_size = private_size;
+ 	kctl.private_free = snd_ctl_elem_user_free;
diff --git a/queue-2.6.18/fix-longstanding-load-balancing-bug-in-the-scheduler.patch b/queue-2.6.18/fix-longstanding-load-balancing-bug-in-the-scheduler.patch
new file mode 100644
index 00000000000..e95a777830d
--- /dev/null
+++ b/queue-2.6.18/fix-longstanding-load-balancing-bug-in-the-scheduler.patch
@@ -0,0 +1,215 @@
+From stable-bounces@linux.kernel.org Mon Sep 25 23:34:14 2006
+Message-Id: <200609260630.k8Q6UpbB011991@shell0.pdx.osdl.net>
+To: torvalds@osdl.org
+From: Christoph Lameter <christoph@sgi.com>
+Date: Mon, 25 Sep 2006 23:30:51 -0700
+Cc: akpm@osdl.org, nickpiggin@yahoo.com.au, suresh.b.siddha@intel.com,
+        christoph@sgi.com, pwil3058@bigpond.net.au, mingo@elte.hu,
+        hawkes@sgi.com, stable@kernel.org, clameter@sgi.com
+Subject: Fix longstanding load balancing bug in the scheduler
+
+From: Christoph Lameter <christoph@sgi.com>
+
+The scheduler will stop load balancing if the most busy processor contains
+processes pinned via processor affinity.
+
+The scheduler currently only does one search for busiest cpu.  If it cannot
+pull any tasks away from the busiest cpu because they were pinned then the
+scheduler goes into a corner and sulks leaving the idle processors idle.
+
+F.e.  If you have processor 0 busy running four tasks pinned via taskset,
+there are none on processor 1 and one just started two processes on
+processor 2 then the scheduler will not move one of the two processes away
+from processor 2.
+
+This patch fixes that issue by forcing the scheduler to come out of its
+corner and retrying the load balancing by considering other processors for
+load balancing.
+
+This patch was originally developed by John Hawkes and discussed at
+http://marc.theaimsgroup.com/?l=linux-kernel&m=113901368523205&w=2.
+
+I have removed extraneous material and gone back to equipping struct rq
+with the cpu the queue is associated with since this makes the patch much
+easier and it is likely that others in the future will have the same
+difficulty of figuring out which processor owns which runqueue.
+
+The overhead added through these patches is a single word on the stack if
+the kernel is configured to support 32 cpus or less (32 bit).  For 32 bit
+environments the maximum number of cpus that can be configued is 255 which
+would result in the use of 32 bytes additional on the stack.  On IA64 up to
+1k cpus can be configured which will result in the use of 128 additional
+bytes on the stack.  The maximum additional cache footprint is one
+cacheline.  Typically memory use will be much less than a cacheline and the
+additional cpumask will be placed on the stack in a cacheline that already
+contains other local variable.
+
+
+Signed-off-by: Christoph Lameter <clameter@sgi.com>
+Cc: John Hawkes <hawkes@sgi.com>
+Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
+Cc: Ingo Molnar <mingo@elte.hu>
+Cc: Nick Piggin <nickpiggin@yahoo.com.au>
+Cc: Peter Williams <pwil3058@bigpond.net.au>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/sched.c |   54 ++++++++++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 46 insertions(+), 8 deletions(-)
+
+--- linux-2.6.18.orig/kernel/sched.c
++++ linux-2.6.18/kernel/sched.c
+@@ -238,6 +238,7 @@ struct rq {
+ 	/* For active balancing */
+ 	int active_balance;
+ 	int push_cpu;
++	int cpu;		/* cpu of this runqueue */
+ 
+ 	struct task_struct *migration_thread;
+ 	struct list_head migration_queue;
+@@ -267,6 +268,15 @@ struct rq {
+ 
+ static DEFINE_PER_CPU(struct rq, runqueues);
+ 
++static inline int cpu_of(struct rq *rq)
++{
++#ifdef CONFIG_SMP
++	return rq->cpu;
++#else
++	return 0;
++#endif
++}
++
+ /*
+  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+  * See detach_destroy_domains: synchronize_sched for details.
+@@ -2211,7 +2221,8 @@ out:
+  */
+ static struct sched_group *
+ find_busiest_group(struct sched_domain *sd, int this_cpu,
+-		   unsigned long *imbalance, enum idle_type idle, int *sd_idle)
++		   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
++		   cpumask_t *cpus)
+ {
+ 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+ 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+@@ -2248,7 +2259,12 @@ find_busiest_group(struct sched_domain *
+ 		sum_weighted_load = sum_nr_running = avg_load = 0;
+ 
+ 		for_each_cpu_mask(i, group->cpumask) {
+-			struct rq *rq = cpu_rq(i);
++			struct rq *rq;
++
++			if (!cpu_isset(i, *cpus))
++				continue;
++
++			rq = cpu_rq(i);
+ 
+ 			if (*sd_idle && !idle_cpu(i))
+ 				*sd_idle = 0;
+@@ -2466,13 +2482,17 @@ ret:
+  */
+ static struct rq *
+ find_busiest_queue(struct sched_group *group, enum idle_type idle,
+-		   unsigned long imbalance)
++		   unsigned long imbalance, cpumask_t *cpus)
+ {
+ 	struct rq *busiest = NULL, *rq;
+ 	unsigned long max_load = 0;
+ 	int i;
+ 
+ 	for_each_cpu_mask(i, group->cpumask) {
++
++		if (!cpu_isset(i, *cpus))
++			continue;
++
+ 		rq = cpu_rq(i);
+ 
+ 		if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
+@@ -2511,6 +2531,7 @@ static int load_balance(int this_cpu, st
+ 	struct sched_group *group;
+ 	unsigned long imbalance;
+ 	struct rq *busiest;
++	cpumask_t cpus = CPU_MASK_ALL;
+ 
+ 	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+ 	    !sched_smt_power_savings)
+@@ -2518,13 +2539,15 @@ static int load_balance(int this_cpu, st
+ 
+ 	schedstat_inc(sd, lb_cnt[idle]);
+ 
+-	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
++redo:
++	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
++							&cpus);
+ 	if (!group) {
+ 		schedstat_inc(sd, lb_nobusyg[idle]);
+ 		goto out_balanced;
+ 	}
+ 
+-	busiest = find_busiest_queue(group, idle, imbalance);
++	busiest = find_busiest_queue(group, idle, imbalance, &cpus);
+ 	if (!busiest) {
+ 		schedstat_inc(sd, lb_nobusyq[idle]);
+ 		goto out_balanced;
+@@ -2549,8 +2572,12 @@ static int load_balance(int this_cpu, st
+ 		double_rq_unlock(this_rq, busiest);
+ 
+ 		/* All tasks on this runqueue were pinned by CPU affinity */
+-		if (unlikely(all_pinned))
++		if (unlikely(all_pinned)) {
++			cpu_clear(cpu_of(busiest), cpus);
++			if (!cpus_empty(cpus))
++				goto redo;
+ 			goto out_balanced;
++		}
+ 	}
+ 
+ 	if (!nr_moved) {
+@@ -2639,18 +2666,22 @@ load_balance_newidle(int this_cpu, struc
+ 	unsigned long imbalance;
+ 	int nr_moved = 0;
+ 	int sd_idle = 0;
++	cpumask_t cpus = CPU_MASK_ALL;
+ 
+ 	if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
+ 		sd_idle = 1;
+ 
+ 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
+-	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
++redo:
++	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
++				&sd_idle, &cpus);
+ 	if (!group) {
+ 		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
+ 		goto out_balanced;
+ 	}
+ 
+-	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
++	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
++				&cpus);
+ 	if (!busiest) {
+ 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+ 		goto out_balanced;
+@@ -2668,6 +2699,12 @@ load_balance_newidle(int this_cpu, struc
+ 					minus_1_or_zero(busiest->nr_running),
+ 					imbalance, sd, NEWLY_IDLE, NULL);
+ 		spin_unlock(&busiest->lock);
++
++		if (!nr_moved) {
++			cpu_clear(cpu_of(busiest), cpus);
++			if (!cpus_empty(cpus))
++				goto redo;
++		}
+ 	}
+ 
+ 	if (!nr_moved) {
+@@ -6747,6 +6784,7 @@ void __init sched_init(void)
+ 			rq->cpu_load[j] = 0;
+ 		rq->active_balance = 0;
+ 		rq->push_cpu = 0;
++		rq->cpu = i;
+ 		rq->migration_thread = NULL;
+ 		INIT_LIST_HEAD(&rq->migration_queue);
+ #endif
diff --git a/queue-2.6.18/ib-mthca-fix-lid-used-for-sending-traps.patch b/queue-2.6.18/ib-mthca-fix-lid-used-for-sending-traps.patch
new file mode 100644
index 00000000000..3307d6c7a19
--- /dev/null
+++ b/queue-2.6.18/ib-mthca-fix-lid-used-for-sending-traps.patch
@@ -0,0 +1,38 @@
+From stable-bounces@linux.kernel.org Wed Sep 20 15:47:56 2006
+To: stable@kernel.org
+From: Roland Dreier <rdreier@cisco.com>
+Date: Wed, 20 Sep 2006 15:47:16 -0700
+Message-ID: <adak63yjguj.fsf@cisco.com>
+Subject: IB/mthca: Fix lid used for sending traps
+
+From: Jack Morgenstein <jackm@dev.mellanox.co.il>
+
+The SM LID used to send traps to is incorrectly set to port LID.  This
+is a regression from 2.6.17 -- after a PortInfo MAD is received, no
+traps are sent to the SM LID.  The traps go to the loopback interface
+instead, and are dropped there.  The SM LID should be taken from the
+sm_lid of the PortInfo response.
+
+The bug was introduced by commit 12bbb2b7be7f5564952ebe0196623e97464b8ac5:
+	IB/mthca: Add client reregister event generation
+
+Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
+Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
+Signed-off-by: Roland Dreier <rolandd@cisco.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/infiniband/hw/mthca/mthca_mad.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- linux-2.6.18.orig/drivers/infiniband/hw/mthca/mthca_mad.c
++++ linux-2.6.18/drivers/infiniband/hw/mthca/mthca_mad.c
+@@ -119,7 +119,7 @@ static void smp_snoop(struct ib_device *
+ 
+ 			mthca_update_rate(to_mdev(ibdev), port_num);
+ 			update_sm_ah(to_mdev(ibdev), port_num,
+-				     be16_to_cpu(pinfo->lid),
++				     be16_to_cpu(pinfo->sm_lid),
+ 				     pinfo->neighbormtu_mastersmsl & 0xf);
+ 
+ 			event.device           = ibdev;
diff --git a/queue-2.6.18/jbd-fix-commit-of-ordered-data-buffers.patch b/queue-2.6.18/jbd-fix-commit-of-ordered-data-buffers.patch
new file mode 100644
index 00000000000..76e7c0eb09b
--- /dev/null
+++ b/queue-2.6.18/jbd-fix-commit-of-ordered-data-buffers.patch
@@ -0,0 +1,239 @@
+From stable-bounces@linux.kernel.org Mon Sep 11 15:14:16 2006
+Message-Id: <200609112213.k8BMDbPC029844@shell0.pdx.osdl.net>
+To: mm-commits@vger.kernel.org
+From: akpm@osdl.org
+Date: Mon, 11 Sep 2006 15:13:37 -0700
+Cc: pbadari@us.ibm.com, jack@suse.cz, stable@kernel.org
+Subject: jbd: fix commit of ordered data buffers
+
+From: Jan Kara <jack@suse.cz>
+
+Original commit code assumes, that when a buffer on BJ_SyncData list is
+locked, it is being written to disk.  But this is not true and hence it can
+lead to a potential data loss on crash.  Also the code didn't count with
+the fact that journal_dirty_data() can steal buffers from committing
+transaction and hence could write buffers that no longer belong to the
+committing transaction.  Finally it could possibly happen that we tried
+writing out one buffer several times.
+
+The patch below tries to solve these problems by a complete rewrite of the
+data commit code.  We go through buffers on t_sync_datalist, lock buffers
+needing write out and store them in an array.  Buffers are also immediately
+refiled to BJ_Locked list or unfiled (if the write out is completed).  When
+the array is full or we have to block on buffer lock, we submit all
+accumulated buffers for IO.
+
+[suitable for 2.6.18.x around the 2.6.19-rc2 timeframe]
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Cc: Badari Pulavarty <pbadari@us.ibm.com>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/jbd/commit.c |  182 ++++++++++++++++++++++++++++++++++----------------------
+ 1 file changed, 113 insertions(+), 69 deletions(-)
+
+--- linux-2.6.18.orig/fs/jbd/commit.c
++++ linux-2.6.18/fs/jbd/commit.c
+@@ -160,6 +160,117 @@ static int journal_write_commit_record(j
+ 	return (ret == -EIO);
+ }
+ 
++void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
++{
++	int i;
++
++	for (i = 0; i < bufs; i++) {
++		wbuf[i]->b_end_io = end_buffer_write_sync;
++		/* We use-up our safety reference in submit_bh() */
++		submit_bh(WRITE, wbuf[i]);
++	}
++}
++
++/*
++ *  Submit all the data buffers to disk
++ */
++static void journal_submit_data_buffers(journal_t *journal,
++				transaction_t *commit_transaction)
++{
++	struct journal_head *jh;
++	struct buffer_head *bh;
++	int locked;
++	int bufs = 0;
++	struct buffer_head **wbuf = journal->j_wbuf;
++
++	/*
++	 * Whenever we unlock the journal and sleep, things can get added
++	 * onto ->t_sync_datalist, so we have to keep looping back to
++	 * write_out_data until we *know* that the list is empty.
++	 *
++	 * Cleanup any flushed data buffers from the data list.  Even in
++	 * abort mode, we want to flush this out as soon as possible.
++	 */
++write_out_data:
++	cond_resched();
++	spin_lock(&journal->j_list_lock);
++
++	while (commit_transaction->t_sync_datalist) {
++		jh = commit_transaction->t_sync_datalist;
++		bh = jh2bh(jh);
++		locked = 0;
++
++		/* Get reference just to make sure buffer does not disappear
++		 * when we are forced to drop various locks */
++		get_bh(bh);
++		/* If the buffer is dirty, we need to submit IO and hence
++		 * we need the buffer lock. We try to lock the buffer without
++		 * blocking. If we fail, we need to drop j_list_lock and do
++		 * blocking lock_buffer().
++		 */
++		if (buffer_dirty(bh)) {
++			if (test_set_buffer_locked(bh)) {
++				BUFFER_TRACE(bh, "needs blocking lock");
++				spin_unlock(&journal->j_list_lock);
++				/* Write out all data to prevent deadlocks */
++				journal_do_submit_data(wbuf, bufs);
++				bufs = 0;
++				lock_buffer(bh);
++				spin_lock(&journal->j_list_lock);
++			}
++			locked = 1;
++		}
++		/* We have to get bh_state lock. Again out of order, sigh. */
++		if (!inverted_lock(journal, bh)) {
++			jbd_lock_bh_state(bh);
++			spin_lock(&journal->j_list_lock);
++		}
++		/* Someone already cleaned up the buffer? */
++		if (!buffer_jbd(bh)
++			|| jh->b_transaction != commit_transaction
++			|| jh->b_jlist != BJ_SyncData) {
++			jbd_unlock_bh_state(bh);
++			if (locked)
++				unlock_buffer(bh);
++			BUFFER_TRACE(bh, "already cleaned up");
++			put_bh(bh);
++			continue;
++		}
++		if (locked && test_clear_buffer_dirty(bh)) {
++			BUFFER_TRACE(bh, "needs writeout, adding to array");
++			wbuf[bufs++] = bh;
++			__journal_file_buffer(jh, commit_transaction,
++						BJ_Locked);
++			jbd_unlock_bh_state(bh);
++			if (bufs == journal->j_wbufsize) {
++				spin_unlock(&journal->j_list_lock);
++				journal_do_submit_data(wbuf, bufs);
++				bufs = 0;
++				goto write_out_data;
++			}
++		}
++		else {
++			BUFFER_TRACE(bh, "writeout complete: unfile");
++			__journal_unfile_buffer(jh);
++			jbd_unlock_bh_state(bh);
++			if (locked)
++				unlock_buffer(bh);
++			journal_remove_journal_head(bh);
++			/* Once for our safety reference, once for
++			 * journal_remove_journal_head() */
++			put_bh(bh);
++			put_bh(bh);
++		}
++
++		if (lock_need_resched(&journal->j_list_lock)) {
++			spin_unlock(&journal->j_list_lock);
++			goto write_out_data;
++		}
++	}
++	spin_unlock(&journal->j_list_lock);
++	journal_do_submit_data(wbuf, bufs);
++}
++
+ /*
+  * journal_commit_transaction
+  *
+@@ -313,80 +424,13 @@ void journal_commit_transaction(journal_
+ 	 * Now start flushing things to disk, in the order they appear
+ 	 * on the transaction lists.  Data blocks go first.
+ 	 */
+-
+ 	err = 0;
+-	/*
+-	 * Whenever we unlock the journal and sleep, things can get added
+-	 * onto ->t_sync_datalist, so we have to keep looping back to
+-	 * write_out_data until we *know* that the list is empty.
+-	 */
+-	bufs = 0;
+-	/*
+-	 * Cleanup any flushed data buffers from the data list.  Even in
+-	 * abort mode, we want to flush this out as soon as possible.
+-	 */
+-write_out_data:
+-	cond_resched();
+-	spin_lock(&journal->j_list_lock);
+-
+-	while (commit_transaction->t_sync_datalist) {
+-		struct buffer_head *bh;
+-
+-		jh = commit_transaction->t_sync_datalist;
+-		commit_transaction->t_sync_datalist = jh->b_tnext;
+-		bh = jh2bh(jh);
+-		if (buffer_locked(bh)) {
+-			BUFFER_TRACE(bh, "locked");
+-			if (!inverted_lock(journal, bh))
+-				goto write_out_data;
+-			__journal_temp_unlink_buffer(jh);
+-			__journal_file_buffer(jh, commit_transaction,
+-						BJ_Locked);
+-			jbd_unlock_bh_state(bh);
+-			if (lock_need_resched(&journal->j_list_lock)) {
+-				spin_unlock(&journal->j_list_lock);
+-				goto write_out_data;
+-			}
+-		} else {
+-			if (buffer_dirty(bh)) {
+-				BUFFER_TRACE(bh, "start journal writeout");
+-				get_bh(bh);
+-				wbuf[bufs++] = bh;
+-				if (bufs == journal->j_wbufsize) {
+-					jbd_debug(2, "submit %d writes\n",
+-							bufs);
+-					spin_unlock(&journal->j_list_lock);
+-					ll_rw_block(SWRITE, bufs, wbuf);
+-					journal_brelse_array(wbuf, bufs);
+-					bufs = 0;
+-					goto write_out_data;
+-				}
+-			} else {
+-				BUFFER_TRACE(bh, "writeout complete: unfile");
+-				if (!inverted_lock(journal, bh))
+-					goto write_out_data;
+-				__journal_unfile_buffer(jh);
+-				jbd_unlock_bh_state(bh);
+-				journal_remove_journal_head(bh);
+-				put_bh(bh);
+-				if (lock_need_resched(&journal->j_list_lock)) {
+-					spin_unlock(&journal->j_list_lock);
+-					goto write_out_data;
+-				}
+-			}
+-		}
+-	}
+-
+-	if (bufs) {
+-		spin_unlock(&journal->j_list_lock);
+-		ll_rw_block(SWRITE, bufs, wbuf);
+-		journal_brelse_array(wbuf, bufs);
+-		spin_lock(&journal->j_list_lock);
+-	}
++	journal_submit_data_buffers(journal, commit_transaction);
+ 
+ 	/*
+ 	 * Wait for all previously submitted IO to complete.
+ 	 */
++	spin_lock(&journal->j_list_lock);
+ 	while (commit_transaction->t_locked_list) {
+ 		struct buffer_head *bh;
+ 
diff --git a/queue-2.6.18/s390-user-readable-uninitialised-kernel-memory.patch b/queue-2.6.18/s390-user-readable-uninitialised-kernel-memory.patch
new file mode 100644
index 00000000000..f6fe9906144
--- /dev/null
+++ b/queue-2.6.18/s390-user-readable-uninitialised-kernel-memory.patch
@@ -0,0 +1,69 @@
+From stable-bounces@linux.kernel.org Thu Sep 28 06:32:54 2006
+Date: Thu, 28 Sep 2006 15:31:52 +0200
+From: Martin Schwidefsky <schwidefsky@de.ibm.com>
+To: gregkh@suse.de, bunk@stusta.de
+Message-ID: <20060928133152.GA10672@skybase>
+Content-Disposition: inline
+Cc: stable@kernel.org
+Subject: S390: user readable uninitialised kernel memory (CVE-2006-5174)
+
+From: Martin Schwidefsky <schwidefsky@de.ibm.com>
+
+[S390] user readable uninitialised kernel memory.
+
+A user space program can read uninitialised kernel memory
+by appending to a file from a bad address and then reading
+the result back. The cause is the copy_from_user function
+that does not clear the remaining bytes of the kernel
+buffer after it got a fault on the user space address.
+
+Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/s390/lib/uaccess.S   |   12 +++++++++++-
+ arch/s390/lib/uaccess64.S |   12 +++++++++++-
+ 2 files changed, 22 insertions(+), 2 deletions(-)
+
+--- linux-2.6.18.orig/arch/s390/lib/uaccess.S
++++ linux-2.6.18/arch/s390/lib/uaccess.S
+@@ -40,7 +40,17 @@ __copy_from_user_asm:
+ 	# move with the reduced length which is < 256
+ 5:	mvcp	0(%r5,%r2),0(%r4),%r0
+ 	slr	%r3,%r5
+-6:	lr	%r2,%r3
++	alr	%r2,%r5
++6:	lgr	%r5,%r3		# copy remaining size
++	ahi	%r5,-1		# subtract 1 for xc loop
++	bras	%r4,8f
++	xc	0(1,%2),0(%2)
++7:	xc	0(256,%2),0(%2)
++	la	%r2,256(%r2)
++8:	ahji	%r5,-256
++	jnm	7b
++	ex	%r5,0(%r2)
++9:	lr	%r2,%r3
+ 	br	%r14
+         .section __ex_table,"a"
+ 	.long	0b,4b
+--- linux-2.6.18.orig/arch/s390/lib/uaccess64.S
++++ linux-2.6.18/arch/s390/lib/uaccess64.S
+@@ -40,7 +40,17 @@ __copy_from_user_asm:
+ 	# move with the reduced length which is < 256
+ 5:	mvcp	0(%r5,%r2),0(%r4),%r0
+ 	slgr	%r3,%r5
+-6:	lgr	%r2,%r3
++	algr	%r2,%r5
++6:	lgr	%r5,%r3		# copy remaining size
++	aghi	%r5,-1		# subtract 1 for xc loop
++	bras	%r4,8f
++	xc	0(1,%r2),0(%r2)
++7:	xc	0(256,%r2),0(%r2)
++	la	%r2,256(%r2)
++8:	aghi	%r5,-256
++	jnm	7b
++	ex	%r5,0(%r2)
++9:	lgr	%r2,%r3
+ 	br	%r14
+         .section __ex_table,"a"
+ 	.quad	0b,4b
diff --git a/queue-2.6.18/series b/queue-2.6.18/series
index e39b176fd70..c3c8e5ba232 100644
--- a/queue-2.6.18/series
+++ b/queue-2.6.18/series
@@ -9,3 +9,10 @@ video-pvrusb2-improve-24xxx-config-option-description.patch
 video-pvrusb2-suppress-compiler-warning.patch
 video-pvrusb2-limit-hor-res-for-24xxx-devices.patch
 zd1211rw-zd1211b-asic-fwt-not-jointly-decoder.patch
+s390-user-readable-uninitialised-kernel-memory.patch
+ib-mthca-fix-lid-used-for-sending-traps.patch
+usb-allow-compile-in-g_ether-fix-typo.patch
+alsa-fix-initiailization-of-user-space-controls.patch
+jbd-fix-commit-of-ordered-data-buffers.patch
+fix-longstanding-load-balancing-bug-in-the-scheduler.patch
+zone_reclaim-dynamic-slab-reclaim.patch
diff --git a/queue-2.6.18/usb-allow-compile-in-g_ether-fix-typo.patch b/queue-2.6.18/usb-allow-compile-in-g_ether-fix-typo.patch
new file mode 100644
index 00000000000..75107987372
--- /dev/null
+++ b/queue-2.6.18/usb-allow-compile-in-g_ether-fix-typo.patch
@@ -0,0 +1,39 @@
+From stable-bounces@linux.kernel.org Fri Sep 22 20:18:48 2006
+Date: Fri, 22 Sep 2006 20:17:48 -0700
+To: stable@kernel.org
+Cc: Andrew Morton <akpm@osdl.org>
+Message-Id: <20060922201748.bf3245d6.akpm@osdl.org>
+Subject: USB: Allow compile in g_ether, fix typo
+
+From: Tony Lindgren <tony@atomide.com>
+
+Allows compiling g_ether in and fixes a typo with MUSB_HDRC
+
+Signed-off-by: Tony Lindgren <tony@atomide.com>
+Cc: David Brownell <david-b@pacbell.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/usb/gadget/ether.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- linux-2.6.18.orig/drivers/usb/gadget/ether.c
++++ linux-2.6.18/drivers/usb/gadget/ether.c
+@@ -262,7 +262,7 @@ MODULE_PARM_DESC(host_addr, "Host Ethern
+ #define DEV_CONFIG_CDC
+ #endif
+ 
+-#ifdef CONFIG_USB_GADGET_MUSBHDRC
++#ifdef CONFIG_USB_GADGET_MUSB_HDRC
+ #define DEV_CONFIG_CDC
+ #endif
+ 
+@@ -2564,7 +2564,7 @@ static struct usb_gadget_driver eth_driv
+ 
+ 	.function	= (char *) driver_desc,
+ 	.bind		= eth_bind,
+-	.unbind		= __exit_p(eth_unbind),
++	.unbind		= eth_unbind,
+ 
+ 	.setup		= eth_setup,
+ 	.disconnect	= eth_disconnect,
diff --git a/queue-2.6.18/zone_reclaim-dynamic-slab-reclaim.patch b/queue-2.6.18/zone_reclaim-dynamic-slab-reclaim.patch
new file mode 100644
index 00000000000..5cce46b1cc3
--- /dev/null
+++ b/queue-2.6.18/zone_reclaim-dynamic-slab-reclaim.patch
@@ -0,0 +1,315 @@
+From stable-bounces@linux.kernel.org Mon Oct  2 10:46:22 2006
+Date: Mon, 2 Oct 2006 10:45:24 -0700 (PDT)
+From: Christoph Lameter <clameter@sgi.com>
+To: stable@kernel.org
+Message-ID: <Pine.LNX.4.64.0610021044190.12969@schroedinger.engr.sgi.com>
+Subject: zone_reclaim: dynamic slab reclaim
+
+From: Christoph Lameter <clameter@sgi.com>
+
+http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=0ff38490c836dc379ff7ec45b10a15a662f4e5f6
+
+
+Currently one can enable slab reclaim by setting an explicit option in
+/proc/sys/vm/zone_reclaim_mode.  Slab reclaim is then used as a final
+option if the freeing of unmapped file backed pages is not enough to free
+enough pages to allow a local allocation.
+
+However, that means that the slab can grow excessively and that most memory
+of a node may be used by slabs.  We have had a case where a machine with
+46GB of memory was using 40-42GB for slab.  Zone reclaim was effective in
+dealing with pagecache pages.  However, slab reclaim was only done during
+global reclaim (which is a bit rare on NUMA systems).
+
+This patch implements slab reclaim during zone reclaim.  Zone reclaim
+occurs if there is a danger of an off node allocation.  At that point we
+
+1. Shrink the per node page cache if the number of pagecache
+   pages is more than min_unmapped_ratio percent of pages in a zone.
+
+2. Shrink the slab cache if the number of the nodes reclaimable slab pages
+   (patch depends on earlier one that implements that counter)
+   are more than min_slab_ratio (a new /proc/sys/vm tunable).
+
+The shrinking of the slab cache is a bit problematic since it is not node
+specific.  So we simply calculate what point in the slab we want to reach
+(current per node slab use minus the number of pages that neeed to be
+allocated) and then repeately run the global reclaim until that is
+unsuccessful or we have reached the limit.  I hope we will have zone based
+slab reclaim at some point which will make that easier.
+
+The default for the min_slab_ratio is 5%
+
+Also remove the slab option from /proc/sys/vm/zone_reclaim_mode.
+
+[akpm@osdl.org: cleanups]
+Signed-off-by: Christoph Lameter <clameter@sgi.com>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Linus Torvalds <torvalds@osdl.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ Documentation/sysctl/vm.txt |   27 +++++++++++++++-----
+ include/linux/mmzone.h      |    3 ++
+ include/linux/swap.h        |    1 
+ include/linux/sysctl.h      |    1 
+ kernel/sysctl.c             |   11 ++++++++
+ mm/page_alloc.c             |   17 ++++++++++++
+ mm/vmscan.c                 |   58 ++++++++++++++++++++++++++++----------------
+ 7 files changed, 90 insertions(+), 28 deletions(-)
+
+--- linux-2.6.18.orig/Documentation/sysctl/vm.txt
++++ linux-2.6.18/Documentation/sysctl/vm.txt
+@@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/
+ - drop-caches
+ - zone_reclaim_mode
+ - min_unmapped_ratio
++- min_slab_ratio
+ - panic_on_oom
+ 
+ ==============================================================
+@@ -138,7 +139,6 @@ This is value ORed together of
+ 1	= Zone reclaim on
+ 2	= Zone reclaim writes dirty pages out
+ 4	= Zone reclaim swaps pages
+-8	= Also do a global slab reclaim pass
+ 
+ zone_reclaim_mode is set during bootup to 1 if it is determined that pages
+ from remote zones will cause a measurable performance reduction. The
+@@ -162,18 +162,13 @@ Allowing regular swap effectively restri
+ node unless explicitly overridden by memory policies or cpuset
+ configurations.
+ 
+-It may be advisable to allow slab reclaim if the system makes heavy
+-use of files and builds up large slab caches. However, the slab
+-shrink operation is global, may take a long time and free slabs
+-in all nodes of the system.
+-
+ =============================================================
+ 
+ min_unmapped_ratio:
+ 
+ This is available only on NUMA kernels.
+ 
+-A percentage of the file backed pages in each zone.  Zone reclaim will only
++A percentage of the total pages in each zone.  Zone reclaim will only
+ occur if more than this percentage of pages are file backed and unmapped.
+ This is to insure that a minimal amount of local pages is still available for
+ file I/O even if the node is overallocated.
+@@ -182,6 +177,24 @@ The default is 1 percent.
+ 
+ =============================================================
+ 
++min_slab_ratio:
++
++This is available only on NUMA kernels.
++
++A percentage of the total pages in each zone.  On Zone reclaim
++(fallback from the local zone occurs) slabs will be reclaimed if more
++than this percentage of pages in a zone are reclaimable slab pages.
++This insures that the slab growth stays under control even in NUMA
++systems that rarely perform global reclaim.
++
++The default is 5 percent.
++
++Note that slab reclaim is triggered in a per zone / node fashion.
++The process of reclaiming slab memory is currently not node specific
++and may not be fast.
++
++=============================================================
++
+ panic_on_oom
+ 
+ This enables or disables panic on out-of-memory feature.  If this is set to 1,
+--- linux-2.6.18.orig/include/linux/mmzone.h
++++ linux-2.6.18/include/linux/mmzone.h
+@@ -155,6 +155,7 @@ struct zone {
+ 	 * zone reclaim becomes active if more unmapped pages exist.
+ 	 */
+ 	unsigned long		min_unmapped_ratio;
++	unsigned long		min_slab_pages;
+ 	struct per_cpu_pageset	*pageset[NR_CPUS];
+ #else
+ 	struct per_cpu_pageset	pageset[NR_CPUS];
+@@ -421,6 +422,8 @@ int percpu_pagelist_fraction_sysctl_hand
+ 					void __user *, size_t *, loff_t *);
+ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
+ 			struct file *, void __user *, size_t *, loff_t *);
++int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
++			struct file *, void __user *, size_t *, loff_t *);
+ 
+ #include <linux/topology.h>
+ /* Returns the number of the current Node. */
+--- linux-2.6.18.orig/include/linux/swap.h
++++ linux-2.6.18/include/linux/swap.h
+@@ -190,6 +190,7 @@ extern long vm_total_pages;
+ #ifdef CONFIG_NUMA
+ extern int zone_reclaim_mode;
+ extern int sysctl_min_unmapped_ratio;
++extern int sysctl_min_slab_ratio;
+ extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+ #else
+ #define zone_reclaim_mode 0
+--- linux-2.6.18.orig/include/linux/sysctl.h
++++ linux-2.6.18/include/linux/sysctl.h
+@@ -191,6 +191,7 @@ enum
+ 	VM_MIN_UNMAPPED=32,	/* Set min percent of unmapped pages */
+ 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
+ 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
++	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
+ };
+ 
+ 
+--- linux-2.6.18.orig/kernel/sysctl.c
++++ linux-2.6.18/kernel/sysctl.c
+@@ -943,6 +943,17 @@ static ctl_table vm_table[] = {
+ 		.extra1		= &zero,
+ 		.extra2		= &one_hundred,
+ 	},
++	{
++		.ctl_name	= VM_MIN_SLAB,
++		.procname	= "min_slab_ratio",
++		.data		= &sysctl_min_slab_ratio,
++		.maxlen		= sizeof(sysctl_min_slab_ratio),
++		.mode		= 0644,
++		.proc_handler	= &sysctl_min_slab_ratio_sysctl_handler,
++		.strategy	= &sysctl_intvec,
++		.extra1		= &zero,
++		.extra2		= &one_hundred,
++	},
+ #endif
+ #ifdef CONFIG_X86_32
+ 	{
+--- linux-2.6.18.orig/mm/page_alloc.c
++++ linux-2.6.18/mm/page_alloc.c
+@@ -2008,6 +2008,7 @@ static void __meminit free_area_init_cor
+ #ifdef CONFIG_NUMA
+ 		zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio)
+ 						/ 100;
++		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
+ #endif
+ 		zone->name = zone_names[j];
+ 		spin_lock_init(&zone->lock);
+@@ -2318,6 +2319,22 @@ int sysctl_min_unmapped_ratio_sysctl_han
+ 				sysctl_min_unmapped_ratio) / 100;
+ 	return 0;
+ }
++
++int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
++	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
++{
++	struct zone *zone;
++	int rc;
++
++	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
++	if (rc)
++		return rc;
++
++	for_each_zone(zone)
++		zone->min_slab_pages = (zone->present_pages *
++				sysctl_min_slab_ratio) / 100;
++	return 0;
++}
+ #endif
+ 
+ /*
+--- linux-2.6.18.orig/mm/vmscan.c
++++ linux-2.6.18/mm/vmscan.c
+@@ -1510,7 +1510,6 @@ int zone_reclaim_mode __read_mostly;
+ #define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
+ #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
+ #define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
+-#define RECLAIM_SLAB (1<<3)	/* Do a global slab shrink if the zone is out of memory */
+ 
+ /*
+  * Priority for ZONE_RECLAIM. This determines the fraction of pages
+@@ -1526,6 +1525,12 @@ int zone_reclaim_mode __read_mostly;
+ int sysctl_min_unmapped_ratio = 1;
+ 
+ /*
++ * If the number of slab pages in a zone grows beyond this percentage then
++ * slab reclaim needs to occur.
++ */
++int sysctl_min_slab_ratio = 5;
++
++/*
+  * Try to free up some pages from this zone through reclaim.
+  */
+ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+@@ -1556,29 +1561,37 @@ static int __zone_reclaim(struct zone *z
+ 	reclaim_state.reclaimed_slab = 0;
+ 	p->reclaim_state = &reclaim_state;
+ 
+-	/*
+-	 * Free memory by calling shrink zone with increasing priorities
+-	 * until we have enough memory freed.
+-	 */
+-	priority = ZONE_RECLAIM_PRIORITY;
+-	do {
+-		nr_reclaimed += shrink_zone(priority, zone, &sc);
+-		priority--;
+-	} while (priority >= 0 && nr_reclaimed < nr_pages);
++	if (zone_page_state(zone, NR_FILE_PAGES) -
++		zone_page_state(zone, NR_FILE_MAPPED) >
++		zone->min_unmapped_ratio) {
++		/*
++		 * Free memory by calling shrink zone with increasing
++		 * priorities until we have enough memory freed.
++		 */
++		priority = ZONE_RECLAIM_PRIORITY;
++		do {
++			nr_reclaimed += shrink_zone(priority, zone, &sc);
++			priority--;
++		} while (priority >= 0 && nr_reclaimed < nr_pages);
++	}
+ 
+-	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
++	if (zone_page_state(zone, NR_SLAB) > zone->min_slab_pages) {
+ 		/*
+ 		 * shrink_slab() does not currently allow us to determine how
+-		 * many pages were freed in this zone. So we just shake the slab
+-		 * a bit and then go off node for this particular allocation
+-		 * despite possibly having freed enough memory to allocate in
+-		 * this zone.  If we freed local memory then the next
+-		 * allocations will be local again.
++		 * many pages were freed in this zone. So we take the current
++		 * number of slab pages and shake the slab until it is reduced
++		 * by the same nr_pages that we used for reclaiming unmapped
++		 * pages.
+ 		 *
+-		 * shrink_slab will free memory on all zones and may take
+-		 * a long time.
++		 * Note that shrink_slab will free memory on all zones and may
++		 * take a long time.
+ 		 */
+-		shrink_slab(sc.nr_scanned, gfp_mask, order);
++		unsigned long limit = zone_page_state(zone,
++				NR_SLAB) - nr_pages;
++
++		while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
++			zone_page_state(zone, NR_SLAB) > limit)
++			;
+ 	}
+ 
+ 	p->reclaim_state = NULL;
+@@ -1592,7 +1605,8 @@ int zone_reclaim(struct zone *zone, gfp_
+ 	int node_id;
+ 
+ 	/*
+-	 * Zone reclaim reclaims unmapped file backed pages.
++	 * Zone reclaim reclaims unmapped file backed pages and
++	 * slab pages if we are over the defined limits.
+ 	 *
+ 	 * A small portion of unmapped file backed pages is needed for
+ 	 * file I/O otherwise pages read by file I/O will be immediately
+@@ -1601,7 +1615,9 @@ int zone_reclaim(struct zone *zone, gfp_
+ 	 * unmapped file backed pages.
+ 	 */
+ 	if (zone_page_state(zone, NR_FILE_PAGES) -
+-	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio)
++	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio
++	    && zone_page_state(zone, NR_SLAB)
++			<= zone->min_slab_pages)
+ 		return 0;
+ 
+ 	/*