From: Greg Kroah-Hartman Date: Tue, 10 Oct 2006 06:38:55 +0000 (-0700) Subject: more 2.6.18.1 patches X-Git-Tag: v2.6.17.14~18 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=917f9eb19484f74b091a725ded2055817074f863;p=thirdparty%2Fkernel%2Fstable-queue.git more 2.6.18.1 patches --- diff --git a/queue-2.6.18/alsa-fix-initiailization-of-user-space-controls.patch b/queue-2.6.18/alsa-fix-initiailization-of-user-space-controls.patch new file mode 100644 index 00000000000..f519643b5d2 --- /dev/null +++ b/queue-2.6.18/alsa-fix-initiailization-of-user-space-controls.patch @@ -0,0 +1,33 @@ +From stable-bounces@linux.kernel.org Mon Sep 25 03:51:05 2006 +Date: Mon, 25 Sep 2006 11:49:01 +0200 +Message-ID: +From: Takashi Iwai +To: stable@kernel.org +Subject: ALSA: Fix initiailization of user-space controls + +From: Takashi Iwai + +ALSA: Fix initiailization of user-space controls + +Fix an assertion when accessing a user-defined control due to lack of +initialization (appears only when CONFIG_SND_DEBUg is enabled). + + ALSA sound/core/control.c:660: BUG? (info->access == 0) + +Signed-off-by: Takashi Iwai +Signed-off-by: Greg Kroah-Hartman + +--- + sound/core/control.c | 1 + + 1 file changed, 1 insertion(+) + +--- linux-2.6.18.orig/sound/core/control.c ++++ linux-2.6.18/sound/core/control.c +@@ -997,6 +997,7 @@ static int snd_ctl_elem_add(struct snd_c + if (ue == NULL) + return -ENOMEM; + ue->info = *info; ++ ue->info.access = 0; + ue->elem_data = (char *)ue + sizeof(*ue); + ue->elem_data_size = private_size; + kctl.private_free = snd_ctl_elem_user_free; diff --git a/queue-2.6.18/fix-longstanding-load-balancing-bug-in-the-scheduler.patch b/queue-2.6.18/fix-longstanding-load-balancing-bug-in-the-scheduler.patch new file mode 100644 index 00000000000..e95a777830d --- /dev/null +++ b/queue-2.6.18/fix-longstanding-load-balancing-bug-in-the-scheduler.patch @@ -0,0 +1,215 @@ +From stable-bounces@linux.kernel.org Mon Sep 25 23:34:14 2006 +Message-Id: <200609260630.k8Q6UpbB011991@shell0.pdx.osdl.net> +To: torvalds@osdl.org +From: Christoph Lameter +Date: Mon, 25 Sep 2006 23:30:51 -0700 +Cc: akpm@osdl.org, nickpiggin@yahoo.com.au, suresh.b.siddha@intel.com, + christoph@sgi.com, pwil3058@bigpond.net.au, mingo@elte.hu, + hawkes@sgi.com, stable@kernel.org, clameter@sgi.com +Subject: Fix longstanding load balancing bug in the scheduler + +From: Christoph Lameter + +The scheduler will stop load balancing if the most busy processor contains +processes pinned via processor affinity. + +The scheduler currently only does one search for busiest cpu. If it cannot +pull any tasks away from the busiest cpu because they were pinned then the +scheduler goes into a corner and sulks leaving the idle processors idle. + +F.e. If you have processor 0 busy running four tasks pinned via taskset, +there are none on processor 1 and one just started two processes on +processor 2 then the scheduler will not move one of the two processes away +from processor 2. + +This patch fixes that issue by forcing the scheduler to come out of its +corner and retrying the load balancing by considering other processors for +load balancing. + +This patch was originally developed by John Hawkes and discussed at +http://marc.theaimsgroup.com/?l=linux-kernel&m=113901368523205&w=2. + +I have removed extraneous material and gone back to equipping struct rq +with the cpu the queue is associated with since this makes the patch much +easier and it is likely that others in the future will have the same +difficulty of figuring out which processor owns which runqueue. + +The overhead added through these patches is a single word on the stack if +the kernel is configured to support 32 cpus or less (32 bit). For 32 bit +environments the maximum number of cpus that can be configued is 255 which +would result in the use of 32 bytes additional on the stack. On IA64 up to +1k cpus can be configured which will result in the use of 128 additional +bytes on the stack. The maximum additional cache footprint is one +cacheline. Typically memory use will be much less than a cacheline and the +additional cpumask will be placed on the stack in a cacheline that already +contains other local variable. + + +Signed-off-by: Christoph Lameter +Cc: John Hawkes +Cc: "Siddha, Suresh B" +Cc: Ingo Molnar +Cc: Nick Piggin +Cc: Peter Williams +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 46 insertions(+), 8 deletions(-) + +--- linux-2.6.18.orig/kernel/sched.c ++++ linux-2.6.18/kernel/sched.c +@@ -238,6 +238,7 @@ struct rq { + /* For active balancing */ + int active_balance; + int push_cpu; ++ int cpu; /* cpu of this runqueue */ + + struct task_struct *migration_thread; + struct list_head migration_queue; +@@ -267,6 +268,15 @@ struct rq { + + static DEFINE_PER_CPU(struct rq, runqueues); + ++static inline int cpu_of(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ + /* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. +@@ -2211,7 +2221,8 @@ out: + */ + static struct sched_group * + find_busiest_group(struct sched_domain *sd, int this_cpu, +- unsigned long *imbalance, enum idle_type idle, int *sd_idle) ++ unsigned long *imbalance, enum idle_type idle, int *sd_idle, ++ cpumask_t *cpus) + { + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; +@@ -2248,7 +2259,12 @@ find_busiest_group(struct sched_domain * + sum_weighted_load = sum_nr_running = avg_load = 0; + + for_each_cpu_mask(i, group->cpumask) { +- struct rq *rq = cpu_rq(i); ++ struct rq *rq; ++ ++ if (!cpu_isset(i, *cpus)) ++ continue; ++ ++ rq = cpu_rq(i); + + if (*sd_idle && !idle_cpu(i)) + *sd_idle = 0; +@@ -2466,13 +2482,17 @@ ret: + */ + static struct rq * + find_busiest_queue(struct sched_group *group, enum idle_type idle, +- unsigned long imbalance) ++ unsigned long imbalance, cpumask_t *cpus) + { + struct rq *busiest = NULL, *rq; + unsigned long max_load = 0; + int i; + + for_each_cpu_mask(i, group->cpumask) { ++ ++ if (!cpu_isset(i, *cpus)) ++ continue; ++ + rq = cpu_rq(i); + + if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) +@@ -2511,6 +2531,7 @@ static int load_balance(int this_cpu, st + struct sched_group *group; + unsigned long imbalance; + struct rq *busiest; ++ cpumask_t cpus = CPU_MASK_ALL; + + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && + !sched_smt_power_savings) +@@ -2518,13 +2539,15 @@ static int load_balance(int this_cpu, st + + schedstat_inc(sd, lb_cnt[idle]); + +- group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); ++redo: ++ group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, ++ &cpus); + if (!group) { + schedstat_inc(sd, lb_nobusyg[idle]); + goto out_balanced; + } + +- busiest = find_busiest_queue(group, idle, imbalance); ++ busiest = find_busiest_queue(group, idle, imbalance, &cpus); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[idle]); + goto out_balanced; +@@ -2549,8 +2572,12 @@ static int load_balance(int this_cpu, st + double_rq_unlock(this_rq, busiest); + + /* All tasks on this runqueue were pinned by CPU affinity */ +- if (unlikely(all_pinned)) ++ if (unlikely(all_pinned)) { ++ cpu_clear(cpu_of(busiest), cpus); ++ if (!cpus_empty(cpus)) ++ goto redo; + goto out_balanced; ++ } + } + + if (!nr_moved) { +@@ -2639,18 +2666,22 @@ load_balance_newidle(int this_cpu, struc + unsigned long imbalance; + int nr_moved = 0; + int sd_idle = 0; ++ cpumask_t cpus = CPU_MASK_ALL; + + if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) + sd_idle = 1; + + schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); +- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); ++redo: ++ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, ++ &sd_idle, &cpus); + if (!group) { + schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); + goto out_balanced; + } + +- busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); ++ busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, ++ &cpus); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); + goto out_balanced; +@@ -2668,6 +2699,12 @@ load_balance_newidle(int this_cpu, struc + minus_1_or_zero(busiest->nr_running), + imbalance, sd, NEWLY_IDLE, NULL); + spin_unlock(&busiest->lock); ++ ++ if (!nr_moved) { ++ cpu_clear(cpu_of(busiest), cpus); ++ if (!cpus_empty(cpus)) ++ goto redo; ++ } + } + + if (!nr_moved) { +@@ -6747,6 +6784,7 @@ void __init sched_init(void) + rq->cpu_load[j] = 0; + rq->active_balance = 0; + rq->push_cpu = 0; ++ rq->cpu = i; + rq->migration_thread = NULL; + INIT_LIST_HEAD(&rq->migration_queue); + #endif diff --git a/queue-2.6.18/ib-mthca-fix-lid-used-for-sending-traps.patch b/queue-2.6.18/ib-mthca-fix-lid-used-for-sending-traps.patch new file mode 100644 index 00000000000..3307d6c7a19 --- /dev/null +++ b/queue-2.6.18/ib-mthca-fix-lid-used-for-sending-traps.patch @@ -0,0 +1,38 @@ +From stable-bounces@linux.kernel.org Wed Sep 20 15:47:56 2006 +To: stable@kernel.org +From: Roland Dreier +Date: Wed, 20 Sep 2006 15:47:16 -0700 +Message-ID: +Subject: IB/mthca: Fix lid used for sending traps + +From: Jack Morgenstein + +The SM LID used to send traps to is incorrectly set to port LID. This +is a regression from 2.6.17 -- after a PortInfo MAD is received, no +traps are sent to the SM LID. The traps go to the loopback interface +instead, and are dropped there. The SM LID should be taken from the +sm_lid of the PortInfo response. + +The bug was introduced by commit 12bbb2b7be7f5564952ebe0196623e97464b8ac5: + IB/mthca: Add client reregister event generation + +Signed-off-by: Jack Morgenstein +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Roland Dreier +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/hw/mthca/mthca_mad.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- linux-2.6.18.orig/drivers/infiniband/hw/mthca/mthca_mad.c ++++ linux-2.6.18/drivers/infiniband/hw/mthca/mthca_mad.c +@@ -119,7 +119,7 @@ static void smp_snoop(struct ib_device * + + mthca_update_rate(to_mdev(ibdev), port_num); + update_sm_ah(to_mdev(ibdev), port_num, +- be16_to_cpu(pinfo->lid), ++ be16_to_cpu(pinfo->sm_lid), + pinfo->neighbormtu_mastersmsl & 0xf); + + event.device = ibdev; diff --git a/queue-2.6.18/jbd-fix-commit-of-ordered-data-buffers.patch b/queue-2.6.18/jbd-fix-commit-of-ordered-data-buffers.patch new file mode 100644 index 00000000000..76e7c0eb09b --- /dev/null +++ b/queue-2.6.18/jbd-fix-commit-of-ordered-data-buffers.patch @@ -0,0 +1,239 @@ +From stable-bounces@linux.kernel.org Mon Sep 11 15:14:16 2006 +Message-Id: <200609112213.k8BMDbPC029844@shell0.pdx.osdl.net> +To: mm-commits@vger.kernel.org +From: akpm@osdl.org +Date: Mon, 11 Sep 2006 15:13:37 -0700 +Cc: pbadari@us.ibm.com, jack@suse.cz, stable@kernel.org +Subject: jbd: fix commit of ordered data buffers + +From: Jan Kara + +Original commit code assumes, that when a buffer on BJ_SyncData list is +locked, it is being written to disk. But this is not true and hence it can +lead to a potential data loss on crash. Also the code didn't count with +the fact that journal_dirty_data() can steal buffers from committing +transaction and hence could write buffers that no longer belong to the +committing transaction. Finally it could possibly happen that we tried +writing out one buffer several times. + +The patch below tries to solve these problems by a complete rewrite of the +data commit code. We go through buffers on t_sync_datalist, lock buffers +needing write out and store them in an array. Buffers are also immediately +refiled to BJ_Locked list or unfiled (if the write out is completed). When +the array is full or we have to block on buffer lock, we submit all +accumulated buffers for IO. + +[suitable for 2.6.18.x around the 2.6.19-rc2 timeframe] + +Signed-off-by: Jan Kara +Cc: Badari Pulavarty +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman + +--- + fs/jbd/commit.c | 182 ++++++++++++++++++++++++++++++++++---------------------- + 1 file changed, 113 insertions(+), 69 deletions(-) + +--- linux-2.6.18.orig/fs/jbd/commit.c ++++ linux-2.6.18/fs/jbd/commit.c +@@ -160,6 +160,117 @@ static int journal_write_commit_record(j + return (ret == -EIO); + } + ++void journal_do_submit_data(struct buffer_head **wbuf, int bufs) ++{ ++ int i; ++ ++ for (i = 0; i < bufs; i++) { ++ wbuf[i]->b_end_io = end_buffer_write_sync; ++ /* We use-up our safety reference in submit_bh() */ ++ submit_bh(WRITE, wbuf[i]); ++ } ++} ++ ++/* ++ * Submit all the data buffers to disk ++ */ ++static void journal_submit_data_buffers(journal_t *journal, ++ transaction_t *commit_transaction) ++{ ++ struct journal_head *jh; ++ struct buffer_head *bh; ++ int locked; ++ int bufs = 0; ++ struct buffer_head **wbuf = journal->j_wbuf; ++ ++ /* ++ * Whenever we unlock the journal and sleep, things can get added ++ * onto ->t_sync_datalist, so we have to keep looping back to ++ * write_out_data until we *know* that the list is empty. ++ * ++ * Cleanup any flushed data buffers from the data list. Even in ++ * abort mode, we want to flush this out as soon as possible. ++ */ ++write_out_data: ++ cond_resched(); ++ spin_lock(&journal->j_list_lock); ++ ++ while (commit_transaction->t_sync_datalist) { ++ jh = commit_transaction->t_sync_datalist; ++ bh = jh2bh(jh); ++ locked = 0; ++ ++ /* Get reference just to make sure buffer does not disappear ++ * when we are forced to drop various locks */ ++ get_bh(bh); ++ /* If the buffer is dirty, we need to submit IO and hence ++ * we need the buffer lock. We try to lock the buffer without ++ * blocking. If we fail, we need to drop j_list_lock and do ++ * blocking lock_buffer(). ++ */ ++ if (buffer_dirty(bh)) { ++ if (test_set_buffer_locked(bh)) { ++ BUFFER_TRACE(bh, "needs blocking lock"); ++ spin_unlock(&journal->j_list_lock); ++ /* Write out all data to prevent deadlocks */ ++ journal_do_submit_data(wbuf, bufs); ++ bufs = 0; ++ lock_buffer(bh); ++ spin_lock(&journal->j_list_lock); ++ } ++ locked = 1; ++ } ++ /* We have to get bh_state lock. Again out of order, sigh. */ ++ if (!inverted_lock(journal, bh)) { ++ jbd_lock_bh_state(bh); ++ spin_lock(&journal->j_list_lock); ++ } ++ /* Someone already cleaned up the buffer? */ ++ if (!buffer_jbd(bh) ++ || jh->b_transaction != commit_transaction ++ || jh->b_jlist != BJ_SyncData) { ++ jbd_unlock_bh_state(bh); ++ if (locked) ++ unlock_buffer(bh); ++ BUFFER_TRACE(bh, "already cleaned up"); ++ put_bh(bh); ++ continue; ++ } ++ if (locked && test_clear_buffer_dirty(bh)) { ++ BUFFER_TRACE(bh, "needs writeout, adding to array"); ++ wbuf[bufs++] = bh; ++ __journal_file_buffer(jh, commit_transaction, ++ BJ_Locked); ++ jbd_unlock_bh_state(bh); ++ if (bufs == journal->j_wbufsize) { ++ spin_unlock(&journal->j_list_lock); ++ journal_do_submit_data(wbuf, bufs); ++ bufs = 0; ++ goto write_out_data; ++ } ++ } ++ else { ++ BUFFER_TRACE(bh, "writeout complete: unfile"); ++ __journal_unfile_buffer(jh); ++ jbd_unlock_bh_state(bh); ++ if (locked) ++ unlock_buffer(bh); ++ journal_remove_journal_head(bh); ++ /* Once for our safety reference, once for ++ * journal_remove_journal_head() */ ++ put_bh(bh); ++ put_bh(bh); ++ } ++ ++ if (lock_need_resched(&journal->j_list_lock)) { ++ spin_unlock(&journal->j_list_lock); ++ goto write_out_data; ++ } ++ } ++ spin_unlock(&journal->j_list_lock); ++ journal_do_submit_data(wbuf, bufs); ++} ++ + /* + * journal_commit_transaction + * +@@ -313,80 +424,13 @@ void journal_commit_transaction(journal_ + * Now start flushing things to disk, in the order they appear + * on the transaction lists. Data blocks go first. + */ +- + err = 0; +- /* +- * Whenever we unlock the journal and sleep, things can get added +- * onto ->t_sync_datalist, so we have to keep looping back to +- * write_out_data until we *know* that the list is empty. +- */ +- bufs = 0; +- /* +- * Cleanup any flushed data buffers from the data list. Even in +- * abort mode, we want to flush this out as soon as possible. +- */ +-write_out_data: +- cond_resched(); +- spin_lock(&journal->j_list_lock); +- +- while (commit_transaction->t_sync_datalist) { +- struct buffer_head *bh; +- +- jh = commit_transaction->t_sync_datalist; +- commit_transaction->t_sync_datalist = jh->b_tnext; +- bh = jh2bh(jh); +- if (buffer_locked(bh)) { +- BUFFER_TRACE(bh, "locked"); +- if (!inverted_lock(journal, bh)) +- goto write_out_data; +- __journal_temp_unlink_buffer(jh); +- __journal_file_buffer(jh, commit_transaction, +- BJ_Locked); +- jbd_unlock_bh_state(bh); +- if (lock_need_resched(&journal->j_list_lock)) { +- spin_unlock(&journal->j_list_lock); +- goto write_out_data; +- } +- } else { +- if (buffer_dirty(bh)) { +- BUFFER_TRACE(bh, "start journal writeout"); +- get_bh(bh); +- wbuf[bufs++] = bh; +- if (bufs == journal->j_wbufsize) { +- jbd_debug(2, "submit %d writes\n", +- bufs); +- spin_unlock(&journal->j_list_lock); +- ll_rw_block(SWRITE, bufs, wbuf); +- journal_brelse_array(wbuf, bufs); +- bufs = 0; +- goto write_out_data; +- } +- } else { +- BUFFER_TRACE(bh, "writeout complete: unfile"); +- if (!inverted_lock(journal, bh)) +- goto write_out_data; +- __journal_unfile_buffer(jh); +- jbd_unlock_bh_state(bh); +- journal_remove_journal_head(bh); +- put_bh(bh); +- if (lock_need_resched(&journal->j_list_lock)) { +- spin_unlock(&journal->j_list_lock); +- goto write_out_data; +- } +- } +- } +- } +- +- if (bufs) { +- spin_unlock(&journal->j_list_lock); +- ll_rw_block(SWRITE, bufs, wbuf); +- journal_brelse_array(wbuf, bufs); +- spin_lock(&journal->j_list_lock); +- } ++ journal_submit_data_buffers(journal, commit_transaction); + + /* + * Wait for all previously submitted IO to complete. + */ ++ spin_lock(&journal->j_list_lock); + while (commit_transaction->t_locked_list) { + struct buffer_head *bh; + diff --git a/queue-2.6.18/s390-user-readable-uninitialised-kernel-memory.patch b/queue-2.6.18/s390-user-readable-uninitialised-kernel-memory.patch new file mode 100644 index 00000000000..f6fe9906144 --- /dev/null +++ b/queue-2.6.18/s390-user-readable-uninitialised-kernel-memory.patch @@ -0,0 +1,69 @@ +From stable-bounces@linux.kernel.org Thu Sep 28 06:32:54 2006 +Date: Thu, 28 Sep 2006 15:31:52 +0200 +From: Martin Schwidefsky +To: gregkh@suse.de, bunk@stusta.de +Message-ID: <20060928133152.GA10672@skybase> +Content-Disposition: inline +Cc: stable@kernel.org +Subject: S390: user readable uninitialised kernel memory (CVE-2006-5174) + +From: Martin Schwidefsky + +[S390] user readable uninitialised kernel memory. + +A user space program can read uninitialised kernel memory +by appending to a file from a bad address and then reading +the result back. The cause is the copy_from_user function +that does not clear the remaining bytes of the kernel +buffer after it got a fault on the user space address. + +Signed-off-by: Martin Schwidefsky +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/lib/uaccess.S | 12 +++++++++++- + arch/s390/lib/uaccess64.S | 12 +++++++++++- + 2 files changed, 22 insertions(+), 2 deletions(-) + +--- linux-2.6.18.orig/arch/s390/lib/uaccess.S ++++ linux-2.6.18/arch/s390/lib/uaccess.S +@@ -40,7 +40,17 @@ __copy_from_user_asm: + # move with the reduced length which is < 256 + 5: mvcp 0(%r5,%r2),0(%r4),%r0 + slr %r3,%r5 +-6: lr %r2,%r3 ++ alr %r2,%r5 ++6: lgr %r5,%r3 # copy remaining size ++ ahi %r5,-1 # subtract 1 for xc loop ++ bras %r4,8f ++ xc 0(1,%2),0(%2) ++7: xc 0(256,%2),0(%2) ++ la %r2,256(%r2) ++8: ahji %r5,-256 ++ jnm 7b ++ ex %r5,0(%r2) ++9: lr %r2,%r3 + br %r14 + .section __ex_table,"a" + .long 0b,4b +--- linux-2.6.18.orig/arch/s390/lib/uaccess64.S ++++ linux-2.6.18/arch/s390/lib/uaccess64.S +@@ -40,7 +40,17 @@ __copy_from_user_asm: + # move with the reduced length which is < 256 + 5: mvcp 0(%r5,%r2),0(%r4),%r0 + slgr %r3,%r5 +-6: lgr %r2,%r3 ++ algr %r2,%r5 ++6: lgr %r5,%r3 # copy remaining size ++ aghi %r5,-1 # subtract 1 for xc loop ++ bras %r4,8f ++ xc 0(1,%r2),0(%r2) ++7: xc 0(256,%r2),0(%r2) ++ la %r2,256(%r2) ++8: aghi %r5,-256 ++ jnm 7b ++ ex %r5,0(%r2) ++9: lgr %r2,%r3 + br %r14 + .section __ex_table,"a" + .quad 0b,4b diff --git a/queue-2.6.18/series b/queue-2.6.18/series index e39b176fd70..c3c8e5ba232 100644 --- a/queue-2.6.18/series +++ b/queue-2.6.18/series @@ -9,3 +9,10 @@ video-pvrusb2-improve-24xxx-config-option-description.patch video-pvrusb2-suppress-compiler-warning.patch video-pvrusb2-limit-hor-res-for-24xxx-devices.patch zd1211rw-zd1211b-asic-fwt-not-jointly-decoder.patch +s390-user-readable-uninitialised-kernel-memory.patch +ib-mthca-fix-lid-used-for-sending-traps.patch +usb-allow-compile-in-g_ether-fix-typo.patch +alsa-fix-initiailization-of-user-space-controls.patch +jbd-fix-commit-of-ordered-data-buffers.patch +fix-longstanding-load-balancing-bug-in-the-scheduler.patch +zone_reclaim-dynamic-slab-reclaim.patch diff --git a/queue-2.6.18/usb-allow-compile-in-g_ether-fix-typo.patch b/queue-2.6.18/usb-allow-compile-in-g_ether-fix-typo.patch new file mode 100644 index 00000000000..75107987372 --- /dev/null +++ b/queue-2.6.18/usb-allow-compile-in-g_ether-fix-typo.patch @@ -0,0 +1,39 @@ +From stable-bounces@linux.kernel.org Fri Sep 22 20:18:48 2006 +Date: Fri, 22 Sep 2006 20:17:48 -0700 +To: stable@kernel.org +Cc: Andrew Morton +Message-Id: <20060922201748.bf3245d6.akpm@osdl.org> +Subject: USB: Allow compile in g_ether, fix typo + +From: Tony Lindgren + +Allows compiling g_ether in and fixes a typo with MUSB_HDRC + +Signed-off-by: Tony Lindgren +Cc: David Brownell +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/usb/gadget/ether.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- linux-2.6.18.orig/drivers/usb/gadget/ether.c ++++ linux-2.6.18/drivers/usb/gadget/ether.c +@@ -262,7 +262,7 @@ MODULE_PARM_DESC(host_addr, "Host Ethern + #define DEV_CONFIG_CDC + #endif + +-#ifdef CONFIG_USB_GADGET_MUSBHDRC ++#ifdef CONFIG_USB_GADGET_MUSB_HDRC + #define DEV_CONFIG_CDC + #endif + +@@ -2564,7 +2564,7 @@ static struct usb_gadget_driver eth_driv + + .function = (char *) driver_desc, + .bind = eth_bind, +- .unbind = __exit_p(eth_unbind), ++ .unbind = eth_unbind, + + .setup = eth_setup, + .disconnect = eth_disconnect, diff --git a/queue-2.6.18/zone_reclaim-dynamic-slab-reclaim.patch b/queue-2.6.18/zone_reclaim-dynamic-slab-reclaim.patch new file mode 100644 index 00000000000..5cce46b1cc3 --- /dev/null +++ b/queue-2.6.18/zone_reclaim-dynamic-slab-reclaim.patch @@ -0,0 +1,315 @@ +From stable-bounces@linux.kernel.org Mon Oct 2 10:46:22 2006 +Date: Mon, 2 Oct 2006 10:45:24 -0700 (PDT) +From: Christoph Lameter +To: stable@kernel.org +Message-ID: +Subject: zone_reclaim: dynamic slab reclaim + +From: Christoph Lameter + +http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=0ff38490c836dc379ff7ec45b10a15a662f4e5f6 + + +Currently one can enable slab reclaim by setting an explicit option in +/proc/sys/vm/zone_reclaim_mode. Slab reclaim is then used as a final +option if the freeing of unmapped file backed pages is not enough to free +enough pages to allow a local allocation. + +However, that means that the slab can grow excessively and that most memory +of a node may be used by slabs. We have had a case where a machine with +46GB of memory was using 40-42GB for slab. Zone reclaim was effective in +dealing with pagecache pages. However, slab reclaim was only done during +global reclaim (which is a bit rare on NUMA systems). + +This patch implements slab reclaim during zone reclaim. Zone reclaim +occurs if there is a danger of an off node allocation. At that point we + +1. Shrink the per node page cache if the number of pagecache + pages is more than min_unmapped_ratio percent of pages in a zone. + +2. Shrink the slab cache if the number of the nodes reclaimable slab pages + (patch depends on earlier one that implements that counter) + are more than min_slab_ratio (a new /proc/sys/vm tunable). + +The shrinking of the slab cache is a bit problematic since it is not node +specific. So we simply calculate what point in the slab we want to reach +(current per node slab use minus the number of pages that neeed to be +allocated) and then repeately run the global reclaim until that is +unsuccessful or we have reached the limit. I hope we will have zone based +slab reclaim at some point which will make that easier. + +The default for the min_slab_ratio is 5% + +Also remove the slab option from /proc/sys/vm/zone_reclaim_mode. + +[akpm@osdl.org: cleanups] +Signed-off-by: Christoph Lameter +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + Documentation/sysctl/vm.txt | 27 +++++++++++++++----- + include/linux/mmzone.h | 3 ++ + include/linux/swap.h | 1 + include/linux/sysctl.h | 1 + kernel/sysctl.c | 11 ++++++++ + mm/page_alloc.c | 17 ++++++++++++ + mm/vmscan.c | 58 ++++++++++++++++++++++++++++---------------- + 7 files changed, 90 insertions(+), 28 deletions(-) + +--- linux-2.6.18.orig/Documentation/sysctl/vm.txt ++++ linux-2.6.18/Documentation/sysctl/vm.txt +@@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/ + - drop-caches + - zone_reclaim_mode + - min_unmapped_ratio ++- min_slab_ratio + - panic_on_oom + + ============================================================== +@@ -138,7 +139,6 @@ This is value ORed together of + 1 = Zone reclaim on + 2 = Zone reclaim writes dirty pages out + 4 = Zone reclaim swaps pages +-8 = Also do a global slab reclaim pass + + zone_reclaim_mode is set during bootup to 1 if it is determined that pages + from remote zones will cause a measurable performance reduction. The +@@ -162,18 +162,13 @@ Allowing regular swap effectively restri + node unless explicitly overridden by memory policies or cpuset + configurations. + +-It may be advisable to allow slab reclaim if the system makes heavy +-use of files and builds up large slab caches. However, the slab +-shrink operation is global, may take a long time and free slabs +-in all nodes of the system. +- + ============================================================= + + min_unmapped_ratio: + + This is available only on NUMA kernels. + +-A percentage of the file backed pages in each zone. Zone reclaim will only ++A percentage of the total pages in each zone. Zone reclaim will only + occur if more than this percentage of pages are file backed and unmapped. + This is to insure that a minimal amount of local pages is still available for + file I/O even if the node is overallocated. +@@ -182,6 +177,24 @@ The default is 1 percent. + + ============================================================= + ++min_slab_ratio: ++ ++This is available only on NUMA kernels. ++ ++A percentage of the total pages in each zone. On Zone reclaim ++(fallback from the local zone occurs) slabs will be reclaimed if more ++than this percentage of pages in a zone are reclaimable slab pages. ++This insures that the slab growth stays under control even in NUMA ++systems that rarely perform global reclaim. ++ ++The default is 5 percent. ++ ++Note that slab reclaim is triggered in a per zone / node fashion. ++The process of reclaiming slab memory is currently not node specific ++and may not be fast. ++ ++============================================================= ++ + panic_on_oom + + This enables or disables panic on out-of-memory feature. If this is set to 1, +--- linux-2.6.18.orig/include/linux/mmzone.h ++++ linux-2.6.18/include/linux/mmzone.h +@@ -155,6 +155,7 @@ struct zone { + * zone reclaim becomes active if more unmapped pages exist. + */ + unsigned long min_unmapped_ratio; ++ unsigned long min_slab_pages; + struct per_cpu_pageset *pageset[NR_CPUS]; + #else + struct per_cpu_pageset pageset[NR_CPUS]; +@@ -421,6 +422,8 @@ int percpu_pagelist_fraction_sysctl_hand + void __user *, size_t *, loff_t *); + int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, + struct file *, void __user *, size_t *, loff_t *); ++int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, ++ struct file *, void __user *, size_t *, loff_t *); + + #include + /* Returns the number of the current Node. */ +--- linux-2.6.18.orig/include/linux/swap.h ++++ linux-2.6.18/include/linux/swap.h +@@ -190,6 +190,7 @@ extern long vm_total_pages; + #ifdef CONFIG_NUMA + extern int zone_reclaim_mode; + extern int sysctl_min_unmapped_ratio; ++extern int sysctl_min_slab_ratio; + extern int zone_reclaim(struct zone *, gfp_t, unsigned int); + #else + #define zone_reclaim_mode 0 +--- linux-2.6.18.orig/include/linux/sysctl.h ++++ linux-2.6.18/include/linux/sysctl.h +@@ -191,6 +191,7 @@ enum + VM_MIN_UNMAPPED=32, /* Set min percent of unmapped pages */ + VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ + VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ ++ VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */ + }; + + +--- linux-2.6.18.orig/kernel/sysctl.c ++++ linux-2.6.18/kernel/sysctl.c +@@ -943,6 +943,17 @@ static ctl_table vm_table[] = { + .extra1 = &zero, + .extra2 = &one_hundred, + }, ++ { ++ .ctl_name = VM_MIN_SLAB, ++ .procname = "min_slab_ratio", ++ .data = &sysctl_min_slab_ratio, ++ .maxlen = sizeof(sysctl_min_slab_ratio), ++ .mode = 0644, ++ .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, ++ .strategy = &sysctl_intvec, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, + #endif + #ifdef CONFIG_X86_32 + { +--- linux-2.6.18.orig/mm/page_alloc.c ++++ linux-2.6.18/mm/page_alloc.c +@@ -2008,6 +2008,7 @@ static void __meminit free_area_init_cor + #ifdef CONFIG_NUMA + zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio) + / 100; ++ zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; + #endif + zone->name = zone_names[j]; + spin_lock_init(&zone->lock); +@@ -2318,6 +2319,22 @@ int sysctl_min_unmapped_ratio_sysctl_han + sysctl_min_unmapped_ratio) / 100; + return 0; + } ++ ++int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, ++ struct file *file, void __user *buffer, size_t *length, loff_t *ppos) ++{ ++ struct zone *zone; ++ int rc; ++ ++ rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); ++ if (rc) ++ return rc; ++ ++ for_each_zone(zone) ++ zone->min_slab_pages = (zone->present_pages * ++ sysctl_min_slab_ratio) / 100; ++ return 0; ++} + #endif + + /* +--- linux-2.6.18.orig/mm/vmscan.c ++++ linux-2.6.18/mm/vmscan.c +@@ -1510,7 +1510,6 @@ int zone_reclaim_mode __read_mostly; + #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ + #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ + #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ +-#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ + + /* + * Priority for ZONE_RECLAIM. This determines the fraction of pages +@@ -1526,6 +1525,12 @@ int zone_reclaim_mode __read_mostly; + int sysctl_min_unmapped_ratio = 1; + + /* ++ * If the number of slab pages in a zone grows beyond this percentage then ++ * slab reclaim needs to occur. ++ */ ++int sysctl_min_slab_ratio = 5; ++ ++/* + * Try to free up some pages from this zone through reclaim. + */ + static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +@@ -1556,29 +1561,37 @@ static int __zone_reclaim(struct zone *z + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + +- /* +- * Free memory by calling shrink zone with increasing priorities +- * until we have enough memory freed. +- */ +- priority = ZONE_RECLAIM_PRIORITY; +- do { +- nr_reclaimed += shrink_zone(priority, zone, &sc); +- priority--; +- } while (priority >= 0 && nr_reclaimed < nr_pages); ++ if (zone_page_state(zone, NR_FILE_PAGES) - ++ zone_page_state(zone, NR_FILE_MAPPED) > ++ zone->min_unmapped_ratio) { ++ /* ++ * Free memory by calling shrink zone with increasing ++ * priorities until we have enough memory freed. ++ */ ++ priority = ZONE_RECLAIM_PRIORITY; ++ do { ++ nr_reclaimed += shrink_zone(priority, zone, &sc); ++ priority--; ++ } while (priority >= 0 && nr_reclaimed < nr_pages); ++ } + +- if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { ++ if (zone_page_state(zone, NR_SLAB) > zone->min_slab_pages) { + /* + * shrink_slab() does not currently allow us to determine how +- * many pages were freed in this zone. So we just shake the slab +- * a bit and then go off node for this particular allocation +- * despite possibly having freed enough memory to allocate in +- * this zone. If we freed local memory then the next +- * allocations will be local again. ++ * many pages were freed in this zone. So we take the current ++ * number of slab pages and shake the slab until it is reduced ++ * by the same nr_pages that we used for reclaiming unmapped ++ * pages. + * +- * shrink_slab will free memory on all zones and may take +- * a long time. ++ * Note that shrink_slab will free memory on all zones and may ++ * take a long time. + */ +- shrink_slab(sc.nr_scanned, gfp_mask, order); ++ unsigned long limit = zone_page_state(zone, ++ NR_SLAB) - nr_pages; ++ ++ while (shrink_slab(sc.nr_scanned, gfp_mask, order) && ++ zone_page_state(zone, NR_SLAB) > limit) ++ ; + } + + p->reclaim_state = NULL; +@@ -1592,7 +1605,8 @@ int zone_reclaim(struct zone *zone, gfp_ + int node_id; + + /* +- * Zone reclaim reclaims unmapped file backed pages. ++ * Zone reclaim reclaims unmapped file backed pages and ++ * slab pages if we are over the defined limits. + * + * A small portion of unmapped file backed pages is needed for + * file I/O otherwise pages read by file I/O will be immediately +@@ -1601,7 +1615,9 @@ int zone_reclaim(struct zone *zone, gfp_ + * unmapped file backed pages. + */ + if (zone_page_state(zone, NR_FILE_PAGES) - +- zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) ++ zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio ++ && zone_page_state(zone, NR_SLAB) ++ <= zone->min_slab_pages) + return 0; + + /*