From: Chris Wright Date: Tue, 31 Oct 2006 11:20:53 +0000 (-0800) Subject: more patches to queue X-Git-Tag: v2.6.18.2~10 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f4f57af4fa74f1ce07f7863742e967920c13c8a4;p=thirdparty%2Fkernel%2Fstable-queue.git more patches to queue --- diff --git a/queue-2.6.18/check-bio-address-after-mapping-through-partitions.patch b/queue-2.6.18/check-bio-address-after-mapping-through-partitions.patch new file mode 100644 index 00000000000..4e5b8e0bc2b --- /dev/null +++ b/queue-2.6.18/check-bio-address-after-mapping-through-partitions.patch @@ -0,0 +1,74 @@ +From stable-bounces@linux.kernel.org Mon Oct 30 17:54:57 2006 +Date: Tue, 31 Oct 2006 12:51:45 +1100 +From: NeilBrown +To: Andrew Morton +Cc: stable@kernel.org, Jens Axboe +Subject: [stable] [PATCH] Check bio address after mapping through partitions. + +Partitions are not limited to live within a device. So +we should range check after partition mapping. + +Note that 'maxsector' was being used for two different things. I have +split off the second usage into 'old_sector' so that maxsector can be +still be used for it's primary usage later in the function. + +Cc: Jens Axboe +Signed-off-by: Neil Brown +Signed-off-by: Chris Wright +--- + block/ll_rw_blk.c | 24 ++++++++++++++++++++---- + 1 file changed, 20 insertions(+), 4 deletions(-) + +--- linux-2.6.18.1.orig/block/ll_rw_blk.c ++++ linux-2.6.18.1/block/ll_rw_blk.c +@@ -3021,6 +3021,7 @@ void generic_make_request(struct bio *bi + { + request_queue_t *q; + sector_t maxsector; ++ sector_t old_sector; + int ret, nr_sectors = bio_sectors(bio); + dev_t old_dev; + +@@ -3049,7 +3050,7 @@ void generic_make_request(struct bio *bi + * NOTE: we don't repeat the blk_size check for each new device. + * Stacking drivers are expected to know what they are doing. + */ +- maxsector = -1; ++ old_sector = -1; + old_dev = 0; + do { + char b[BDEVNAME_SIZE]; +@@ -3083,15 +3084,30 @@ end_io: + */ + blk_partition_remap(bio); + +- if (maxsector != -1) ++ if (old_sector != -1) + blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, +- maxsector); ++ old_sector); + + blk_add_trace_bio(q, bio, BLK_TA_QUEUE); + +- maxsector = bio->bi_sector; ++ old_sector = bio->bi_sector; + old_dev = bio->bi_bdev->bd_dev; + ++ maxsector = bio->bi_bdev->bd_inode->i_size >> 9; ++ if (maxsector) { ++ sector_t sector = bio->bi_sector; ++ ++ if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { ++ /* ++ * This may well happen - partitions are not checked ++ * to make sure they are within the size of the ++ * whole device. ++ */ ++ handle_bad_sector(bio); ++ goto end_io; ++ } ++ } ++ + ret = q->make_request_fn(q, bio); + } while (ret); + } diff --git a/queue-2.6.18/fill_tgid-fix-task_struct-leak-and-possible-oops.patch b/queue-2.6.18/fill_tgid-fix-task_struct-leak-and-possible-oops.patch new file mode 100644 index 00000000000..6c50711ea34 --- /dev/null +++ b/queue-2.6.18/fill_tgid-fix-task_struct-leak-and-possible-oops.patch @@ -0,0 +1,58 @@ +From fca178c0c6e8d52a1875be36b070f30884ebfae9 Mon Sep 17 00:00:00 2001 +From: Oleg Nesterov +Date: Sat, 28 Oct 2006 10:38:49 -0700 +Subject: fill_tgid: fix task_struct leak and possible oops + +1. fill_tgid() forgets to do put_task_struct(first). + +2. release_task(first) can happen after fill_tgid() drops tasklist_lock, + it is unsafe to dereference first->signal. + +This is a temporary fix, imho the locking should be reworked. + +Signed-off-by: Oleg Nesterov +Cc: Shailabh Nagar +Cc: Balbir Singh +Cc: Jay Lan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Chris Wright +--- + kernel/taskstats.c | 15 +++++++++------ + 1 file changed, 9 insertions(+), 6 deletions(-) + +--- linux-2.6.18.1.orig/kernel/taskstats.c ++++ linux-2.6.18.1/kernel/taskstats.c +@@ -229,14 +229,17 @@ static int fill_tgid(pid_t tgid, struct + } else + get_task_struct(first); + +- /* Start with stats from dead tasks */ +- spin_lock_irqsave(&first->signal->stats_lock, flags); +- if (first->signal->stats) +- memcpy(stats, first->signal->stats, sizeof(*stats)); +- spin_unlock_irqrestore(&first->signal->stats_lock, flags); + + tsk = first; + read_lock(&tasklist_lock); ++ /* Start with stats from dead tasks */ ++ if (first->signal) { ++ spin_lock_irqsave(&first->signal->stats_lock, flags); ++ if (first->signal->stats) ++ memcpy(stats, first->signal->stats, sizeof(*stats)); ++ spin_unlock_irqrestore(&first->signal->stats_lock, flags); ++ } ++ + do { + if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) + continue; +@@ -256,7 +259,7 @@ static int fill_tgid(pid_t tgid, struct + * Accounting subsytems can also add calls here to modify + * fields of taskstats. + */ +- ++ put_task_struct(first); + return 0; + } + diff --git a/queue-2.6.18/ipv6-fix-lockup-via-proc-net-ip6_flowlabel.patch b/queue-2.6.18/ipv6-fix-lockup-via-proc-net-ip6_flowlabel.patch new file mode 100644 index 00000000000..3517201aa08 --- /dev/null +++ b/queue-2.6.18/ipv6-fix-lockup-via-proc-net-ip6_flowlabel.patch @@ -0,0 +1,38 @@ +From stable-bounces@linux.kernel.org Mon Oct 30 15:14:51 2006 +Date: Mon, 30 Oct 2006 15:11:21 -0800 (PST) +From: David Miller +To: stable@kernel.org +Cc: bunk@stusta.de +Subject: IPV6: fix lockup via /proc/net/ip6_flowlabel + +From: James Morris + +There's a bug in the seqfile handling for /proc/net/ip6_flowlabel, where, +after finding a flowlabel, the code will loop forever not finding any +further flowlabels, first traversing the rest of the hash bucket then just +looping. + +This patch fixes the problem by breaking after the hash bucket has been +traversed. + +Note that this bug can cause lockups and oopses, and is trivially invoked +by an unpriveleged user. + +Signed-off-by: James Morris +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + net/ipv6/ip6_flowlabel.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- linux-2.6.18.1.orig/net/ipv6/ip6_flowlabel.c ++++ linux-2.6.18.1/net/ipv6/ip6_flowlabel.c +@@ -587,6 +587,8 @@ static struct ip6_flowlabel *ip6fl_get_n + while (!fl) { + if (++state->bucket <= FL_HASH_MASK) + fl = fl_ht[state->bucket]; ++ else ++ break; + } + return fl; + } diff --git a/queue-2.6.18/isdn-fix-drivers-by-handling-errors-thrown-by-readstat.patch b/queue-2.6.18/isdn-fix-drivers-by-handling-errors-thrown-by-readstat.patch new file mode 100644 index 00000000000..8c0766cb09d --- /dev/null +++ b/queue-2.6.18/isdn-fix-drivers-by-handling-errors-thrown-by-readstat.patch @@ -0,0 +1,49 @@ +From 04518bfe8eac2e82b476fb2b0093527adc2bc791 Mon Sep 17 00:00:00 2001 +From: Jeff Garzik +Date: Tue, 17 Oct 2006 00:10:39 -0700 +Subject: ISDN: fix drivers, by handling errors thrown by ->readstat() + +This is a particularly ugly on-failure bug, possibly security, since the +lack of error handling here is covering up another class of bug: failure to +handle copy_to_user() return values. + +The I4L API function ->readstat() returns an integer, and by looking at +several existing driver implementations, it is clear that a negative return +value was meant to indicate an error. + +Given that several drivers already return a negative value indicating an +errno-style error, the current code would blindly accept that [negative] +value as a valid amount of bytes read. Obvious damage ensues. + +Correcting ->readstat() handling to properly notice errors fixes the +existing code to work correctly on error, and enables future patches to +more easily indicate errors during operation. + +Signed-off-by: Jeff Garzik +Cc: Karsten Keil +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Chris Wright +--- + drivers/isdn/i4l/isdn_common.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- linux-2.6.18.1.orig/drivers/isdn/i4l/isdn_common.c ++++ linux-2.6.18.1/drivers/isdn/i4l/isdn_common.c +@@ -1134,9 +1134,12 @@ isdn_read(struct file *file, char __user + if (dev->drv[drvidx]->interface->readstat) { + if (count > dev->drv[drvidx]->stavail) + count = dev->drv[drvidx]->stavail; +- len = dev->drv[drvidx]->interface-> +- readstat(buf, count, drvidx, +- isdn_minor2chan(minor)); ++ len = dev->drv[drvidx]->interface->readstat(buf, count, ++ drvidx, isdn_minor2chan(minor)); ++ if (len < 0) { ++ retval = len; ++ goto out; ++ } + } else { + len = 0; + } diff --git a/queue-2.6.18/jmb-368-pata-detection.patch b/queue-2.6.18/jmb-368-pata-detection.patch new file mode 100644 index 00000000000..cbf463b53e9 --- /dev/null +++ b/queue-2.6.18/jmb-368-pata-detection.patch @@ -0,0 +1,32 @@ +From c333526f489044be2b93085720eb898f0037b346 Mon Sep 17 00:00:00 2001 +From: Alan Cox +Date: Sat, 28 Oct 2006 10:38:57 -0700 +Subject: JMB 368 PATA detection + +The Jmicron JMB368 is PATA only so has the PATA on function zero. Don't +therefore skip function zero on this device when probing + +Signed-off-by: Alan Cox +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Chris Wright +--- + drivers/ide/pci/generic.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- linux-2.6.18.1.orig/drivers/ide/pci/generic.c ++++ linux-2.6.18.1/drivers/ide/pci/generic.c +@@ -242,8 +242,10 @@ static int __devinit generic_init_one(st + (!(PCI_FUNC(dev->devfn) & 1))) + goto out; + +- if (dev->vendor == PCI_VENDOR_ID_JMICRON && PCI_FUNC(dev->devfn) != 1) +- goto out; ++ if (dev->vendor == PCI_VENDOR_ID_JMICRON) { ++ if (dev->device != PCI_DEVICE_ID_JMICRON_JMB368 && PCI_FUNC(dev->devfn) != 1) ++ goto out; ++ } + + if (dev->vendor != PCI_VENDOR_ID_JMICRON) { + pci_read_config_word(dev, PCI_COMMAND, &command); diff --git a/queue-2.6.18/nfs-nfs_lookup-don-t-hash-dentry-when-optimising-away-the-lookup.patch b/queue-2.6.18/nfs-nfs_lookup-don-t-hash-dentry-when-optimising-away-the-lookup.patch new file mode 100644 index 00000000000..389ae7cb394 --- /dev/null +++ b/queue-2.6.18/nfs-nfs_lookup-don-t-hash-dentry-when-optimising-away-the-lookup.patch @@ -0,0 +1,53 @@ +From fd6840714d9cf6e93f1d42b904860a94df316b85 Mon Sep 17 00:00:00 2001 +From: Trond Myklebust +Date: Tue, 5 Sep 2006 12:27:44 -0400 +Subject: NFS: nfs_lookup - don't hash dentry when optimising away the lookup + +If the open intents tell us that a given lookup is going to result in a, +exclusive create, we currently optimize away the lookup call itself. The +reason is that the lookup would not be atomic with the create RPC call, so +why do it in the first place? + +A problem occurs, however, if the VFS aborts the exclusive create operation +after the lookup, but before the call to create the file/directory: in this +case we will end up with a hashed negative dentry in the dcache that has +never been looked up. +Fix this by only actually hashing the dentry once the create operation has +been successfully completed. + +Signed-off-by: Trond Myklebust +Signed-off-by: Chris Wright +--- + fs/nfs/dir.c | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +--- linux-2.6.18.1.orig/fs/nfs/dir.c ++++ linux-2.6.18.1/fs/nfs/dir.c +@@ -902,9 +902,15 @@ static struct dentry *nfs_lookup(struct + + lock_kernel(); + +- /* If we're doing an exclusive create, optimize away the lookup */ +- if (nfs_is_exclusive_create(dir, nd)) +- goto no_entry; ++ /* ++ * If we're doing an exclusive create, optimize away the lookup ++ * but don't hash the dentry. ++ */ ++ if (nfs_is_exclusive_create(dir, nd)) { ++ d_instantiate(dentry, NULL); ++ res = NULL; ++ goto out_unlock; ++ } + + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); + if (error == -ENOENT) +@@ -1156,6 +1162,8 @@ int nfs_instantiate(struct dentry *dentr + if (IS_ERR(inode)) + goto out_err; + d_instantiate(dentry, inode); ++ if (d_unhashed(dentry)) ++ d_rehash(dentry); + return 0; + out_err: + d_drop(dentry); diff --git a/queue-2.6.18/pci-remove-quirk_via_abnormal_poweroff.patch b/queue-2.6.18/pci-remove-quirk_via_abnormal_poweroff.patch new file mode 100644 index 00000000000..d2a645942b9 --- /dev/null +++ b/queue-2.6.18/pci-remove-quirk_via_abnormal_poweroff.patch @@ -0,0 +1,73 @@ +From 3560cc5ec3488b20d927f7160a21a0df1d1fda20 Mon Sep 17 00:00:00 2001 +From: Karsten Wiese +Date: Fri, 20 Oct 2006 14:45:36 -0700 +Subject: PCI: Remove quirk_via_abnormal_poweroff + +My K8T800 mobo resumes fine from suspend to ram with and without patch +applied against 2.6.18. + +quirk_via_abnormal_poweroff makes some boards not boot 2.6.18, so IMO patch +should go to head, 2.6.18.2 and everywhere "ACPI: ACPICA 20060623" has been +applied. + + +Remove quirk_via_abnormal_poweroff + +Obsoleted by "ACPI: ACPICA 20060623": + + Implemented support for "ignored" bits in the ACPI + registers. According to the ACPI specification, these + bits should be preserved when writing the registers via + a read/modify/write cycle. There are 3 bits preserved + in this manner: PM1_CONTROL[0] (SCI_EN), PM1_CONTROL[9], + and PM1_STATUS[11]. + http://bugzilla.kernel.org/show_bug.cgi?id=3691 + + +Signed-off-by: Karsten Wiese +Cc: Bob Moore +Cc: "Brown, Len" +Acked-by: Dave Jones +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Chris Wright +--- + drivers/pci/quirks.c | 27 --------------------------- + 1 file changed, 27 deletions(-) + +--- linux-2.6.18.1.orig/drivers/pci/quirks.c ++++ linux-2.6.18.1/drivers/pci/quirks.c +@@ -685,33 +685,6 @@ static void __devinit quirk_vt82c598_id( + } + DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C597_0, quirk_vt82c598_id ); + +-#ifdef CONFIG_ACPI_SLEEP +- +-/* +- * Some VIA systems boot with the abnormal status flag set. This can cause +- * the BIOS to re-POST the system on resume rather than passing control +- * back to the OS. Clear the flag on boot +- */ +-static void __devinit quirk_via_abnormal_poweroff(struct pci_dev *dev) +-{ +- u32 reg; +- +- acpi_hw_register_read(ACPI_MTX_DO_NOT_LOCK, ACPI_REGISTER_PM1_STATUS, +- ®); +- +- if (reg & 0x800) { +- printk("Clearing abnormal poweroff flag\n"); +- acpi_hw_register_write(ACPI_MTX_DO_NOT_LOCK, +- ACPI_REGISTER_PM1_STATUS, +- (u16)0x800); +- } +-} +- +-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, quirk_via_abnormal_poweroff); +-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, quirk_via_abnormal_poweroff); +- +-#endif +- + /* + * CardBus controllers have a legacy base address that enables them + * to respond as i82365 pcmcia controllers. We don't want them to diff --git a/queue-2.6.18/posix-cpu-timers-prevent-signal-delivery-starvation.patch b/queue-2.6.18/posix-cpu-timers-prevent-signal-delivery-starvation.patch new file mode 100644 index 00000000000..b4b19d3cfa5 --- /dev/null +++ b/queue-2.6.18/posix-cpu-timers-prevent-signal-delivery-starvation.patch @@ -0,0 +1,137 @@ +From stable-bounces@linux.kernel.org Tue Oct 17 00:12:55 2006 +Date: Tue, 17 Oct 2006 00:09:39 -0700 +From: akpm@osdl.org +To: torvalds@osdl.org +Cc: akpm@osdl.org, dwalker@mvista.com, pmattis@google.com, johnstul@us.ibm.com, toyoa@mvista.com, stable@kernel.org, zippel@linux-m68k.org, mbligh@google.com, spark@google.com, rohitseth@google.com, tglx@linutronix.de, mingo@elte.hu, roland@redhat.com +Subject: posix-cpu-timers: prevent signal delivery starvation + +From: Thomas Gleixner + +The integer divisions in the timer accounting code can round the result +down to 0. Adding 0 is without effect and the signal delivery stops. + +Clamp the division result to minimum 1 to avoid this. + +Problem was reported by Seongbae Park , who provided +also an inital patch. + +Roland sayeth: + + I have had some more time to think about the problem, and to reproduce it + using Toyo's test case. For the record, if my understanding of the problem + is correct, this happens only in one very particular case. First, the + expiry time has to be so soon that in cputime_t units (usually 1s/HZ ticks) + it's < nthreads so the division yields zero. Second, it only affects each + thread that is so new that its CPU time accumulation is zero so now+0 is + still zero and ->it_*_expires winds up staying zero. For the VIRT and PROF + clocks when cputime_t is tick granularity (or the SCHED clock on + configurations where sched_clock's value only advances on clock ticks), this + is not hard to arrange with new threads starting up and blocking before they + accumulate a whole tick of CPU time. That's what happens in Toyo's test + case. + + Note that in general it is fine for that division to round down to zero, + and set each thread's expiry time to its "now" time. The problem only + arises with thread's whose "now" value is still zero, so that now+0 winds up + 0 and is interpreted as "not set" instead of ">= now". So it would be a + sufficient and more precise fix to just use max(ticks, 1) inside the loop + when setting each it_*_expires value. + + But, it does no harm to round the division up to one and always advance + every thread's expiry time. If the thread didn't already fire timers for + the expiry time of "now", there is no expectation that it will do so before + the next tick anyway. So I followed Thomas's patch in lifting the max out + of the loops. + + This patch also covers the reload cases, which are harder to write a test + for (and I didn't try). I've tested it with Toyo's case and it fixes that. + + +[toyoa@mvista.com: fix: min_t -> max_t] +Signed-off-by: Thomas Gleixner +Cc: Ingo Molnar +Signed-off-by: Roland McGrath +Cc: Daniel Walker +Cc: Toyo Abe +Cc: john stultz +Cc: Roman Zippel +Cc: Seongbae Park +Cc: Peter Mattis +Cc: Rohit Seth +Cc: Martin Bligh +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Chris Wright +--- + + kernel/posix-cpu-timers.c | 27 +++++++++++++++++++++------ + 1 file changed, 21 insertions(+), 6 deletions(-) + +--- linux-2.6.18.1.orig/kernel/posix-cpu-timers.c ++++ linux-2.6.18.1/kernel/posix-cpu-timers.c +@@ -88,6 +88,19 @@ static inline union cpu_time_count cpu_t + } + + /* ++ * Divide and limit the result to res >= 1 ++ * ++ * This is necessary to prevent signal delivery starvation, when the result of ++ * the division would be rounded down to 0. ++ */ ++static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) ++{ ++ cputime_t res = cputime_div(time, div); ++ ++ return max_t(cputime_t, res, 1); ++} ++ ++/* + * Update expiry time from increment, and increase overrun count, + * given the current clock sample. + */ +@@ -483,8 +496,8 @@ static void process_timer_rebalance(stru + BUG(); + break; + case CPUCLOCK_PROF: +- left = cputime_div(cputime_sub(expires.cpu, val.cpu), +- nthreads); ++ left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), ++ nthreads); + do { + if (likely(!(t->flags & PF_EXITING))) { + ticks = cputime_add(prof_ticks(t), left); +@@ -498,8 +511,8 @@ static void process_timer_rebalance(stru + } while (t != p); + break; + case CPUCLOCK_VIRT: +- left = cputime_div(cputime_sub(expires.cpu, val.cpu), +- nthreads); ++ left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), ++ nthreads); + do { + if (likely(!(t->flags & PF_EXITING))) { + ticks = cputime_add(virt_ticks(t), left); +@@ -515,6 +528,7 @@ static void process_timer_rebalance(stru + case CPUCLOCK_SCHED: + nsleft = expires.sched - val.sched; + do_div(nsleft, nthreads); ++ nsleft = max_t(unsigned long long, nsleft, 1); + do { + if (likely(!(t->flags & PF_EXITING))) { + ns = t->sched_time + nsleft; +@@ -1159,12 +1173,13 @@ static void check_process_timers(struct + + prof_left = cputime_sub(prof_expires, utime); + prof_left = cputime_sub(prof_left, stime); +- prof_left = cputime_div(prof_left, nthreads); ++ prof_left = cputime_div_non_zero(prof_left, nthreads); + virt_left = cputime_sub(virt_expires, utime); +- virt_left = cputime_div(virt_left, nthreads); ++ virt_left = cputime_div_non_zero(virt_left, nthreads); + if (sched_expires) { + sched_left = sched_expires - sched_time; + do_div(sched_left, nthreads); ++ sched_left = max_t(unsigned long long, sched_left, 1); + } else { + sched_left = 0; + } diff --git a/queue-2.6.18/reintroduce-nodes_span_other_nodes-for-powerpc.patch b/queue-2.6.18/reintroduce-nodes_span_other_nodes-for-powerpc.patch new file mode 100644 index 00000000000..27f45a29d3c --- /dev/null +++ b/queue-2.6.18/reintroduce-nodes_span_other_nodes-for-powerpc.patch @@ -0,0 +1,85 @@ +From 7516795739bd53175629b90fab0ad488d7a6a9f7 Mon Sep 17 00:00:00 2001 +From: Andy Whitcroft +Date: Sat, 21 Oct 2006 10:24:14 -0700 +Subject: Reintroduce NODES_SPAN_OTHER_NODES for powerpc + +Revert "[PATCH] Remove SPAN_OTHER_NODES config definition" + This reverts commit f62859bb6871c5e4a8e591c60befc8caaf54db8c. +Revert "[PATCH] mm: remove arch independent NODES_SPAN_OTHER_NODES" + This reverts commit a94b3ab7eab4edcc9b2cb474b188f774c331adf7. + +Also update the comments to indicate that this is still required +and where its used. + +Signed-off-by: Andy Whitcroft +Cc: Paul Mackerras +Cc: Mike Kravetz +Cc: Benjamin Herrenschmidt +Acked-by: Mel Gorman +Acked-by: Will Schmidt +Cc: Christoph Lameter +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Chris Wright +--- + arch/powerpc/Kconfig | 9 +++++++++ + arch/powerpc/configs/pseries_defconfig | 1 + + include/linux/mmzone.h | 6 ++++++ + mm/page_alloc.c | 2 ++ + 4 files changed, 18 insertions(+) + +--- linux-2.6.18.1.orig/arch/powerpc/Kconfig ++++ linux-2.6.18.1/arch/powerpc/Kconfig +@@ -729,6 +729,15 @@ config ARCH_MEMORY_PROBE + def_bool y + depends on MEMORY_HOTPLUG + ++# Some NUMA nodes have memory ranges that span ++# other nodes. Even though a pfn is valid and ++# between a node's start and end pfns, it may not ++# reside on that node. See memmap_init_zone() ++# for details. ++config NODES_SPAN_OTHER_NODES ++ def_bool y ++ depends on NEED_MULTIPLE_NODES ++ + config PPC_64K_PAGES + bool "64k page size" + depends on PPC64 +--- linux-2.6.18.1.orig/arch/powerpc/configs/pseries_defconfig ++++ linux-2.6.18.1/arch/powerpc/configs/pseries_defconfig +@@ -184,6 +184,7 @@ CONFIG_SPLIT_PTLOCK_CPUS=4 + CONFIG_MIGRATION=y + CONFIG_RESOURCES_64BIT=y + CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y ++CONFIG_NODES_SPAN_OTHER_NODES=y + # CONFIG_PPC_64K_PAGES is not set + CONFIG_SCHED_SMT=y + CONFIG_PROC_DEVICETREE=y +--- linux-2.6.18.1.orig/include/linux/mmzone.h ++++ linux-2.6.18.1/include/linux/mmzone.h +@@ -632,6 +632,12 @@ void sparse_init(void); + #define sparse_index_init(_sec, _nid) do {} while (0) + #endif /* CONFIG_SPARSEMEM */ + ++#ifdef CONFIG_NODES_SPAN_OTHER_NODES ++#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) ++#else ++#define early_pfn_in_nid(pfn, nid) (1) ++#endif ++ + #ifndef early_pfn_valid + #define early_pfn_valid(pfn) (1) + #endif +--- linux-2.6.18.1.orig/mm/page_alloc.c ++++ linux-2.6.18.1/mm/page_alloc.c +@@ -1673,6 +1673,8 @@ void __meminit memmap_init_zone(unsigned + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!early_pfn_valid(pfn)) + continue; ++ if (!early_pfn_in_nid(pfn, nid)) ++ continue; + page = pfn_to_page(pfn); + set_page_links(page, zone, nid, pfn); + init_page_count(page); diff --git a/queue-2.6.18/rtc-max6902-month-conversion-fix.patch b/queue-2.6.18/rtc-max6902-month-conversion-fix.patch new file mode 100644 index 00000000000..8b3a0ff029c --- /dev/null +++ b/queue-2.6.18/rtc-max6902-month-conversion-fix.patch @@ -0,0 +1,39 @@ +From stable-bounces@linux.kernel.org Tue Oct 17 00:12:18 2006 +Date: Tue, 17 Oct 2006 00:09:53 -0700 +From: akpm@osdl.org +To: torvalds@osdl.org +Cc: akpm@osdl.org, a.zummo@towertech.it, flarramendi@gmail.com, raph@raphnet.net, stable@kernel.org +Subject: rtc-max6902: month conversion fix + +From: Francisco Larramendi + +Fix October-only BCD-to-binary conversion bug: + + 0x08 -> 7 + 0x09 -> 8 + 0x10 -> 15 (!) + 0x11 -> 19 + +Fixes http://bugzilla.kernel.org/show_bug.cgi?id=7361 + +Cc: Raphael Assenat +Cc: Alessandro Zummo +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Chris Wright +--- + + drivers/rtc/rtc-max6902.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- linux-2.6.18.1.orig/drivers/rtc/rtc-max6902.c ++++ linux-2.6.18.1/drivers/rtc/rtc-max6902.c +@@ -137,7 +137,7 @@ static int max6902_get_datetime(struct d + dt->tm_min = BCD2BIN(chip->buf[2]); + dt->tm_hour = BCD2BIN(chip->buf[3]); + dt->tm_mday = BCD2BIN(chip->buf[4]); +- dt->tm_mon = BCD2BIN(chip->buf[5] - 1); ++ dt->tm_mon = BCD2BIN(chip->buf[5]) - 1; + dt->tm_wday = BCD2BIN(chip->buf[6]); + dt->tm_year = BCD2BIN(chip->buf[7]); + diff --git a/queue-2.6.18/series b/queue-2.6.18/series index 85fb75e1645..b9d16f4fd11 100644 --- a/queue-2.6.18/series +++ b/queue-2.6.18/series @@ -44,3 +44,17 @@ uml-remove-warnings-added-by-previous-stable-patch.patch alsa-snd_rtctimer-handle-rtc-interrupts-with-a-tasklet.patch watchdog-sc1200wdt-fix-missing-pnp_unregister_driver.patch fix-intel-rng-detection.patch +posix-cpu-timers-prevent-signal-delivery-starvation.patch +rtc-max6902-month-conversion-fix.patch +isdn-fix-drivers-by-handling-errors-thrown-by-readstat.patch +sparc64-fix-pci-memory-space-root-resource-on-hummingbird.patch +pci-remove-quirk_via_abnormal_poweroff.patch +reintroduce-nodes_span_other_nodes-for-powerpc.patch +nfs-nfs_lookup-don-t-hash-dentry-when-optimising-away-the-lookup.patch +vmscan-fix-temp_priority-race.patch +use-min-of-two-prio-settings-in-calculating-distress-for-reclaim.patch +fill_tgid-fix-task_struct-leak-and-possible-oops.patch +jmb-368-pata-detection.patch +tcp-cubic-scaling-error.patch +ipv6-fix-lockup-via-proc-net-ip6_flowlabel.patch +check-bio-address-after-mapping-through-partitions.patch diff --git a/queue-2.6.18/sparc64-fix-pci-memory-space-root-resource-on-hummingbird.patch b/queue-2.6.18/sparc64-fix-pci-memory-space-root-resource-on-hummingbird.patch new file mode 100644 index 00000000000..8d35b58b74f --- /dev/null +++ b/queue-2.6.18/sparc64-fix-pci-memory-space-root-resource-on-hummingbird.patch @@ -0,0 +1,140 @@ +From stable-bounces@linux.kernel.org Wed Oct 18 13:40:37 2006 +Date: Wed, 18 Oct 2006 13:38:49 -0700 (PDT) +From: David Miller +To: stable@kernel.org +Subject: SPARC64: Fix PCI memory space root resource on Hummingbird. + +For Hummingbird PCI controllers, we should create the root +PCI memory space resource as the full 4GB area, and then +allocate the IOMMU DMA translation window out of there. + +The old code just assumed that the IOMMU DMA translation base +to the top of the 4GB area was unusable. This is not true on +many systems such as SB100 and SB150, where the IOMMU DMA +translation window sits at 0xc0000000->0xdfffffff. + +So what would happen is that any device mapped by the firmware +at the top section 0xe0000000->0xffffffff would get remapped +by Linux somewhere else leading to all kinds of problems and +boot failures. + +While we're here, report more cases of OBP resource assignment +conflicts. The only truly valid ones are ROM resource conflicts. + +Signed-off-by: David S. Miller +Signed-off-by: Chris Wright +--- + arch/sparc64/kernel/pci_common.c | 29 ++++++++++------------------- + arch/sparc64/kernel/pci_sabre.c | 23 +++++++++++++++++++---- + 2 files changed, 29 insertions(+), 23 deletions(-) + +--- linux-2.6.18.1.orig/arch/sparc64/kernel/pci_common.c ++++ linux-2.6.18.1/arch/sparc64/kernel/pci_common.c +@@ -330,19 +330,6 @@ __init get_device_resource(struct linux_ + return res; + } + +-static int __init pdev_resource_collisions_expected(struct pci_dev *pdev) +-{ +- if (pdev->vendor != PCI_VENDOR_ID_SUN) +- return 0; +- +- if (pdev->device == PCI_DEVICE_ID_SUN_RIO_EBUS || +- pdev->device == PCI_DEVICE_ID_SUN_RIO_1394 || +- pdev->device == PCI_DEVICE_ID_SUN_RIO_USB) +- return 1; +- +- return 0; +-} +- + static void __init pdev_record_assignments(struct pci_pbm_info *pbm, + struct pci_dev *pdev) + { +@@ -400,19 +387,23 @@ static void __init pdev_record_assignmen + pbm->parent->resource_adjust(pdev, res, root); + + if (request_resource(root, res) < 0) { ++ int rnum; ++ + /* OK, there is some conflict. But this is fine + * since we'll reassign it in the fixup pass. + * +- * We notify the user that OBP made an error if it +- * is a case we don't expect. ++ * Do not print the warning for ROM resources ++ * as such a conflict is quite common and ++ * harmless as the ROM bar is disabled. + */ +- if (!pdev_resource_collisions_expected(pdev)) { +- printk(KERN_ERR "PCI: Address space collision on region %ld " ++ rnum = (res - &pdev->resource[0]); ++ if (rnum != PCI_ROM_RESOURCE) ++ printk(KERN_ERR "PCI: Resource collision, " ++ "region %d " + "[%016lx:%016lx] of device %s\n", +- (res - &pdev->resource[0]), ++ rnum, + res->start, res->end, + pci_name(pdev)); +- } + } + } + } +--- linux-2.6.18.1.orig/arch/sparc64/kernel/pci_sabre.c ++++ linux-2.6.18.1/arch/sparc64/kernel/pci_sabre.c +@@ -1196,7 +1196,7 @@ static void pbm_register_toplevel_resour + &pbm->mem_space); + } + +-static void sabre_pbm_init(struct pci_controller_info *p, struct device_node *dp, u32 dma_begin) ++static void sabre_pbm_init(struct pci_controller_info *p, struct device_node *dp, u32 dma_start, u32 dma_end) + { + struct pci_pbm_info *pbm; + struct device_node *node; +@@ -1261,6 +1261,8 @@ static void sabre_pbm_init(struct pci_co + node = node->sibling; + } + if (simbas_found == 0) { ++ struct resource *rp; ++ + /* No APBs underneath, probably this is a hummingbird + * system. + */ +@@ -1302,8 +1304,10 @@ static void sabre_pbm_init(struct pci_co + pbm->io_space.end = pbm->io_space.start + (1UL << 24) - 1UL; + pbm->io_space.flags = IORESOURCE_IO; + +- pbm->mem_space.start = p->pbm_A.controller_regs + SABRE_MEMSPACE; +- pbm->mem_space.end = pbm->mem_space.start + (unsigned long)dma_begin - 1UL; ++ pbm->mem_space.start = ++ (p->pbm_A.controller_regs + SABRE_MEMSPACE); ++ pbm->mem_space.end = ++ (pbm->mem_space.start + ((1UL << 32UL) - 1UL)); + pbm->mem_space.flags = IORESOURCE_MEM; + + if (request_resource(&ioport_resource, &pbm->io_space) < 0) { +@@ -1315,6 +1319,17 @@ static void sabre_pbm_init(struct pci_co + prom_halt(); + } + ++ rp = kmalloc(sizeof(*rp), GFP_KERNEL); ++ if (!rp) { ++ prom_printf("Cannot allocate IOMMU resource.\n"); ++ prom_halt(); ++ } ++ rp->name = "IOMMU"; ++ rp->start = pbm->mem_space.start + (unsigned long) dma_start; ++ rp->end = pbm->mem_space.start + (unsigned long) dma_end - 1UL; ++ rp->flags = IORESOURCE_BUSY; ++ request_resource(&pbm->mem_space, rp); ++ + pci_register_legacy_regions(&pbm->io_space, + &pbm->mem_space); + } +@@ -1450,5 +1465,5 @@ void sabre_init(struct device_node *dp, + /* + * Look for APB underneath. + */ +- sabre_pbm_init(p, dp, vdma[0]); ++ sabre_pbm_init(p, dp, vdma[0], vdma[0] + vdma[1]); + } diff --git a/queue-2.6.18/tcp-cubic-scaling-error.patch b/queue-2.6.18/tcp-cubic-scaling-error.patch new file mode 100644 index 00000000000..ee194368791 --- /dev/null +++ b/queue-2.6.18/tcp-cubic-scaling-error.patch @@ -0,0 +1,56 @@ +From stable-bounces@linux.kernel.org Mon Oct 30 14:50:53 2006 +Date: Mon, 30 Oct 2006 14:47:35 -0800 +From: Stephen Hemminger +To: stable@kernel.org +Subject: tcp: cubic scaling error + +Doug Leith observed a discrepancy between the version of CUBIC described +in the papers and the version in 2.6.18. A math error related to scaling +causes Cubic to grow too slowly. + +Patch is from "Sangtae Ha" . I validated that +it does fix the problems. + +See the following to show behavior over 500ms 100 Mbit link. + +Sender (2.6.19-rc3) --- Bridge (2.6.18-rt7) ------- Receiver (2.6.19-rc3) + 1G [netem] 100M + + http://developer.osdl.org/shemminger/tcp/2.6.19-rc3/cubic-orig.png + http://developer.osdl.org/shemminger/tcp/2.6.19-rc3/cubic-fix.png + +Signed-off-by: Stephen Hemminger +Signed-off-by: Chris Wright +--- + net/ipv4/tcp_cubic.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- linux-2.6.18.1.orig/net/ipv4/tcp_cubic.c ++++ linux-2.6.18.1/net/ipv4/tcp_cubic.c +@@ -190,7 +190,7 @@ static inline void bictcp_update(struct + */ + + /* change the unit from HZ to bictcp_HZ */ +- t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start) ++ t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) + << BICTCP_HZ) / HZ; + + if (t < ca->bic_K) /* t - K */ +@@ -259,7 +259,7 @@ static inline void measure_delay(struct + (s32)(tcp_time_stamp - ca->epoch_start) < HZ) + return; + +- delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; ++ delay = (tcp_time_stamp - tp->rx_opt.rcv_tsecr)<<3; + if (delay == 0) + delay = 1; + +@@ -366,7 +366,7 @@ static int __init cubictcp_register(void + + beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); + +- cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */ ++ cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ + + /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 + * so K = cubic_root( (wmax-cwnd)*rtt/c ) diff --git a/queue-2.6.18/use-min-of-two-prio-settings-in-calculating-distress-for-reclaim.patch b/queue-2.6.18/use-min-of-two-prio-settings-in-calculating-distress-for-reclaim.patch new file mode 100644 index 00000000000..62b44783b35 --- /dev/null +++ b/queue-2.6.18/use-min-of-two-prio-settings-in-calculating-distress-for-reclaim.patch @@ -0,0 +1,67 @@ +From bbdb396a60b2ebf7de3b717991e5d3e28c8b7bbd Mon Sep 17 00:00:00 2001 +From: Martin Bligh +Date: Sat, 28 Oct 2006 10:38:25 -0700 +Subject: Use min of two prio settings in calculating distress for reclaim + +If try_to_free_pages / balance_pgdat are called with a gfp_mask specifying +GFP_IO and/or GFP_FS, they will reclaim the requisite number of pages, and the +reset prev_priority to DEF_PRIORITY (or to some other high (ie: unurgent) +value). + +However, another reclaimer without those gfp_mask flags set (say, GFP_NOIO) +may still be struggling to reclaim pages. The concurrent overwrite of +zone->prev_priority will cause this GFP_NOIO thread to unexpectedly cease +deactivating mapped pages, thus causing reclaim difficulties. + +Fix this is to key the distress calculation not off zone->prev_priority, but +also take into account the local caller's priority by using +min(zone->prev_priority, sc->priority) + +Signed-off-by: Martin J. Bligh +Cc: Nick Piggin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Chris Wright +--- + mm/vmscan.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- linux-2.6.18.1.orig/mm/vmscan.c ++++ linux-2.6.18.1/mm/vmscan.c +@@ -727,7 +727,7 @@ static inline void note_zone_scanning_pr + * But we had to alter page->flags anyway. + */ + static void shrink_active_list(unsigned long nr_pages, struct zone *zone, +- struct scan_control *sc) ++ struct scan_control *sc, int priority) + { + unsigned long pgmoved; + int pgdeactivate = 0; +@@ -748,7 +748,7 @@ static void shrink_active_list(unsigned + * `distress' is a measure of how much trouble we're having + * reclaiming pages. 0 -> no problems. 100 -> great trouble. + */ +- distress = 100 >> zone->prev_priority; ++ distress = 100 >> min(zone->prev_priority, priority); + + /* + * The point of this algorithm is to decide when to start +@@ -899,7 +899,7 @@ static unsigned long shrink_zone(int pri + nr_to_scan = min(nr_active, + (unsigned long)sc->swap_cluster_max); + nr_active -= nr_to_scan; +- shrink_active_list(nr_to_scan, zone, sc); ++ shrink_active_list(nr_to_scan, zone, sc, priority); + } + + if (nr_inactive) { +@@ -1341,7 +1341,7 @@ static unsigned long shrink_all_zones(un + if (zone->nr_scan_active >= nr_pages || pass > 3) { + zone->nr_scan_active = 0; + nr_to_scan = min(nr_pages, zone->nr_active); +- shrink_active_list(nr_to_scan, zone, sc); ++ shrink_active_list(nr_to_scan, zone, sc, prio); + } + } + diff --git a/queue-2.6.18/vmscan-fix-temp_priority-race.patch b/queue-2.6.18/vmscan-fix-temp_priority-race.patch new file mode 100644 index 00000000000..593778dcfbe --- /dev/null +++ b/queue-2.6.18/vmscan-fix-temp_priority-race.patch @@ -0,0 +1,221 @@ +From 3bb1a852ab6c9cdf211a2f4a2f502340c8c38eca Mon Sep 17 00:00:00 2001 +From: Martin Bligh +Date: Sat, 28 Oct 2006 10:38:24 -0700 +Subject: vmscan: Fix temp_priority race + +The temp_priority field in zone is racy, as we can walk through a reclaim +path, and just before we copy it into prev_priority, it can be overwritten +(say with DEF_PRIORITY) by another reclaimer. + +The same bug is contained in both try_to_free_pages and balance_pgdat, but +it is fixed slightly differently. In balance_pgdat, we keep a separate +priority record per zone in a local array. In try_to_free_pages there is +no need to do this, as the priority level is the same for all zones that we +reclaim from. + +Impact of this bug is that temp_priority is copied into prev_priority, and +setting this artificially high causes reclaimers to set distress +artificially low. They then fail to reclaim mapped pages, when they are, +in fact, under severe memory pressure (their priority may be as low as 0). +This causes the OOM killer to fire incorrectly. + +From: Andrew Morton + +__zone_reclaim() isn't modifying zone->prev_priority. But zone->prev_priority +is used in the decision whether or not to bring mapped pages onto the inactive +list. Hence there's a risk here that __zone_reclaim() will fail because +zone->prev_priority ir large (ie: low urgency) and lots of mapped pages end up +stuck on the active list. + +Fix that up by decreasing (ie making more urgent) zone->prev_priority as +__zone_reclaim() scans the zone's pages. + +This bug perhaps explains why ZONE_RECLAIM_PRIORITY was created. It should be +possible to remove that now, and to just start out at DEF_PRIORITY? + +Cc: Nick Piggin +Cc: Christoph Lameter +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Chris Wright +[chrisw: minor wiggle to fit -stable] +--- + include/linux/mmzone.h | 6 ----- + mm/page_alloc.c | 2 - + mm/vmscan.c | 55 ++++++++++++++++++++++++++++++++++++------------- + mm/vmstat.c | 2 - + 4 files changed, 43 insertions(+), 22 deletions(-) + +--- linux-2.6.18.1.orig/include/linux/mmzone.h ++++ linux-2.6.18.1/include/linux/mmzone.h +@@ -200,13 +200,9 @@ struct zone { + * under - it drives the swappiness decision: whether to unmap mapped + * pages. + * +- * temp_priority is used to remember the scanning priority at which +- * this zone was successfully refilled to free_pages == pages_high. +- * +- * Access to both these fields is quite racy even on uniprocessor. But ++ * Access to both this field is quite racy even on uniprocessor. But + * it is expected to average out OK. + */ +- int temp_priority; + int prev_priority; + + +--- linux-2.6.18.1.orig/mm/page_alloc.c ++++ linux-2.6.18.1/mm/page_alloc.c +@@ -2021,7 +2021,7 @@ static void __meminit free_area_init_cor + zone->zone_pgdat = pgdat; + zone->free_pages = 0; + +- zone->temp_priority = zone->prev_priority = DEF_PRIORITY; ++ zone->prev_priority = DEF_PRIORITY; + + zone_pcp_init(zone); + INIT_LIST_HEAD(&zone->active_list); +--- linux-2.6.18.1.orig/mm/vmscan.c ++++ linux-2.6.18.1/mm/vmscan.c +@@ -696,6 +696,20 @@ done: + } + + /* ++ * We are about to scan this zone at a certain priority level. If that priority ++ * level is smaller (ie: more urgent) than the previous priority, then note ++ * that priority level within the zone. This is done so that when the next ++ * process comes in to scan this zone, it will immediately start out at this ++ * priority level rather than having to build up its own scanning priority. ++ * Here, this priority affects only the reclaim-mapped threshold. ++ */ ++static inline void note_zone_scanning_priority(struct zone *zone, int priority) ++{ ++ if (priority < zone->prev_priority) ++ zone->prev_priority = priority; ++} ++ ++/* + * This moves pages from the active list to the inactive list. + * + * We move them the other way if the page is referenced by one or more +@@ -934,9 +948,7 @@ static unsigned long shrink_zones(int pr + if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + continue; + +- zone->temp_priority = priority; +- if (zone->prev_priority > priority) +- zone->prev_priority = priority; ++ note_zone_scanning_priority(zone, priority); + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; /* Let kswapd poll it */ +@@ -984,7 +996,6 @@ unsigned long try_to_free_pages(struct z + if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + continue; + +- zone->temp_priority = DEF_PRIORITY; + lru_pages += zone->nr_active + zone->nr_inactive; + } + +@@ -1022,13 +1033,22 @@ unsigned long try_to_free_pages(struct z + blk_congestion_wait(WRITE, HZ/10); + } + out: ++ /* ++ * Now that we've scanned all the zones at this priority level, note ++ * that level within the zone so that the next thread which performs ++ * scanning of this zone will immediately start out at this priority ++ * level. This affects only the decision whether or not to bring ++ * mapped pages onto the inactive list. ++ */ ++ if (priority < 0) ++ priority = 0; + for (i = 0; zones[i] != 0; i++) { + struct zone *zone = zones[i]; + + if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + continue; + +- zone->prev_priority = zone->temp_priority; ++ zone->prev_priority = priority; + } + return ret; + } +@@ -1068,6 +1088,11 @@ static unsigned long balance_pgdat(pg_da + .swap_cluster_max = SWAP_CLUSTER_MAX, + .swappiness = vm_swappiness, + }; ++ /* ++ * temp_priority is used to remember the scanning priority at which ++ * this zone was successfully refilled to free_pages == pages_high. ++ */ ++ int temp_priority[MAX_NR_ZONES]; + + loop_again: + total_scanned = 0; +@@ -1075,11 +1100,8 @@ loop_again: + sc.may_writepage = !laptop_mode; + count_vm_event(PAGEOUTRUN); + +- for (i = 0; i < pgdat->nr_zones; i++) { +- struct zone *zone = pgdat->node_zones + i; +- +- zone->temp_priority = DEF_PRIORITY; +- } ++ for (i = 0; i < pgdat->nr_zones; i++) ++ temp_priority[i] = DEF_PRIORITY; + + for (priority = DEF_PRIORITY; priority >= 0; priority--) { + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ +@@ -1140,10 +1162,9 @@ scan: + if (!zone_watermark_ok(zone, order, zone->pages_high, + end_zone, 0)) + all_zones_ok = 0; +- zone->temp_priority = priority; +- if (zone->prev_priority > priority) +- zone->prev_priority = priority; ++ temp_priority[i] = priority; + sc.nr_scanned = 0; ++ note_zone_scanning_priority(zone, priority); + nr_reclaimed += shrink_zone(priority, zone, &sc); + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, +@@ -1183,10 +1204,15 @@ scan: + break; + } + out: ++ /* ++ * Note within each zone the priority level at which this zone was ++ * brought into a happy state. So that the next thread which scans this ++ * zone will start out at that priority level. ++ */ + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + +- zone->prev_priority = zone->temp_priority; ++ zone->prev_priority = temp_priority[i]; + } + if (!all_zones_ok) { + cond_resched(); +@@ -1570,6 +1596,7 @@ static int __zone_reclaim(struct zone *z + */ + priority = ZONE_RECLAIM_PRIORITY; + do { ++ note_zone_scanning_priority(zone, priority); + nr_reclaimed += shrink_zone(priority, zone, &sc); + priority--; + } while (priority >= 0 && nr_reclaimed < nr_pages); +--- linux-2.6.18.1.orig/mm/vmstat.c ++++ linux-2.6.18.1/mm/vmstat.c +@@ -586,11 +586,9 @@ static int zoneinfo_show(struct seq_file + seq_printf(m, + "\n all_unreclaimable: %u" + "\n prev_priority: %i" +- "\n temp_priority: %i" + "\n start_pfn: %lu", + zone->all_unreclaimable, + zone->prev_priority, +- zone->temp_priority, + zone->zone_start_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n');