]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
more patches to queue
authorChris Wright <chrisw@sous-sol.org>
Tue, 31 Oct 2006 11:20:53 +0000 (03:20 -0800)
committerChris Wright <chrisw@sous-sol.org>
Tue, 31 Oct 2006 11:20:53 +0000 (03:20 -0800)
15 files changed:
queue-2.6.18/check-bio-address-after-mapping-through-partitions.patch [new file with mode: 0644]
queue-2.6.18/fill_tgid-fix-task_struct-leak-and-possible-oops.patch [new file with mode: 0644]
queue-2.6.18/ipv6-fix-lockup-via-proc-net-ip6_flowlabel.patch [new file with mode: 0644]
queue-2.6.18/isdn-fix-drivers-by-handling-errors-thrown-by-readstat.patch [new file with mode: 0644]
queue-2.6.18/jmb-368-pata-detection.patch [new file with mode: 0644]
queue-2.6.18/nfs-nfs_lookup-don-t-hash-dentry-when-optimising-away-the-lookup.patch [new file with mode: 0644]
queue-2.6.18/pci-remove-quirk_via_abnormal_poweroff.patch [new file with mode: 0644]
queue-2.6.18/posix-cpu-timers-prevent-signal-delivery-starvation.patch [new file with mode: 0644]
queue-2.6.18/reintroduce-nodes_span_other_nodes-for-powerpc.patch [new file with mode: 0644]
queue-2.6.18/rtc-max6902-month-conversion-fix.patch [new file with mode: 0644]
queue-2.6.18/series
queue-2.6.18/sparc64-fix-pci-memory-space-root-resource-on-hummingbird.patch [new file with mode: 0644]
queue-2.6.18/tcp-cubic-scaling-error.patch [new file with mode: 0644]
queue-2.6.18/use-min-of-two-prio-settings-in-calculating-distress-for-reclaim.patch [new file with mode: 0644]
queue-2.6.18/vmscan-fix-temp_priority-race.patch [new file with mode: 0644]

diff --git a/queue-2.6.18/check-bio-address-after-mapping-through-partitions.patch b/queue-2.6.18/check-bio-address-after-mapping-through-partitions.patch
new file mode 100644 (file)
index 0000000..4e5b8e0
--- /dev/null
@@ -0,0 +1,74 @@
+From stable-bounces@linux.kernel.org  Mon Oct 30 17:54:57 2006
+Date: Tue, 31 Oct 2006 12:51:45 +1100
+From: NeilBrown <neilb@suse.de>
+To: Andrew Morton <akpm@osdl.org>
+Cc: stable@kernel.org, Jens Axboe <jens.axboe@oracle.com>
+Subject: [stable] [PATCH] Check bio address after mapping through partitions.
+
+Partitions are not limited to live within a device.  So
+we should range check after partition mapping.
+
+Note that 'maxsector' was being used for two different things.  I have
+split off the second usage into 'old_sector' so that maxsector can be
+still be used for it's primary usage later in the function.
+
+Cc: Jens Axboe <jens.axboe@oracle.com>
+Signed-off-by: Neil Brown <neilb@suse.de>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ block/ll_rw_blk.c |   24 ++++++++++++++++++++----
+ 1 file changed, 20 insertions(+), 4 deletions(-)
+
+--- linux-2.6.18.1.orig/block/ll_rw_blk.c
++++ linux-2.6.18.1/block/ll_rw_blk.c
+@@ -3021,6 +3021,7 @@ void generic_make_request(struct bio *bi
+ {
+       request_queue_t *q;
+       sector_t maxsector;
++      sector_t old_sector;
+       int ret, nr_sectors = bio_sectors(bio);
+       dev_t old_dev;
+@@ -3049,7 +3050,7 @@ void generic_make_request(struct bio *bi
+        * NOTE: we don't repeat the blk_size check for each new device.
+        * Stacking drivers are expected to know what they are doing.
+        */
+-      maxsector = -1;
++      old_sector = -1;
+       old_dev = 0;
+       do {
+               char b[BDEVNAME_SIZE];
+@@ -3083,15 +3084,30 @@ end_io:
+                */
+               blk_partition_remap(bio);
+-              if (maxsector != -1)
++              if (old_sector != -1)
+                       blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
+-                                          maxsector);
++                                          old_sector);
+               blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+-              maxsector = bio->bi_sector;
++              old_sector = bio->bi_sector;
+               old_dev = bio->bi_bdev->bd_dev;
++              maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
++              if (maxsector) {
++                      sector_t sector = bio->bi_sector;
++
++                      if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
++                              /*
++                               * This may well happen - partitions are not checked
++                               * to make sure they are within the size of the
++                               * whole device.
++                               */
++                              handle_bad_sector(bio);
++                              goto end_io;
++                      }
++              }
++
+               ret = q->make_request_fn(q, bio);
+       } while (ret);
+ }
diff --git a/queue-2.6.18/fill_tgid-fix-task_struct-leak-and-possible-oops.patch b/queue-2.6.18/fill_tgid-fix-task_struct-leak-and-possible-oops.patch
new file mode 100644 (file)
index 0000000..6c50711
--- /dev/null
@@ -0,0 +1,58 @@
+From fca178c0c6e8d52a1875be36b070f30884ebfae9 Mon Sep 17 00:00:00 2001
+From: Oleg Nesterov <oleg@tv-sign.ru>
+Date: Sat, 28 Oct 2006 10:38:49 -0700
+Subject: fill_tgid: fix task_struct leak and possible oops
+
+1. fill_tgid() forgets to do put_task_struct(first).
+
+2. release_task(first) can happen after fill_tgid() drops tasklist_lock,
+   it is unsafe to dereference first->signal.
+
+This is a temporary fix, imho the locking should be reworked.
+
+Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
+Cc: Shailabh Nagar <nagar@watson.ibm.com>
+Cc: Balbir Singh <balbir@in.ibm.com>
+Cc: Jay Lan <jlan@sgi.com>
+Cc: <stable@kernel.org>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Linus Torvalds <torvalds@osdl.org>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ kernel/taskstats.c |   15 +++++++++------
+ 1 file changed, 9 insertions(+), 6 deletions(-)
+
+--- linux-2.6.18.1.orig/kernel/taskstats.c
++++ linux-2.6.18.1/kernel/taskstats.c
+@@ -229,14 +229,17 @@ static int fill_tgid(pid_t tgid, struct 
+       } else
+               get_task_struct(first);
+-      /* Start with stats from dead tasks */
+-      spin_lock_irqsave(&first->signal->stats_lock, flags);
+-      if (first->signal->stats)
+-              memcpy(stats, first->signal->stats, sizeof(*stats));
+-      spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+       tsk = first;
+       read_lock(&tasklist_lock);
++      /* Start with stats from dead tasks */
++      if (first->signal) {
++              spin_lock_irqsave(&first->signal->stats_lock, flags);
++              if (first->signal->stats)
++                      memcpy(stats, first->signal->stats, sizeof(*stats));
++              spin_unlock_irqrestore(&first->signal->stats_lock, flags);
++      }
++
+       do {
+               if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+                       continue;
+@@ -256,7 +259,7 @@ static int fill_tgid(pid_t tgid, struct 
+        * Accounting subsytems can also add calls here to modify
+        * fields of taskstats.
+        */
+-
++      put_task_struct(first);
+       return 0;
+ }
diff --git a/queue-2.6.18/ipv6-fix-lockup-via-proc-net-ip6_flowlabel.patch b/queue-2.6.18/ipv6-fix-lockup-via-proc-net-ip6_flowlabel.patch
new file mode 100644 (file)
index 0000000..3517201
--- /dev/null
@@ -0,0 +1,38 @@
+From stable-bounces@linux.kernel.org  Mon Oct 30 15:14:51 2006
+Date: Mon, 30 Oct 2006 15:11:21 -0800 (PST)
+From: David Miller <davem@davemloft.net>
+To: stable@kernel.org
+Cc: bunk@stusta.de
+Subject: IPV6: fix lockup via /proc/net/ip6_flowlabel
+
+From: James Morris <jmorris@namei.org>
+
+There's a bug in the seqfile handling for /proc/net/ip6_flowlabel, where, 
+after finding a flowlabel, the code will loop forever not finding any 
+further flowlabels, first traversing the rest of the hash bucket then just 
+looping.
+
+This patch fixes the problem by breaking after the hash bucket has been 
+traversed.
+
+Note that this bug can cause lockups and oopses, and is trivially invoked 
+by an unpriveleged user.
+
+Signed-off-by: James Morris <jmorris@namei.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ net/ipv6/ip6_flowlabel.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- linux-2.6.18.1.orig/net/ipv6/ip6_flowlabel.c
++++ linux-2.6.18.1/net/ipv6/ip6_flowlabel.c
+@@ -587,6 +587,8 @@ static struct ip6_flowlabel *ip6fl_get_n
+       while (!fl) {
+               if (++state->bucket <= FL_HASH_MASK)
+                       fl = fl_ht[state->bucket];
++              else
++                      break;
+       }
+       return fl;
+ }
diff --git a/queue-2.6.18/isdn-fix-drivers-by-handling-errors-thrown-by-readstat.patch b/queue-2.6.18/isdn-fix-drivers-by-handling-errors-thrown-by-readstat.patch
new file mode 100644 (file)
index 0000000..8c0766c
--- /dev/null
@@ -0,0 +1,49 @@
+From 04518bfe8eac2e82b476fb2b0093527adc2bc791 Mon Sep 17 00:00:00 2001
+From: Jeff Garzik <jeff@garzik.org>
+Date: Tue, 17 Oct 2006 00:10:39 -0700
+Subject: ISDN: fix drivers, by handling errors thrown by ->readstat()
+
+This is a particularly ugly on-failure bug, possibly security, since the
+lack of error handling here is covering up another class of bug: failure to
+handle copy_to_user() return values.
+
+The I4L API function ->readstat() returns an integer, and by looking at
+several existing driver implementations, it is clear that a negative return
+value was meant to indicate an error.
+
+Given that several drivers already return a negative value indicating an
+errno-style error, the current code would blindly accept that [negative]
+value as a valid amount of bytes read.  Obvious damage ensues.
+
+Correcting ->readstat() handling to properly notice errors fixes the
+existing code to work correctly on error, and enables future patches to
+more easily indicate errors during operation.
+
+Signed-off-by: Jeff Garzik <jeff@garzik.org>
+Cc: Karsten Keil <kkeil@suse.de>
+Cc: <stable@kernel.org>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Linus Torvalds <torvalds@osdl.org>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ drivers/isdn/i4l/isdn_common.c |    9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- linux-2.6.18.1.orig/drivers/isdn/i4l/isdn_common.c
++++ linux-2.6.18.1/drivers/isdn/i4l/isdn_common.c
+@@ -1134,9 +1134,12 @@ isdn_read(struct file *file, char __user
+               if (dev->drv[drvidx]->interface->readstat) {
+                       if (count > dev->drv[drvidx]->stavail)
+                               count = dev->drv[drvidx]->stavail;
+-                      len = dev->drv[drvidx]->interface->
+-                              readstat(buf, count, drvidx,
+-                                       isdn_minor2chan(minor));
++                      len = dev->drv[drvidx]->interface->readstat(buf, count,
++                                              drvidx, isdn_minor2chan(minor));
++                      if (len < 0) {
++                              retval = len;
++                              goto out;
++                      }
+               } else {
+                       len = 0;
+               }
diff --git a/queue-2.6.18/jmb-368-pata-detection.patch b/queue-2.6.18/jmb-368-pata-detection.patch
new file mode 100644 (file)
index 0000000..cbf463b
--- /dev/null
@@ -0,0 +1,32 @@
+From c333526f489044be2b93085720eb898f0037b346 Mon Sep 17 00:00:00 2001
+From: Alan Cox <alan@lxorguk.ukuu.org.uk>
+Date: Sat, 28 Oct 2006 10:38:57 -0700
+Subject: JMB 368 PATA detection
+
+The Jmicron JMB368 is PATA only so has the PATA on function zero.  Don't
+therefore skip function zero on this device when probing
+
+Signed-off-by: Alan Cox <alan@redhat.com>
+Cc: <stable@kernel.org>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Linus Torvalds <torvalds@osdl.org>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ drivers/ide/pci/generic.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- linux-2.6.18.1.orig/drivers/ide/pci/generic.c
++++ linux-2.6.18.1/drivers/ide/pci/generic.c
+@@ -242,8 +242,10 @@ static int __devinit generic_init_one(st
+           (!(PCI_FUNC(dev->devfn) & 1)))
+               goto out;
+-      if (dev->vendor == PCI_VENDOR_ID_JMICRON && PCI_FUNC(dev->devfn) != 1)
+-              goto out;
++      if (dev->vendor == PCI_VENDOR_ID_JMICRON) {
++              if (dev->device != PCI_DEVICE_ID_JMICRON_JMB368 && PCI_FUNC(dev->devfn) != 1)
++                      goto out;
++      }
+       if (dev->vendor != PCI_VENDOR_ID_JMICRON) {
+               pci_read_config_word(dev, PCI_COMMAND, &command);
diff --git a/queue-2.6.18/nfs-nfs_lookup-don-t-hash-dentry-when-optimising-away-the-lookup.patch b/queue-2.6.18/nfs-nfs_lookup-don-t-hash-dentry-when-optimising-away-the-lookup.patch
new file mode 100644 (file)
index 0000000..389ae7c
--- /dev/null
@@ -0,0 +1,53 @@
+From fd6840714d9cf6e93f1d42b904860a94df316b85 Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+Date: Tue, 5 Sep 2006 12:27:44 -0400
+Subject: NFS: nfs_lookup - don't hash dentry when optimising away the lookup
+
+If the open intents tell us that a given lookup is going to result in a,
+exclusive create, we currently optimize away the lookup call itself. The
+reason is that the lookup would not be atomic with the create RPC call, so
+why do it in the first place?
+
+A problem occurs, however, if the VFS aborts the exclusive create operation
+after the lookup, but before the call to create the file/directory: in this
+case we will end up with a hashed negative dentry in the dcache that has
+never been looked up.
+Fix this by only actually hashing the dentry once the create operation has
+been successfully completed.
+
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ fs/nfs/dir.c |   14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+--- linux-2.6.18.1.orig/fs/nfs/dir.c
++++ linux-2.6.18.1/fs/nfs/dir.c
+@@ -902,9 +902,15 @@ static struct dentry *nfs_lookup(struct 
+       lock_kernel();
+-      /* If we're doing an exclusive create, optimize away the lookup */
+-      if (nfs_is_exclusive_create(dir, nd))
+-              goto no_entry;
++      /*
++       * If we're doing an exclusive create, optimize away the lookup
++       * but don't hash the dentry.
++       */
++      if (nfs_is_exclusive_create(dir, nd)) {
++              d_instantiate(dentry, NULL);
++              res = NULL;
++              goto out_unlock;
++      }
+       error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+       if (error == -ENOENT)
+@@ -1156,6 +1162,8 @@ int nfs_instantiate(struct dentry *dentr
+       if (IS_ERR(inode))
+               goto out_err;
+       d_instantiate(dentry, inode);
++      if (d_unhashed(dentry))
++              d_rehash(dentry);
+       return 0;
+ out_err:
+       d_drop(dentry);
diff --git a/queue-2.6.18/pci-remove-quirk_via_abnormal_poweroff.patch b/queue-2.6.18/pci-remove-quirk_via_abnormal_poweroff.patch
new file mode 100644 (file)
index 0000000..d2a6459
--- /dev/null
@@ -0,0 +1,73 @@
+From 3560cc5ec3488b20d927f7160a21a0df1d1fda20 Mon Sep 17 00:00:00 2001
+From: Karsten Wiese <annabellesgarden@yahoo.de>
+Date: Fri, 20 Oct 2006 14:45:36 -0700
+Subject: PCI: Remove quirk_via_abnormal_poweroff
+
+My K8T800 mobo resumes fine from suspend to ram with and without patch
+applied against 2.6.18.
+
+quirk_via_abnormal_poweroff makes some boards not boot 2.6.18, so IMO patch
+should go to head, 2.6.18.2 and everywhere "ACPI: ACPICA 20060623" has been
+applied.
+
+
+Remove quirk_via_abnormal_poweroff
+
+Obsoleted by "ACPI: ACPICA 20060623":
+<snip>
+    Implemented support for "ignored" bits in the ACPI
+    registers.  According to the ACPI specification, these
+    bits should be preserved when writing the registers via
+    a read/modify/write cycle. There are 3 bits preserved
+    in this manner: PM1_CONTROL[0] (SCI_EN), PM1_CONTROL[9],
+    and PM1_STATUS[11].
+    http://bugzilla.kernel.org/show_bug.cgi?id=3691
+</snip>
+
+Signed-off-by: Karsten Wiese <fzu@wemgehoertderstaat.de>
+Cc: Bob Moore <robert.moore@intel.com>
+Cc: "Brown, Len" <len.brown@intel.com>
+Acked-by: Dave Jones <davej@redhat.com>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ drivers/pci/quirks.c |   27 ---------------------------
+ 1 file changed, 27 deletions(-)
+
+--- linux-2.6.18.1.orig/drivers/pci/quirks.c
++++ linux-2.6.18.1/drivers/pci/quirks.c
+@@ -685,33 +685,6 @@ static void __devinit quirk_vt82c598_id(
+ }
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA,   PCI_DEVICE_ID_VIA_82C597_0,     quirk_vt82c598_id );
+-#ifdef CONFIG_ACPI_SLEEP
+-
+-/*
+- * Some VIA systems boot with the abnormal status flag set. This can cause
+- * the BIOS to re-POST the system on resume rather than passing control
+- * back to the OS.  Clear the flag on boot
+- */
+-static void __devinit quirk_via_abnormal_poweroff(struct pci_dev *dev)
+-{
+-      u32 reg;
+-
+-      acpi_hw_register_read(ACPI_MTX_DO_NOT_LOCK, ACPI_REGISTER_PM1_STATUS,
+-                              &reg);
+-
+-      if (reg & 0x800) {
+-              printk("Clearing abnormal poweroff flag\n");
+-              acpi_hw_register_write(ACPI_MTX_DO_NOT_LOCK,
+-                                      ACPI_REGISTER_PM1_STATUS,
+-                                      (u16)0x800);
+-      }
+-}
+-
+-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, quirk_via_abnormal_poweroff);
+-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, quirk_via_abnormal_poweroff);
+-
+-#endif
+-
+ /*
+  * CardBus controllers have a legacy base address that enables them
+  * to respond as i82365 pcmcia controllers.  We don't want them to
diff --git a/queue-2.6.18/posix-cpu-timers-prevent-signal-delivery-starvation.patch b/queue-2.6.18/posix-cpu-timers-prevent-signal-delivery-starvation.patch
new file mode 100644 (file)
index 0000000..b4b19d3
--- /dev/null
@@ -0,0 +1,137 @@
+From stable-bounces@linux.kernel.org  Tue Oct 17 00:12:55 2006
+Date: Tue, 17 Oct 2006 00:09:39 -0700
+From: akpm@osdl.org
+To: torvalds@osdl.org
+Cc: akpm@osdl.org, dwalker@mvista.com, pmattis@google.com, johnstul@us.ibm.com, toyoa@mvista.com, stable@kernel.org, zippel@linux-m68k.org, mbligh@google.com, spark@google.com, rohitseth@google.com, tglx@linutronix.de, mingo@elte.hu, roland@redhat.com
+Subject: posix-cpu-timers: prevent signal delivery starvation
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+The integer divisions in the timer accounting code can round the result
+down to 0.  Adding 0 is without effect and the signal delivery stops.
+
+Clamp the division result to minimum 1 to avoid this.
+
+Problem was reported by Seongbae Park <spark@google.com>, who provided
+also an inital patch.
+
+Roland sayeth:
+
+  I have had some more time to think about the problem, and to reproduce it
+  using Toyo's test case.  For the record, if my understanding of the problem
+  is correct, this happens only in one very particular case.  First, the
+  expiry time has to be so soon that in cputime_t units (usually 1s/HZ ticks)
+  it's < nthreads so the division yields zero.  Second, it only affects each
+  thread that is so new that its CPU time accumulation is zero so now+0 is
+  still zero and ->it_*_expires winds up staying zero.  For the VIRT and PROF
+  clocks when cputime_t is tick granularity (or the SCHED clock on
+  configurations where sched_clock's value only advances on clock ticks), this
+  is not hard to arrange with new threads starting up and blocking before they
+  accumulate a whole tick of CPU time.  That's what happens in Toyo's test
+  case.
+
+  Note that in general it is fine for that division to round down to zero,
+  and set each thread's expiry time to its "now" time.  The problem only
+  arises with thread's whose "now" value is still zero, so that now+0 winds up
+  0 and is interpreted as "not set" instead of ">= now".  So it would be a
+  sufficient and more precise fix to just use max(ticks, 1) inside the loop
+  when setting each it_*_expires value.
+
+  But, it does no harm to round the division up to one and always advance
+  every thread's expiry time.  If the thread didn't already fire timers for
+  the expiry time of "now", there is no expectation that it will do so before
+  the next tick anyway.  So I followed Thomas's patch in lifting the max out
+  of the loops.
+
+  This patch also covers the reload cases, which are harder to write a test
+  for (and I didn't try).  I've tested it with Toyo's case and it fixes that.
+
+
+[toyoa@mvista.com: fix: min_t -> max_t]
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Roland McGrath <roland@redhat.com>
+Cc: Daniel Walker <dwalker@mvista.com>
+Cc: Toyo Abe <toyoa@mvista.com>
+Cc: john stultz <johnstul@us.ibm.com>
+Cc: Roman Zippel <zippel@linux-m68k.org>
+Cc: Seongbae Park <spark@google.com>
+Cc: Peter Mattis <pmattis@google.com>
+Cc: Rohit Seth <rohitseth@google.com>
+Cc: Martin Bligh <mbligh@google.com>
+Cc: <stable@kernel.org>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+
+ kernel/posix-cpu-timers.c |   27 +++++++++++++++++++++------
+ 1 file changed, 21 insertions(+), 6 deletions(-)
+
+--- linux-2.6.18.1.orig/kernel/posix-cpu-timers.c
++++ linux-2.6.18.1/kernel/posix-cpu-timers.c
+@@ -88,6 +88,19 @@ static inline union cpu_time_count cpu_t
+ }
+ /*
++ * Divide and limit the result to res >= 1
++ *
++ * This is necessary to prevent signal delivery starvation, when the result of
++ * the division would be rounded down to 0.
++ */
++static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
++{
++      cputime_t res = cputime_div(time, div);
++
++      return max_t(cputime_t, res, 1);
++}
++
++/*
+  * Update expiry time from increment, and increase overrun count,
+  * given the current clock sample.
+  */
+@@ -483,8 +496,8 @@ static void process_timer_rebalance(stru
+               BUG();
+               break;
+       case CPUCLOCK_PROF:
+-              left = cputime_div(cputime_sub(expires.cpu, val.cpu),
+-                                 nthreads);
++              left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
++                                     nthreads);
+               do {
+                       if (likely(!(t->flags & PF_EXITING))) {
+                               ticks = cputime_add(prof_ticks(t), left);
+@@ -498,8 +511,8 @@ static void process_timer_rebalance(stru
+               } while (t != p);
+               break;
+       case CPUCLOCK_VIRT:
+-              left = cputime_div(cputime_sub(expires.cpu, val.cpu),
+-                                 nthreads);
++              left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
++                                     nthreads);
+               do {
+                       if (likely(!(t->flags & PF_EXITING))) {
+                               ticks = cputime_add(virt_ticks(t), left);
+@@ -515,6 +528,7 @@ static void process_timer_rebalance(stru
+       case CPUCLOCK_SCHED:
+               nsleft = expires.sched - val.sched;
+               do_div(nsleft, nthreads);
++              nsleft = max_t(unsigned long long, nsleft, 1);
+               do {
+                       if (likely(!(t->flags & PF_EXITING))) {
+                               ns = t->sched_time + nsleft;
+@@ -1159,12 +1173,13 @@ static void check_process_timers(struct 
+               prof_left = cputime_sub(prof_expires, utime);
+               prof_left = cputime_sub(prof_left, stime);
+-              prof_left = cputime_div(prof_left, nthreads);
++              prof_left = cputime_div_non_zero(prof_left, nthreads);
+               virt_left = cputime_sub(virt_expires, utime);
+-              virt_left = cputime_div(virt_left, nthreads);
++              virt_left = cputime_div_non_zero(virt_left, nthreads);
+               if (sched_expires) {
+                       sched_left = sched_expires - sched_time;
+                       do_div(sched_left, nthreads);
++                      sched_left = max_t(unsigned long long, sched_left, 1);
+               } else {
+                       sched_left = 0;
+               }
diff --git a/queue-2.6.18/reintroduce-nodes_span_other_nodes-for-powerpc.patch b/queue-2.6.18/reintroduce-nodes_span_other_nodes-for-powerpc.patch
new file mode 100644 (file)
index 0000000..27f45a2
--- /dev/null
@@ -0,0 +1,85 @@
+From 7516795739bd53175629b90fab0ad488d7a6a9f7 Mon Sep 17 00:00:00 2001
+From: Andy Whitcroft <apw@shadowen.org>
+Date: Sat, 21 Oct 2006 10:24:14 -0700
+Subject: Reintroduce NODES_SPAN_OTHER_NODES for powerpc
+
+Revert "[PATCH] Remove SPAN_OTHER_NODES config definition"
+    This reverts commit f62859bb6871c5e4a8e591c60befc8caaf54db8c.
+Revert "[PATCH] mm: remove arch independent NODES_SPAN_OTHER_NODES"
+    This reverts commit a94b3ab7eab4edcc9b2cb474b188f774c331adf7.
+
+Also update the comments to indicate that this is still required
+and where its used.
+
+Signed-off-by: Andy Whitcroft <apw@shadowen.org>
+Cc: Paul Mackerras <paulus@samba.org>
+Cc: Mike Kravetz <kravetz@us.ibm.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Acked-by: Mel Gorman <mel@csn.ul.ie>
+Acked-by: Will Schmidt <will_schmidt@vnet.ibm.com>
+Cc: Christoph Lameter <clameter@sgi.com>
+Cc: <stable@kernel.org>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Linus Torvalds <torvalds@osdl.org>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ arch/powerpc/Kconfig                   |    9 +++++++++
+ arch/powerpc/configs/pseries_defconfig |    1 +
+ include/linux/mmzone.h                 |    6 ++++++
+ mm/page_alloc.c                        |    2 ++
+ 4 files changed, 18 insertions(+)
+
+--- linux-2.6.18.1.orig/arch/powerpc/Kconfig
++++ linux-2.6.18.1/arch/powerpc/Kconfig
+@@ -729,6 +729,15 @@ config ARCH_MEMORY_PROBE
+       def_bool y
+       depends on MEMORY_HOTPLUG
++# Some NUMA nodes have memory ranges that span
++# other nodes.  Even though a pfn is valid and
++# between a node's start and end pfns, it may not
++# reside on that node.  See memmap_init_zone()
++# for details.
++config NODES_SPAN_OTHER_NODES
++      def_bool y
++      depends on NEED_MULTIPLE_NODES
++
+ config PPC_64K_PAGES
+       bool "64k page size"
+       depends on PPC64
+--- linux-2.6.18.1.orig/arch/powerpc/configs/pseries_defconfig
++++ linux-2.6.18.1/arch/powerpc/configs/pseries_defconfig
+@@ -184,6 +184,7 @@ CONFIG_SPLIT_PTLOCK_CPUS=4
+ CONFIG_MIGRATION=y
+ CONFIG_RESOURCES_64BIT=y
+ CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
++CONFIG_NODES_SPAN_OTHER_NODES=y
+ # CONFIG_PPC_64K_PAGES is not set
+ CONFIG_SCHED_SMT=y
+ CONFIG_PROC_DEVICETREE=y
+--- linux-2.6.18.1.orig/include/linux/mmzone.h
++++ linux-2.6.18.1/include/linux/mmzone.h
+@@ -632,6 +632,12 @@ void sparse_init(void);
+ #define sparse_index_init(_sec, _nid)  do {} while (0)
+ #endif /* CONFIG_SPARSEMEM */
++#ifdef CONFIG_NODES_SPAN_OTHER_NODES
++#define early_pfn_in_nid(pfn, nid)    (early_pfn_to_nid(pfn) == (nid))
++#else
++#define early_pfn_in_nid(pfn, nid)    (1)
++#endif
++
+ #ifndef early_pfn_valid
+ #define early_pfn_valid(pfn)  (1)
+ #endif
+--- linux-2.6.18.1.orig/mm/page_alloc.c
++++ linux-2.6.18.1/mm/page_alloc.c
+@@ -1673,6 +1673,8 @@ void __meminit memmap_init_zone(unsigned
+       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+               if (!early_pfn_valid(pfn))
+                       continue;
++              if (!early_pfn_in_nid(pfn, nid))
++                      continue;
+               page = pfn_to_page(pfn);
+               set_page_links(page, zone, nid, pfn);
+               init_page_count(page);
diff --git a/queue-2.6.18/rtc-max6902-month-conversion-fix.patch b/queue-2.6.18/rtc-max6902-month-conversion-fix.patch
new file mode 100644 (file)
index 0000000..8b3a0ff
--- /dev/null
@@ -0,0 +1,39 @@
+From stable-bounces@linux.kernel.org  Tue Oct 17 00:12:18 2006
+Date: Tue, 17 Oct 2006 00:09:53 -0700
+From: akpm@osdl.org
+To: torvalds@osdl.org
+Cc: akpm@osdl.org, a.zummo@towertech.it, flarramendi@gmail.com, raph@raphnet.net, stable@kernel.org
+Subject: rtc-max6902: month conversion fix
+
+From: Francisco Larramendi <flarramendi@gmail.com>
+
+Fix October-only BCD-to-binary conversion bug:
+
+       0x08 -> 7
+       0x09 -> 8
+       0x10 -> 15 (!)
+       0x11 -> 19
+
+Fixes http://bugzilla.kernel.org/show_bug.cgi?id=7361
+
+Cc: Raphael Assenat <raph@raphnet.net>
+Cc: Alessandro Zummo <a.zummo@towertech.it>
+Cc: <stable@kernel.org>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+
+ drivers/rtc/rtc-max6902.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- linux-2.6.18.1.orig/drivers/rtc/rtc-max6902.c
++++ linux-2.6.18.1/drivers/rtc/rtc-max6902.c
+@@ -137,7 +137,7 @@ static int max6902_get_datetime(struct d
+       dt->tm_min      = BCD2BIN(chip->buf[2]);
+       dt->tm_hour     = BCD2BIN(chip->buf[3]);
+       dt->tm_mday     = BCD2BIN(chip->buf[4]);
+-      dt->tm_mon      = BCD2BIN(chip->buf[5] - 1);
++      dt->tm_mon      = BCD2BIN(chip->buf[5]) - 1;
+       dt->tm_wday     = BCD2BIN(chip->buf[6]);
+       dt->tm_year = BCD2BIN(chip->buf[7]);
index 85fb75e1645112679bf59694bb4f13e37a902a8c..b9d16f4fd1177f597e3c7680635169c4266a81c2 100644 (file)
@@ -44,3 +44,17 @@ uml-remove-warnings-added-by-previous-stable-patch.patch
 alsa-snd_rtctimer-handle-rtc-interrupts-with-a-tasklet.patch
 watchdog-sc1200wdt-fix-missing-pnp_unregister_driver.patch
 fix-intel-rng-detection.patch
+posix-cpu-timers-prevent-signal-delivery-starvation.patch
+rtc-max6902-month-conversion-fix.patch
+isdn-fix-drivers-by-handling-errors-thrown-by-readstat.patch
+sparc64-fix-pci-memory-space-root-resource-on-hummingbird.patch
+pci-remove-quirk_via_abnormal_poweroff.patch
+reintroduce-nodes_span_other_nodes-for-powerpc.patch
+nfs-nfs_lookup-don-t-hash-dentry-when-optimising-away-the-lookup.patch
+vmscan-fix-temp_priority-race.patch
+use-min-of-two-prio-settings-in-calculating-distress-for-reclaim.patch
+fill_tgid-fix-task_struct-leak-and-possible-oops.patch
+jmb-368-pata-detection.patch
+tcp-cubic-scaling-error.patch
+ipv6-fix-lockup-via-proc-net-ip6_flowlabel.patch
+check-bio-address-after-mapping-through-partitions.patch
diff --git a/queue-2.6.18/sparc64-fix-pci-memory-space-root-resource-on-hummingbird.patch b/queue-2.6.18/sparc64-fix-pci-memory-space-root-resource-on-hummingbird.patch
new file mode 100644 (file)
index 0000000..8d35b58
--- /dev/null
@@ -0,0 +1,140 @@
+From stable-bounces@linux.kernel.org  Wed Oct 18 13:40:37 2006
+Date: Wed, 18 Oct 2006 13:38:49 -0700 (PDT)
+From: David Miller <davem@davemloft.net>
+To: stable@kernel.org
+Subject: SPARC64: Fix PCI memory space root resource on Hummingbird.
+
+For Hummingbird PCI controllers, we should create the root
+PCI memory space resource as the full 4GB area, and then
+allocate the IOMMU DMA translation window out of there.
+
+The old code just assumed that the IOMMU DMA translation base
+to the top of the 4GB area was unusable.  This is not true on
+many systems such as SB100 and SB150, where the IOMMU DMA
+translation window sits at 0xc0000000->0xdfffffff.
+
+So what would happen is that any device mapped by the firmware
+at the top section 0xe0000000->0xffffffff would get remapped
+by Linux somewhere else leading to all kinds of problems and
+boot failures.
+
+While we're here, report more cases of OBP resource assignment
+conflicts.  The only truly valid ones are ROM resource conflicts.
+
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ arch/sparc64/kernel/pci_common.c |   29 ++++++++++-------------------
+ arch/sparc64/kernel/pci_sabre.c  |   23 +++++++++++++++++++----
+ 2 files changed, 29 insertions(+), 23 deletions(-)
+
+--- linux-2.6.18.1.orig/arch/sparc64/kernel/pci_common.c
++++ linux-2.6.18.1/arch/sparc64/kernel/pci_common.c
+@@ -330,19 +330,6 @@ __init get_device_resource(struct linux_
+       return res;
+ }
+-static int __init pdev_resource_collisions_expected(struct pci_dev *pdev)
+-{
+-      if (pdev->vendor != PCI_VENDOR_ID_SUN)
+-              return 0;
+-
+-      if (pdev->device == PCI_DEVICE_ID_SUN_RIO_EBUS ||
+-          pdev->device == PCI_DEVICE_ID_SUN_RIO_1394 ||
+-          pdev->device == PCI_DEVICE_ID_SUN_RIO_USB)
+-              return 1;
+-
+-      return 0;
+-}
+-
+ static void __init pdev_record_assignments(struct pci_pbm_info *pbm,
+                                          struct pci_dev *pdev)
+ {
+@@ -400,19 +387,23 @@ static void __init pdev_record_assignmen
+               pbm->parent->resource_adjust(pdev, res, root);
+               if (request_resource(root, res) < 0) {
++                      int rnum;
++
+                       /* OK, there is some conflict.  But this is fine
+                        * since we'll reassign it in the fixup pass.
+                        *
+-                       * We notify the user that OBP made an error if it
+-                       * is a case we don't expect.
++                       * Do not print the warning for ROM resources
++                       * as such a conflict is quite common and
++                       * harmless as the ROM bar is disabled.
+                        */
+-                      if (!pdev_resource_collisions_expected(pdev)) {
+-                              printk(KERN_ERR "PCI: Address space collision on region %ld "
++                      rnum = (res - &pdev->resource[0]);
++                      if (rnum != PCI_ROM_RESOURCE)
++                              printk(KERN_ERR "PCI: Resource collision, "
++                                     "region %d "
+                                      "[%016lx:%016lx] of device %s\n",
+-                                     (res - &pdev->resource[0]),
++                                     rnum,
+                                      res->start, res->end,
+                                      pci_name(pdev));
+-                      }
+               }
+       }
+ }
+--- linux-2.6.18.1.orig/arch/sparc64/kernel/pci_sabre.c
++++ linux-2.6.18.1/arch/sparc64/kernel/pci_sabre.c
+@@ -1196,7 +1196,7 @@ static void pbm_register_toplevel_resour
+                                           &pbm->mem_space);
+ }
+-static void sabre_pbm_init(struct pci_controller_info *p, struct device_node *dp, u32 dma_begin)
++static void sabre_pbm_init(struct pci_controller_info *p, struct device_node *dp, u32 dma_start, u32 dma_end)
+ {
+       struct pci_pbm_info *pbm;
+       struct device_node *node;
+@@ -1261,6 +1261,8 @@ static void sabre_pbm_init(struct pci_co
+               node = node->sibling;
+       }
+       if (simbas_found == 0) {
++              struct resource *rp;
++
+               /* No APBs underneath, probably this is a hummingbird
+                * system.
+                */
+@@ -1302,8 +1304,10 @@ static void sabre_pbm_init(struct pci_co
+               pbm->io_space.end   = pbm->io_space.start + (1UL << 24) - 1UL;
+               pbm->io_space.flags = IORESOURCE_IO;
+-              pbm->mem_space.start = p->pbm_A.controller_regs + SABRE_MEMSPACE;
+-              pbm->mem_space.end   = pbm->mem_space.start + (unsigned long)dma_begin - 1UL;
++              pbm->mem_space.start =
++                      (p->pbm_A.controller_regs + SABRE_MEMSPACE);
++              pbm->mem_space.end =
++                      (pbm->mem_space.start + ((1UL << 32UL) - 1UL));
+               pbm->mem_space.flags = IORESOURCE_MEM;
+               if (request_resource(&ioport_resource, &pbm->io_space) < 0) {
+@@ -1315,6 +1319,17 @@ static void sabre_pbm_init(struct pci_co
+                       prom_halt();
+               }
++              rp = kmalloc(sizeof(*rp), GFP_KERNEL);
++              if (!rp) {
++                      prom_printf("Cannot allocate IOMMU resource.\n");
++                      prom_halt();
++              }
++              rp->name = "IOMMU";
++              rp->start = pbm->mem_space.start + (unsigned long) dma_start;
++              rp->end = pbm->mem_space.start + (unsigned long) dma_end - 1UL;
++              rp->flags = IORESOURCE_BUSY;
++              request_resource(&pbm->mem_space, rp);
++
+               pci_register_legacy_regions(&pbm->io_space,
+                                           &pbm->mem_space);
+       }
+@@ -1450,5 +1465,5 @@ void sabre_init(struct device_node *dp, 
+       /*
+        * Look for APB underneath.
+        */
+-      sabre_pbm_init(p, dp, vdma[0]);
++      sabre_pbm_init(p, dp, vdma[0], vdma[0] + vdma[1]);
+ }
diff --git a/queue-2.6.18/tcp-cubic-scaling-error.patch b/queue-2.6.18/tcp-cubic-scaling-error.patch
new file mode 100644 (file)
index 0000000..ee19436
--- /dev/null
@@ -0,0 +1,56 @@
+From stable-bounces@linux.kernel.org  Mon Oct 30 14:50:53 2006
+Date: Mon, 30 Oct 2006 14:47:35 -0800
+From: Stephen Hemminger <shemminger@osdl.org>
+To: stable@kernel.org
+Subject: tcp: cubic scaling error
+
+Doug Leith observed a discrepancy between the version of CUBIC described
+in the papers and the version in 2.6.18. A math error related to scaling
+causes Cubic to grow too slowly.
+
+Patch is from "Sangtae Ha" <sha2@ncsu.edu>. I validated that
+it does fix the problems.
+
+See the following to show behavior over 500ms 100 Mbit link.
+
+Sender (2.6.19-rc3) ---  Bridge (2.6.18-rt7) ------- Receiver (2.6.19-rc3)
+                    1G      [netem]           100M
+
+       http://developer.osdl.org/shemminger/tcp/2.6.19-rc3/cubic-orig.png
+       http://developer.osdl.org/shemminger/tcp/2.6.19-rc3/cubic-fix.png
+
+Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ net/ipv4/tcp_cubic.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- linux-2.6.18.1.orig/net/ipv4/tcp_cubic.c
++++ linux-2.6.18.1/net/ipv4/tcp_cubic.c
+@@ -190,7 +190,7 @@ static inline void bictcp_update(struct 
+          */
+       /* change the unit from HZ to bictcp_HZ */
+-        t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start)
++        t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start)
+            << BICTCP_HZ) / HZ;
+         if (t < ca->bic_K)            /* t - K */
+@@ -259,7 +259,7 @@ static inline void measure_delay(struct 
+           (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
+               return;
+-      delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
++      delay = (tcp_time_stamp - tp->rx_opt.rcv_tsecr)<<3;
+       if (delay == 0)
+               delay = 1;
+@@ -366,7 +366,7 @@ static int __init cubictcp_register(void
+       beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta);
+-      cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */
++      cube_rtt_scale = (bic_scale * 10);      /* 1024*c/rtt */
+       /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
+        *  so K = cubic_root( (wmax-cwnd)*rtt/c )
diff --git a/queue-2.6.18/use-min-of-two-prio-settings-in-calculating-distress-for-reclaim.patch b/queue-2.6.18/use-min-of-two-prio-settings-in-calculating-distress-for-reclaim.patch
new file mode 100644 (file)
index 0000000..62b4478
--- /dev/null
@@ -0,0 +1,67 @@
+From bbdb396a60b2ebf7de3b717991e5d3e28c8b7bbd Mon Sep 17 00:00:00 2001
+From: Martin Bligh <mbligh@google.com>
+Date: Sat, 28 Oct 2006 10:38:25 -0700
+Subject: Use min of two prio settings in calculating distress for reclaim
+
+If try_to_free_pages / balance_pgdat are called with a gfp_mask specifying
+GFP_IO and/or GFP_FS, they will reclaim the requisite number of pages, and the
+reset prev_priority to DEF_PRIORITY (or to some other high (ie: unurgent)
+value).
+
+However, another reclaimer without those gfp_mask flags set (say, GFP_NOIO)
+may still be struggling to reclaim pages.  The concurrent overwrite of
+zone->prev_priority will cause this GFP_NOIO thread to unexpectedly cease
+deactivating mapped pages, thus causing reclaim difficulties.
+
+Fix this is to key the distress calculation not off zone->prev_priority, but
+also take into account the local caller's priority by using
+min(zone->prev_priority, sc->priority)
+
+Signed-off-by: Martin J. Bligh <mbligh@google.com>
+Cc: Nick Piggin <nickpiggin@yahoo.com.au>
+Cc: <stable@kernel.org>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Linus Torvalds <torvalds@osdl.org>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ mm/vmscan.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- linux-2.6.18.1.orig/mm/vmscan.c
++++ linux-2.6.18.1/mm/vmscan.c
+@@ -727,7 +727,7 @@ static inline void note_zone_scanning_pr
+  * But we had to alter page->flags anyway.
+  */
+ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+-                              struct scan_control *sc)
++                              struct scan_control *sc, int priority)
+ {
+       unsigned long pgmoved;
+       int pgdeactivate = 0;
+@@ -748,7 +748,7 @@ static void shrink_active_list(unsigned 
+                * `distress' is a measure of how much trouble we're having
+                * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+                */
+-              distress = 100 >> zone->prev_priority;
++              distress = 100 >> min(zone->prev_priority, priority);
+               /*
+                * The point of this algorithm is to decide when to start
+@@ -899,7 +899,7 @@ static unsigned long shrink_zone(int pri
+                       nr_to_scan = min(nr_active,
+                                       (unsigned long)sc->swap_cluster_max);
+                       nr_active -= nr_to_scan;
+-                      shrink_active_list(nr_to_scan, zone, sc);
++                      shrink_active_list(nr_to_scan, zone, sc, priority);
+               }
+               if (nr_inactive) {
+@@ -1341,7 +1341,7 @@ static unsigned long shrink_all_zones(un
+                       if (zone->nr_scan_active >= nr_pages || pass > 3) {
+                               zone->nr_scan_active = 0;
+                               nr_to_scan = min(nr_pages, zone->nr_active);
+-                              shrink_active_list(nr_to_scan, zone, sc);
++                              shrink_active_list(nr_to_scan, zone, sc, prio);
+                       }
+               }
diff --git a/queue-2.6.18/vmscan-fix-temp_priority-race.patch b/queue-2.6.18/vmscan-fix-temp_priority-race.patch
new file mode 100644 (file)
index 0000000..593778d
--- /dev/null
@@ -0,0 +1,221 @@
+From 3bb1a852ab6c9cdf211a2f4a2f502340c8c38eca Mon Sep 17 00:00:00 2001
+From: Martin Bligh <mbligh@mbligh.org>
+Date: Sat, 28 Oct 2006 10:38:24 -0700
+Subject: vmscan: Fix temp_priority race
+
+The temp_priority field in zone is racy, as we can walk through a reclaim
+path, and just before we copy it into prev_priority, it can be overwritten
+(say with DEF_PRIORITY) by another reclaimer.
+
+The same bug is contained in both try_to_free_pages and balance_pgdat, but
+it is fixed slightly differently.  In balance_pgdat, we keep a separate
+priority record per zone in a local array.  In try_to_free_pages there is
+no need to do this, as the priority level is the same for all zones that we
+reclaim from.
+
+Impact of this bug is that temp_priority is copied into prev_priority, and
+setting this artificially high causes reclaimers to set distress
+artificially low.  They then fail to reclaim mapped pages, when they are,
+in fact, under severe memory pressure (their priority may be as low as 0).
+This causes the OOM killer to fire incorrectly.
+
+From: Andrew Morton <akpm@osdl.org>
+
+__zone_reclaim() isn't modifying zone->prev_priority.  But zone->prev_priority
+is used in the decision whether or not to bring mapped pages onto the inactive
+list.  Hence there's a risk here that __zone_reclaim() will fail because
+zone->prev_priority ir large (ie: low urgency) and lots of mapped pages end up
+stuck on the active list.
+
+Fix that up by decreasing (ie making more urgent) zone->prev_priority as
+__zone_reclaim() scans the zone's pages.
+
+This bug perhaps explains why ZONE_RECLAIM_PRIORITY was created.  It should be
+possible to remove that now, and to just start out at DEF_PRIORITY?
+
+Cc: Nick Piggin <nickpiggin@yahoo.com.au>
+Cc: Christoph Lameter <clameter@engr.sgi.com>
+Cc: <stable@kernel.org>
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Signed-off-by: Linus Torvalds <torvalds@osdl.org>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+[chrisw: minor wiggle to fit -stable]
+---
+ include/linux/mmzone.h |    6 -----
+ mm/page_alloc.c        |    2 -
+ mm/vmscan.c            |   55 ++++++++++++++++++++++++++++++++++++-------------
+ mm/vmstat.c            |    2 -
+ 4 files changed, 43 insertions(+), 22 deletions(-)
+
+--- linux-2.6.18.1.orig/include/linux/mmzone.h
++++ linux-2.6.18.1/include/linux/mmzone.h
+@@ -200,13 +200,9 @@ struct zone {
+        * under - it drives the swappiness decision: whether to unmap mapped
+        * pages.
+        *
+-       * temp_priority is used to remember the scanning priority at which
+-       * this zone was successfully refilled to free_pages == pages_high.
+-       *
+-       * Access to both these fields is quite racy even on uniprocessor.  But
++       * Access to both this field is quite racy even on uniprocessor.  But
+        * it is expected to average out OK.
+        */
+-      int temp_priority;
+       int prev_priority;
+--- linux-2.6.18.1.orig/mm/page_alloc.c
++++ linux-2.6.18.1/mm/page_alloc.c
+@@ -2021,7 +2021,7 @@ static void __meminit free_area_init_cor
+               zone->zone_pgdat = pgdat;
+               zone->free_pages = 0;
+-              zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
++              zone->prev_priority = DEF_PRIORITY;
+               zone_pcp_init(zone);
+               INIT_LIST_HEAD(&zone->active_list);
+--- linux-2.6.18.1.orig/mm/vmscan.c
++++ linux-2.6.18.1/mm/vmscan.c
+@@ -696,6 +696,20 @@ done:
+ }
+ /*
++ * We are about to scan this zone at a certain priority level.  If that priority
++ * level is smaller (ie: more urgent) than the previous priority, then note
++ * that priority level within the zone.  This is done so that when the next
++ * process comes in to scan this zone, it will immediately start out at this
++ * priority level rather than having to build up its own scanning priority.
++ * Here, this priority affects only the reclaim-mapped threshold.
++ */
++static inline void note_zone_scanning_priority(struct zone *zone, int priority)
++{
++      if (priority < zone->prev_priority)
++              zone->prev_priority = priority;
++}
++
++/*
+  * This moves pages from the active list to the inactive list.
+  *
+  * We move them the other way if the page is referenced by one or more
+@@ -934,9 +948,7 @@ static unsigned long shrink_zones(int pr
+               if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+                       continue;
+-              zone->temp_priority = priority;
+-              if (zone->prev_priority > priority)
+-                      zone->prev_priority = priority;
++              note_zone_scanning_priority(zone, priority);
+               if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                       continue;       /* Let kswapd poll it */
+@@ -984,7 +996,6 @@ unsigned long try_to_free_pages(struct z
+               if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+                       continue;
+-              zone->temp_priority = DEF_PRIORITY;
+               lru_pages += zone->nr_active + zone->nr_inactive;
+       }
+@@ -1022,13 +1033,22 @@ unsigned long try_to_free_pages(struct z
+                       blk_congestion_wait(WRITE, HZ/10);
+       }
+ out:
++      /*
++       * Now that we've scanned all the zones at this priority level, note
++       * that level within the zone so that the next thread which performs
++       * scanning of this zone will immediately start out at this priority
++       * level.  This affects only the decision whether or not to bring
++       * mapped pages onto the inactive list.
++       */
++      if (priority < 0)
++              priority = 0;
+       for (i = 0; zones[i] != 0; i++) {
+               struct zone *zone = zones[i];
+               if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+                       continue;
+-              zone->prev_priority = zone->temp_priority;
++              zone->prev_priority = priority;
+       }
+       return ret;
+ }
+@@ -1068,6 +1088,11 @@ static unsigned long balance_pgdat(pg_da
+               .swap_cluster_max = SWAP_CLUSTER_MAX,
+               .swappiness = vm_swappiness,
+       };
++      /*
++       * temp_priority is used to remember the scanning priority at which
++       * this zone was successfully refilled to free_pages == pages_high.
++       */
++      int temp_priority[MAX_NR_ZONES];
+ loop_again:
+       total_scanned = 0;
+@@ -1075,11 +1100,8 @@ loop_again:
+       sc.may_writepage = !laptop_mode;
+       count_vm_event(PAGEOUTRUN);
+-      for (i = 0; i < pgdat->nr_zones; i++) {
+-              struct zone *zone = pgdat->node_zones + i;
+-
+-              zone->temp_priority = DEF_PRIORITY;
+-      }
++      for (i = 0; i < pgdat->nr_zones; i++)
++              temp_priority[i] = DEF_PRIORITY;
+       for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+               int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
+@@ -1140,10 +1162,9 @@ scan:
+                       if (!zone_watermark_ok(zone, order, zone->pages_high,
+                                              end_zone, 0))
+                               all_zones_ok = 0;
+-                      zone->temp_priority = priority;
+-                      if (zone->prev_priority > priority)
+-                              zone->prev_priority = priority;
++                      temp_priority[i] = priority;
+                       sc.nr_scanned = 0;
++                      note_zone_scanning_priority(zone, priority);
+                       nr_reclaimed += shrink_zone(priority, zone, &sc);
+                       reclaim_state->reclaimed_slab = 0;
+                       nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+@@ -1183,10 +1204,15 @@ scan:
+                       break;
+       }
+ out:
++      /*
++       * Note within each zone the priority level at which this zone was
++       * brought into a happy state.  So that the next thread which scans this
++       * zone will start out at that priority level.
++       */
+       for (i = 0; i < pgdat->nr_zones; i++) {
+               struct zone *zone = pgdat->node_zones + i;
+-              zone->prev_priority = zone->temp_priority;
++              zone->prev_priority = temp_priority[i];
+       }
+       if (!all_zones_ok) {
+               cond_resched();
+@@ -1570,6 +1596,7 @@ static int __zone_reclaim(struct zone *z
+                */
+               priority = ZONE_RECLAIM_PRIORITY;
+               do {
++                      note_zone_scanning_priority(zone, priority);
+                       nr_reclaimed += shrink_zone(priority, zone, &sc);
+                       priority--;
+               } while (priority >= 0 && nr_reclaimed < nr_pages);
+--- linux-2.6.18.1.orig/mm/vmstat.c
++++ linux-2.6.18.1/mm/vmstat.c
+@@ -586,11 +586,9 @@ static int zoneinfo_show(struct seq_file
+               seq_printf(m,
+                          "\n  all_unreclaimable: %u"
+                          "\n  prev_priority:     %i"
+-                         "\n  temp_priority:     %i"
+                          "\n  start_pfn:         %lu",
+                          zone->all_unreclaimable,
+                          zone->prev_priority,
+-                         zone->temp_priority,
+                          zone->zone_start_pfn);
+               spin_unlock_irqrestore(&zone->lock, flags);
+               seq_putc(m, '\n');