]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
more .30 patches
authorGreg Kroah-Hartman <gregkh@suse.de>
Tue, 30 Jun 2009 23:11:00 +0000 (16:11 -0700)
committerGreg Kroah-Hartman <gregkh@suse.de>
Tue, 30 Jun 2009 23:11:00 +0000 (16:11 -0700)
queue-2.6.30/config_file_locking-should-not-depend-on-config_block.patch [new file with mode: 0644]
queue-2.6.30/drm-i915-correct-suspend-resume-ordering.patch [new file with mode: 0644]
queue-2.6.30/ide-cd-prevent-null-pointer-deref-via-cdrom_newpc_intr.patch [new file with mode: 0644]
queue-2.6.30/kvm-x86-silence-preempt-warning-on-kvm_write_guest_time.patch [new file with mode: 0644]
queue-2.6.30/lib-genalloc.c-remove-unmatched-write_lock-in-gen_pool_destroy.patch [new file with mode: 0644]
queue-2.6.30/ocfs2-fix-ocfs2_osb_dump.patch [new file with mode: 0644]
queue-2.6.30/serial-bfin_5xx-fix-building-as-module-when-early-printk-is-enabled.patch [new file with mode: 0644]
queue-2.6.30/series
queue-2.6.30/vmscan-count-the-number-of-times-zone_reclaim-scans-and-fails.patch [new file with mode: 0644]
queue-2.6.30/vmscan-properly-account-for-the-number-of-page-cache-pages-zone_reclaim-can-reclaim.patch [new file with mode: 0644]

diff --git a/queue-2.6.30/config_file_locking-should-not-depend-on-config_block.patch b/queue-2.6.30/config_file_locking-should-not-depend-on-config_block.patch
new file mode 100644 (file)
index 0000000..023a852
--- /dev/null
@@ -0,0 +1,52 @@
+From 69050eee8e08a6234f29fe71a56f8c7c7d4d7186 Mon Sep 17 00:00:00 2001
+From: Tomas Szepe <szepe@pinerecords.com>
+Date: Tue, 16 Jun 2009 15:33:56 -0700
+Subject: CONFIG_FILE_LOCKING should not depend on CONFIG_BLOCK
+
+From: Tomas Szepe <szepe@pinerecords.com>
+
+commit 69050eee8e08a6234f29fe71a56f8c7c7d4d7186 upstream.
+
+CONFIG_FILE_LOCKING should not depend on CONFIG_BLOCK.
+
+This makes it possible to run complete systems out of a CONFIG_BLOCK=n
+initramfs on current kernels again (this last worked on 2.6.27.*).
+
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/Kconfig |   14 +++++++-------
+ 1 file changed, 7 insertions(+), 7 deletions(-)
+
+--- a/fs/Kconfig
++++ b/fs/Kconfig
+@@ -39,6 +39,13 @@ config FS_POSIX_ACL
+       bool
+       default n
++source "fs/xfs/Kconfig"
++source "fs/gfs2/Kconfig"
++source "fs/ocfs2/Kconfig"
++source "fs/btrfs/Kconfig"
++
++endif # BLOCK
++
+ config FILE_LOCKING
+       bool "Enable POSIX file locking API" if EMBEDDED
+       default y
+@@ -47,13 +54,6 @@ config FILE_LOCKING
+           for filesystems like NFS and for the flock() system
+           call. Disabling this option saves about 11k.
+-source "fs/xfs/Kconfig"
+-source "fs/gfs2/Kconfig"
+-source "fs/ocfs2/Kconfig"
+-source "fs/btrfs/Kconfig"
+-
+-endif # BLOCK
+-
+ source "fs/notify/Kconfig"
+ source "fs/quota/Kconfig"
diff --git a/queue-2.6.30/drm-i915-correct-suspend-resume-ordering.patch b/queue-2.6.30/drm-i915-correct-suspend-resume-ordering.patch
new file mode 100644 (file)
index 0000000..8ac2ba8
--- /dev/null
@@ -0,0 +1,48 @@
+From 9e06dd39f2b6d7e35981e0d7aded618686b32ccb Mon Sep 17 00:00:00 2001
+From: Jesse Barnes <jbarnes@virtuousgeek.org>
+Date: Mon, 22 Jun 2009 18:05:12 -0700
+Subject: drm/i915: correct suspend/resume ordering
+
+From: Jesse Barnes <jbarnes@virtuousgeek.org>
+
+commit 9e06dd39f2b6d7e35981e0d7aded618686b32ccb upstream.
+
+We need to save register state *after* idling GEM, clearing the ring,
+and uninstalling the IRQ handler, or we might end up saving bogus
+fence regs, for one.  Our restore ordering should already be correct,
+since we do GEM, ring and IRQ init after restoring the last register
+state, which prevents us from clobbering things.
+
+I put this together to potentially address a bug, but I haven't heard
+back if it fixes it yet.  However I think it stands on its own, so I'm
+sending it in.
+
+Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
+Signed-off-by: Eric Anholt <eric@anholt.net>
+Cc: Jie Luo <clotho67@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/gpu/drm/i915/i915_drv.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/gpu/drm/i915/i915_drv.c
++++ b/drivers/gpu/drm/i915/i915_drv.c
+@@ -67,8 +67,6 @@ static int i915_suspend(struct drm_devic
+       pci_save_state(dev->pdev);
+-      i915_save_state(dev);
+-
+       /* If KMS is active, we do the leavevt stuff here */
+       if (drm_core_check_feature(dev, DRIVER_MODESET)) {
+               if (i915_gem_idle(dev))
+@@ -77,6 +75,8 @@ static int i915_suspend(struct drm_devic
+               drm_irq_uninstall(dev);
+       }
++      i915_save_state(dev);
++
+       intel_opregion_free(dev, 1);
+       if (state.event == PM_EVENT_SUSPEND) {
diff --git a/queue-2.6.30/ide-cd-prevent-null-pointer-deref-via-cdrom_newpc_intr.patch b/queue-2.6.30/ide-cd-prevent-null-pointer-deref-via-cdrom_newpc_intr.patch
new file mode 100644 (file)
index 0000000..53f14b3
--- /dev/null
@@ -0,0 +1,44 @@
+From 39c58f37a10198054c656c28202fb1e6d22fd505 Mon Sep 17 00:00:00 2001
+From: Rainer Weikusat <rweikusat@mssgmbh.com>
+Date: Thu, 18 Jun 2009 17:04:00 +0200
+Subject: ide-cd: prevent null pointer deref via cdrom_newpc_intr
+
+From: Rainer Weikusat <rweikusat@mssgmbh.com>
+
+commit 39c58f37a10198054c656c28202fb1e6d22fd505 upstream.
+
+With 2.6.30, the error handling code in cdrom_newpc_intr was changed
+to deal with partial request failures by normally completing the 'good'
+parts of a request and only 'error' the last (and presumably,
+incompletely transferred) bio associated with a particular
+request. In order to do this, ide_complete_rq is called over
+ide_cd_error_cmd() to partially complete the rq. The block layer
+does partial completion only for requests with bio's and if the
+rq doesn't have one (eg 'GPCMD_READ_DISC_INFO') the request is
+completed as a whole and the drive->hwif->rq pointer set to NULL
+afterwards. When calling ide_complete_rq again to report
+the error, this null pointer is derefenced, resulting in a kernel
+crash.
+
+This fixes http://bugzilla.kernel.org/show_bug.cgi?id=13399.
+
+Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com>
+Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
+Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/ide/ide-cd.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/ide/ide-cd.c
++++ b/drivers/ide/ide-cd.c
+@@ -758,7 +758,7 @@ out_end:
+                               rq->errors = -EIO;
+               }
+-              if (uptodate == 0)
++              if (uptodate == 0 && rq->bio)
+                       ide_cd_error_cmd(drive, cmd);
+               /* make sure it's fully ended */
diff --git a/queue-2.6.30/kvm-x86-silence-preempt-warning-on-kvm_write_guest_time.patch b/queue-2.6.30/kvm-x86-silence-preempt-warning-on-kvm_write_guest_time.patch
new file mode 100644 (file)
index 0000000..2b9fff8
--- /dev/null
@@ -0,0 +1,69 @@
+From 2dea4c84bc936731668b5a7a9fba5b436a422668 Mon Sep 17 00:00:00 2001
+From: Matt T. Yourst <yourst@users.sourceforge.net>
+Date: Tue, 24 Feb 2009 15:28:00 -0300
+Subject: KVM: x86: silence preempt warning on kvm_write_guest_time
+
+From: Matt T. Yourst <yourst@users.sourceforge.net>
+
+commit 2dea4c84bc936731668b5a7a9fba5b436a422668 upstream.
+
+This issue just appeared in kvm-84 when running on 2.6.28.7 (x86-64)
+with PREEMPT enabled.
+
+We're getting syslog warnings like this many (but not all) times qemu
+tells KVM to run the VCPU:
+
+BUG: using smp_processor_id() in preemptible [00000000] code:
+qemu-system-x86/28938
+caller is kvm_arch_vcpu_ioctl_run+0x5d1/0xc70 [kvm]
+Pid: 28938, comm: qemu-system-x86 2.6.28.7-mtyrel-64bit
+Call Trace:
+debug_smp_processor_id+0xf7/0x100
+kvm_arch_vcpu_ioctl_run+0x5d1/0xc70 [kvm]
+? __wake_up+0x4e/0x70
+? wake_futex+0x27/0x40
+kvm_vcpu_ioctl+0x2e9/0x5a0 [kvm]
+enqueue_hrtimer+0x8a/0x110
+_spin_unlock_irqrestore+0x27/0x50
+vfs_ioctl+0x31/0xa0
+do_vfs_ioctl+0x74/0x480
+sys_futex+0xb4/0x140
+sys_ioctl+0x99/0xa0
+system_call_fastpath+0x16/0x1b
+
+As it turns out, the call trace is messed up due to gcc's inlining, but
+I isolated the problem anyway: kvm_write_guest_time() is being used in a
+non-thread-safe manner on preemptable kernels.
+
+Basically kvm_write_guest_time()'s body needs to be surrounded by
+preempt_disable() and preempt_enable(), since the kernel won't let us
+query any per-CPU data (indirectly using smp_processor_id()) without
+preemption disabled. The attached patch fixes this issue by disabling
+preemption inside kvm_write_guest_time().
+
+[marcelo: surround only __get_cpu_var calls since the warning
+is harmless]
+
+Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
+Signed-off-by: Avi Kivity <avi@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/x86/kvm/x86.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -634,10 +634,12 @@ static void kvm_write_guest_time(struct 
+       if ((!vcpu->time_page))
+               return;
++      preempt_disable();
+       if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
+               kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
+               vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
+       }
++      preempt_enable();
+       /* Keep irq disabled to prevent changes to the clock */
+       local_irq_save(flags);
diff --git a/queue-2.6.30/lib-genalloc.c-remove-unmatched-write_lock-in-gen_pool_destroy.patch b/queue-2.6.30/lib-genalloc.c-remove-unmatched-write_lock-in-gen_pool_destroy.patch
new file mode 100644 (file)
index 0000000..1b43bb5
--- /dev/null
@@ -0,0 +1,36 @@
+From 8e8a2dea0ca91fe2cb7de7ea212124cfe8c82c35 Mon Sep 17 00:00:00 2001
+From: Zygo Blaxell <zygo.blaxell@xandros.com>
+Date: Tue, 16 Jun 2009 15:33:57 -0700
+Subject: lib/genalloc.c: remove unmatched write_lock() in gen_pool_destroy
+
+From: Zygo Blaxell <zygo.blaxell@xandros.com>
+
+commit 8e8a2dea0ca91fe2cb7de7ea212124cfe8c82c35 upstream.
+
+There is a call to write_lock() in gen_pool_destroy which is not balanced
+by any corresponding write_unlock().  This causes problems with preemption
+because the preemption-disable counter is incremented in the write_lock()
+call, but never decremented by any call to write_unlock().  This bug is
+gen_pool_destroy, and one of them is non-x86 arch-specific code.
+
+Signed-off-by: Zygo Blaxell <zygo.blaxell@xandros.com>
+Cc: Jiri Kosina <trivial@kernel.org>
+Cc: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ lib/genalloc.c |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/lib/genalloc.c
++++ b/lib/genalloc.c
+@@ -85,7 +85,6 @@ void gen_pool_destroy(struct gen_pool *p
+       int bit, end_bit;
+-      write_lock(&pool->lock);
+       list_for_each_safe(_chunk, _next_chunk, &pool->chunks) {
+               chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+               list_del(&chunk->next_chunk);
diff --git a/queue-2.6.30/ocfs2-fix-ocfs2_osb_dump.patch b/queue-2.6.30/ocfs2-fix-ocfs2_osb_dump.patch
new file mode 100644 (file)
index 0000000..f2961e3
--- /dev/null
@@ -0,0 +1,83 @@
+From c3d38840abaa45c1c5a5fabbb8ffc9a0d1a764d1 Mon Sep 17 00:00:00 2001
+From: Sunil Mushran <sunil.mushran@oracle.com>
+Date: Fri, 19 Jun 2009 14:45:55 -0700
+Subject: ocfs2: Fix ocfs2_osb_dump()
+
+From: Sunil Mushran <sunil.mushran@oracle.com>
+
+commit c3d38840abaa45c1c5a5fabbb8ffc9a0d1a764d1 upstream.
+
+Skip printing information that is not valid for local mounts.
+
+Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
+Signed-off-by: Joel Becker <joel.becker@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ocfs2/super.c |   32 ++++++++++++++++++--------------
+ 1 file changed, 18 insertions(+), 14 deletions(-)
+
+--- a/fs/ocfs2/super.c
++++ b/fs/ocfs2/super.c
+@@ -232,20 +232,24 @@ static int ocfs2_osb_dump(struct ocfs2_s
+                       "%10s => Opts: 0x%lX  AtimeQuanta: %u\n", "Mount",
+                       osb->s_mount_opt, osb->s_atime_quantum);
+-      out += snprintf(buf + out, len - out,
+-                      "%10s => Stack: %s  Name: %*s  Version: %d.%d\n",
+-                      "Cluster",
+-                      (*osb->osb_cluster_stack == '\0' ?
+-                       "o2cb" : osb->osb_cluster_stack),
+-                      cconn->cc_namelen, cconn->cc_name,
+-                      cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
++      if (cconn) {
++              out += snprintf(buf + out, len - out,
++                              "%10s => Stack: %s  Name: %*s  "
++                              "Version: %d.%d\n", "Cluster",
++                              (*osb->osb_cluster_stack == '\0' ?
++                               "o2cb" : osb->osb_cluster_stack),
++                              cconn->cc_namelen, cconn->cc_name,
++                              cconn->cc_version.pv_major,
++                              cconn->cc_version.pv_minor);
++      }
+       spin_lock(&osb->dc_task_lock);
+       out += snprintf(buf + out, len - out,
+                       "%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
+                       "WorkSeq: %lu\n", "DownCnvt",
+-                      task_pid_nr(osb->dc_task), osb->blocked_lock_count,
+-                      osb->dc_wake_sequence, osb->dc_work_sequence);
++                      (osb->dc_task ?  task_pid_nr(osb->dc_task) : -1),
++                      osb->blocked_lock_count, osb->dc_wake_sequence,
++                      osb->dc_work_sequence);
+       spin_unlock(&osb->dc_task_lock);
+       spin_lock(&osb->osb_lock);
+@@ -265,14 +269,15 @@ static int ocfs2_osb_dump(struct ocfs2_s
+       out += snprintf(buf + out, len - out,
+                       "%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
+-                      task_pid_nr(osb->commit_task), osb->osb_commit_interval,
++                      (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
++                      osb->osb_commit_interval,
+                       atomic_read(&osb->needs_checkpoint));
+       out += snprintf(buf + out, len - out,
+-                      "%10s => State: %d  NumTxns: %d  TxnId: %lu\n",
++                      "%10s => State: %d  TxnId: %lu  NumTxns: %d\n",
+                       "Journal", osb->journal->j_state,
+-                      atomic_read(&osb->journal->j_num_trans),
+-                      osb->journal->j_trans_id);
++                      osb->journal->j_trans_id,
++                      atomic_read(&osb->journal->j_num_trans));
+       out += snprintf(buf + out, len - out,
+                       "%10s => GlobalAllocs: %d  LocalAllocs: %d  "
+@@ -300,7 +305,6 @@ static int ocfs2_osb_dump(struct ocfs2_s
+       out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
+                       "Slots", "Num", "RecoGen");
+-
+       for (i = 0; i < osb->max_slots; ++i) {
+               out += snprintf(buf + out, len - out,
+                               "%10s  %c %3d  %10d\n",
diff --git a/queue-2.6.30/serial-bfin_5xx-fix-building-as-module-when-early-printk-is-enabled.patch b/queue-2.6.30/serial-bfin_5xx-fix-building-as-module-when-early-printk-is-enabled.patch
new file mode 100644 (file)
index 0000000..30d59ab
--- /dev/null
@@ -0,0 +1,36 @@
+From 607c268ef9a4675287e77f732071e426e62c2d86 Mon Sep 17 00:00:00 2001
+From: Mike Frysinger <vapier@gentoo.org>
+Date: Mon, 22 Jun 2009 18:41:47 +0100
+Subject: serial: bfin_5xx: fix building as module when early printk is enabled
+
+From: Mike Frysinger <vapier@gentoo.org>
+
+commit 607c268ef9a4675287e77f732071e426e62c2d86 upstream.
+
+Since early printk only makes sense/works when the serial driver is built
+into the kernel, disable the option for this driver when it is going to be
+built as a module.  Otherwise we get build failures due to the ifdef
+handling.
+
+Signed-off-by: Mike Frysinger <vapier@gentoo.org>
+Signed-off-by: Alan Cox <alan@linux.intel.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/serial/bfin_5xx.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/drivers/serial/bfin_5xx.c
++++ b/drivers/serial/bfin_5xx.c
+@@ -38,6 +38,10 @@
+ #include <asm/cacheflush.h>
+ #endif
++#ifdef CONFIG_SERIAL_BFIN_MODULE
++# undef CONFIG_EARLY_PRINTK
++#endif
++
+ /* UART name and device definitions */
+ #define BFIN_SERIAL_NAME      "ttyBF"
+ #define BFIN_SERIAL_MAJOR     204
index 8dddc540b595551bee1767907127c5b9888cab8e..4e706752b4a80321e46b799405ccb814f20ec2a7 100644 (file)
@@ -97,3 +97,12 @@ dm-sysfs-skip-output-when-device-is-being-destroyed.patch
 dm-mpath-flush-keventd-queue-in-destructor.patch
 dm-exception-store-fix-exstore-lookup-to-be-case-insensitive.patch
 dm-use-i_size_read.patch
+vmscan-properly-account-for-the-number-of-page-cache-pages-zone_reclaim-can-reclaim.patch
+vmscan-count-the-number-of-times-zone_reclaim-scans-and-fails.patch
+lib-genalloc.c-remove-unmatched-write_lock-in-gen_pool_destroy.patch
+config_file_locking-should-not-depend-on-config_block.patch
+serial-bfin_5xx-fix-building-as-module-when-early-printk-is-enabled.patch
+ocfs2-fix-ocfs2_osb_dump.patch
+ide-cd-prevent-null-pointer-deref-via-cdrom_newpc_intr.patch
+drm-i915-correct-suspend-resume-ordering.patch
+kvm-x86-silence-preempt-warning-on-kvm_write_guest_time.patch
diff --git a/queue-2.6.30/vmscan-count-the-number-of-times-zone_reclaim-scans-and-fails.patch b/queue-2.6.30/vmscan-count-the-number-of-times-zone_reclaim-scans-and-fails.patch
new file mode 100644 (file)
index 0000000..5fb8ae0
--- /dev/null
@@ -0,0 +1,75 @@
+From 24cf72518c79cdcda486ed26074ff8151291cf65 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mel@csn.ul.ie>
+Date: Tue, 16 Jun 2009 15:33:23 -0700
+Subject: vmscan: count the number of times zone_reclaim() scans and fails
+
+From: Mel Gorman <mel@csn.ul.ie>
+
+commit 24cf72518c79cdcda486ed26074ff8151291cf65 upstream.
+
+On NUMA machines, the administrator can configure zone_reclaim_mode that
+is a more targetted form of direct reclaim.  On machines with large NUMA
+distances for example, a zone_reclaim_mode defaults to 1 meaning that
+clean unmapped pages will be reclaimed if the zone watermarks are not
+being met.
+
+There is a heuristic that determines if the scan is worthwhile but it is
+possible that the heuristic will fail and the CPU gets tied up scanning
+uselessly.  Detecting the situation requires some guesswork and
+experimentation so this patch adds a counter "zreclaim_failed" to
+/proc/vmstat.  If during high CPU utilisation this counter is increasing
+rapidly, then the resolution to the problem may be to set
+/proc/sys/vm/zone_reclaim_mode to 0.
+
+[akpm@linux-foundation.org: name things consistently]
+Signed-off-by: Mel Gorman <mel@csn.ul.ie>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Christoph Lameter <cl@linux-foundation.org>
+Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: Wu Fengguang <fengguang.wu@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/vmstat.h |    3 +++
+ mm/vmscan.c            |    3 +++
+ mm/vmstat.c            |    3 +++
+ 3 files changed, 9 insertions(+)
+
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -36,6 +36,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
+               FOR_ALL_ZONES(PGSTEAL),
+               FOR_ALL_ZONES(PGSCAN_KSWAPD),
+               FOR_ALL_ZONES(PGSCAN_DIRECT),
++#ifdef CONFIG_NUMA
++              PGSCAN_ZONE_RECLAIM_FAILED,
++#endif
+               PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
+               PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+ #ifdef CONFIG_HUGETLB_PAGE
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2452,6 +2452,9 @@ int zone_reclaim(struct zone *zone, gfp_
+       ret = __zone_reclaim(zone, gfp_mask, order);
+       zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
++      if (!ret)
++              count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
++
+       return ret;
+ }
+ #endif
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -675,6 +675,9 @@ static const char * const vmstat_text[] 
+       TEXTS_FOR_ZONES("pgscan_kswapd")
+       TEXTS_FOR_ZONES("pgscan_direct")
++#ifdef CONFIG_NUMA
++      "zone_reclaim_failed",
++#endif
+       "pginodesteal",
+       "slabs_scanned",
+       "kswapd_steal",
diff --git a/queue-2.6.30/vmscan-properly-account-for-the-number-of-page-cache-pages-zone_reclaim-can-reclaim.patch b/queue-2.6.30/vmscan-properly-account-for-the-number-of-page-cache-pages-zone_reclaim-can-reclaim.patch
new file mode 100644 (file)
index 0000000..c406d0f
--- /dev/null
@@ -0,0 +1,185 @@
+From 90afa5de6f3fa89a733861e843377302479fcf7e Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mel@csn.ul.ie>
+Date: Tue, 16 Jun 2009 15:33:20 -0700
+Subject: vmscan: properly account for the number of page cache pages zone_reclaim() can reclaim
+
+From: Mel Gorman <mel@csn.ul.ie>
+
+commit 90afa5de6f3fa89a733861e843377302479fcf7e upstream.
+
+A bug was brought to my attention against a distro kernel but it affects
+mainline and I believe problems like this have been reported in various
+guises on the mailing lists although I don't have specific examples at the
+moment.
+
+The reported problem was that malloc() stalled for a long time (minutes in
+some cases) if a large tmpfs mount was occupying a large percentage of
+memory overall.  The pages did not get cleaned or reclaimed by
+zone_reclaim() because the zone_reclaim_mode was unsuitable, but the lists
+are uselessly scanned frequencly making the CPU spin at near 100%.
+
+This patchset intends to address that bug and bring the behaviour of
+zone_reclaim() more in line with expectations which were noticed during
+investigation.  It is based on top of mmotm and takes advantage of
+Kosaki's work with respect to zone_reclaim().
+
+Patch 1 fixes the heuristics that zone_reclaim() uses to determine if the
+       scan should go ahead. The broken heuristic is what was causing the
+       malloc() stall as it uselessly scanned the LRU constantly. Currently,
+       zone_reclaim is assuming zone_reclaim_mode is 1 and historically it
+       could not deal with tmpfs pages at all. This fixes up the heuristic so
+       that an unnecessary scan is more likely to be correctly avoided.
+
+Patch 2 notes that zone_reclaim() returning a failure automatically means
+       the zone is marked full. This is not always true. It could have
+       failed because the GFP mask or zone_reclaim_mode were unsuitable.
+
+Patch 3 introduces a counter zreclaim_failed that will increment each
+       time the zone_reclaim scan-avoidance heuristics fail. If that
+       counter is rapidly increasing, then zone_reclaim_mode should be
+       set to 0 as a temporarily resolution and a bug reported because
+       the scan-avoidance heuristic is still broken.
+
+This patch:
+
+On NUMA machines, the administrator can configure zone_reclaim_mode that
+is a more targetted form of direct reclaim.  On machines with large NUMA
+distances for example, a zone_reclaim_mode defaults to 1 meaning that
+clean unmapped pages will be reclaimed if the zone watermarks are not
+being met.
+
+There is a heuristic that determines if the scan is worthwhile but the
+problem is that the heuristic is not being properly applied and is
+basically assuming zone_reclaim_mode is 1 if it is enabled.  The lack of
+proper detection can manfiest as high CPU usage as the LRU list is scanned
+uselessly.
+
+Historically, once enabled it was depending on NR_FILE_PAGES which may
+include swapcache pages that the reclaim_mode cannot deal with.  Patch
+vmscan-change-the-number-of-the-unmapped-files-in-zone-reclaim.patch by
+Kosaki Motohiro noted that zone_page_state(zone, NR_FILE_PAGES) included
+pages that were not file-backed such as swapcache and made a calculation
+based on the inactive, active and mapped files.  This is far superior when
+zone_reclaim==1 but if RECLAIM_SWAP is set, then NR_FILE_PAGES is a
+reasonable starting figure.
+
+This patch alters how zone_reclaim() works out how many pages it might be
+able to reclaim given the current reclaim_mode.  If RECLAIM_SWAP is set in
+the reclaim_mode it will either consider NR_FILE_PAGES as potential
+candidates or else use NR_{IN}ACTIVE}_PAGES-NR_FILE_MAPPED to discount
+swapcache and other non-file-backed pages.  If RECLAIM_WRITE is not set,
+then NR_FILE_DIRTY number of pages are not candidates.  If RECLAIM_SWAP is
+not set, then NR_FILE_MAPPED are not.
+
+[kosaki.motohiro@jp.fujitsu.com: Estimate unmapped pages minus tmpfs pages]
+[fengguang.wu@intel.com: Fix underflow problem in Kosaki's estimate]
+Signed-off-by: Mel Gorman <mel@csn.ul.ie>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Acked-by: Christoph Lameter <cl@linux-foundation.org>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: Wu Fengguang <fengguang.wu@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ Documentation/sysctl/vm.txt |   12 ++++++----
+ mm/vmscan.c                 |   52 ++++++++++++++++++++++++++++++++++++++------
+ 2 files changed, 53 insertions(+), 11 deletions(-)
+
+--- a/Documentation/sysctl/vm.txt
++++ b/Documentation/sysctl/vm.txt
+@@ -314,10 +314,14 @@ min_unmapped_ratio:
+ This is available only on NUMA kernels.
+-A percentage of the total pages in each zone.  Zone reclaim will only
+-occur if more than this percentage of pages are file backed and unmapped.
+-This is to insure that a minimal amount of local pages is still available for
+-file I/O even if the node is overallocated.
++This is a percentage of the total pages in each zone. Zone reclaim will
++only occur if more than this percentage of pages are in a state that
++zone_reclaim_mode allows to be reclaimed.
++
++If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared
++against all file-backed unmapped pages including swapcache pages and tmpfs
++files. Otherwise, only unmapped pages backed by normal files but not tmpfs
++files and similar are considered.
+ The default is 1 percent.
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2290,6 +2290,48 @@ int sysctl_min_unmapped_ratio = 1;
+  */
+ int sysctl_min_slab_ratio = 5;
++static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
++{
++      unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
++      unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
++              zone_page_state(zone, NR_ACTIVE_FILE);
++
++      /*
++       * It's possible for there to be more file mapped pages than
++       * accounted for by the pages on the file LRU lists because
++       * tmpfs pages accounted for as ANON can also be FILE_MAPPED
++       */
++      return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
++}
++
++/* Work out how many page cache pages we can reclaim in this reclaim_mode */
++static long zone_pagecache_reclaimable(struct zone *zone)
++{
++      long nr_pagecache_reclaimable;
++      long delta = 0;
++
++      /*
++       * If RECLAIM_SWAP is set, then all file pages are considered
++       * potentially reclaimable. Otherwise, we have to worry about
++       * pages like swapcache and zone_unmapped_file_pages() provides
++       * a better estimate
++       */
++      if (zone_reclaim_mode & RECLAIM_SWAP)
++              nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
++      else
++              nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
++
++      /* If we can't clean pages, remove dirty pages from consideration */
++      if (!(zone_reclaim_mode & RECLAIM_WRITE))
++              delta += zone_page_state(zone, NR_FILE_DIRTY);
++
++      /* Watch for any possible underflows due to delta */
++      if (unlikely(delta > nr_pagecache_reclaimable))
++              delta = nr_pagecache_reclaimable;
++
++      return nr_pagecache_reclaimable - delta;
++}
++
+ /*
+  * Try to free up some pages from this zone through reclaim.
+  */
+@@ -2324,9 +2366,7 @@ static int __zone_reclaim(struct zone *z
+       reclaim_state.reclaimed_slab = 0;
+       p->reclaim_state = &reclaim_state;
+-      if (zone_page_state(zone, NR_FILE_PAGES) -
+-              zone_page_state(zone, NR_FILE_MAPPED) >
+-              zone->min_unmapped_pages) {
++      if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+               /*
+                * Free memory by calling shrink zone with increasing
+                * priorities until we have enough memory freed.
+@@ -2384,10 +2424,8 @@ int zone_reclaim(struct zone *zone, gfp_
+        * if less than a specified percentage of the zone is used by
+        * unmapped file backed pages.
+        */
+-      if (zone_page_state(zone, NR_FILE_PAGES) -
+-          zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
+-          && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
+-                      <= zone->min_slab_pages)
++      if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
++          zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
+               return 0;
+       if (zone_is_all_unreclaimable(zone))