--- /dev/null
+From 69050eee8e08a6234f29fe71a56f8c7c7d4d7186 Mon Sep 17 00:00:00 2001
+From: Tomas Szepe <szepe@pinerecords.com>
+Date: Tue, 16 Jun 2009 15:33:56 -0700
+Subject: CONFIG_FILE_LOCKING should not depend on CONFIG_BLOCK
+
+From: Tomas Szepe <szepe@pinerecords.com>
+
+commit 69050eee8e08a6234f29fe71a56f8c7c7d4d7186 upstream.
+
+CONFIG_FILE_LOCKING should not depend on CONFIG_BLOCK.
+
+This makes it possible to run complete systems out of a CONFIG_BLOCK=n
+initramfs on current kernels again (this last worked on 2.6.27.*).
+
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/Kconfig | 14 +++++++-------
+ 1 file changed, 7 insertions(+), 7 deletions(-)
+
+--- a/fs/Kconfig
++++ b/fs/Kconfig
+@@ -39,6 +39,13 @@ config FS_POSIX_ACL
+ bool
+ default n
+
++source "fs/xfs/Kconfig"
++source "fs/gfs2/Kconfig"
++source "fs/ocfs2/Kconfig"
++source "fs/btrfs/Kconfig"
++
++endif # BLOCK
++
+ config FILE_LOCKING
+ bool "Enable POSIX file locking API" if EMBEDDED
+ default y
+@@ -47,13 +54,6 @@ config FILE_LOCKING
+ for filesystems like NFS and for the flock() system
+ call. Disabling this option saves about 11k.
+
+-source "fs/xfs/Kconfig"
+-source "fs/gfs2/Kconfig"
+-source "fs/ocfs2/Kconfig"
+-source "fs/btrfs/Kconfig"
+-
+-endif # BLOCK
+-
+ source "fs/notify/Kconfig"
+
+ source "fs/quota/Kconfig"
--- /dev/null
+From 9e06dd39f2b6d7e35981e0d7aded618686b32ccb Mon Sep 17 00:00:00 2001
+From: Jesse Barnes <jbarnes@virtuousgeek.org>
+Date: Mon, 22 Jun 2009 18:05:12 -0700
+Subject: drm/i915: correct suspend/resume ordering
+
+From: Jesse Barnes <jbarnes@virtuousgeek.org>
+
+commit 9e06dd39f2b6d7e35981e0d7aded618686b32ccb upstream.
+
+We need to save register state *after* idling GEM, clearing the ring,
+and uninstalling the IRQ handler, or we might end up saving bogus
+fence regs, for one. Our restore ordering should already be correct,
+since we do GEM, ring and IRQ init after restoring the last register
+state, which prevents us from clobbering things.
+
+I put this together to potentially address a bug, but I haven't heard
+back if it fixes it yet. However I think it stands on its own, so I'm
+sending it in.
+
+Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
+Signed-off-by: Eric Anholt <eric@anholt.net>
+Cc: Jie Luo <clotho67@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/gpu/drm/i915/i915_drv.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/gpu/drm/i915/i915_drv.c
++++ b/drivers/gpu/drm/i915/i915_drv.c
+@@ -67,8 +67,6 @@ static int i915_suspend(struct drm_devic
+
+ pci_save_state(dev->pdev);
+
+- i915_save_state(dev);
+-
+ /* If KMS is active, we do the leavevt stuff here */
+ if (drm_core_check_feature(dev, DRIVER_MODESET)) {
+ if (i915_gem_idle(dev))
+@@ -77,6 +75,8 @@ static int i915_suspend(struct drm_devic
+ drm_irq_uninstall(dev);
+ }
+
++ i915_save_state(dev);
++
+ intel_opregion_free(dev, 1);
+
+ if (state.event == PM_EVENT_SUSPEND) {
--- /dev/null
+From 39c58f37a10198054c656c28202fb1e6d22fd505 Mon Sep 17 00:00:00 2001
+From: Rainer Weikusat <rweikusat@mssgmbh.com>
+Date: Thu, 18 Jun 2009 17:04:00 +0200
+Subject: ide-cd: prevent null pointer deref via cdrom_newpc_intr
+
+From: Rainer Weikusat <rweikusat@mssgmbh.com>
+
+commit 39c58f37a10198054c656c28202fb1e6d22fd505 upstream.
+
+With 2.6.30, the error handling code in cdrom_newpc_intr was changed
+to deal with partial request failures by normally completing the 'good'
+parts of a request and only 'error' the last (and presumably,
+incompletely transferred) bio associated with a particular
+request. In order to do this, ide_complete_rq is called over
+ide_cd_error_cmd() to partially complete the rq. The block layer
+does partial completion only for requests with bio's and if the
+rq doesn't have one (eg 'GPCMD_READ_DISC_INFO') the request is
+completed as a whole and the drive->hwif->rq pointer set to NULL
+afterwards. When calling ide_complete_rq again to report
+the error, this null pointer is derefenced, resulting in a kernel
+crash.
+
+This fixes http://bugzilla.kernel.org/show_bug.cgi?id=13399.
+
+Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com>
+Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
+Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/ide/ide-cd.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/ide/ide-cd.c
++++ b/drivers/ide/ide-cd.c
+@@ -758,7 +758,7 @@ out_end:
+ rq->errors = -EIO;
+ }
+
+- if (uptodate == 0)
++ if (uptodate == 0 && rq->bio)
+ ide_cd_error_cmd(drive, cmd);
+
+ /* make sure it's fully ended */
--- /dev/null
+From 2dea4c84bc936731668b5a7a9fba5b436a422668 Mon Sep 17 00:00:00 2001
+From: Matt T. Yourst <yourst@users.sourceforge.net>
+Date: Tue, 24 Feb 2009 15:28:00 -0300
+Subject: KVM: x86: silence preempt warning on kvm_write_guest_time
+
+From: Matt T. Yourst <yourst@users.sourceforge.net>
+
+commit 2dea4c84bc936731668b5a7a9fba5b436a422668 upstream.
+
+This issue just appeared in kvm-84 when running on 2.6.28.7 (x86-64)
+with PREEMPT enabled.
+
+We're getting syslog warnings like this many (but not all) times qemu
+tells KVM to run the VCPU:
+
+BUG: using smp_processor_id() in preemptible [00000000] code:
+qemu-system-x86/28938
+caller is kvm_arch_vcpu_ioctl_run+0x5d1/0xc70 [kvm]
+Pid: 28938, comm: qemu-system-x86 2.6.28.7-mtyrel-64bit
+Call Trace:
+debug_smp_processor_id+0xf7/0x100
+kvm_arch_vcpu_ioctl_run+0x5d1/0xc70 [kvm]
+? __wake_up+0x4e/0x70
+? wake_futex+0x27/0x40
+kvm_vcpu_ioctl+0x2e9/0x5a0 [kvm]
+enqueue_hrtimer+0x8a/0x110
+_spin_unlock_irqrestore+0x27/0x50
+vfs_ioctl+0x31/0xa0
+do_vfs_ioctl+0x74/0x480
+sys_futex+0xb4/0x140
+sys_ioctl+0x99/0xa0
+system_call_fastpath+0x16/0x1b
+
+As it turns out, the call trace is messed up due to gcc's inlining, but
+I isolated the problem anyway: kvm_write_guest_time() is being used in a
+non-thread-safe manner on preemptable kernels.
+
+Basically kvm_write_guest_time()'s body needs to be surrounded by
+preempt_disable() and preempt_enable(), since the kernel won't let us
+query any per-CPU data (indirectly using smp_processor_id()) without
+preemption disabled. The attached patch fixes this issue by disabling
+preemption inside kvm_write_guest_time().
+
+[marcelo: surround only __get_cpu_var calls since the warning
+is harmless]
+
+Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
+Signed-off-by: Avi Kivity <avi@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/x86/kvm/x86.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -634,10 +634,12 @@ static void kvm_write_guest_time(struct
+ if ((!vcpu->time_page))
+ return;
+
++ preempt_disable();
+ if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
+ kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
+ vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
+ }
++ preempt_enable();
+
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
--- /dev/null
+From 8e8a2dea0ca91fe2cb7de7ea212124cfe8c82c35 Mon Sep 17 00:00:00 2001
+From: Zygo Blaxell <zygo.blaxell@xandros.com>
+Date: Tue, 16 Jun 2009 15:33:57 -0700
+Subject: lib/genalloc.c: remove unmatched write_lock() in gen_pool_destroy
+
+From: Zygo Blaxell <zygo.blaxell@xandros.com>
+
+commit 8e8a2dea0ca91fe2cb7de7ea212124cfe8c82c35 upstream.
+
+There is a call to write_lock() in gen_pool_destroy which is not balanced
+by any corresponding write_unlock(). This causes problems with preemption
+because the preemption-disable counter is incremented in the write_lock()
+call, but never decremented by any call to write_unlock(). This bug is
+gen_pool_destroy, and one of them is non-x86 arch-specific code.
+
+Signed-off-by: Zygo Blaxell <zygo.blaxell@xandros.com>
+Cc: Jiri Kosina <trivial@kernel.org>
+Cc: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ lib/genalloc.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/lib/genalloc.c
++++ b/lib/genalloc.c
+@@ -85,7 +85,6 @@ void gen_pool_destroy(struct gen_pool *p
+ int bit, end_bit;
+
+
+- write_lock(&pool->lock);
+ list_for_each_safe(_chunk, _next_chunk, &pool->chunks) {
+ chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+ list_del(&chunk->next_chunk);
--- /dev/null
+From c3d38840abaa45c1c5a5fabbb8ffc9a0d1a764d1 Mon Sep 17 00:00:00 2001
+From: Sunil Mushran <sunil.mushran@oracle.com>
+Date: Fri, 19 Jun 2009 14:45:55 -0700
+Subject: ocfs2: Fix ocfs2_osb_dump()
+
+From: Sunil Mushran <sunil.mushran@oracle.com>
+
+commit c3d38840abaa45c1c5a5fabbb8ffc9a0d1a764d1 upstream.
+
+Skip printing information that is not valid for local mounts.
+
+Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
+Signed-off-by: Joel Becker <joel.becker@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ocfs2/super.c | 32 ++++++++++++++++++--------------
+ 1 file changed, 18 insertions(+), 14 deletions(-)
+
+--- a/fs/ocfs2/super.c
++++ b/fs/ocfs2/super.c
+@@ -232,20 +232,24 @@ static int ocfs2_osb_dump(struct ocfs2_s
+ "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
+ osb->s_mount_opt, osb->s_atime_quantum);
+
+- out += snprintf(buf + out, len - out,
+- "%10s => Stack: %s Name: %*s Version: %d.%d\n",
+- "Cluster",
+- (*osb->osb_cluster_stack == '\0' ?
+- "o2cb" : osb->osb_cluster_stack),
+- cconn->cc_namelen, cconn->cc_name,
+- cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
++ if (cconn) {
++ out += snprintf(buf + out, len - out,
++ "%10s => Stack: %s Name: %*s "
++ "Version: %d.%d\n", "Cluster",
++ (*osb->osb_cluster_stack == '\0' ?
++ "o2cb" : osb->osb_cluster_stack),
++ cconn->cc_namelen, cconn->cc_name,
++ cconn->cc_version.pv_major,
++ cconn->cc_version.pv_minor);
++ }
+
+ spin_lock(&osb->dc_task_lock);
+ out += snprintf(buf + out, len - out,
+ "%10s => Pid: %d Count: %lu WakeSeq: %lu "
+ "WorkSeq: %lu\n", "DownCnvt",
+- task_pid_nr(osb->dc_task), osb->blocked_lock_count,
+- osb->dc_wake_sequence, osb->dc_work_sequence);
++ (osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
++ osb->blocked_lock_count, osb->dc_wake_sequence,
++ osb->dc_work_sequence);
+ spin_unlock(&osb->dc_task_lock);
+
+ spin_lock(&osb->osb_lock);
+@@ -265,14 +269,15 @@ static int ocfs2_osb_dump(struct ocfs2_s
+
+ out += snprintf(buf + out, len - out,
+ "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit",
+- task_pid_nr(osb->commit_task), osb->osb_commit_interval,
++ (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
++ osb->osb_commit_interval,
+ atomic_read(&osb->needs_checkpoint));
+
+ out += snprintf(buf + out, len - out,
+- "%10s => State: %d NumTxns: %d TxnId: %lu\n",
++ "%10s => State: %d TxnId: %lu NumTxns: %d\n",
+ "Journal", osb->journal->j_state,
+- atomic_read(&osb->journal->j_num_trans),
+- osb->journal->j_trans_id);
++ osb->journal->j_trans_id,
++ atomic_read(&osb->journal->j_num_trans));
+
+ out += snprintf(buf + out, len - out,
+ "%10s => GlobalAllocs: %d LocalAllocs: %d "
+@@ -300,7 +305,6 @@ static int ocfs2_osb_dump(struct ocfs2_s
+
+ out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
+ "Slots", "Num", "RecoGen");
+-
+ for (i = 0; i < osb->max_slots; ++i) {
+ out += snprintf(buf + out, len - out,
+ "%10s %c %3d %10d\n",
--- /dev/null
+From 607c268ef9a4675287e77f732071e426e62c2d86 Mon Sep 17 00:00:00 2001
+From: Mike Frysinger <vapier@gentoo.org>
+Date: Mon, 22 Jun 2009 18:41:47 +0100
+Subject: serial: bfin_5xx: fix building as module when early printk is enabled
+
+From: Mike Frysinger <vapier@gentoo.org>
+
+commit 607c268ef9a4675287e77f732071e426e62c2d86 upstream.
+
+Since early printk only makes sense/works when the serial driver is built
+into the kernel, disable the option for this driver when it is going to be
+built as a module. Otherwise we get build failures due to the ifdef
+handling.
+
+Signed-off-by: Mike Frysinger <vapier@gentoo.org>
+Signed-off-by: Alan Cox <alan@linux.intel.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/serial/bfin_5xx.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/drivers/serial/bfin_5xx.c
++++ b/drivers/serial/bfin_5xx.c
+@@ -38,6 +38,10 @@
+ #include <asm/cacheflush.h>
+ #endif
+
++#ifdef CONFIG_SERIAL_BFIN_MODULE
++# undef CONFIG_EARLY_PRINTK
++#endif
++
+ /* UART name and device definitions */
+ #define BFIN_SERIAL_NAME "ttyBF"
+ #define BFIN_SERIAL_MAJOR 204
dm-mpath-flush-keventd-queue-in-destructor.patch
dm-exception-store-fix-exstore-lookup-to-be-case-insensitive.patch
dm-use-i_size_read.patch
+vmscan-properly-account-for-the-number-of-page-cache-pages-zone_reclaim-can-reclaim.patch
+vmscan-count-the-number-of-times-zone_reclaim-scans-and-fails.patch
+lib-genalloc.c-remove-unmatched-write_lock-in-gen_pool_destroy.patch
+config_file_locking-should-not-depend-on-config_block.patch
+serial-bfin_5xx-fix-building-as-module-when-early-printk-is-enabled.patch
+ocfs2-fix-ocfs2_osb_dump.patch
+ide-cd-prevent-null-pointer-deref-via-cdrom_newpc_intr.patch
+drm-i915-correct-suspend-resume-ordering.patch
+kvm-x86-silence-preempt-warning-on-kvm_write_guest_time.patch
--- /dev/null
+From 24cf72518c79cdcda486ed26074ff8151291cf65 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mel@csn.ul.ie>
+Date: Tue, 16 Jun 2009 15:33:23 -0700
+Subject: vmscan: count the number of times zone_reclaim() scans and fails
+
+From: Mel Gorman <mel@csn.ul.ie>
+
+commit 24cf72518c79cdcda486ed26074ff8151291cf65 upstream.
+
+On NUMA machines, the administrator can configure zone_reclaim_mode that
+is a more targetted form of direct reclaim. On machines with large NUMA
+distances for example, a zone_reclaim_mode defaults to 1 meaning that
+clean unmapped pages will be reclaimed if the zone watermarks are not
+being met.
+
+There is a heuristic that determines if the scan is worthwhile but it is
+possible that the heuristic will fail and the CPU gets tied up scanning
+uselessly. Detecting the situation requires some guesswork and
+experimentation so this patch adds a counter "zreclaim_failed" to
+/proc/vmstat. If during high CPU utilisation this counter is increasing
+rapidly, then the resolution to the problem may be to set
+/proc/sys/vm/zone_reclaim_mode to 0.
+
+[akpm@linux-foundation.org: name things consistently]
+Signed-off-by: Mel Gorman <mel@csn.ul.ie>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Christoph Lameter <cl@linux-foundation.org>
+Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: Wu Fengguang <fengguang.wu@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/vmstat.h | 3 +++
+ mm/vmscan.c | 3 +++
+ mm/vmstat.c | 3 +++
+ 3 files changed, 9 insertions(+)
+
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -36,6 +36,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
+ FOR_ALL_ZONES(PGSTEAL),
+ FOR_ALL_ZONES(PGSCAN_KSWAPD),
+ FOR_ALL_ZONES(PGSCAN_DIRECT),
++#ifdef CONFIG_NUMA
++ PGSCAN_ZONE_RECLAIM_FAILED,
++#endif
+ PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
+ PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+ #ifdef CONFIG_HUGETLB_PAGE
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2452,6 +2452,9 @@ int zone_reclaim(struct zone *zone, gfp_
+ ret = __zone_reclaim(zone, gfp_mask, order);
+ zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+
++ if (!ret)
++ count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
++
+ return ret;
+ }
+ #endif
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -675,6 +675,9 @@ static const char * const vmstat_text[]
+ TEXTS_FOR_ZONES("pgscan_kswapd")
+ TEXTS_FOR_ZONES("pgscan_direct")
+
++#ifdef CONFIG_NUMA
++ "zone_reclaim_failed",
++#endif
+ "pginodesteal",
+ "slabs_scanned",
+ "kswapd_steal",
--- /dev/null
+From 90afa5de6f3fa89a733861e843377302479fcf7e Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mel@csn.ul.ie>
+Date: Tue, 16 Jun 2009 15:33:20 -0700
+Subject: vmscan: properly account for the number of page cache pages zone_reclaim() can reclaim
+
+From: Mel Gorman <mel@csn.ul.ie>
+
+commit 90afa5de6f3fa89a733861e843377302479fcf7e upstream.
+
+A bug was brought to my attention against a distro kernel but it affects
+mainline and I believe problems like this have been reported in various
+guises on the mailing lists although I don't have specific examples at the
+moment.
+
+The reported problem was that malloc() stalled for a long time (minutes in
+some cases) if a large tmpfs mount was occupying a large percentage of
+memory overall. The pages did not get cleaned or reclaimed by
+zone_reclaim() because the zone_reclaim_mode was unsuitable, but the lists
+are uselessly scanned frequencly making the CPU spin at near 100%.
+
+This patchset intends to address that bug and bring the behaviour of
+zone_reclaim() more in line with expectations which were noticed during
+investigation. It is based on top of mmotm and takes advantage of
+Kosaki's work with respect to zone_reclaim().
+
+Patch 1 fixes the heuristics that zone_reclaim() uses to determine if the
+ scan should go ahead. The broken heuristic is what was causing the
+ malloc() stall as it uselessly scanned the LRU constantly. Currently,
+ zone_reclaim is assuming zone_reclaim_mode is 1 and historically it
+ could not deal with tmpfs pages at all. This fixes up the heuristic so
+ that an unnecessary scan is more likely to be correctly avoided.
+
+Patch 2 notes that zone_reclaim() returning a failure automatically means
+ the zone is marked full. This is not always true. It could have
+ failed because the GFP mask or zone_reclaim_mode were unsuitable.
+
+Patch 3 introduces a counter zreclaim_failed that will increment each
+ time the zone_reclaim scan-avoidance heuristics fail. If that
+ counter is rapidly increasing, then zone_reclaim_mode should be
+ set to 0 as a temporarily resolution and a bug reported because
+ the scan-avoidance heuristic is still broken.
+
+This patch:
+
+On NUMA machines, the administrator can configure zone_reclaim_mode that
+is a more targetted form of direct reclaim. On machines with large NUMA
+distances for example, a zone_reclaim_mode defaults to 1 meaning that
+clean unmapped pages will be reclaimed if the zone watermarks are not
+being met.
+
+There is a heuristic that determines if the scan is worthwhile but the
+problem is that the heuristic is not being properly applied and is
+basically assuming zone_reclaim_mode is 1 if it is enabled. The lack of
+proper detection can manfiest as high CPU usage as the LRU list is scanned
+uselessly.
+
+Historically, once enabled it was depending on NR_FILE_PAGES which may
+include swapcache pages that the reclaim_mode cannot deal with. Patch
+vmscan-change-the-number-of-the-unmapped-files-in-zone-reclaim.patch by
+Kosaki Motohiro noted that zone_page_state(zone, NR_FILE_PAGES) included
+pages that were not file-backed such as swapcache and made a calculation
+based on the inactive, active and mapped files. This is far superior when
+zone_reclaim==1 but if RECLAIM_SWAP is set, then NR_FILE_PAGES is a
+reasonable starting figure.
+
+This patch alters how zone_reclaim() works out how many pages it might be
+able to reclaim given the current reclaim_mode. If RECLAIM_SWAP is set in
+the reclaim_mode it will either consider NR_FILE_PAGES as potential
+candidates or else use NR_{IN}ACTIVE}_PAGES-NR_FILE_MAPPED to discount
+swapcache and other non-file-backed pages. If RECLAIM_WRITE is not set,
+then NR_FILE_DIRTY number of pages are not candidates. If RECLAIM_SWAP is
+not set, then NR_FILE_MAPPED are not.
+
+[kosaki.motohiro@jp.fujitsu.com: Estimate unmapped pages minus tmpfs pages]
+[fengguang.wu@intel.com: Fix underflow problem in Kosaki's estimate]
+Signed-off-by: Mel Gorman <mel@csn.ul.ie>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Acked-by: Christoph Lameter <cl@linux-foundation.org>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: Wu Fengguang <fengguang.wu@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ Documentation/sysctl/vm.txt | 12 ++++++----
+ mm/vmscan.c | 52 ++++++++++++++++++++++++++++++++++++++------
+ 2 files changed, 53 insertions(+), 11 deletions(-)
+
+--- a/Documentation/sysctl/vm.txt
++++ b/Documentation/sysctl/vm.txt
+@@ -314,10 +314,14 @@ min_unmapped_ratio:
+
+ This is available only on NUMA kernels.
+
+-A percentage of the total pages in each zone. Zone reclaim will only
+-occur if more than this percentage of pages are file backed and unmapped.
+-This is to insure that a minimal amount of local pages is still available for
+-file I/O even if the node is overallocated.
++This is a percentage of the total pages in each zone. Zone reclaim will
++only occur if more than this percentage of pages are in a state that
++zone_reclaim_mode allows to be reclaimed.
++
++If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared
++against all file-backed unmapped pages including swapcache pages and tmpfs
++files. Otherwise, only unmapped pages backed by normal files but not tmpfs
++files and similar are considered.
+
+ The default is 1 percent.
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2290,6 +2290,48 @@ int sysctl_min_unmapped_ratio = 1;
+ */
+ int sysctl_min_slab_ratio = 5;
+
++static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
++{
++ unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
++ unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
++ zone_page_state(zone, NR_ACTIVE_FILE);
++
++ /*
++ * It's possible for there to be more file mapped pages than
++ * accounted for by the pages on the file LRU lists because
++ * tmpfs pages accounted for as ANON can also be FILE_MAPPED
++ */
++ return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
++}
++
++/* Work out how many page cache pages we can reclaim in this reclaim_mode */
++static long zone_pagecache_reclaimable(struct zone *zone)
++{
++ long nr_pagecache_reclaimable;
++ long delta = 0;
++
++ /*
++ * If RECLAIM_SWAP is set, then all file pages are considered
++ * potentially reclaimable. Otherwise, we have to worry about
++ * pages like swapcache and zone_unmapped_file_pages() provides
++ * a better estimate
++ */
++ if (zone_reclaim_mode & RECLAIM_SWAP)
++ nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
++ else
++ nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
++
++ /* If we can't clean pages, remove dirty pages from consideration */
++ if (!(zone_reclaim_mode & RECLAIM_WRITE))
++ delta += zone_page_state(zone, NR_FILE_DIRTY);
++
++ /* Watch for any possible underflows due to delta */
++ if (unlikely(delta > nr_pagecache_reclaimable))
++ delta = nr_pagecache_reclaimable;
++
++ return nr_pagecache_reclaimable - delta;
++}
++
+ /*
+ * Try to free up some pages from this zone through reclaim.
+ */
+@@ -2324,9 +2366,7 @@ static int __zone_reclaim(struct zone *z
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+
+- if (zone_page_state(zone, NR_FILE_PAGES) -
+- zone_page_state(zone, NR_FILE_MAPPED) >
+- zone->min_unmapped_pages) {
++ if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+ /*
+ * Free memory by calling shrink zone with increasing
+ * priorities until we have enough memory freed.
+@@ -2384,10 +2424,8 @@ int zone_reclaim(struct zone *zone, gfp_
+ * if less than a specified percentage of the zone is used by
+ * unmapped file backed pages.
+ */
+- if (zone_page_state(zone, NR_FILE_PAGES) -
+- zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
+- && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
+- <= zone->min_slab_pages)
++ if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
++ zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
+ return 0;
+
+ if (zone_is_all_unreclaimable(zone))