From: Greg Kroah-Hartman Date: Tue, 30 Jun 2009 23:11:00 +0000 (-0700) Subject: more .30 patches X-Git-Tag: v2.6.27.26~10 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=495b3fd1b4bf8cf4ce9212b852af0ac92965faa8;p=thirdparty%2Fkernel%2Fstable-queue.git more .30 patches --- diff --git a/queue-2.6.30/config_file_locking-should-not-depend-on-config_block.patch b/queue-2.6.30/config_file_locking-should-not-depend-on-config_block.patch new file mode 100644 index 00000000000..023a852a023 --- /dev/null +++ b/queue-2.6.30/config_file_locking-should-not-depend-on-config_block.patch @@ -0,0 +1,52 @@ +From 69050eee8e08a6234f29fe71a56f8c7c7d4d7186 Mon Sep 17 00:00:00 2001 +From: Tomas Szepe +Date: Tue, 16 Jun 2009 15:33:56 -0700 +Subject: CONFIG_FILE_LOCKING should not depend on CONFIG_BLOCK + +From: Tomas Szepe + +commit 69050eee8e08a6234f29fe71a56f8c7c7d4d7186 upstream. + +CONFIG_FILE_LOCKING should not depend on CONFIG_BLOCK. + +This makes it possible to run complete systems out of a CONFIG_BLOCK=n +initramfs on current kernels again (this last worked on 2.6.27.*). + +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/Kconfig | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -39,6 +39,13 @@ config FS_POSIX_ACL + bool + default n + ++source "fs/xfs/Kconfig" ++source "fs/gfs2/Kconfig" ++source "fs/ocfs2/Kconfig" ++source "fs/btrfs/Kconfig" ++ ++endif # BLOCK ++ + config FILE_LOCKING + bool "Enable POSIX file locking API" if EMBEDDED + default y +@@ -47,13 +54,6 @@ config FILE_LOCKING + for filesystems like NFS and for the flock() system + call. Disabling this option saves about 11k. + +-source "fs/xfs/Kconfig" +-source "fs/gfs2/Kconfig" +-source "fs/ocfs2/Kconfig" +-source "fs/btrfs/Kconfig" +- +-endif # BLOCK +- + source "fs/notify/Kconfig" + + source "fs/quota/Kconfig" diff --git a/queue-2.6.30/drm-i915-correct-suspend-resume-ordering.patch b/queue-2.6.30/drm-i915-correct-suspend-resume-ordering.patch new file mode 100644 index 00000000000..8ac2ba83b04 --- /dev/null +++ b/queue-2.6.30/drm-i915-correct-suspend-resume-ordering.patch @@ -0,0 +1,48 @@ +From 9e06dd39f2b6d7e35981e0d7aded618686b32ccb Mon Sep 17 00:00:00 2001 +From: Jesse Barnes +Date: Mon, 22 Jun 2009 18:05:12 -0700 +Subject: drm/i915: correct suspend/resume ordering + +From: Jesse Barnes + +commit 9e06dd39f2b6d7e35981e0d7aded618686b32ccb upstream. + +We need to save register state *after* idling GEM, clearing the ring, +and uninstalling the IRQ handler, or we might end up saving bogus +fence regs, for one. Our restore ordering should already be correct, +since we do GEM, ring and IRQ init after restoring the last register +state, which prevents us from clobbering things. + +I put this together to potentially address a bug, but I haven't heard +back if it fixes it yet. However I think it stands on its own, so I'm +sending it in. + +Signed-off-by: Jesse Barnes +Signed-off-by: Eric Anholt +Cc: Jie Luo +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/gpu/drm/i915/i915_drv.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/gpu/drm/i915/i915_drv.c ++++ b/drivers/gpu/drm/i915/i915_drv.c +@@ -67,8 +67,6 @@ static int i915_suspend(struct drm_devic + + pci_save_state(dev->pdev); + +- i915_save_state(dev); +- + /* If KMS is active, we do the leavevt stuff here */ + if (drm_core_check_feature(dev, DRIVER_MODESET)) { + if (i915_gem_idle(dev)) +@@ -77,6 +75,8 @@ static int i915_suspend(struct drm_devic + drm_irq_uninstall(dev); + } + ++ i915_save_state(dev); ++ + intel_opregion_free(dev, 1); + + if (state.event == PM_EVENT_SUSPEND) { diff --git a/queue-2.6.30/ide-cd-prevent-null-pointer-deref-via-cdrom_newpc_intr.patch b/queue-2.6.30/ide-cd-prevent-null-pointer-deref-via-cdrom_newpc_intr.patch new file mode 100644 index 00000000000..53f14b39f5f --- /dev/null +++ b/queue-2.6.30/ide-cd-prevent-null-pointer-deref-via-cdrom_newpc_intr.patch @@ -0,0 +1,44 @@ +From 39c58f37a10198054c656c28202fb1e6d22fd505 Mon Sep 17 00:00:00 2001 +From: Rainer Weikusat +Date: Thu, 18 Jun 2009 17:04:00 +0200 +Subject: ide-cd: prevent null pointer deref via cdrom_newpc_intr + +From: Rainer Weikusat + +commit 39c58f37a10198054c656c28202fb1e6d22fd505 upstream. + +With 2.6.30, the error handling code in cdrom_newpc_intr was changed +to deal with partial request failures by normally completing the 'good' +parts of a request and only 'error' the last (and presumably, +incompletely transferred) bio associated with a particular +request. In order to do this, ide_complete_rq is called over +ide_cd_error_cmd() to partially complete the rq. The block layer +does partial completion only for requests with bio's and if the +rq doesn't have one (eg 'GPCMD_READ_DISC_INFO') the request is +completed as a whole and the drive->hwif->rq pointer set to NULL +afterwards. When calling ide_complete_rq again to report +the error, this null pointer is derefenced, resulting in a kernel +crash. + +This fixes http://bugzilla.kernel.org/show_bug.cgi?id=13399. + +Signed-off-by: Rainer Weikusat +Signed-off-by: Borislav Petkov +Signed-off-by: Bartlomiej Zolnierkiewicz +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/ide/ide-cd.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/ide/ide-cd.c ++++ b/drivers/ide/ide-cd.c +@@ -758,7 +758,7 @@ out_end: + rq->errors = -EIO; + } + +- if (uptodate == 0) ++ if (uptodate == 0 && rq->bio) + ide_cd_error_cmd(drive, cmd); + + /* make sure it's fully ended */ diff --git a/queue-2.6.30/kvm-x86-silence-preempt-warning-on-kvm_write_guest_time.patch b/queue-2.6.30/kvm-x86-silence-preempt-warning-on-kvm_write_guest_time.patch new file mode 100644 index 00000000000..2b9fff857aa --- /dev/null +++ b/queue-2.6.30/kvm-x86-silence-preempt-warning-on-kvm_write_guest_time.patch @@ -0,0 +1,69 @@ +From 2dea4c84bc936731668b5a7a9fba5b436a422668 Mon Sep 17 00:00:00 2001 +From: Matt T. Yourst +Date: Tue, 24 Feb 2009 15:28:00 -0300 +Subject: KVM: x86: silence preempt warning on kvm_write_guest_time + +From: Matt T. Yourst + +commit 2dea4c84bc936731668b5a7a9fba5b436a422668 upstream. + +This issue just appeared in kvm-84 when running on 2.6.28.7 (x86-64) +with PREEMPT enabled. + +We're getting syslog warnings like this many (but not all) times qemu +tells KVM to run the VCPU: + +BUG: using smp_processor_id() in preemptible [00000000] code: +qemu-system-x86/28938 +caller is kvm_arch_vcpu_ioctl_run+0x5d1/0xc70 [kvm] +Pid: 28938, comm: qemu-system-x86 2.6.28.7-mtyrel-64bit +Call Trace: +debug_smp_processor_id+0xf7/0x100 +kvm_arch_vcpu_ioctl_run+0x5d1/0xc70 [kvm] +? __wake_up+0x4e/0x70 +? wake_futex+0x27/0x40 +kvm_vcpu_ioctl+0x2e9/0x5a0 [kvm] +enqueue_hrtimer+0x8a/0x110 +_spin_unlock_irqrestore+0x27/0x50 +vfs_ioctl+0x31/0xa0 +do_vfs_ioctl+0x74/0x480 +sys_futex+0xb4/0x140 +sys_ioctl+0x99/0xa0 +system_call_fastpath+0x16/0x1b + +As it turns out, the call trace is messed up due to gcc's inlining, but +I isolated the problem anyway: kvm_write_guest_time() is being used in a +non-thread-safe manner on preemptable kernels. + +Basically kvm_write_guest_time()'s body needs to be surrounded by +preempt_disable() and preempt_enable(), since the kernel won't let us +query any per-CPU data (indirectly using smp_processor_id()) without +preemption disabled. The attached patch fixes this issue by disabling +preemption inside kvm_write_guest_time(). + +[marcelo: surround only __get_cpu_var calls since the warning +is harmless] + +Signed-off-by: Marcelo Tosatti +Signed-off-by: Avi Kivity +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -634,10 +634,12 @@ static void kvm_write_guest_time(struct + if ((!vcpu->time_page)) + return; + ++ preempt_disable(); + if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { + kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); + } ++ preempt_enable(); + + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); diff --git a/queue-2.6.30/lib-genalloc.c-remove-unmatched-write_lock-in-gen_pool_destroy.patch b/queue-2.6.30/lib-genalloc.c-remove-unmatched-write_lock-in-gen_pool_destroy.patch new file mode 100644 index 00000000000..1b43bb5f30f --- /dev/null +++ b/queue-2.6.30/lib-genalloc.c-remove-unmatched-write_lock-in-gen_pool_destroy.patch @@ -0,0 +1,36 @@ +From 8e8a2dea0ca91fe2cb7de7ea212124cfe8c82c35 Mon Sep 17 00:00:00 2001 +From: Zygo Blaxell +Date: Tue, 16 Jun 2009 15:33:57 -0700 +Subject: lib/genalloc.c: remove unmatched write_lock() in gen_pool_destroy + +From: Zygo Blaxell + +commit 8e8a2dea0ca91fe2cb7de7ea212124cfe8c82c35 upstream. + +There is a call to write_lock() in gen_pool_destroy which is not balanced +by any corresponding write_unlock(). This causes problems with preemption +because the preemption-disable counter is incremented in the write_lock() +call, but never decremented by any call to write_unlock(). This bug is +gen_pool_destroy, and one of them is non-x86 arch-specific code. + +Signed-off-by: Zygo Blaxell +Cc: Jiri Kosina +Cc: Steve Wise +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + lib/genalloc.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/lib/genalloc.c ++++ b/lib/genalloc.c +@@ -85,7 +85,6 @@ void gen_pool_destroy(struct gen_pool *p + int bit, end_bit; + + +- write_lock(&pool->lock); + list_for_each_safe(_chunk, _next_chunk, &pool->chunks) { + chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); + list_del(&chunk->next_chunk); diff --git a/queue-2.6.30/ocfs2-fix-ocfs2_osb_dump.patch b/queue-2.6.30/ocfs2-fix-ocfs2_osb_dump.patch new file mode 100644 index 00000000000..f2961e3261f --- /dev/null +++ b/queue-2.6.30/ocfs2-fix-ocfs2_osb_dump.patch @@ -0,0 +1,83 @@ +From c3d38840abaa45c1c5a5fabbb8ffc9a0d1a764d1 Mon Sep 17 00:00:00 2001 +From: Sunil Mushran +Date: Fri, 19 Jun 2009 14:45:55 -0700 +Subject: ocfs2: Fix ocfs2_osb_dump() + +From: Sunil Mushran + +commit c3d38840abaa45c1c5a5fabbb8ffc9a0d1a764d1 upstream. + +Skip printing information that is not valid for local mounts. + +Signed-off-by: Sunil Mushran +Signed-off-by: Joel Becker +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/super.c | 32 ++++++++++++++++++-------------- + 1 file changed, 18 insertions(+), 14 deletions(-) + +--- a/fs/ocfs2/super.c ++++ b/fs/ocfs2/super.c +@@ -232,20 +232,24 @@ static int ocfs2_osb_dump(struct ocfs2_s + "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", + osb->s_mount_opt, osb->s_atime_quantum); + +- out += snprintf(buf + out, len - out, +- "%10s => Stack: %s Name: %*s Version: %d.%d\n", +- "Cluster", +- (*osb->osb_cluster_stack == '\0' ? +- "o2cb" : osb->osb_cluster_stack), +- cconn->cc_namelen, cconn->cc_name, +- cconn->cc_version.pv_major, cconn->cc_version.pv_minor); ++ if (cconn) { ++ out += snprintf(buf + out, len - out, ++ "%10s => Stack: %s Name: %*s " ++ "Version: %d.%d\n", "Cluster", ++ (*osb->osb_cluster_stack == '\0' ? ++ "o2cb" : osb->osb_cluster_stack), ++ cconn->cc_namelen, cconn->cc_name, ++ cconn->cc_version.pv_major, ++ cconn->cc_version.pv_minor); ++ } + + spin_lock(&osb->dc_task_lock); + out += snprintf(buf + out, len - out, + "%10s => Pid: %d Count: %lu WakeSeq: %lu " + "WorkSeq: %lu\n", "DownCnvt", +- task_pid_nr(osb->dc_task), osb->blocked_lock_count, +- osb->dc_wake_sequence, osb->dc_work_sequence); ++ (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), ++ osb->blocked_lock_count, osb->dc_wake_sequence, ++ osb->dc_work_sequence); + spin_unlock(&osb->dc_task_lock); + + spin_lock(&osb->osb_lock); +@@ -265,14 +269,15 @@ static int ocfs2_osb_dump(struct ocfs2_s + + out += snprintf(buf + out, len - out, + "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit", +- task_pid_nr(osb->commit_task), osb->osb_commit_interval, ++ (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), ++ osb->osb_commit_interval, + atomic_read(&osb->needs_checkpoint)); + + out += snprintf(buf + out, len - out, +- "%10s => State: %d NumTxns: %d TxnId: %lu\n", ++ "%10s => State: %d TxnId: %lu NumTxns: %d\n", + "Journal", osb->journal->j_state, +- atomic_read(&osb->journal->j_num_trans), +- osb->journal->j_trans_id); ++ osb->journal->j_trans_id, ++ atomic_read(&osb->journal->j_num_trans)); + + out += snprintf(buf + out, len - out, + "%10s => GlobalAllocs: %d LocalAllocs: %d " +@@ -300,7 +305,6 @@ static int ocfs2_osb_dump(struct ocfs2_s + + out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", + "Slots", "Num", "RecoGen"); +- + for (i = 0; i < osb->max_slots; ++i) { + out += snprintf(buf + out, len - out, + "%10s %c %3d %10d\n", diff --git a/queue-2.6.30/serial-bfin_5xx-fix-building-as-module-when-early-printk-is-enabled.patch b/queue-2.6.30/serial-bfin_5xx-fix-building-as-module-when-early-printk-is-enabled.patch new file mode 100644 index 00000000000..30d59abde8b --- /dev/null +++ b/queue-2.6.30/serial-bfin_5xx-fix-building-as-module-when-early-printk-is-enabled.patch @@ -0,0 +1,36 @@ +From 607c268ef9a4675287e77f732071e426e62c2d86 Mon Sep 17 00:00:00 2001 +From: Mike Frysinger +Date: Mon, 22 Jun 2009 18:41:47 +0100 +Subject: serial: bfin_5xx: fix building as module when early printk is enabled + +From: Mike Frysinger + +commit 607c268ef9a4675287e77f732071e426e62c2d86 upstream. + +Since early printk only makes sense/works when the serial driver is built +into the kernel, disable the option for this driver when it is going to be +built as a module. Otherwise we get build failures due to the ifdef +handling. + +Signed-off-by: Mike Frysinger +Signed-off-by: Alan Cox +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/serial/bfin_5xx.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/serial/bfin_5xx.c ++++ b/drivers/serial/bfin_5xx.c +@@ -38,6 +38,10 @@ + #include + #endif + ++#ifdef CONFIG_SERIAL_BFIN_MODULE ++# undef CONFIG_EARLY_PRINTK ++#endif ++ + /* UART name and device definitions */ + #define BFIN_SERIAL_NAME "ttyBF" + #define BFIN_SERIAL_MAJOR 204 diff --git a/queue-2.6.30/series b/queue-2.6.30/series index 8dddc540b59..4e706752b4a 100644 --- a/queue-2.6.30/series +++ b/queue-2.6.30/series @@ -97,3 +97,12 @@ dm-sysfs-skip-output-when-device-is-being-destroyed.patch dm-mpath-flush-keventd-queue-in-destructor.patch dm-exception-store-fix-exstore-lookup-to-be-case-insensitive.patch dm-use-i_size_read.patch +vmscan-properly-account-for-the-number-of-page-cache-pages-zone_reclaim-can-reclaim.patch +vmscan-count-the-number-of-times-zone_reclaim-scans-and-fails.patch +lib-genalloc.c-remove-unmatched-write_lock-in-gen_pool_destroy.patch +config_file_locking-should-not-depend-on-config_block.patch +serial-bfin_5xx-fix-building-as-module-when-early-printk-is-enabled.patch +ocfs2-fix-ocfs2_osb_dump.patch +ide-cd-prevent-null-pointer-deref-via-cdrom_newpc_intr.patch +drm-i915-correct-suspend-resume-ordering.patch +kvm-x86-silence-preempt-warning-on-kvm_write_guest_time.patch diff --git a/queue-2.6.30/vmscan-count-the-number-of-times-zone_reclaim-scans-and-fails.patch b/queue-2.6.30/vmscan-count-the-number-of-times-zone_reclaim-scans-and-fails.patch new file mode 100644 index 00000000000..5fb8ae04313 --- /dev/null +++ b/queue-2.6.30/vmscan-count-the-number-of-times-zone_reclaim-scans-and-fails.patch @@ -0,0 +1,75 @@ +From 24cf72518c79cdcda486ed26074ff8151291cf65 Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Tue, 16 Jun 2009 15:33:23 -0700 +Subject: vmscan: count the number of times zone_reclaim() scans and fails + +From: Mel Gorman + +commit 24cf72518c79cdcda486ed26074ff8151291cf65 upstream. + +On NUMA machines, the administrator can configure zone_reclaim_mode that +is a more targetted form of direct reclaim. On machines with large NUMA +distances for example, a zone_reclaim_mode defaults to 1 meaning that +clean unmapped pages will be reclaimed if the zone watermarks are not +being met. + +There is a heuristic that determines if the scan is worthwhile but it is +possible that the heuristic will fail and the CPU gets tied up scanning +uselessly. Detecting the situation requires some guesswork and +experimentation so this patch adds a counter "zreclaim_failed" to +/proc/vmstat. If during high CPU utilisation this counter is increasing +rapidly, then the resolution to the problem may be to set +/proc/sys/vm/zone_reclaim_mode to 0. + +[akpm@linux-foundation.org: name things consistently] +Signed-off-by: Mel Gorman +Reviewed-by: Rik van Riel +Cc: Christoph Lameter +Reviewed-by: KOSAKI Motohiro +Cc: Wu Fengguang +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/vmstat.h | 3 +++ + mm/vmscan.c | 3 +++ + mm/vmstat.c | 3 +++ + 3 files changed, 9 insertions(+) + +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -36,6 +36,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS + FOR_ALL_ZONES(PGSTEAL), + FOR_ALL_ZONES(PGSCAN_KSWAPD), + FOR_ALL_ZONES(PGSCAN_DIRECT), ++#ifdef CONFIG_NUMA ++ PGSCAN_ZONE_RECLAIM_FAILED, ++#endif + PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL, + PAGEOUTRUN, ALLOCSTALL, PGROTATED, + #ifdef CONFIG_HUGETLB_PAGE +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2452,6 +2452,9 @@ int zone_reclaim(struct zone *zone, gfp_ + ret = __zone_reclaim(zone, gfp_mask, order); + zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + ++ if (!ret) ++ count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); ++ + return ret; + } + #endif +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -675,6 +675,9 @@ static const char * const vmstat_text[] + TEXTS_FOR_ZONES("pgscan_kswapd") + TEXTS_FOR_ZONES("pgscan_direct") + ++#ifdef CONFIG_NUMA ++ "zone_reclaim_failed", ++#endif + "pginodesteal", + "slabs_scanned", + "kswapd_steal", diff --git a/queue-2.6.30/vmscan-properly-account-for-the-number-of-page-cache-pages-zone_reclaim-can-reclaim.patch b/queue-2.6.30/vmscan-properly-account-for-the-number-of-page-cache-pages-zone_reclaim-can-reclaim.patch new file mode 100644 index 00000000000..c406d0fbda7 --- /dev/null +++ b/queue-2.6.30/vmscan-properly-account-for-the-number-of-page-cache-pages-zone_reclaim-can-reclaim.patch @@ -0,0 +1,185 @@ +From 90afa5de6f3fa89a733861e843377302479fcf7e Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Tue, 16 Jun 2009 15:33:20 -0700 +Subject: vmscan: properly account for the number of page cache pages zone_reclaim() can reclaim + +From: Mel Gorman + +commit 90afa5de6f3fa89a733861e843377302479fcf7e upstream. + +A bug was brought to my attention against a distro kernel but it affects +mainline and I believe problems like this have been reported in various +guises on the mailing lists although I don't have specific examples at the +moment. + +The reported problem was that malloc() stalled for a long time (minutes in +some cases) if a large tmpfs mount was occupying a large percentage of +memory overall. The pages did not get cleaned or reclaimed by +zone_reclaim() because the zone_reclaim_mode was unsuitable, but the lists +are uselessly scanned frequencly making the CPU spin at near 100%. + +This patchset intends to address that bug and bring the behaviour of +zone_reclaim() more in line with expectations which were noticed during +investigation. It is based on top of mmotm and takes advantage of +Kosaki's work with respect to zone_reclaim(). + +Patch 1 fixes the heuristics that zone_reclaim() uses to determine if the + scan should go ahead. The broken heuristic is what was causing the + malloc() stall as it uselessly scanned the LRU constantly. Currently, + zone_reclaim is assuming zone_reclaim_mode is 1 and historically it + could not deal with tmpfs pages at all. This fixes up the heuristic so + that an unnecessary scan is more likely to be correctly avoided. + +Patch 2 notes that zone_reclaim() returning a failure automatically means + the zone is marked full. This is not always true. It could have + failed because the GFP mask or zone_reclaim_mode were unsuitable. + +Patch 3 introduces a counter zreclaim_failed that will increment each + time the zone_reclaim scan-avoidance heuristics fail. If that + counter is rapidly increasing, then zone_reclaim_mode should be + set to 0 as a temporarily resolution and a bug reported because + the scan-avoidance heuristic is still broken. + +This patch: + +On NUMA machines, the administrator can configure zone_reclaim_mode that +is a more targetted form of direct reclaim. On machines with large NUMA +distances for example, a zone_reclaim_mode defaults to 1 meaning that +clean unmapped pages will be reclaimed if the zone watermarks are not +being met. + +There is a heuristic that determines if the scan is worthwhile but the +problem is that the heuristic is not being properly applied and is +basically assuming zone_reclaim_mode is 1 if it is enabled. The lack of +proper detection can manfiest as high CPU usage as the LRU list is scanned +uselessly. + +Historically, once enabled it was depending on NR_FILE_PAGES which may +include swapcache pages that the reclaim_mode cannot deal with. Patch +vmscan-change-the-number-of-the-unmapped-files-in-zone-reclaim.patch by +Kosaki Motohiro noted that zone_page_state(zone, NR_FILE_PAGES) included +pages that were not file-backed such as swapcache and made a calculation +based on the inactive, active and mapped files. This is far superior when +zone_reclaim==1 but if RECLAIM_SWAP is set, then NR_FILE_PAGES is a +reasonable starting figure. + +This patch alters how zone_reclaim() works out how many pages it might be +able to reclaim given the current reclaim_mode. If RECLAIM_SWAP is set in +the reclaim_mode it will either consider NR_FILE_PAGES as potential +candidates or else use NR_{IN}ACTIVE}_PAGES-NR_FILE_MAPPED to discount +swapcache and other non-file-backed pages. If RECLAIM_WRITE is not set, +then NR_FILE_DIRTY number of pages are not candidates. If RECLAIM_SWAP is +not set, then NR_FILE_MAPPED are not. + +[kosaki.motohiro@jp.fujitsu.com: Estimate unmapped pages minus tmpfs pages] +[fengguang.wu@intel.com: Fix underflow problem in Kosaki's estimate] +Signed-off-by: Mel Gorman +Reviewed-by: Rik van Riel +Acked-by: Christoph Lameter +Cc: KOSAKI Motohiro +Cc: Wu Fengguang +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + Documentation/sysctl/vm.txt | 12 ++++++---- + mm/vmscan.c | 52 ++++++++++++++++++++++++++++++++++++++------ + 2 files changed, 53 insertions(+), 11 deletions(-) + +--- a/Documentation/sysctl/vm.txt ++++ b/Documentation/sysctl/vm.txt +@@ -314,10 +314,14 @@ min_unmapped_ratio: + + This is available only on NUMA kernels. + +-A percentage of the total pages in each zone. Zone reclaim will only +-occur if more than this percentage of pages are file backed and unmapped. +-This is to insure that a minimal amount of local pages is still available for +-file I/O even if the node is overallocated. ++This is a percentage of the total pages in each zone. Zone reclaim will ++only occur if more than this percentage of pages are in a state that ++zone_reclaim_mode allows to be reclaimed. ++ ++If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared ++against all file-backed unmapped pages including swapcache pages and tmpfs ++files. Otherwise, only unmapped pages backed by normal files but not tmpfs ++files and similar are considered. + + The default is 1 percent. + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2290,6 +2290,48 @@ int sysctl_min_unmapped_ratio = 1; + */ + int sysctl_min_slab_ratio = 5; + ++static inline unsigned long zone_unmapped_file_pages(struct zone *zone) ++{ ++ unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); ++ unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + ++ zone_page_state(zone, NR_ACTIVE_FILE); ++ ++ /* ++ * It's possible for there to be more file mapped pages than ++ * accounted for by the pages on the file LRU lists because ++ * tmpfs pages accounted for as ANON can also be FILE_MAPPED ++ */ ++ return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; ++} ++ ++/* Work out how many page cache pages we can reclaim in this reclaim_mode */ ++static long zone_pagecache_reclaimable(struct zone *zone) ++{ ++ long nr_pagecache_reclaimable; ++ long delta = 0; ++ ++ /* ++ * If RECLAIM_SWAP is set, then all file pages are considered ++ * potentially reclaimable. Otherwise, we have to worry about ++ * pages like swapcache and zone_unmapped_file_pages() provides ++ * a better estimate ++ */ ++ if (zone_reclaim_mode & RECLAIM_SWAP) ++ nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); ++ else ++ nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); ++ ++ /* If we can't clean pages, remove dirty pages from consideration */ ++ if (!(zone_reclaim_mode & RECLAIM_WRITE)) ++ delta += zone_page_state(zone, NR_FILE_DIRTY); ++ ++ /* Watch for any possible underflows due to delta */ ++ if (unlikely(delta > nr_pagecache_reclaimable)) ++ delta = nr_pagecache_reclaimable; ++ ++ return nr_pagecache_reclaimable - delta; ++} ++ + /* + * Try to free up some pages from this zone through reclaim. + */ +@@ -2324,9 +2366,7 @@ static int __zone_reclaim(struct zone *z + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + +- if (zone_page_state(zone, NR_FILE_PAGES) - +- zone_page_state(zone, NR_FILE_MAPPED) > +- zone->min_unmapped_pages) { ++ if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { + /* + * Free memory by calling shrink zone with increasing + * priorities until we have enough memory freed. +@@ -2384,10 +2424,8 @@ int zone_reclaim(struct zone *zone, gfp_ + * if less than a specified percentage of the zone is used by + * unmapped file backed pages. + */ +- if (zone_page_state(zone, NR_FILE_PAGES) - +- zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages +- && zone_page_state(zone, NR_SLAB_RECLAIMABLE) +- <= zone->min_slab_pages) ++ if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && ++ zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) + return 0; + + if (zone_is_all_unreclaimable(zone))