+++ /dev/null
-From f01f17d3705bb6081c9e5728078f64067982be36 Mon Sep 17 00:00:00 2001
-From: Michal Hocko <mhocko@suse.com>
-Date: Fri, 5 Feb 2016 15:36:24 -0800
-Subject: mm, vmstat: make quiet_vmstat lighter
-
-From: Michal Hocko <mhocko@suse.com>
-
-commit f01f17d3705bb6081c9e5728078f64067982be36 upstream.
-
-Mike has reported a considerable overhead of refresh_cpu_vm_stats from
-the idle entry during pipe test:
-
- 12.89% [kernel] [k] refresh_cpu_vm_stats.isra.12
- 4.75% [kernel] [k] __schedule
- 4.70% [kernel] [k] mutex_unlock
- 3.14% [kernel] [k] __switch_to
-
-This is caused by commit 0eb77e988032 ("vmstat: make vmstat_updater
-deferrable again and shut down on idle") which has placed quiet_vmstat
-into cpu_idle_loop. The main reason here seems to be that the idle
-entry has to get over all zones and perform atomic operations for each
-vmstat entry even though there might be no per cpu diffs. This is a
-pointless overhead for _each_ idle entry.
-
-Make sure that quiet_vmstat is as light as possible.
-
-First of all it doesn't make any sense to do any local sync if the
-current cpu is already set in oncpu_stat_off because vmstat_update puts
-itself there only if there is nothing to do.
-
-Then we can check need_update which should be a cheap way to check for
-potential per-cpu diffs and only then do refresh_cpu_vm_stats.
-
-The original patch also did cancel_delayed_work which we are not doing
-here. There are two reasons for that. Firstly cancel_delayed_work from
-idle context will blow up on RT kernels (reported by Mike):
-
- CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.5.0-rt3 #7
- Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013
- Call Trace:
- dump_stack+0x49/0x67
- ___might_sleep+0xf5/0x180
- rt_spin_lock+0x20/0x50
- try_to_grab_pending+0x69/0x240
- cancel_delayed_work+0x26/0xe0
- quiet_vmstat+0x75/0xa0
- cpu_idle_loop+0x38/0x3e0
- cpu_startup_entry+0x13/0x20
- start_secondary+0x114/0x140
-
-And secondly, even on !RT kernels it might add some non trivial overhead
-which is not necessary. Even if the vmstat worker wakes up and preempts
-idle then it will be most likely a single shot noop because the stats
-were already synced and so it would end up on the oncpu_stat_off anyway.
-We just need to teach both vmstat_shepherd and vmstat_update to stop
-scheduling the worker if there is nothing to do.
-
-[mgalbraith@suse.de: cancel pending work of the cpu_stat_off CPU]
-Signed-off-by: Michal Hocko <mhocko@suse.com>
-Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
-Acked-by: Christoph Lameter <cl@linux.com>
-Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
-Signed-off-by: Daniel Wagner <wagi@monom.org>
-
----
- mm/vmstat.c | 68 ++++++++++++++++++++++++++++++++++++++++--------------------
- 1 file changed, 46 insertions(+), 22 deletions(-)
-
---- a/mm/vmstat.c
-+++ b/mm/vmstat.c
-@@ -1395,10 +1395,15 @@ static void vmstat_update(struct work_st
- * Counters were updated so we expect more updates
- * to occur in the future. Keep on running the
- * update worker thread.
-+ * If we were marked on cpu_stat_off clear the flag
-+ * so that vmstat_shepherd doesn't schedule us again.
- */
-- queue_delayed_work_on(smp_processor_id(), vmstat_wq,
-- this_cpu_ptr(&vmstat_work),
-- round_jiffies_relative(sysctl_stat_interval));
-+ if (!cpumask_test_and_clear_cpu(smp_processor_id(),
-+ cpu_stat_off)) {
-+ queue_delayed_work_on(smp_processor_id(), vmstat_wq,
-+ this_cpu_ptr(&vmstat_work),
-+ round_jiffies_relative(sysctl_stat_interval));
-+ }
- } else {
- /*
- * We did not update any counters so the app may be in
-@@ -1426,18 +1431,6 @@ static void vmstat_update(struct work_st
- * until the diffs stay at zero. The function is used by NOHZ and can only be
- * invoked when tick processing is not active.
- */
--void quiet_vmstat(void)
--{
-- if (system_state != SYSTEM_RUNNING)
-- return;
--
-- do {
-- if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
-- cancel_delayed_work(this_cpu_ptr(&vmstat_work));
--
-- } while (refresh_cpu_vm_stats(false));
--}
--
- /*
- * Check if the diffs for a certain cpu indicate that
- * an update is needed.
-@@ -1461,6 +1454,30 @@ static bool need_update(int cpu)
- return false;
- }
-
-+void quiet_vmstat(void)
-+{
-+ if (system_state != SYSTEM_RUNNING)
-+ return;
-+
-+ /*
-+ * If we are already in hands of the shepherd then there
-+ * is nothing for us to do here.
-+ */
-+ if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
-+ return;
-+
-+ if (!need_update(smp_processor_id()))
-+ return;
-+
-+ /*
-+ * Just refresh counters and do not care about the pending delayed
-+ * vmstat_update. It doesn't fire that often to matter and canceling
-+ * it would be too expensive from this path.
-+ * vmstat_shepherd will take care about that for us.
-+ */
-+ refresh_cpu_vm_stats(false);
-+}
-+
-
- /*
- * Shepherd worker thread that checks the
-@@ -1478,18 +1495,25 @@ static void vmstat_shepherd(struct work_
-
- get_online_cpus();
- /* Check processors whose vmstat worker threads have been disabled */
-- for_each_cpu(cpu, cpu_stat_off)
-- if (need_update(cpu) &&
-- cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
--
-- queue_delayed_work_on(cpu, vmstat_wq,
-- &per_cpu(vmstat_work, cpu), 0);
-+ for_each_cpu(cpu, cpu_stat_off) {
-+ struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
-
-+ if (need_update(cpu)) {
-+ if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
-+ queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
-+ } else {
-+ /*
-+ * Cancel the work if quiet_vmstat has put this
-+ * cpu on cpu_stat_off because the work item might
-+ * be still scheduled
-+ */
-+ cancel_delayed_work(dw);
-+ }
-+ }
- put_online_cpus();
-
- schedule_delayed_work(&shepherd,
- round_jiffies_relative(sysctl_stat_interval));
--
- }
-
- static void __init start_shepherd_timer(void)