]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
psi: Reduce calls to sched_clock() in psi
authorShakeel Butt <shakeelb@google.com>
Sun, 21 Mar 2021 20:51:56 +0000 (13:51 -0700)
committerPeter Zijlstra <peterz@infradead.org>
Tue, 23 Mar 2021 15:01:58 +0000 (16:01 +0100)
We noticed that the cost of psi increases with the increase in the
levels of the cgroups. Particularly the cost of cpu_clock() sticks out
as the kernel calls it multiple times as it traverses up the cgroup
tree. This patch reduces the calls to cpu_clock().

Performed perf bench on Intel Broadwell with 3 levels of cgroup.

Before the patch:

$ perf bench sched all
 # Running sched/messaging benchmark...
 # 20 sender and receiver processes per group
 # 10 groups == 400 processes run

     Total time: 0.747 [sec]

 # Running sched/pipe benchmark...
 # Executed 1000000 pipe operations between two processes

     Total time: 3.516 [sec]

       3.516689 usecs/op
         284358 ops/sec

After the patch:

$ perf bench sched all
 # Running sched/messaging benchmark...
 # 20 sender and receiver processes per group
 # 10 groups == 400 processes run

     Total time: 0.640 [sec]

 # Running sched/pipe benchmark...
 # Executed 1000000 pipe operations between two processes

     Total time: 3.329 [sec]

       3.329820 usecs/op
         300316 ops/sec

Signed-off-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20210321205156.4186483-1-shakeelb@google.com
kernel/sched/psi.c

index c8480d785987b8dbb3c4c65ae9a26e82afbe2e95..b1b00e9bd7edc01e09cd15eec928e2e529804cac 100644 (file)
@@ -644,12 +644,10 @@ static void poll_timer_fn(struct timer_list *t)
        wake_up_interruptible(&group->poll_wait);
 }
 
-static void record_times(struct psi_group_cpu *groupc, int cpu)
+static void record_times(struct psi_group_cpu *groupc, u64 now)
 {
        u32 delta;
-       u64 now;
 
-       now = cpu_clock(cpu);
        delta = now - groupc->state_start;
        groupc->state_start = now;
 
@@ -676,7 +674,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu)
 }
 
 static void psi_group_change(struct psi_group *group, int cpu,
-                            unsigned int clear, unsigned int set,
+                            unsigned int clear, unsigned int set, u64 now,
                             bool wake_clock)
 {
        struct psi_group_cpu *groupc;
@@ -696,7 +694,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
         */
        write_seqcount_begin(&groupc->seq);
 
-       record_times(groupc, cpu);
+       record_times(groupc, now);
 
        for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
                if (!(m & (1 << t)))
@@ -788,12 +786,14 @@ void psi_task_change(struct task_struct *task, int clear, int set)
        struct psi_group *group;
        bool wake_clock = true;
        void *iter = NULL;
+       u64 now;
 
        if (!task->pid)
                return;
 
        psi_flags_change(task, clear, set);
 
+       now = cpu_clock(cpu);
        /*
         * Periodic aggregation shuts off if there is a period of no
         * task changes, so we wake it back up if necessary. However,
@@ -806,7 +806,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
                wake_clock = false;
 
        while ((group = iterate_groups(task, &iter)))
-               psi_group_change(group, cpu, clear, set, wake_clock);
+               psi_group_change(group, cpu, clear, set, now, wake_clock);
 }
 
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -815,6 +815,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
        struct psi_group *group, *common = NULL;
        int cpu = task_cpu(prev);
        void *iter;
+       u64 now = cpu_clock(cpu);
 
        if (next->pid) {
                bool identical_state;
@@ -836,7 +837,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
                                break;
                        }
 
-                       psi_group_change(group, cpu, 0, TSK_ONCPU, true);
+                       psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
                }
        }
 
@@ -858,7 +859,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 
                iter = NULL;
                while ((group = iterate_groups(prev, &iter)) && group != common)
-                       psi_group_change(group, cpu, clear, set, true);
+                       psi_group_change(group, cpu, clear, set, now, true);
 
                /*
                 * TSK_ONCPU is handled up to the common ancestor. If we're tasked
@@ -867,7 +868,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
                if (sleep) {
                        clear &= ~TSK_ONCPU;
                        for (; group; group = iterate_groups(prev, &iter))
-                               psi_group_change(group, cpu, clear, set, true);
+                               psi_group_change(group, cpu, clear, set, now, true);
                }
        }
 }