]>
Commit | Line | Data |
---|---|---|
2cb7cef9 BS |
1 | Subject: sched: revert back to per-rq vruntime |
2 | From: Peter Zijlstra <a.p.zijlstra@chello.nl> | |
3 | References: 437171 - LTC47404 | |
4 | ||
5 | Vatsa rightly points out that having the runqueue weight in the vruntime | |
6 | calculations can cause unfairness in the face of task joins/leaves. | |
7 | ||
8 | Suppose: dv = dt * rw / w | |
9 | ||
10 | Then take 10 tasks t_n, each of similar weight. If the first will run 1 | |
11 | then its vruntime will increase by 10. Now, if the next 8 tasks leave after | |
12 | having run their 1, then the last task will get a vruntime increase of 2 | |
13 | after having run 1. | |
14 | ||
15 | Which will leave us with 2 tasks of equal weight and equal runtime, of which | |
16 | one will not be scheduled for 8/2=4 units of time. | |
17 | ||
18 | Ergo, we cannot do that and must use: dv = dt / w. | |
19 | ||
20 | This means we cannot have a global vruntime based on effective priority, but | |
21 | must instead go back to the vruntime per rq model we started out with. | |
22 | ||
23 | This patch was lightly tested by doing starting while loops on each nice level | |
24 | and observing their execution time, and a simple group scenario of 1:2:3 pinned | |
25 | to a single cpu. | |
26 | ||
27 | Reported-by: Srivatsa Vaddagiri <vatsa@in.ibm.com> | |
28 | Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> | |
29 | Signed-off-by: Olaf Hering <olh@suse.de> | |
30 | --- | |
31 | kernel/sched_fair.c | 32 +++++++++++++++----------------- | |
32 | 1 file changed, 15 insertions(+), 17 deletions(-) | |
33 | ||
34 | --- a/kernel/sched_fair.c | |
35 | +++ b/kernel/sched_fair.c | |
36 | @@ -334,7 +334,7 @@ int sched_nr_latency_handler(struct ctl_ | |
37 | #endif | |
38 | ||
39 | /* | |
40 | - * delta *= w / rw | |
41 | + * delta *= P[w / rw] | |
42 | */ | |
43 | static inline unsigned long | |
44 | calc_delta_weight(unsigned long delta, struct sched_entity *se) | |
45 | @@ -348,15 +348,13 @@ calc_delta_weight(unsigned long delta, s | |
46 | } | |
47 | ||
48 | /* | |
49 | - * delta *= rw / w | |
50 | + * delta /= w | |
51 | */ | |
52 | static inline unsigned long | |
53 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | |
54 | { | |
55 | - for_each_sched_entity(se) { | |
56 | - delta = calc_delta_mine(delta, | |
57 | - cfs_rq_of(se)->load.weight, &se->load); | |
58 | - } | |
59 | + if (unlikely(se->load.weight != NICE_0_LOAD)) | |
60 | + delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); | |
61 | ||
62 | return delta; | |
63 | } | |
64 | @@ -386,26 +384,26 @@ static u64 __sched_period(unsigned long | |
65 | * We calculate the wall-time slice from the period by taking a part | |
66 | * proportional to the weight. | |
67 | * | |
68 | - * s = p*w/rw | |
69 | + * s = p*P[w/rw] | |
70 | */ | |
71 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |
72 | { | |
73 | - return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); | |
74 | + unsigned long nr_running = cfs_rq->nr_running; | |
75 | + | |
76 | + if (unlikely(!se->on_rq)) | |
77 | + nr_running++; | |
78 | + | |
79 | + return calc_delta_weight(__sched_period(nr_running), se); | |
80 | } | |
81 | ||
82 | /* | |
83 | * We calculate the vruntime slice of a to be inserted task | |
84 | * | |
85 | - * vs = s*rw/w = p | |
86 | + * vs = s/w | |
87 | */ | |
88 | -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | |
89 | +static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |
90 | { | |
91 | - unsigned long nr_running = cfs_rq->nr_running; | |
92 | - | |
93 | - if (!se->on_rq) | |
94 | - nr_running++; | |
95 | - | |
96 | - return __sched_period(nr_running); | |
97 | + return calc_delta_fair(sched_slice(cfs_rq, se), se); | |
98 | } | |
99 | ||
100 | /* | |
101 | @@ -683,7 +681,7 @@ place_entity(struct cfs_rq *cfs_rq, stru | |
102 | * stays open at the end. | |
103 | */ | |
104 | if (initial && sched_feat(START_DEBIT)) | |
105 | - vruntime += sched_vslice_add(cfs_rq, se); | |
106 | + vruntime += sched_vslice(cfs_rq, se); | |
107 | ||
108 | if (!initial) { | |
109 | /* sleeps upto a single latency don't count. */ |