src/patches/suse-2.6.27.39/patches.fixes/bug-437171_2_sched_delta_weight.patch

   1 Subject: sched: revert back to per-rq vruntime
   2 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
   3 References: 437171 - LTC47404
   4
   5 Vatsa rightly points out that having the runqueue weight in the vruntime
   6 calculations can cause unfairness in the face of task joins/leaves.
   7
   8 Suppose: dv = dt * rw / w
   9
  10 Then take 10 tasks t_n, each of similar weight. If the first will run 1
  11 then its vruntime will increase by 10. Now, if the next 8 tasks leave after
  12 having run their 1, then the last task will get a vruntime increase of 2
  13 after having run 1.
  14
  15 Which will leave us with 2 tasks of equal weight and equal runtime, of which
  16 one will not be scheduled for 8/2=4 units of time.
  17
  18 Ergo, we cannot do that and must use: dv = dt / w.
  19
  20 This means we cannot have a global vruntime based on effective priority, but
  21 must instead go back to the vruntime per rq model we started out with.
  22
  23 This patch was lightly tested by doing starting while loops on each nice level
  24 and observing their execution time, and a simple group scenario of 1:2:3 pinned
  25 to a single cpu.
  26
  27 Reported-by: Srivatsa Vaddagiri <vatsa@in.ibm.com>
  28 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
  29 Signed-off-by: Olaf Hering <olh@suse.de>
  30 ---
  31  kernel/sched_fair.c |   32 +++++++++++++++-----------------
  32  1 file changed, 15 insertions(+), 17 deletions(-)
  33
  34 --- a/kernel/sched_fair.c
  35 +++ b/kernel/sched_fair.c
  36 @@ -334,7 +334,7 @@ int sched_nr_latency_handler(struct ctl_
  37  #endif
  38
  39  /*
  40 - * delta *= w / rw
  41 + * delta *= P[w / rw]
  42   */
  43  static inline unsigned long
  44  calc_delta_weight(unsigned long delta, struct sched_entity *se)
  45 @@ -348,15 +348,13 @@ calc_delta_weight(unsigned long delta, s
  46  }
  47
  48  /*
  49 - * delta *= rw / w
  50 + * delta /= w
  51   */
  52  static inline unsigned long
  53  calc_delta_fair(unsigned long delta, struct sched_entity *se)
  54  {
  55 -       for_each_sched_entity(se) {
  56 -               delta = calc_delta_mine(delta,
  57 -                               cfs_rq_of(se)->load.weight, &se->load);
  58 -       }
  59 +       if (unlikely(se->load.weight != NICE_0_LOAD))
  60 +               delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
  61
  62         return delta;
  63  }
  64 @@ -386,26 +384,26 @@ static u64 __sched_period(unsigned long
  65   * We calculate the wall-time slice from the period by taking a part
  66   * proportional to the weight.
  67   *
  68 - * s = p*w/rw
  69 + * s = p*P[w/rw]
  70   */
  71  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  72  {
  73 -       return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
  74 +       unsigned long nr_running = cfs_rq->nr_running;
  75 +
  76 +       if (unlikely(!se->on_rq))
  77 +               nr_running++;
  78 +
  79 +       return calc_delta_weight(__sched_period(nr_running), se);
  80  }
  81
  82  /*
  83   * We calculate the vruntime slice of a to be inserted task
  84   *
  85 - * vs = s*rw/w = p
  86 + * vs = s/w
  87   */
  88 -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
  89 +static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  90  {
  91 -       unsigned long nr_running = cfs_rq->nr_running;
  92 -
  93 -       if (!se->on_rq)
  94 -               nr_running++;
  95 -
  96 -       return __sched_period(nr_running);
  97 +       return calc_delta_fair(sched_slice(cfs_rq, se), se);
  98  }
  99
 100  /*
 101 @@ -683,7 +681,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
 102          * stays open at the end.
 103          */
 104         if (initial && sched_feat(START_DEBIT))
 105 -               vruntime += sched_vslice_add(cfs_rq, se);
 106 +               vruntime += sched_vslice(cfs_rq, se);
 107
 108         if (!initial) {
 109                 /* sleeps upto a single latency don't count. */