git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob

   1 From 4929a4e6faa0f13289a67cae98139e727f0d4a97 Mon Sep 17 00:00:00 2001
   2 From: Xuewei Zhang <xueweiz@google.com>
   3 Date: Thu, 3 Oct 2019 17:12:43 -0700
   4 Subject: sched/fair: Scale bandwidth quota and period without losing quota/period ratio precision
   5
   6 From: Xuewei Zhang <xueweiz@google.com>
   7
   8 commit 4929a4e6faa0f13289a67cae98139e727f0d4a97 upstream.
   9
  10 The quota/period ratio is used to ensure a child task group won't get
  11 more bandwidth than the parent task group, and is calculated as:
  12
  13   normalized_cfs_quota() = [(quota_us << 20) / period_us]
  14
  15 If the quota/period ratio was changed during this scaling due to
  16 precision loss, it will cause inconsistency between parent and child
  17 task groups.
  18
  19 See below example:
  20
  21 A userspace container manager (kubelet) does three operations:
  22
  23  1) Create a parent cgroup, set quota to 1,000us and period to 10,000us.
  24  2) Create a few children cgroups.
  25  3) Set quota to 1,000us and period to 10,000us on a child cgroup.
  26
  27 These operations are expected to succeed. However, if the scaling of
  28 147/128 happens before step 3, quota and period of the parent cgroup
  29 will be changed:
  30
  31   new_quota: 1148437ns,   1148us
  32  new_period: 11484375ns, 11484us
  33
  34 And when step 3 comes in, the ratio of the child cgroup will be
  35 104857, which will be larger than the parent cgroup ratio (104821),
  36 and will fail.
  37
  38 Scaling them by a factor of 2 will fix the problem.
  39
  40 Tested-by: Phil Auld <pauld@redhat.com>
  41 Signed-off-by: Xuewei Zhang <xueweiz@google.com>
  42 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
  43 Acked-by: Phil Auld <pauld@redhat.com>
  44 Cc: Anton Blanchard <anton@ozlabs.org>
  45 Cc: Ben Segall <bsegall@google.com>
  46 Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
  47 Cc: Juri Lelli <juri.lelli@redhat.com>
  48 Cc: Linus Torvalds <torvalds@linux-foundation.org>
  49 Cc: Mel Gorman <mgorman@suse.de>
  50 Cc: Peter Zijlstra <peterz@infradead.org>
  51 Cc: Steven Rostedt <rostedt@goodmis.org>
  52 Cc: Thomas Gleixner <tglx@linutronix.de>
  53 Cc: Vincent Guittot <vincent.guittot@linaro.org>
  54 Fixes: 2e8e19226398 ("sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup")
  55 Link: https://lkml.kernel.org/r/20191004001243.140897-1-xueweiz@google.com
  56 Signed-off-by: Ingo Molnar <mingo@kernel.org>
  57 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  58
  59
  60 ---
  61  kernel/sched/fair.c |   34 +++++++++++++++++++++-------------
  62  1 file changed, 21 insertions(+), 13 deletions(-)
  63
  64 --- a/kernel/sched/fair.c
  65 +++ b/kernel/sched/fair.c
  66 @@ -4391,20 +4391,28 @@ static enum hrtimer_restart sched_cfs_pe
  67                 if (++count > 3) {
  68                         u64 new, old = ktime_to_ns(cfs_b->period);
  69
  70 -                       new = (old * 147) / 128; /* ~115% */
  71 -                       new = min(new, max_cfs_quota_period);
  72 +                       /*
  73 +                        * Grow period by a factor of 2 to avoid losing precision.
  74 +                        * Precision loss in the quota/period ratio can cause __cfs_schedulable
  75 +                        * to fail.
  76 +                        */
  77 +                       new = old * 2;
  78 +                       if (new < max_cfs_quota_period) {
  79 +                               cfs_b->period = ns_to_ktime(new);
  80 +                               cfs_b->quota *= 2;
  81
  82 -                       cfs_b->period = ns_to_ktime(new);
  83 -
  84 -                       /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
  85 -                       cfs_b->quota *= new;
  86 -                       cfs_b->quota = div64_u64(cfs_b->quota, old);
  87 -
  88 -                       pr_warn_ratelimited(
  89 -        "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
  90 -                               smp_processor_id(),
  91 -                               div_u64(new, NSEC_PER_USEC),
  92 -                                div_u64(cfs_b->quota, NSEC_PER_USEC));
  93 +                               pr_warn_ratelimited(
  94 +       "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
  95 +                                       smp_processor_id(),
  96 +                                       div_u64(new, NSEC_PER_USEC),
  97 +                                       div_u64(cfs_b->quota, NSEC_PER_USEC));
  98 +                       } else {
  99 +                               pr_warn_ratelimited(
 100 +       "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
 101 +                                       smp_processor_id(),
 102 +                                       div_u64(old, NSEC_PER_USEC),
 103 +                                       div_u64(cfs_b->quota, NSEC_PER_USEC));
 104 +                       }
 105
 106                         /* reset count so we don't come right back in here */
 107                         count = 0;