]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob
c7674fdbef2cc5cc1d041dffb681ec6524032ac8
[thirdparty/kernel/stable-queue.git] /
1 From 4929a4e6faa0f13289a67cae98139e727f0d4a97 Mon Sep 17 00:00:00 2001
2 From: Xuewei Zhang <xueweiz@google.com>
3 Date: Thu, 3 Oct 2019 17:12:43 -0700
4 Subject: sched/fair: Scale bandwidth quota and period without losing quota/period ratio precision
5
6 From: Xuewei Zhang <xueweiz@google.com>
7
8 commit 4929a4e6faa0f13289a67cae98139e727f0d4a97 upstream.
9
10 The quota/period ratio is used to ensure a child task group won't get
11 more bandwidth than the parent task group, and is calculated as:
12
13 normalized_cfs_quota() = [(quota_us << 20) / period_us]
14
15 If the quota/period ratio was changed during this scaling due to
16 precision loss, it will cause inconsistency between parent and child
17 task groups.
18
19 See below example:
20
21 A userspace container manager (kubelet) does three operations:
22
23 1) Create a parent cgroup, set quota to 1,000us and period to 10,000us.
24 2) Create a few children cgroups.
25 3) Set quota to 1,000us and period to 10,000us on a child cgroup.
26
27 These operations are expected to succeed. However, if the scaling of
28 147/128 happens before step 3, quota and period of the parent cgroup
29 will be changed:
30
31 new_quota: 1148437ns, 1148us
32 new_period: 11484375ns, 11484us
33
34 And when step 3 comes in, the ratio of the child cgroup will be
35 104857, which will be larger than the parent cgroup ratio (104821),
36 and will fail.
37
38 Scaling them by a factor of 2 will fix the problem.
39
40 Tested-by: Phil Auld <pauld@redhat.com>
41 Signed-off-by: Xuewei Zhang <xueweiz@google.com>
42 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
43 Acked-by: Phil Auld <pauld@redhat.com>
44 Cc: Anton Blanchard <anton@ozlabs.org>
45 Cc: Ben Segall <bsegall@google.com>
46 Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
47 Cc: Juri Lelli <juri.lelli@redhat.com>
48 Cc: Linus Torvalds <torvalds@linux-foundation.org>
49 Cc: Mel Gorman <mgorman@suse.de>
50 Cc: Peter Zijlstra <peterz@infradead.org>
51 Cc: Steven Rostedt <rostedt@goodmis.org>
52 Cc: Thomas Gleixner <tglx@linutronix.de>
53 Cc: Vincent Guittot <vincent.guittot@linaro.org>
54 Fixes: 2e8e19226398 ("sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup")
55 Link: https://lkml.kernel.org/r/20191004001243.140897-1-xueweiz@google.com
56 Signed-off-by: Ingo Molnar <mingo@kernel.org>
57 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
58
59
60 ---
61 kernel/sched/fair.c | 34 +++++++++++++++++++++-------------
62 1 file changed, 21 insertions(+), 13 deletions(-)
63
64 --- a/kernel/sched/fair.c
65 +++ b/kernel/sched/fair.c
66 @@ -4391,20 +4391,28 @@ static enum hrtimer_restart sched_cfs_pe
67 if (++count > 3) {
68 u64 new, old = ktime_to_ns(cfs_b->period);
69
70 - new = (old * 147) / 128; /* ~115% */
71 - new = min(new, max_cfs_quota_period);
72 + /*
73 + * Grow period by a factor of 2 to avoid losing precision.
74 + * Precision loss in the quota/period ratio can cause __cfs_schedulable
75 + * to fail.
76 + */
77 + new = old * 2;
78 + if (new < max_cfs_quota_period) {
79 + cfs_b->period = ns_to_ktime(new);
80 + cfs_b->quota *= 2;
81
82 - cfs_b->period = ns_to_ktime(new);
83 -
84 - /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
85 - cfs_b->quota *= new;
86 - cfs_b->quota = div64_u64(cfs_b->quota, old);
87 -
88 - pr_warn_ratelimited(
89 - "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
90 - smp_processor_id(),
91 - div_u64(new, NSEC_PER_USEC),
92 - div_u64(cfs_b->quota, NSEC_PER_USEC));
93 + pr_warn_ratelimited(
94 + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
95 + smp_processor_id(),
96 + div_u64(new, NSEC_PER_USEC),
97 + div_u64(cfs_b->quota, NSEC_PER_USEC));
98 + } else {
99 + pr_warn_ratelimited(
100 + "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
101 + smp_processor_id(),
102 + div_u64(old, NSEC_PER_USEC),
103 + div_u64(cfs_b->quota, NSEC_PER_USEC));
104 + }
105
106 /* reset count so we don't come right back in here */
107 count = 0;