]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob
bedc7e3f0da7f50f317f8e92d87b033f711764d4
[thirdparty/kernel/stable-queue.git] /
1 From de53fd7aedb100f03e5d2231cfce0e4993282425 Mon Sep 17 00:00:00 2001
2 From: Dave Chiluk <chiluk+linux@indeed.com>
3 Date: Tue, 23 Jul 2019 11:44:26 -0500
4 Subject: sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices
5
6 From: Dave Chiluk <chiluk+linux@indeed.com>
7
8 commit de53fd7aedb100f03e5d2231cfce0e4993282425 upstream.
9
10 It has been observed, that highly-threaded, non-cpu-bound applications
11 running under cpu.cfs_quota_us constraints can hit a high percentage of
12 periods throttled while simultaneously not consuming the allocated
13 amount of quota. This use case is typical of user-interactive non-cpu
14 bound applications, such as those running in kubernetes or mesos when
15 run on multiple cpu cores.
16
17 This has been root caused to cpu-local run queue being allocated per cpu
18 bandwidth slices, and then not fully using that slice within the period.
19 At which point the slice and quota expires. This expiration of unused
20 slice results in applications not being able to utilize the quota for
21 which they are allocated.
22
23 The non-expiration of per-cpu slices was recently fixed by
24 'commit 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift
25 condition")'. Prior to that it appears that this had been broken since
26 at least 'commit 51f2176d74ac ("sched/fair: Fix unlocked reads of some
27 cfs_b->quota/period")' which was introduced in v3.16-rc1 in 2014. That
28 added the following conditional which resulted in slices never being
29 expired.
30
31 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
32 /* extend local deadline, drift is bounded above by 2 ticks */
33 cfs_rq->runtime_expires += TICK_NSEC;
34
35 Because this was broken for nearly 5 years, and has recently been fixed
36 and is now being noticed by many users running kubernetes
37 (https://github.com/kubernetes/kubernetes/issues/67577) it is my opinion
38 that the mechanisms around expiring runtime should be removed
39 altogether.
40
41 This allows quota already allocated to per-cpu run-queues to live longer
42 than the period boundary. This allows threads on runqueues that do not
43 use much CPU to continue to use their remaining slice over a longer
44 period of time than cpu.cfs_period_us. However, this helps prevent the
45 above condition of hitting throttling while also not fully utilizing
46 your cpu quota.
47
48 This theoretically allows a machine to use slightly more than its
49 allotted quota in some periods. This overflow would be bounded by the
50 remaining quota left on each per-cpu runqueueu. This is typically no
51 more than min_cfs_rq_runtime=1ms per cpu. For CPU bound tasks this will
52 change nothing, as they should theoretically fully utilize all of their
53 quota in each period. For user-interactive tasks as described above this
54 provides a much better user/application experience as their cpu
55 utilization will more closely match the amount they requested when they
56 hit throttling. This means that cpu limits no longer strictly apply per
57 period for non-cpu bound applications, but that they are still accurate
58 over longer timeframes.
59
60 This greatly improves performance of high-thread-count, non-cpu bound
61 applications with low cfs_quota_us allocation on high-core-count
62 machines. In the case of an artificial testcase (10ms/100ms of quota on
63 80 CPU machine), this commit resulted in almost 30x performance
64 improvement, while still maintaining correct cpu quota restrictions.
65 That testcase is available at https://github.com/indeedeng/fibtest.
66
67 Fixes: 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition")
68 Signed-off-by: Dave Chiluk <chiluk+linux@indeed.com>
69 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
70 Reviewed-by: Phil Auld <pauld@redhat.com>
71 Reviewed-by: Ben Segall <bsegall@google.com>
72 Cc: Ingo Molnar <mingo@redhat.com>
73 Cc: John Hammond <jhammond@indeed.com>
74 Cc: Jonathan Corbet <corbet@lwn.net>
75 Cc: Kyle Anderson <kwa@yelp.com>
76 Cc: Gabriel Munos <gmunoz@netflix.com>
77 Cc: Peter Oskolkov <posk@posk.io>
78 Cc: Cong Wang <xiyou.wangcong@gmail.com>
79 Cc: Brendan Gregg <bgregg@netflix.com>
80 Link: https://lkml.kernel.org/r/1563900266-19734-2-git-send-email-chiluk+linux@indeed.com
81 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
82
83
84 ---
85 Documentation/scheduler/sched-bwc.txt | 45 +++++++++++++++++++++
86 kernel/sched/fair.c | 72 +++-------------------------------
87 kernel/sched/sched.h | 4 -
88 3 files changed, 52 insertions(+), 69 deletions(-)
89
90 --- a/Documentation/scheduler/sched-bwc.txt
91 +++ b/Documentation/scheduler/sched-bwc.txt
92 @@ -90,6 +90,51 @@ There are two ways in which a group may
93 In case b) above, even though the child may have runtime remaining it will not
94 be allowed to until the parent's runtime is refreshed.
95
96 +CFS Bandwidth Quota Caveats
97 +---------------------------
98 +Once a slice is assigned to a cpu it does not expire. However all but 1ms of
99 +the slice may be returned to the global pool if all threads on that cpu become
100 +unrunnable. This is configured at compile time by the min_cfs_rq_runtime
101 +variable. This is a performance tweak that helps prevent added contention on
102 +the global lock.
103 +
104 +The fact that cpu-local slices do not expire results in some interesting corner
105 +cases that should be understood.
106 +
107 +For cgroup cpu constrained applications that are cpu limited this is a
108 +relatively moot point because they will naturally consume the entirety of their
109 +quota as well as the entirety of each cpu-local slice in each period. As a
110 +result it is expected that nr_periods roughly equal nr_throttled, and that
111 +cpuacct.usage will increase roughly equal to cfs_quota_us in each period.
112 +
113 +For highly-threaded, non-cpu bound applications this non-expiration nuance
114 +allows applications to briefly burst past their quota limits by the amount of
115 +unused slice on each cpu that the task group is running on (typically at most
116 +1ms per cpu or as defined by min_cfs_rq_runtime). This slight burst only
117 +applies if quota had been assigned to a cpu and then not fully used or returned
118 +in previous periods. This burst amount will not be transferred between cores.
119 +As a result, this mechanism still strictly limits the task group to quota
120 +average usage, albeit over a longer time window than a single period. This
121 +also limits the burst ability to no more than 1ms per cpu. This provides
122 +better more predictable user experience for highly threaded applications with
123 +small quota limits on high core count machines. It also eliminates the
124 +propensity to throttle these applications while simultanously using less than
125 +quota amounts of cpu. Another way to say this, is that by allowing the unused
126 +portion of a slice to remain valid across periods we have decreased the
127 +possibility of wastefully expiring quota on cpu-local silos that don't need a
128 +full slice's amount of cpu time.
129 +
130 +The interaction between cpu-bound and non-cpu-bound-interactive applications
131 +should also be considered, especially when single core usage hits 100%. If you
132 +gave each of these applications half of a cpu-core and they both got scheduled
133 +on the same CPU it is theoretically possible that the non-cpu bound application
134 +will use up to 1ms additional quota in some periods, thereby preventing the
135 +cpu-bound application from fully using its quota by that same amount. In these
136 +instances it will be up to the CFS algorithm (see sched-design-CFS.rst) to
137 +decide which application is chosen to run, as they will both be runnable and
138 +have remaining quota. This runtime discrepancy will be made up in the following
139 +periods when the interactive application idles.
140 +
141 Examples
142 --------
143 1. Limit a group to 1 CPU worth of runtime.
144 --- a/kernel/sched/fair.c
145 +++ b/kernel/sched/fair.c
146 @@ -4320,8 +4320,6 @@ void __refill_cfs_bandwidth_runtime(stru
147
148 now = sched_clock_cpu(smp_processor_id());
149 cfs_b->runtime = cfs_b->quota;
150 - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
151 - cfs_b->expires_seq++;
152 }
153
154 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
155 @@ -4343,8 +4341,7 @@ static int assign_cfs_rq_runtime(struct
156 {
157 struct task_group *tg = cfs_rq->tg;
158 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
159 - u64 amount = 0, min_amount, expires;
160 - int expires_seq;
161 + u64 amount = 0, min_amount;
162
163 /* note: this is a positive sum as runtime_remaining <= 0 */
164 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
165 @@ -4361,61 +4358,17 @@ static int assign_cfs_rq_runtime(struct
166 cfs_b->idle = 0;
167 }
168 }
169 - expires_seq = cfs_b->expires_seq;
170 - expires = cfs_b->runtime_expires;
171 raw_spin_unlock(&cfs_b->lock);
172
173 cfs_rq->runtime_remaining += amount;
174 - /*
175 - * we may have advanced our local expiration to account for allowed
176 - * spread between our sched_clock and the one on which runtime was
177 - * issued.
178 - */
179 - if (cfs_rq->expires_seq != expires_seq) {
180 - cfs_rq->expires_seq = expires_seq;
181 - cfs_rq->runtime_expires = expires;
182 - }
183
184 return cfs_rq->runtime_remaining > 0;
185 }
186
187 -/*
188 - * Note: This depends on the synchronization provided by sched_clock and the
189 - * fact that rq->clock snapshots this value.
190 - */
191 -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
192 -{
193 - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
194 -
195 - /* if the deadline is ahead of our clock, nothing to do */
196 - if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
197 - return;
198 -
199 - if (cfs_rq->runtime_remaining < 0)
200 - return;
201 -
202 - /*
203 - * If the local deadline has passed we have to consider the
204 - * possibility that our sched_clock is 'fast' and the global deadline
205 - * has not truly expired.
206 - *
207 - * Fortunately we can check determine whether this the case by checking
208 - * whether the global deadline(cfs_b->expires_seq) has advanced.
209 - */
210 - if (cfs_rq->expires_seq == cfs_b->expires_seq) {
211 - /* extend local deadline, drift is bounded above by 2 ticks */
212 - cfs_rq->runtime_expires += TICK_NSEC;
213 - } else {
214 - /* global deadline is ahead, expiration has passed */
215 - cfs_rq->runtime_remaining = 0;
216 - }
217 -}
218 -
219 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
220 {
221 /* dock delta_exec before expiring quota (as it could span periods) */
222 cfs_rq->runtime_remaining -= delta_exec;
223 - expire_cfs_rq_runtime(cfs_rq);
224
225 if (likely(cfs_rq->runtime_remaining > 0))
226 return;
227 @@ -4600,8 +4553,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cf
228 resched_curr(rq);
229 }
230
231 -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
232 - u64 remaining, u64 expires)
233 +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
234 {
235 struct cfs_rq *cfs_rq;
236 u64 runtime;
237 @@ -4626,7 +4578,6 @@ static u64 distribute_cfs_runtime(struct
238 remaining -= runtime;
239
240 cfs_rq->runtime_remaining += runtime;
241 - cfs_rq->runtime_expires = expires;
242
243 /* we check whether we're throttled above */
244 if (cfs_rq->runtime_remaining > 0)
245 @@ -4651,7 +4602,7 @@ next:
246 */
247 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
248 {
249 - u64 runtime, runtime_expires;
250 + u64 runtime;
251 int throttled;
252
253 /* no need to continue the timer with no bandwidth constraint */
254 @@ -4679,8 +4630,6 @@ static int do_sched_cfs_period_timer(str
255 /* account preceding periods in which throttling occurred */
256 cfs_b->nr_throttled += overrun;
257
258 - runtime_expires = cfs_b->runtime_expires;
259 -
260 /*
261 * This check is repeated as we are holding onto the new bandwidth while
262 * we unthrottle. This can potentially race with an unthrottled group
263 @@ -4693,8 +4642,7 @@ static int do_sched_cfs_period_timer(str
264 cfs_b->distribute_running = 1;
265 raw_spin_unlock(&cfs_b->lock);
266 /* we can't nest cfs_b->lock while distributing bandwidth */
267 - runtime = distribute_cfs_runtime(cfs_b, runtime,
268 - runtime_expires);
269 + runtime = distribute_cfs_runtime(cfs_b, runtime);
270 raw_spin_lock(&cfs_b->lock);
271
272 cfs_b->distribute_running = 0;
273 @@ -4771,8 +4719,7 @@ static void __return_cfs_rq_runtime(stru
274 return;
275
276 raw_spin_lock(&cfs_b->lock);
277 - if (cfs_b->quota != RUNTIME_INF &&
278 - cfs_rq->runtime_expires == cfs_b->runtime_expires) {
279 + if (cfs_b->quota != RUNTIME_INF) {
280 cfs_b->runtime += slack_runtime;
281
282 /* we are under rq->lock, defer unthrottling using a timer */
283 @@ -4804,7 +4751,6 @@ static __always_inline void return_cfs_r
284 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
285 {
286 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
287 - u64 expires;
288
289 /* confirm we're still not at a refresh boundary */
290 raw_spin_lock(&cfs_b->lock);
291 @@ -4821,7 +4767,6 @@ static void do_sched_cfs_slack_timer(str
292 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
293 runtime = cfs_b->runtime;
294
295 - expires = cfs_b->runtime_expires;
296 if (runtime)
297 cfs_b->distribute_running = 1;
298
299 @@ -4830,11 +4775,10 @@ static void do_sched_cfs_slack_timer(str
300 if (!runtime)
301 return;
302
303 - runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
304 + runtime = distribute_cfs_runtime(cfs_b, runtime);
305
306 raw_spin_lock(&cfs_b->lock);
307 - if (expires == cfs_b->runtime_expires)
308 - cfs_b->runtime -= min(runtime, cfs_b->runtime);
309 + cfs_b->runtime -= min(runtime, cfs_b->runtime);
310 cfs_b->distribute_running = 0;
311 raw_spin_unlock(&cfs_b->lock);
312 }
313 @@ -4989,8 +4933,6 @@ void start_cfs_bandwidth(struct cfs_band
314
315 cfs_b->period_active = 1;
316 overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
317 - cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
318 - cfs_b->expires_seq++;
319 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
320 }
321
322 --- a/kernel/sched/sched.h
323 +++ b/kernel/sched/sched.h
324 @@ -334,8 +334,6 @@ struct cfs_bandwidth {
325 u64 quota;
326 u64 runtime;
327 s64 hierarchical_quota;
328 - u64 runtime_expires;
329 - int expires_seq;
330
331 short idle;
332 short period_active;
333 @@ -555,8 +553,6 @@ struct cfs_rq {
334
335 #ifdef CONFIG_CFS_BANDWIDTH
336 int runtime_enabled;
337 - int expires_seq;
338 - u64 runtime_expires;
339 s64 runtime_remaining;
340
341 u64 throttled_clock;