From: Greg Kroah-Hartman Date: Mon, 4 Nov 2019 11:10:25 +0000 (+0100) Subject: 5.3-stable patches X-Git-Tag: v4.4.199~13 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=dda9c14546db2172bb6c27cd1dbdf89ce3f3e80c;p=thirdparty%2Fkernel%2Fstable-queue.git 5.3-stable patches added patches: iwlwifi-exclude-geo-sar-support-for-3168.patch sched-fair-fix-low-cpu-usage-with-high-throttling-by-removing-expiration-of-cpu-local-slices.patch --- diff --git a/queue-5.3/iwlwifi-exclude-geo-sar-support-for-3168.patch b/queue-5.3/iwlwifi-exclude-geo-sar-support-for-3168.patch new file mode 100644 index 00000000000..8d1b3af7c4e --- /dev/null +++ b/queue-5.3/iwlwifi-exclude-geo-sar-support-for-3168.patch @@ -0,0 +1,49 @@ +From 12e36d98d3e5acf5fc57774e0a15906d55f30cb9 Mon Sep 17 00:00:00 2001 +From: Luca Coelho +Date: Tue, 8 Oct 2019 13:10:53 +0300 +Subject: iwlwifi: exclude GEO SAR support for 3168 + +From: Luca Coelho + +commit 12e36d98d3e5acf5fc57774e0a15906d55f30cb9 upstream. + +We currently support two NICs in FW version 29, namely 7265D and 3168. +Out of these, only 7265D supports GEO SAR, so adjust the function that +checks for it accordingly. + +Signed-off-by: Luca Coelho +Fixes: f5a47fae6aa3 ("iwlwifi: mvm: fix version check for GEO_TX_POWER_LIMIT support") +Signed-off-by: Luca Coelho +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 16 +++++++++------- + 1 file changed, 9 insertions(+), 7 deletions(-) + +--- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c ++++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +@@ -887,15 +887,17 @@ static bool iwl_mvm_sar_geo_support(stru + * firmware versions. Unfortunately, we don't have a TLV API + * flag to rely on, so rely on the major version which is in + * the first byte of ucode_ver. This was implemented +- * initially on version 38 and then backported to29 and 17. +- * The intention was to have it in 36 as well, but not all +- * 8000 family got this feature enabled. The 8000 family is +- * the only one using version 36, so skip this version +- * entirely. ++ * initially on version 38 and then backported to 17. It was ++ * also backported to 29, but only for 7265D devices. The ++ * intention was to have it in 36 as well, but not all 8000 ++ * family got this feature enabled. The 8000 family is the ++ * only one using version 36, so skip this version entirely. + */ + return IWL_UCODE_SERIAL(mvm->fw->ucode_ver) >= 38 || +- IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 29 || +- IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 17; ++ IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 17 || ++ (IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 29 && ++ ((mvm->trans->hw_rev & CSR_HW_REV_TYPE_MSK) == ++ CSR_HW_REV_TYPE_7265D)); + } + + int iwl_mvm_get_sar_geo_profile(struct iwl_mvm *mvm) diff --git a/queue-5.3/sched-fair-fix-low-cpu-usage-with-high-throttling-by-removing-expiration-of-cpu-local-slices.patch b/queue-5.3/sched-fair-fix-low-cpu-usage-with-high-throttling-by-removing-expiration-of-cpu-local-slices.patch new file mode 100644 index 00000000000..00226f56dcb --- /dev/null +++ b/queue-5.3/sched-fair-fix-low-cpu-usage-with-high-throttling-by-removing-expiration-of-cpu-local-slices.patch @@ -0,0 +1,393 @@ +From de53fd7aedb100f03e5d2231cfce0e4993282425 Mon Sep 17 00:00:00 2001 +From: Dave Chiluk +Date: Tue, 23 Jul 2019 11:44:26 -0500 +Subject: sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices + +From: Dave Chiluk + +commit de53fd7aedb100f03e5d2231cfce0e4993282425 upstream. + +It has been observed, that highly-threaded, non-cpu-bound applications +running under cpu.cfs_quota_us constraints can hit a high percentage of +periods throttled while simultaneously not consuming the allocated +amount of quota. This use case is typical of user-interactive non-cpu +bound applications, such as those running in kubernetes or mesos when +run on multiple cpu cores. + +This has been root caused to cpu-local run queue being allocated per cpu +bandwidth slices, and then not fully using that slice within the period. +At which point the slice and quota expires. This expiration of unused +slice results in applications not being able to utilize the quota for +which they are allocated. + +The non-expiration of per-cpu slices was recently fixed by +'commit 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift +condition")'. Prior to that it appears that this had been broken since +at least 'commit 51f2176d74ac ("sched/fair: Fix unlocked reads of some +cfs_b->quota/period")' which was introduced in v3.16-rc1 in 2014. That +added the following conditional which resulted in slices never being +expired. + +if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { + /* extend local deadline, drift is bounded above by 2 ticks */ + cfs_rq->runtime_expires += TICK_NSEC; + +Because this was broken for nearly 5 years, and has recently been fixed +and is now being noticed by many users running kubernetes +(https://github.com/kubernetes/kubernetes/issues/67577) it is my opinion +that the mechanisms around expiring runtime should be removed +altogether. + +This allows quota already allocated to per-cpu run-queues to live longer +than the period boundary. This allows threads on runqueues that do not +use much CPU to continue to use their remaining slice over a longer +period of time than cpu.cfs_period_us. However, this helps prevent the +above condition of hitting throttling while also not fully utilizing +your cpu quota. + +This theoretically allows a machine to use slightly more than its +allotted quota in some periods. This overflow would be bounded by the +remaining quota left on each per-cpu runqueueu. This is typically no +more than min_cfs_rq_runtime=1ms per cpu. For CPU bound tasks this will +change nothing, as they should theoretically fully utilize all of their +quota in each period. For user-interactive tasks as described above this +provides a much better user/application experience as their cpu +utilization will more closely match the amount they requested when they +hit throttling. This means that cpu limits no longer strictly apply per +period for non-cpu bound applications, but that they are still accurate +over longer timeframes. + +This greatly improves performance of high-thread-count, non-cpu bound +applications with low cfs_quota_us allocation on high-core-count +machines. In the case of an artificial testcase (10ms/100ms of quota on +80 CPU machine), this commit resulted in almost 30x performance +improvement, while still maintaining correct cpu quota restrictions. +That testcase is available at https://github.com/indeedeng/fibtest. + +Fixes: 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition") +Signed-off-by: Dave Chiluk +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Phil Auld +Reviewed-by: Ben Segall +Cc: Ingo Molnar +Cc: John Hammond +Cc: Jonathan Corbet +Cc: Kyle Anderson +Cc: Gabriel Munos +Cc: Peter Oskolkov +Cc: Cong Wang +Cc: Brendan Gregg +Link: https://lkml.kernel.org/r/1563900266-19734-2-git-send-email-chiluk+linux@indeed.com +Signed-off-by: Greg Kroah-Hartman + +--- + Documentation/scheduler/sched-bwc.rst | 74 +++++++++++++++++++++++++++------- + kernel/sched/fair.c | 72 +++------------------------------ + kernel/sched/sched.h | 4 - + 3 files changed, 67 insertions(+), 83 deletions(-) + +--- a/Documentation/scheduler/sched-bwc.rst ++++ b/Documentation/scheduler/sched-bwc.rst +@@ -9,15 +9,16 @@ CFS bandwidth control is a CONFIG_FAIR_G + specification of the maximum CPU bandwidth available to a group or hierarchy. + + The bandwidth allowed for a group is specified using a quota and period. Within +-each given "period" (microseconds), a group is allowed to consume only up to +-"quota" microseconds of CPU time. When the CPU bandwidth consumption of a +-group exceeds this limit (for that period), the tasks belonging to its +-hierarchy will be throttled and are not allowed to run again until the next +-period. +- +-A group's unused runtime is globally tracked, being refreshed with quota units +-above at each period boundary. As threads consume this bandwidth it is +-transferred to cpu-local "silos" on a demand basis. The amount transferred ++each given "period" (microseconds), a task group is allocated up to "quota" ++microseconds of CPU time. That quota is assigned to per-cpu run queues in ++slices as threads in the cgroup become runnable. Once all quota has been ++assigned any additional requests for quota will result in those threads being ++throttled. Throttled threads will not be able to run again until the next ++period when the quota is replenished. ++ ++A group's unassigned quota is globally tracked, being refreshed back to ++cfs_quota units at each period boundary. As threads consume this bandwidth it ++is transferred to cpu-local "silos" on a demand basis. The amount transferred + within each of these updates is tunable and described as the "slice". + + Management +@@ -35,12 +36,12 @@ The default values are:: + + A value of -1 for cpu.cfs_quota_us indicates that the group does not have any + bandwidth restriction in place, such a group is described as an unconstrained +-bandwidth group. This represents the traditional work-conserving behavior for ++bandwidth group. This represents the traditional work-conserving behavior for + CFS. + + Writing any (valid) positive value(s) will enact the specified bandwidth limit. +-The minimum quota allowed for the quota or period is 1ms. There is also an +-upper bound on the period length of 1s. Additional restrictions exist when ++The minimum quota allowed for the quota or period is 1ms. There is also an ++upper bound on the period length of 1s. Additional restrictions exist when + bandwidth limits are used in a hierarchical fashion, these are explained in + more detail below. + +@@ -53,8 +54,8 @@ unthrottled if it is in a constrained st + System wide settings + -------------------- + For efficiency run-time is transferred between the global pool and CPU local +-"silos" in a batch fashion. This greatly reduces global accounting pressure +-on large systems. The amount transferred each time such an update is required ++"silos" in a batch fashion. This greatly reduces global accounting pressure ++on large systems. The amount transferred each time such an update is required + is described as the "slice". + + This is tunable via procfs:: +@@ -97,6 +98,51 @@ There are two ways in which a group may + In case b) above, even though the child may have runtime remaining it will not + be allowed to until the parent's runtime is refreshed. + ++CFS Bandwidth Quota Caveats ++--------------------------- ++Once a slice is assigned to a cpu it does not expire. However all but 1ms of ++the slice may be returned to the global pool if all threads on that cpu become ++unrunnable. This is configured at compile time by the min_cfs_rq_runtime ++variable. This is a performance tweak that helps prevent added contention on ++the global lock. ++ ++The fact that cpu-local slices do not expire results in some interesting corner ++cases that should be understood. ++ ++For cgroup cpu constrained applications that are cpu limited this is a ++relatively moot point because they will naturally consume the entirety of their ++quota as well as the entirety of each cpu-local slice in each period. As a ++result it is expected that nr_periods roughly equal nr_throttled, and that ++cpuacct.usage will increase roughly equal to cfs_quota_us in each period. ++ ++For highly-threaded, non-cpu bound applications this non-expiration nuance ++allows applications to briefly burst past their quota limits by the amount of ++unused slice on each cpu that the task group is running on (typically at most ++1ms per cpu or as defined by min_cfs_rq_runtime). This slight burst only ++applies if quota had been assigned to a cpu and then not fully used or returned ++in previous periods. This burst amount will not be transferred between cores. ++As a result, this mechanism still strictly limits the task group to quota ++average usage, albeit over a longer time window than a single period. This ++also limits the burst ability to no more than 1ms per cpu. This provides ++better more predictable user experience for highly threaded applications with ++small quota limits on high core count machines. It also eliminates the ++propensity to throttle these applications while simultanously using less than ++quota amounts of cpu. Another way to say this, is that by allowing the unused ++portion of a slice to remain valid across periods we have decreased the ++possibility of wastefully expiring quota on cpu-local silos that don't need a ++full slice's amount of cpu time. ++ ++The interaction between cpu-bound and non-cpu-bound-interactive applications ++should also be considered, especially when single core usage hits 100%. If you ++gave each of these applications half of a cpu-core and they both got scheduled ++on the same CPU it is theoretically possible that the non-cpu bound application ++will use up to 1ms additional quota in some periods, thereby preventing the ++cpu-bound application from fully using its quota by that same amount. In these ++instances it will be up to the CFS algorithm (see sched-design-CFS.rst) to ++decide which application is chosen to run, as they will both be runnable and ++have remaining quota. This runtime discrepancy will be made up in the following ++periods when the interactive application idles. ++ + Examples + -------- + 1. Limit a group to 1 CPU worth of runtime:: +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -4370,8 +4370,6 @@ void __refill_cfs_bandwidth_runtime(stru + + now = sched_clock_cpu(smp_processor_id()); + cfs_b->runtime = cfs_b->quota; +- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); +- cfs_b->expires_seq++; + } + + static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +@@ -4393,8 +4391,7 @@ static int assign_cfs_rq_runtime(struct + { + struct task_group *tg = cfs_rq->tg; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); +- u64 amount = 0, min_amount, expires; +- int expires_seq; ++ u64 amount = 0, min_amount; + + /* note: this is a positive sum as runtime_remaining <= 0 */ + min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; +@@ -4411,61 +4408,17 @@ static int assign_cfs_rq_runtime(struct + cfs_b->idle = 0; + } + } +- expires_seq = cfs_b->expires_seq; +- expires = cfs_b->runtime_expires; + raw_spin_unlock(&cfs_b->lock); + + cfs_rq->runtime_remaining += amount; +- /* +- * we may have advanced our local expiration to account for allowed +- * spread between our sched_clock and the one on which runtime was +- * issued. +- */ +- if (cfs_rq->expires_seq != expires_seq) { +- cfs_rq->expires_seq = expires_seq; +- cfs_rq->runtime_expires = expires; +- } + + return cfs_rq->runtime_remaining > 0; + } + +-/* +- * Note: This depends on the synchronization provided by sched_clock and the +- * fact that rq->clock snapshots this value. +- */ +-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) +-{ +- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); +- +- /* if the deadline is ahead of our clock, nothing to do */ +- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) +- return; +- +- if (cfs_rq->runtime_remaining < 0) +- return; +- +- /* +- * If the local deadline has passed we have to consider the +- * possibility that our sched_clock is 'fast' and the global deadline +- * has not truly expired. +- * +- * Fortunately we can check determine whether this the case by checking +- * whether the global deadline(cfs_b->expires_seq) has advanced. +- */ +- if (cfs_rq->expires_seq == cfs_b->expires_seq) { +- /* extend local deadline, drift is bounded above by 2 ticks */ +- cfs_rq->runtime_expires += TICK_NSEC; +- } else { +- /* global deadline is ahead, expiration has passed */ +- cfs_rq->runtime_remaining = 0; +- } +-} +- + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) + { + /* dock delta_exec before expiring quota (as it could span periods) */ + cfs_rq->runtime_remaining -= delta_exec; +- expire_cfs_rq_runtime(cfs_rq); + + if (likely(cfs_rq->runtime_remaining > 0)) + return; +@@ -4658,8 +4611,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cf + resched_curr(rq); + } + +-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, +- u64 remaining, u64 expires) ++static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) + { + struct cfs_rq *cfs_rq; + u64 runtime; +@@ -4684,7 +4636,6 @@ static u64 distribute_cfs_runtime(struct + remaining -= runtime; + + cfs_rq->runtime_remaining += runtime; +- cfs_rq->runtime_expires = expires; + + /* we check whether we're throttled above */ + if (cfs_rq->runtime_remaining > 0) +@@ -4709,7 +4660,7 @@ next: + */ + static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) + { +- u64 runtime, runtime_expires; ++ u64 runtime; + int throttled; + + /* no need to continue the timer with no bandwidth constraint */ +@@ -4737,8 +4688,6 @@ static int do_sched_cfs_period_timer(str + /* account preceding periods in which throttling occurred */ + cfs_b->nr_throttled += overrun; + +- runtime_expires = cfs_b->runtime_expires; +- + /* + * This check is repeated as we are holding onto the new bandwidth while + * we unthrottle. This can potentially race with an unthrottled group +@@ -4751,8 +4700,7 @@ static int do_sched_cfs_period_timer(str + cfs_b->distribute_running = 1; + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); + /* we can't nest cfs_b->lock while distributing bandwidth */ +- runtime = distribute_cfs_runtime(cfs_b, runtime, +- runtime_expires); ++ runtime = distribute_cfs_runtime(cfs_b, runtime); + raw_spin_lock_irqsave(&cfs_b->lock, flags); + + cfs_b->distribute_running = 0; +@@ -4834,8 +4782,7 @@ static void __return_cfs_rq_runtime(stru + return; + + raw_spin_lock(&cfs_b->lock); +- if (cfs_b->quota != RUNTIME_INF && +- cfs_rq->runtime_expires == cfs_b->runtime_expires) { ++ if (cfs_b->quota != RUNTIME_INF) { + cfs_b->runtime += slack_runtime; + + /* we are under rq->lock, defer unthrottling using a timer */ +@@ -4868,7 +4815,6 @@ static void do_sched_cfs_slack_timer(str + { + u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + unsigned long flags; +- u64 expires; + + /* confirm we're still not at a refresh boundary */ + raw_spin_lock_irqsave(&cfs_b->lock, flags); +@@ -4886,7 +4832,6 @@ static void do_sched_cfs_slack_timer(str + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) + runtime = cfs_b->runtime; + +- expires = cfs_b->runtime_expires; + if (runtime) + cfs_b->distribute_running = 1; + +@@ -4895,11 +4840,10 @@ static void do_sched_cfs_slack_timer(str + if (!runtime) + return; + +- runtime = distribute_cfs_runtime(cfs_b, runtime, expires); ++ runtime = distribute_cfs_runtime(cfs_b, runtime); + + raw_spin_lock_irqsave(&cfs_b->lock, flags); +- if (expires == cfs_b->runtime_expires) +- lsub_positive(&cfs_b->runtime, runtime); ++ lsub_positive(&cfs_b->runtime, runtime); + cfs_b->distribute_running = 0; + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); + } +@@ -5064,8 +5008,6 @@ void start_cfs_bandwidth(struct cfs_band + + cfs_b->period_active = 1; + overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); +- cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); +- cfs_b->expires_seq++; + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); + } + +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -335,8 +335,6 @@ struct cfs_bandwidth { + u64 quota; + u64 runtime; + s64 hierarchical_quota; +- u64 runtime_expires; +- int expires_seq; + + u8 idle; + u8 period_active; +@@ -556,8 +554,6 @@ struct cfs_rq { + + #ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; +- int expires_seq; +- u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_clock; diff --git a/queue-5.3/series b/queue-5.3/series index f409a0799c1..ac778b267b0 100644 --- a/queue-5.3/series +++ b/queue-5.3/series @@ -155,3 +155,5 @@ blackhole_netdev-fix-syzkaller-reported-issue.patch net-usb-sr9800-fix-uninitialized-local-variable.patch sch_netem-fix-rcu-splat-in-netem_enqueue.patch net-sched-sch_sfb-don-t-call-qdisc_put-while-holding-tree-lock.patch +iwlwifi-exclude-geo-sar-support-for-3168.patch +sched-fair-fix-low-cpu-usage-with-high-throttling-by-removing-expiration-of-cpu-local-slices.patch