From: Greg Kroah-Hartman Date: Fri, 11 Jan 2019 12:40:32 +0000 (+0100) Subject: 4.14-stable patches X-Git-Tag: v4.20.2~16 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ea52754f411fa6a84b595a2e69f858a985775a61;p=thirdparty%2Fkernel%2Fstable-queue.git 4.14-stable patches added patches: iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch --- diff --git a/queue-4.14/iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch b/queue-4.14/iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch new file mode 100644 index 00000000000..6bf860785a6 --- /dev/null +++ b/queue-4.14/iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch @@ -0,0 +1,59 @@ +From 3569dd07aaad71920c5ea4da2d5cc9a167c1ffd4 Mon Sep 17 00:00:00 2001 +From: Sohil Mehta +Date: Wed, 21 Nov 2018 15:29:33 -0800 +Subject: iommu/vt-d: Handle domain agaw being less than iommu agaw + +From: Sohil Mehta + +commit 3569dd07aaad71920c5ea4da2d5cc9a167c1ffd4 upstream. + +The Intel IOMMU driver opportunistically skips a few top level page +tables from the domain paging directory while programming the IOMMU +context entry. However there is an implicit assumption in the code that +domain's adjusted guest address width (agaw) would always be greater +than IOMMU's agaw. + +The IOMMU capabilities in an upcoming platform cause the domain's agaw +to be lower than IOMMU's agaw. The issue is seen when the IOMMU supports +both 4-level and 5-level paging. The domain builds a 4-level page table +based on agaw of 2. However the IOMMU's agaw is set as 3 (5-level). In +this case the code incorrectly tries to skip page page table levels. +This causes the IOMMU driver to avoid programming the context entry. The +fix handles this case and programs the context entry accordingly. + +Fixes: de24e55395698 ("iommu/vt-d: Simplify domain_context_mapping_one") +Cc: +Cc: Ashok Raj +Cc: Jacob Pan +Cc: Lu Baolu +Reviewed-by: Lu Baolu +Reported-by: Ramos Falcon, Ernesto R +Tested-by: Ricardo Neri +Signed-off-by: Sohil Mehta +Signed-off-by: Joerg Roedel +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/iommu/intel-iommu.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/iommu/intel-iommu.c ++++ b/drivers/iommu/intel-iommu.c +@@ -2093,7 +2093,7 @@ static int domain_context_mapping_one(st + * than default. Unnecessary for PT mode. + */ + if (translation != CONTEXT_TT_PASS_THROUGH) { +- for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) { ++ for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { + ret = -ENOMEM; + pgd = phys_to_virt(dma_pte_addr(pgd)); + if (!dma_pte_present(pgd)) +@@ -2107,7 +2107,7 @@ static int domain_context_mapping_one(st + translation = CONTEXT_TT_MULTI_LEVEL; + + context_set_address_root(context, virt_to_phys(pgd)); +- context_set_address_width(context, iommu->agaw); ++ context_set_address_width(context, agaw); + } else { + /* + * In pass through mode, AW must be programmed to diff --git a/queue-4.14/sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch b/queue-4.14/sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch new file mode 100644 index 00000000000..dd061683c5b --- /dev/null +++ b/queue-4.14/sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch @@ -0,0 +1,185 @@ +From c40f7d74c741a907cfaeb73a7697081881c497d0 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 27 Dec 2018 13:46:17 -0800 +Subject: sched/fair: Fix infinite loop in update_blocked_averages() by reverting a9e7f6544b9c + +From: Linus Torvalds + +commit c40f7d74c741a907cfaeb73a7697081881c497d0 upstream. + +Zhipeng Xie, Xie XiuQi and Sargun Dhillon reported lockups in the +scheduler under high loads, starting at around the v4.18 time frame, +and Zhipeng Xie tracked it down to bugs in the rq->leaf_cfs_rq_list +manipulation. + +Do a (manual) revert of: + + a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path") + +It turns out that the list_del_leaf_cfs_rq() introduced by this commit +is a surprising property that was not considered in followup commits +such as: + + 9c2791f936ef ("sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list") + +As Vincent Guittot explains: + + "I think that there is a bigger problem with commit a9e7f6544b9c and + cfs_rq throttling: + + Let take the example of the following topology TG2 --> TG1 --> root: + + 1) The 1st time a task is enqueued, we will add TG2 cfs_rq then TG1 + cfs_rq to leaf_cfs_rq_list and we are sure to do the whole branch in + one path because it has never been used and can't be throttled so + tmp_alone_branch will point to leaf_cfs_rq_list at the end. + + 2) Then TG1 is throttled + + 3) and we add TG3 as a new child of TG1. + + 4) The 1st enqueue of a task on TG3 will add TG3 cfs_rq just before TG1 + cfs_rq and tmp_alone_branch will stay on rq->leaf_cfs_rq_list. + + With commit a9e7f6544b9c, we can del a cfs_rq from rq->leaf_cfs_rq_list. + So if the load of TG1 cfs_rq becomes NULL before step 2) above, TG1 + cfs_rq is removed from the list. + Then at step 4), TG3 cfs_rq is added at the beginning of rq->leaf_cfs_rq_list + but tmp_alone_branch still points to TG3 cfs_rq because its throttled + parent can't be enqueued when the lock is released. + tmp_alone_branch doesn't point to rq->leaf_cfs_rq_list whereas it should. + + So if TG3 cfs_rq is removed or destroyed before tmp_alone_branch + points on another TG cfs_rq, the next TG cfs_rq that will be added, + will be linked outside rq->leaf_cfs_rq_list - which is bad. + + In addition, we can break the ordering of the cfs_rq in + rq->leaf_cfs_rq_list but this ordering is used to update and + propagate the update from leaf down to root." + +Instead of trying to work through all these cases and trying to reproduce +the very high loads that produced the lockup to begin with, simplify +the code temporarily by reverting a9e7f6544b9c - which change was clearly +not thought through completely. + +This (hopefully) gives us a kernel that doesn't lock up so people +can continue to enjoy their holidays without worrying about regressions. ;-) + +[ mingo: Wrote changelog, fixed weird spelling in code comment while at it. ] + +Analyzed-by: Xie XiuQi +Analyzed-by: Vincent Guittot +Reported-by: Zhipeng Xie +Reported-by: Sargun Dhillon +Reported-by: Xie XiuQi +Tested-by: Zhipeng Xie +Tested-by: Sargun Dhillon +Signed-off-by: Linus Torvalds +Acked-by: Vincent Guittot +Cc: # v4.13+ +Cc: Bin Li +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Tejun Heo +Cc: Thomas Gleixner +Fixes: a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path") +Link: http://lkml.kernel.org/r/1545879866-27809-1-git-send-email-xiexiuqi@huawei.com +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched/fair.c | 43 +++++++++---------------------------------- + 1 file changed, 9 insertions(+), 34 deletions(-) + +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -369,10 +369,9 @@ static inline void list_del_leaf_cfs_rq( + } + } + +-/* Iterate thr' all leaf cfs_rq's on a runqueue */ +-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ +- list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ +- leaf_cfs_rq_list) ++/* Iterate through all leaf cfs_rq's on a runqueue: */ ++#define for_each_leaf_cfs_rq(rq, cfs_rq) \ ++ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + + /* Do the two (enqueued) entities belong to the same group ? */ + static inline struct cfs_rq * +@@ -465,8 +464,8 @@ static inline void list_del_leaf_cfs_rq( + { + } + +-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ +- for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) ++#define for_each_leaf_cfs_rq(rq, cfs_rq) \ ++ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) + + static inline struct sched_entity *parent_entity(struct sched_entity *se) + { +@@ -6970,27 +6969,10 @@ static void attach_tasks(struct lb_env * + + #ifdef CONFIG_FAIR_GROUP_SCHED + +-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +-{ +- if (cfs_rq->load.weight) +- return false; +- +- if (cfs_rq->avg.load_sum) +- return false; +- +- if (cfs_rq->avg.util_sum) +- return false; +- +- if (cfs_rq->runnable_load_sum) +- return false; +- +- return true; +-} +- + static void update_blocked_averages(int cpu) + { + struct rq *rq = cpu_rq(cpu); +- struct cfs_rq *cfs_rq, *pos; ++ struct cfs_rq *cfs_rq; + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); +@@ -7000,7 +6982,7 @@ static void update_blocked_averages(int + * Iterates the task_group tree in a bottom up fashion, see + * list_add_leaf_cfs_rq() for details. + */ +- for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { ++ for_each_leaf_cfs_rq(rq, cfs_rq) { + struct sched_entity *se; + + /* throttled entities do not contribute to load */ +@@ -7014,13 +6996,6 @@ static void update_blocked_averages(int + se = cfs_rq->tg->se[cpu]; + if (se && !skip_blocked_update(se)) + update_load_avg(se, 0); +- +- /* +- * There can be a lot of idle CPU cgroups. Don't let fully +- * decayed cfs_rqs linger on the list. +- */ +- if (cfs_rq_is_decayed(cfs_rq)) +- list_del_leaf_cfs_rq(cfs_rq); + } + rq_unlock_irqrestore(rq, &rf); + } +@@ -9580,10 +9555,10 @@ const struct sched_class fair_sched_clas + #ifdef CONFIG_SCHED_DEBUG + void print_cfs_stats(struct seq_file *m, int cpu) + { +- struct cfs_rq *cfs_rq, *pos; ++ struct cfs_rq *cfs_rq; + + rcu_read_lock(); +- for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) ++ for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + print_cfs_rq(m, cpu, cfs_rq); + rcu_read_unlock(); + } diff --git a/queue-4.14/series b/queue-4.14/series index d6e41c99927..3113e59810f 100644 --- a/queue-4.14/series +++ b/queue-4.14/series @@ -95,3 +95,5 @@ arm64-relocatable-fix-inconsistencies-in-linker-script-and-options.patch powerpc-tm-set-msr-just-prior-to-recheckpoint.patch 9p-net-put-a-lower-bound-on-msize.patch rxe-fix-error-completion-wr_id-and-qp_num.patch +iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch +sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch