]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.20-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 11 Jan 2019 12:41:05 +0000 (13:41 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 11 Jan 2019 12:41:05 +0000 (13:41 +0100)
added patches:
iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch
sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch

queue-4.20/iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch [new file with mode: 0644]
queue-4.20/sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch [new file with mode: 0644]
queue-4.20/series

diff --git a/queue-4.20/iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch b/queue-4.20/iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch
new file mode 100644 (file)
index 0000000..ea10f55
--- /dev/null
@@ -0,0 +1,59 @@
+From 3569dd07aaad71920c5ea4da2d5cc9a167c1ffd4 Mon Sep 17 00:00:00 2001
+From: Sohil Mehta <sohil.mehta@intel.com>
+Date: Wed, 21 Nov 2018 15:29:33 -0800
+Subject: iommu/vt-d: Handle domain agaw being less than iommu agaw
+
+From: Sohil Mehta <sohil.mehta@intel.com>
+
+commit 3569dd07aaad71920c5ea4da2d5cc9a167c1ffd4 upstream.
+
+The Intel IOMMU driver opportunistically skips a few top level page
+tables from the domain paging directory while programming the IOMMU
+context entry. However there is an implicit assumption in the code that
+domain's adjusted guest address width (agaw) would always be greater
+than IOMMU's agaw.
+
+The IOMMU capabilities in an upcoming platform cause the domain's agaw
+to be lower than IOMMU's agaw. The issue is seen when the IOMMU supports
+both 4-level and 5-level paging. The domain builds a 4-level page table
+based on agaw of 2. However the IOMMU's agaw is set as 3 (5-level). In
+this case the code incorrectly tries to skip page page table levels.
+This causes the IOMMU driver to avoid programming the context entry. The
+fix handles this case and programs the context entry accordingly.
+
+Fixes: de24e55395698 ("iommu/vt-d: Simplify domain_context_mapping_one")
+Cc: <stable@vger.kernel.org>
+Cc: Ashok Raj <ashok.raj@intel.com>
+Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
+Cc: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reported-by: Ramos Falcon, Ernesto R <ernesto.r.ramos.falcon@intel.com>
+Tested-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/iommu/intel-iommu.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/iommu/intel-iommu.c
++++ b/drivers/iommu/intel-iommu.c
+@@ -2044,7 +2044,7 @@ static int domain_context_mapping_one(st
+        * than default.  Unnecessary for PT mode.
+        */
+       if (translation != CONTEXT_TT_PASS_THROUGH) {
+-              for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
++              for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+                       ret = -ENOMEM;
+                       pgd = phys_to_virt(dma_pte_addr(pgd));
+                       if (!dma_pte_present(pgd))
+@@ -2058,7 +2058,7 @@ static int domain_context_mapping_one(st
+                       translation = CONTEXT_TT_MULTI_LEVEL;
+               context_set_address_root(context, virt_to_phys(pgd));
+-              context_set_address_width(context, iommu->agaw);
++              context_set_address_width(context, agaw);
+       } else {
+               /*
+                * In pass through mode, AW must be programmed to
diff --git a/queue-4.20/sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch b/queue-4.20/sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch
new file mode 100644 (file)
index 0000000..ea85977
--- /dev/null
@@ -0,0 +1,185 @@
+From c40f7d74c741a907cfaeb73a7697081881c497d0 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 27 Dec 2018 13:46:17 -0800
+Subject: sched/fair: Fix infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit c40f7d74c741a907cfaeb73a7697081881c497d0 upstream.
+
+Zhipeng Xie, Xie XiuQi and Sargun Dhillon reported lockups in the
+scheduler under high loads, starting at around the v4.18 time frame,
+and Zhipeng Xie tracked it down to bugs in the rq->leaf_cfs_rq_list
+manipulation.
+
+Do a (manual) revert of:
+
+  a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path")
+
+It turns out that the list_del_leaf_cfs_rq() introduced by this commit
+is a surprising property that was not considered in followup commits
+such as:
+
+  9c2791f936ef ("sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list")
+
+As Vincent Guittot explains:
+
+ "I think that there is a bigger problem with commit a9e7f6544b9c and
+  cfs_rq throttling:
+
+  Let take the example of the following topology TG2 --> TG1 --> root:
+
+   1) The 1st time a task is enqueued, we will add TG2 cfs_rq then TG1
+      cfs_rq to leaf_cfs_rq_list and we are sure to do the whole branch in
+      one path because it has never been used and can't be throttled so
+      tmp_alone_branch will point to leaf_cfs_rq_list at the end.
+
+   2) Then TG1 is throttled
+
+   3) and we add TG3 as a new child of TG1.
+
+   4) The 1st enqueue of a task on TG3 will add TG3 cfs_rq just before TG1
+      cfs_rq and tmp_alone_branch will stay  on rq->leaf_cfs_rq_list.
+
+  With commit a9e7f6544b9c, we can del a cfs_rq from rq->leaf_cfs_rq_list.
+  So if the load of TG1 cfs_rq becomes NULL before step 2) above, TG1
+  cfs_rq is removed from the list.
+  Then at step 4), TG3 cfs_rq is added at the beginning of rq->leaf_cfs_rq_list
+  but tmp_alone_branch still points to TG3 cfs_rq because its throttled
+  parent can't be enqueued when the lock is released.
+  tmp_alone_branch doesn't point to rq->leaf_cfs_rq_list whereas it should.
+
+  So if TG3 cfs_rq is removed or destroyed before tmp_alone_branch
+  points on another TG cfs_rq, the next TG cfs_rq that will be added,
+  will be linked outside rq->leaf_cfs_rq_list - which is bad.
+
+  In addition, we can break the ordering of the cfs_rq in
+  rq->leaf_cfs_rq_list but this ordering is used to update and
+  propagate the update from leaf down to root."
+
+Instead of trying to work through all these cases and trying to reproduce
+the very high loads that produced the lockup to begin with, simplify
+the code temporarily by reverting a9e7f6544b9c - which change was clearly
+not thought through completely.
+
+This (hopefully) gives us a kernel that doesn't lock up so people
+can continue to enjoy their holidays without worrying about regressions. ;-)
+
+[ mingo: Wrote changelog, fixed weird spelling in code comment while at it. ]
+
+Analyzed-by: Xie XiuQi <xiexiuqi@huawei.com>
+Analyzed-by: Vincent Guittot <vincent.guittot@linaro.org>
+Reported-by: Zhipeng Xie <xiezhipeng1@huawei.com>
+Reported-by: Sargun Dhillon <sargun@sargun.me>
+Reported-by: Xie XiuQi <xiexiuqi@huawei.com>
+Tested-by: Zhipeng Xie <xiezhipeng1@huawei.com>
+Tested-by: Sargun Dhillon <sargun@sargun.me>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
+Cc: <stable@vger.kernel.org> # v4.13+
+Cc: Bin Li <huawei.libin@huawei.com>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Fixes: a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path")
+Link: http://lkml.kernel.org/r/1545879866-27809-1-git-send-email-xiexiuqi@huawei.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |   43 +++++++++----------------------------------
+ 1 file changed, 9 insertions(+), 34 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -352,10 +352,9 @@ static inline void list_del_leaf_cfs_rq(
+       }
+ }
+-/* Iterate thr' all leaf cfs_rq's on a runqueue */
+-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                    \
+-      list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
+-                               leaf_cfs_rq_list)
++/* Iterate through all leaf cfs_rq's on a runqueue: */
++#define for_each_leaf_cfs_rq(rq, cfs_rq) \
++      list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+ /* Do the two (enqueued) entities belong to the same group ? */
+ static inline struct cfs_rq *
+@@ -447,8 +446,8 @@ static inline void list_del_leaf_cfs_rq(
+ {
+ }
+-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)    \
+-              for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
++#define for_each_leaf_cfs_rq(rq, cfs_rq)      \
++              for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+ static inline struct sched_entity *parent_entity(struct sched_entity *se)
+ {
+@@ -7387,27 +7386,10 @@ static inline bool others_have_blocked(s
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+-{
+-      if (cfs_rq->load.weight)
+-              return false;
+-
+-      if (cfs_rq->avg.load_sum)
+-              return false;
+-
+-      if (cfs_rq->avg.util_sum)
+-              return false;
+-
+-      if (cfs_rq->avg.runnable_load_sum)
+-              return false;
+-
+-      return true;
+-}
+-
+ static void update_blocked_averages(int cpu)
+ {
+       struct rq *rq = cpu_rq(cpu);
+-      struct cfs_rq *cfs_rq, *pos;
++      struct cfs_rq *cfs_rq;
+       const struct sched_class *curr_class;
+       struct rq_flags rf;
+       bool done = true;
+@@ -7419,7 +7401,7 @@ static void update_blocked_averages(int
+        * Iterates the task_group tree in a bottom up fashion, see
+        * list_add_leaf_cfs_rq() for details.
+        */
+-      for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
++      for_each_leaf_cfs_rq(rq, cfs_rq) {
+               struct sched_entity *se;
+               /* throttled entities do not contribute to load */
+@@ -7434,13 +7416,6 @@ static void update_blocked_averages(int
+               if (se && !skip_blocked_update(se))
+                       update_load_avg(cfs_rq_of(se), se, 0);
+-              /*
+-               * There can be a lot of idle CPU cgroups.  Don't let fully
+-               * decayed cfs_rqs linger on the list.
+-               */
+-              if (cfs_rq_is_decayed(cfs_rq))
+-                      list_del_leaf_cfs_rq(cfs_rq);
+-
+               /* Don't need periodic decay once load/util_avg are null */
+               if (cfs_rq_has_blocked(cfs_rq))
+                       done = false;
+@@ -10289,10 +10264,10 @@ const struct sched_class fair_sched_clas
+ #ifdef CONFIG_SCHED_DEBUG
+ void print_cfs_stats(struct seq_file *m, int cpu)
+ {
+-      struct cfs_rq *cfs_rq, *pos;
++      struct cfs_rq *cfs_rq;
+       rcu_read_lock();
+-      for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
++      for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+               print_cfs_rq(m, cpu, cfs_rq);
+       rcu_read_unlock();
+ }
index 7ca0e5c7a039908494b6552ac553bfbe5eacc3f4..813ca7cd70645a31b3264a1a13ec727627d6e73b 100644 (file)
@@ -47,3 +47,5 @@ rxe-fix-error-completion-wr_id-and-qp_num.patch
 stm-class-fix-a-module-refcount-leak-in-policy-creation-error-path.patch
 rdma-srpt-fix-a-use-after-free-in-the-channel-release-code.patch
 rdma-iwcm-don-t-copy-past-the-end-of-dev_name-string.patch
+iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch
+sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch