From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 11 Jan 2019 12:40:32 +0000 (+0100)
Subject: 4.14-stable patches
X-Git-Tag: v4.20.2~16
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ea52754f411fa6a84b595a2e69f858a985775a61;p=thirdparty%2Fkernel%2Fstable-queue.git

4.14-stable patches

added patches:
	iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch
	sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch
---

diff --git a/queue-4.14/iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch b/queue-4.14/iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch
new file mode 100644
index 00000000000..6bf860785a6
--- /dev/null
+++ b/queue-4.14/iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch
@@ -0,0 +1,59 @@
+From 3569dd07aaad71920c5ea4da2d5cc9a167c1ffd4 Mon Sep 17 00:00:00 2001
+From: Sohil Mehta <sohil.mehta@intel.com>
+Date: Wed, 21 Nov 2018 15:29:33 -0800
+Subject: iommu/vt-d: Handle domain agaw being less than iommu agaw
+
+From: Sohil Mehta <sohil.mehta@intel.com>
+
+commit 3569dd07aaad71920c5ea4da2d5cc9a167c1ffd4 upstream.
+
+The Intel IOMMU driver opportunistically skips a few top level page
+tables from the domain paging directory while programming the IOMMU
+context entry. However there is an implicit assumption in the code that
+domain's adjusted guest address width (agaw) would always be greater
+than IOMMU's agaw.
+
+The IOMMU capabilities in an upcoming platform cause the domain's agaw
+to be lower than IOMMU's agaw. The issue is seen when the IOMMU supports
+both 4-level and 5-level paging. The domain builds a 4-level page table
+based on agaw of 2. However the IOMMU's agaw is set as 3 (5-level). In
+this case the code incorrectly tries to skip page page table levels.
+This causes the IOMMU driver to avoid programming the context entry. The
+fix handles this case and programs the context entry accordingly.
+
+Fixes: de24e55395698 ("iommu/vt-d: Simplify domain_context_mapping_one")
+Cc: <stable@vger.kernel.org>
+Cc: Ashok Raj <ashok.raj@intel.com>
+Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
+Cc: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reported-by: Ramos Falcon, Ernesto R <ernesto.r.ramos.falcon@intel.com>
+Tested-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/iommu/intel-iommu.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/iommu/intel-iommu.c
++++ b/drivers/iommu/intel-iommu.c
+@@ -2093,7 +2093,7 @@ static int domain_context_mapping_one(st
+ 	 * than default.  Unnecessary for PT mode.
+ 	 */
+ 	if (translation != CONTEXT_TT_PASS_THROUGH) {
+-		for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
++		for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+ 			ret = -ENOMEM;
+ 			pgd = phys_to_virt(dma_pte_addr(pgd));
+ 			if (!dma_pte_present(pgd))
+@@ -2107,7 +2107,7 @@ static int domain_context_mapping_one(st
+ 			translation = CONTEXT_TT_MULTI_LEVEL;
+ 
+ 		context_set_address_root(context, virt_to_phys(pgd));
+-		context_set_address_width(context, iommu->agaw);
++		context_set_address_width(context, agaw);
+ 	} else {
+ 		/*
+ 		 * In pass through mode, AW must be programmed to
diff --git a/queue-4.14/sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch b/queue-4.14/sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch
new file mode 100644
index 00000000000..dd061683c5b
--- /dev/null
+++ b/queue-4.14/sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch
@@ -0,0 +1,185 @@
+From c40f7d74c741a907cfaeb73a7697081881c497d0 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 27 Dec 2018 13:46:17 -0800
+Subject: sched/fair: Fix infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit c40f7d74c741a907cfaeb73a7697081881c497d0 upstream.
+
+Zhipeng Xie, Xie XiuQi and Sargun Dhillon reported lockups in the
+scheduler under high loads, starting at around the v4.18 time frame,
+and Zhipeng Xie tracked it down to bugs in the rq->leaf_cfs_rq_list
+manipulation.
+
+Do a (manual) revert of:
+
+  a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path")
+
+It turns out that the list_del_leaf_cfs_rq() introduced by this commit
+is a surprising property that was not considered in followup commits
+such as:
+
+  9c2791f936ef ("sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list")
+
+As Vincent Guittot explains:
+
+ "I think that there is a bigger problem with commit a9e7f6544b9c and
+  cfs_rq throttling:
+
+  Let take the example of the following topology TG2 --> TG1 --> root:
+
+   1) The 1st time a task is enqueued, we will add TG2 cfs_rq then TG1
+      cfs_rq to leaf_cfs_rq_list and we are sure to do the whole branch in
+      one path because it has never been used and can't be throttled so
+      tmp_alone_branch will point to leaf_cfs_rq_list at the end.
+
+   2) Then TG1 is throttled
+
+   3) and we add TG3 as a new child of TG1.
+
+   4) The 1st enqueue of a task on TG3 will add TG3 cfs_rq just before TG1
+      cfs_rq and tmp_alone_branch will stay  on rq->leaf_cfs_rq_list.
+
+  With commit a9e7f6544b9c, we can del a cfs_rq from rq->leaf_cfs_rq_list.
+  So if the load of TG1 cfs_rq becomes NULL before step 2) above, TG1
+  cfs_rq is removed from the list.
+  Then at step 4), TG3 cfs_rq is added at the beginning of rq->leaf_cfs_rq_list
+  but tmp_alone_branch still points to TG3 cfs_rq because its throttled
+  parent can't be enqueued when the lock is released.
+  tmp_alone_branch doesn't point to rq->leaf_cfs_rq_list whereas it should.
+
+  So if TG3 cfs_rq is removed or destroyed before tmp_alone_branch
+  points on another TG cfs_rq, the next TG cfs_rq that will be added,
+  will be linked outside rq->leaf_cfs_rq_list - which is bad.
+
+  In addition, we can break the ordering of the cfs_rq in
+  rq->leaf_cfs_rq_list but this ordering is used to update and
+  propagate the update from leaf down to root."
+
+Instead of trying to work through all these cases and trying to reproduce
+the very high loads that produced the lockup to begin with, simplify
+the code temporarily by reverting a9e7f6544b9c - which change was clearly
+not thought through completely.
+
+This (hopefully) gives us a kernel that doesn't lock up so people
+can continue to enjoy their holidays without worrying about regressions. ;-)
+
+[ mingo: Wrote changelog, fixed weird spelling in code comment while at it. ]
+
+Analyzed-by: Xie XiuQi <xiexiuqi@huawei.com>
+Analyzed-by: Vincent Guittot <vincent.guittot@linaro.org>
+Reported-by: Zhipeng Xie <xiezhipeng1@huawei.com>
+Reported-by: Sargun Dhillon <sargun@sargun.me>
+Reported-by: Xie XiuQi <xiexiuqi@huawei.com>
+Tested-by: Zhipeng Xie <xiezhipeng1@huawei.com>
+Tested-by: Sargun Dhillon <sargun@sargun.me>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
+Cc: <stable@vger.kernel.org> # v4.13+
+Cc: Bin Li <huawei.libin@huawei.com>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Fixes: a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path")
+Link: http://lkml.kernel.org/r/1545879866-27809-1-git-send-email-xiexiuqi@huawei.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |   43 +++++++++----------------------------------
+ 1 file changed, 9 insertions(+), 34 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -369,10 +369,9 @@ static inline void list_del_leaf_cfs_rq(
+ 	}
+ }
+ 
+-/* Iterate thr' all leaf cfs_rq's on a runqueue */
+-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
+-	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
+-				 leaf_cfs_rq_list)
++/* Iterate through all leaf cfs_rq's on a runqueue: */
++#define for_each_leaf_cfs_rq(rq, cfs_rq) \
++	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+ 
+ /* Do the two (enqueued) entities belong to the same group ? */
+ static inline struct cfs_rq *
+@@ -465,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(
+ {
+ }
+ 
+-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
+-		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
++#define for_each_leaf_cfs_rq(rq, cfs_rq)	\
++		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+ 
+ static inline struct sched_entity *parent_entity(struct sched_entity *se)
+ {
+@@ -6970,27 +6969,10 @@ static void attach_tasks(struct lb_env *
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 
+-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+-{
+-	if (cfs_rq->load.weight)
+-		return false;
+-
+-	if (cfs_rq->avg.load_sum)
+-		return false;
+-
+-	if (cfs_rq->avg.util_sum)
+-		return false;
+-
+-	if (cfs_rq->runnable_load_sum)
+-		return false;
+-
+-	return true;
+-}
+-
+ static void update_blocked_averages(int cpu)
+ {
+ 	struct rq *rq = cpu_rq(cpu);
+-	struct cfs_rq *cfs_rq, *pos;
++	struct cfs_rq *cfs_rq;
+ 	struct rq_flags rf;
+ 
+ 	rq_lock_irqsave(rq, &rf);
+@@ -7000,7 +6982,7 @@ static void update_blocked_averages(int
+ 	 * Iterates the task_group tree in a bottom up fashion, see
+ 	 * list_add_leaf_cfs_rq() for details.
+ 	 */
+-	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
++	for_each_leaf_cfs_rq(rq, cfs_rq) {
+ 		struct sched_entity *se;
+ 
+ 		/* throttled entities do not contribute to load */
+@@ -7014,13 +6996,6 @@ static void update_blocked_averages(int
+ 		se = cfs_rq->tg->se[cpu];
+ 		if (se && !skip_blocked_update(se))
+ 			update_load_avg(se, 0);
+-
+-		/*
+-		 * There can be a lot of idle CPU cgroups.  Don't let fully
+-		 * decayed cfs_rqs linger on the list.
+-		 */
+-		if (cfs_rq_is_decayed(cfs_rq))
+-			list_del_leaf_cfs_rq(cfs_rq);
+ 	}
+ 	rq_unlock_irqrestore(rq, &rf);
+ }
+@@ -9580,10 +9555,10 @@ const struct sched_class fair_sched_clas
+ #ifdef CONFIG_SCHED_DEBUG
+ void print_cfs_stats(struct seq_file *m, int cpu)
+ {
+-	struct cfs_rq *cfs_rq, *pos;
++	struct cfs_rq *cfs_rq;
+ 
+ 	rcu_read_lock();
+-	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
++	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ 		print_cfs_rq(m, cpu, cfs_rq);
+ 	rcu_read_unlock();
+ }
diff --git a/queue-4.14/series b/queue-4.14/series
index d6e41c99927..3113e59810f 100644
--- a/queue-4.14/series
+++ b/queue-4.14/series
@@ -95,3 +95,5 @@ arm64-relocatable-fix-inconsistencies-in-linker-script-and-options.patch
 powerpc-tm-set-msr-just-prior-to-recheckpoint.patch
 9p-net-put-a-lower-bound-on-msize.patch
 rxe-fix-error-completion-wr_id-and-qp_num.patch
+iommu-vt-d-handle-domain-agaw-being-less-than-iommu-agaw.patch
+sched-fair-fix-infinite-loop-in-update_blocked_averages-by-reverting-a9e7f6544b9c.patch