--- /dev/null
+From 3569dd07aaad71920c5ea4da2d5cc9a167c1ffd4 Mon Sep 17 00:00:00 2001
+From: Sohil Mehta <sohil.mehta@intel.com>
+Date: Wed, 21 Nov 2018 15:29:33 -0800
+Subject: iommu/vt-d: Handle domain agaw being less than iommu agaw
+
+From: Sohil Mehta <sohil.mehta@intel.com>
+
+commit 3569dd07aaad71920c5ea4da2d5cc9a167c1ffd4 upstream.
+
+The Intel IOMMU driver opportunistically skips a few top level page
+tables from the domain paging directory while programming the IOMMU
+context entry. However there is an implicit assumption in the code that
+domain's adjusted guest address width (agaw) would always be greater
+than IOMMU's agaw.
+
+The IOMMU capabilities in an upcoming platform cause the domain's agaw
+to be lower than IOMMU's agaw. The issue is seen when the IOMMU supports
+both 4-level and 5-level paging. The domain builds a 4-level page table
+based on agaw of 2. However the IOMMU's agaw is set as 3 (5-level). In
+this case the code incorrectly tries to skip page page table levels.
+This causes the IOMMU driver to avoid programming the context entry. The
+fix handles this case and programs the context entry accordingly.
+
+Fixes: de24e55395698 ("iommu/vt-d: Simplify domain_context_mapping_one")
+Cc: <stable@vger.kernel.org>
+Cc: Ashok Raj <ashok.raj@intel.com>
+Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
+Cc: Lu Baolu <baolu.lu@linux.intel.com>
+Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
+Reported-by: Ramos Falcon, Ernesto R <ernesto.r.ramos.falcon@intel.com>
+Tested-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/iommu/intel-iommu.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/iommu/intel-iommu.c
++++ b/drivers/iommu/intel-iommu.c
+@@ -2093,7 +2093,7 @@ static int domain_context_mapping_one(st
+ * than default. Unnecessary for PT mode.
+ */
+ if (translation != CONTEXT_TT_PASS_THROUGH) {
+- for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
++ for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+ ret = -ENOMEM;
+ pgd = phys_to_virt(dma_pte_addr(pgd));
+ if (!dma_pte_present(pgd))
+@@ -2107,7 +2107,7 @@ static int domain_context_mapping_one(st
+ translation = CONTEXT_TT_MULTI_LEVEL;
+
+ context_set_address_root(context, virt_to_phys(pgd));
+- context_set_address_width(context, iommu->agaw);
++ context_set_address_width(context, agaw);
+ } else {
+ /*
+ * In pass through mode, AW must be programmed to
--- /dev/null
+From c40f7d74c741a907cfaeb73a7697081881c497d0 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 27 Dec 2018 13:46:17 -0800
+Subject: sched/fair: Fix infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit c40f7d74c741a907cfaeb73a7697081881c497d0 upstream.
+
+Zhipeng Xie, Xie XiuQi and Sargun Dhillon reported lockups in the
+scheduler under high loads, starting at around the v4.18 time frame,
+and Zhipeng Xie tracked it down to bugs in the rq->leaf_cfs_rq_list
+manipulation.
+
+Do a (manual) revert of:
+
+ a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path")
+
+It turns out that the list_del_leaf_cfs_rq() introduced by this commit
+is a surprising property that was not considered in followup commits
+such as:
+
+ 9c2791f936ef ("sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list")
+
+As Vincent Guittot explains:
+
+ "I think that there is a bigger problem with commit a9e7f6544b9c and
+ cfs_rq throttling:
+
+ Let take the example of the following topology TG2 --> TG1 --> root:
+
+ 1) The 1st time a task is enqueued, we will add TG2 cfs_rq then TG1
+ cfs_rq to leaf_cfs_rq_list and we are sure to do the whole branch in
+ one path because it has never been used and can't be throttled so
+ tmp_alone_branch will point to leaf_cfs_rq_list at the end.
+
+ 2) Then TG1 is throttled
+
+ 3) and we add TG3 as a new child of TG1.
+
+ 4) The 1st enqueue of a task on TG3 will add TG3 cfs_rq just before TG1
+ cfs_rq and tmp_alone_branch will stay on rq->leaf_cfs_rq_list.
+
+ With commit a9e7f6544b9c, we can del a cfs_rq from rq->leaf_cfs_rq_list.
+ So if the load of TG1 cfs_rq becomes NULL before step 2) above, TG1
+ cfs_rq is removed from the list.
+ Then at step 4), TG3 cfs_rq is added at the beginning of rq->leaf_cfs_rq_list
+ but tmp_alone_branch still points to TG3 cfs_rq because its throttled
+ parent can't be enqueued when the lock is released.
+ tmp_alone_branch doesn't point to rq->leaf_cfs_rq_list whereas it should.
+
+ So if TG3 cfs_rq is removed or destroyed before tmp_alone_branch
+ points on another TG cfs_rq, the next TG cfs_rq that will be added,
+ will be linked outside rq->leaf_cfs_rq_list - which is bad.
+
+ In addition, we can break the ordering of the cfs_rq in
+ rq->leaf_cfs_rq_list but this ordering is used to update and
+ propagate the update from leaf down to root."
+
+Instead of trying to work through all these cases and trying to reproduce
+the very high loads that produced the lockup to begin with, simplify
+the code temporarily by reverting a9e7f6544b9c - which change was clearly
+not thought through completely.
+
+This (hopefully) gives us a kernel that doesn't lock up so people
+can continue to enjoy their holidays without worrying about regressions. ;-)
+
+[ mingo: Wrote changelog, fixed weird spelling in code comment while at it. ]
+
+Analyzed-by: Xie XiuQi <xiexiuqi@huawei.com>
+Analyzed-by: Vincent Guittot <vincent.guittot@linaro.org>
+Reported-by: Zhipeng Xie <xiezhipeng1@huawei.com>
+Reported-by: Sargun Dhillon <sargun@sargun.me>
+Reported-by: Xie XiuQi <xiexiuqi@huawei.com>
+Tested-by: Zhipeng Xie <xiezhipeng1@huawei.com>
+Tested-by: Sargun Dhillon <sargun@sargun.me>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
+Cc: <stable@vger.kernel.org> # v4.13+
+Cc: Bin Li <huawei.libin@huawei.com>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Fixes: a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path")
+Link: http://lkml.kernel.org/r/1545879866-27809-1-git-send-email-xiexiuqi@huawei.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 43 +++++++++----------------------------------
+ 1 file changed, 9 insertions(+), 34 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -369,10 +369,9 @@ static inline void list_del_leaf_cfs_rq(
+ }
+ }
+
+-/* Iterate thr' all leaf cfs_rq's on a runqueue */
+-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+- list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+- leaf_cfs_rq_list)
++/* Iterate through all leaf cfs_rq's on a runqueue: */
++#define for_each_leaf_cfs_rq(rq, cfs_rq) \
++ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+ /* Do the two (enqueued) entities belong to the same group ? */
+ static inline struct cfs_rq *
+@@ -465,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(
+ {
+ }
+
+-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+- for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
++#define for_each_leaf_cfs_rq(rq, cfs_rq) \
++ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+ static inline struct sched_entity *parent_entity(struct sched_entity *se)
+ {
+@@ -6970,27 +6969,10 @@ static void attach_tasks(struct lb_env *
+
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+
+-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+-{
+- if (cfs_rq->load.weight)
+- return false;
+-
+- if (cfs_rq->avg.load_sum)
+- return false;
+-
+- if (cfs_rq->avg.util_sum)
+- return false;
+-
+- if (cfs_rq->runnable_load_sum)
+- return false;
+-
+- return true;
+-}
+-
+ static void update_blocked_averages(int cpu)
+ {
+ struct rq *rq = cpu_rq(cpu);
+- struct cfs_rq *cfs_rq, *pos;
++ struct cfs_rq *cfs_rq;
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+@@ -7000,7 +6982,7 @@ static void update_blocked_averages(int
+ * Iterates the task_group tree in a bottom up fashion, see
+ * list_add_leaf_cfs_rq() for details.
+ */
+- for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
++ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct sched_entity *se;
+
+ /* throttled entities do not contribute to load */
+@@ -7014,13 +6996,6 @@ static void update_blocked_averages(int
+ se = cfs_rq->tg->se[cpu];
+ if (se && !skip_blocked_update(se))
+ update_load_avg(se, 0);
+-
+- /*
+- * There can be a lot of idle CPU cgroups. Don't let fully
+- * decayed cfs_rqs linger on the list.
+- */
+- if (cfs_rq_is_decayed(cfs_rq))
+- list_del_leaf_cfs_rq(cfs_rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+ }
+@@ -9580,10 +9555,10 @@ const struct sched_class fair_sched_clas
+ #ifdef CONFIG_SCHED_DEBUG
+ void print_cfs_stats(struct seq_file *m, int cpu)
+ {
+- struct cfs_rq *cfs_rq, *pos;
++ struct cfs_rq *cfs_rq;
+
+ rcu_read_lock();
+- for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
++ for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ print_cfs_rq(m, cpu, cfs_rq);
+ rcu_read_unlock();
+ }