From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 24 Jan 2022 18:32:58 +0000 (+0100)
Subject: 4.9-stable patches
X-Git-Tag: v4.4.300~16
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c769fb4d1741dd67694459da21e1c10bfe0b10f6;p=thirdparty%2Fkernel%2Fstable-queue.git

4.9-stable patches

added patches:
	cipso-calipso-resolve-a-number-of-problems-with-the-doi-refcounts.patch
	kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch
	kvm-do-not-assume-pte-is-writable-after-follow_pfn.patch
	kvm-use-kvm_pfn_t-for-local-pfn-variable-in-hva_to_pfn_remapped.patch
	lib-timerqueue-rely-on-rbtree-semantics-for-next-timer.patch
	mm-add-follow_pte_pmd.patch
	rbtree-cache-leftmost-node-internally.patch
---

diff --git a/queue-4.9/cipso-calipso-resolve-a-number-of-problems-with-the-doi-refcounts.patch b/queue-4.9/cipso-calipso-resolve-a-number-of-problems-with-the-doi-refcounts.patch
new file mode 100644
index 00000000000..8c3006b93c7
--- /dev/null
+++ b/queue-4.9/cipso-calipso-resolve-a-number-of-problems-with-the-doi-refcounts.patch
@@ -0,0 +1,140 @@
+From foo@baz Mon Jan 24 07:28:36 PM CET 2022
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Mon, 24 Jan 2022 17:32:21 +0100
+Subject: cipso,calipso: resolve a number of problems with the DOI refcounts
+To: stable@vger.kernel.org
+Cc: Paul Moore <paul@paul-moore.com>
+Message-ID: <Ye7UlciYUYBcfI31@decadent.org.uk>
+Content-Disposition: inline
+
+From: Paul Moore <paul@paul-moore.com>
+
+commit ad5d07f4a9cd671233ae20983848874731102c08 upstream.
+
+The current CIPSO and CALIPSO refcounting scheme for the DOI
+definitions is a bit flawed in that we:
+
+1. Don't correctly match gets/puts in netlbl_cipsov4_list().
+2. Decrement the refcount on each attempt to remove the DOI from the
+   DOI list, only removing it from the list once the refcount drops
+   to zero.
+
+This patch fixes these problems by adding the missing "puts" to
+netlbl_cipsov4_list() and introduces a more conventional, i.e.
+not-buggy, refcounting mechanism to the DOI definitions.  Upon the
+addition of a DOI to the DOI list, it is initialized with a refcount
+of one, removing a DOI from the list removes it from the list and
+drops the refcount by one; "gets" and "puts" behave as expected with
+respect to refcounts, increasing and decreasing the DOI's refcount by
+one.
+
+Fixes: b1edeb102397 ("netlabel: Replace protocol/NetLabel linking with refrerence counts")
+Fixes: d7cce01504a0 ("netlabel: Add support for removing a CALIPSO DOI.")
+Reported-by: syzbot+9ec037722d2603a9f52e@syzkaller.appspotmail.com
+Signed-off-by: Paul Moore <paul@paul-moore.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/cipso_ipv4.c            |   11 +----------
+ net/ipv6/calipso.c               |   14 +++++---------
+ net/netlabel/netlabel_cipso_v4.c |    3 +++
+ 3 files changed, 9 insertions(+), 19 deletions(-)
+
+--- a/net/ipv4/cipso_ipv4.c
++++ b/net/ipv4/cipso_ipv4.c
+@@ -534,16 +534,10 @@ int cipso_v4_doi_remove(u32 doi, struct
+ 		ret_val = -ENOENT;
+ 		goto doi_remove_return;
+ 	}
+-	if (!atomic_dec_and_test(&doi_def->refcount)) {
+-		spin_unlock(&cipso_v4_doi_list_lock);
+-		ret_val = -EBUSY;
+-		goto doi_remove_return;
+-	}
+ 	list_del_rcu(&doi_def->list);
+ 	spin_unlock(&cipso_v4_doi_list_lock);
+ 
+-	cipso_v4_cache_invalidate();
+-	call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
++	cipso_v4_doi_putdef(doi_def);
+ 	ret_val = 0;
+ 
+ doi_remove_return:
+@@ -600,9 +594,6 @@ void cipso_v4_doi_putdef(struct cipso_v4
+ 
+ 	if (!atomic_dec_and_test(&doi_def->refcount))
+ 		return;
+-	spin_lock(&cipso_v4_doi_list_lock);
+-	list_del_rcu(&doi_def->list);
+-	spin_unlock(&cipso_v4_doi_list_lock);
+ 
+ 	cipso_v4_cache_invalidate();
+ 	call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+--- a/net/ipv6/calipso.c
++++ b/net/ipv6/calipso.c
+@@ -97,6 +97,9 @@ struct calipso_map_cache_entry {
+ 
+ static struct calipso_map_cache_bkt *calipso_cache;
+ 
++static void calipso_cache_invalidate(void);
++static void calipso_doi_putdef(struct calipso_doi *doi_def);
++
+ /* Label Mapping Cache Functions
+  */
+ 
+@@ -458,15 +461,10 @@ static int calipso_doi_remove(u32 doi, s
+ 		ret_val = -ENOENT;
+ 		goto doi_remove_return;
+ 	}
+-	if (!atomic_dec_and_test(&doi_def->refcount)) {
+-		spin_unlock(&calipso_doi_list_lock);
+-		ret_val = -EBUSY;
+-		goto doi_remove_return;
+-	}
+ 	list_del_rcu(&doi_def->list);
+ 	spin_unlock(&calipso_doi_list_lock);
+ 
+-	call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
++	calipso_doi_putdef(doi_def);
+ 	ret_val = 0;
+ 
+ doi_remove_return:
+@@ -522,10 +520,8 @@ static void calipso_doi_putdef(struct ca
+ 
+ 	if (!atomic_dec_and_test(&doi_def->refcount))
+ 		return;
+-	spin_lock(&calipso_doi_list_lock);
+-	list_del_rcu(&doi_def->list);
+-	spin_unlock(&calipso_doi_list_lock);
+ 
++	calipso_cache_invalidate();
+ 	call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
+ }
+ 
+--- a/net/netlabel/netlabel_cipso_v4.c
++++ b/net/netlabel/netlabel_cipso_v4.c
+@@ -587,6 +587,7 @@ list_start:
+ 
+ 		break;
+ 	}
++	cipso_v4_doi_putdef(doi_def);
+ 	rcu_read_unlock();
+ 
+ 	genlmsg_end(ans_skb, data);
+@@ -595,12 +596,14 @@ list_start:
+ list_retry:
+ 	/* XXX - this limit is a guesstimate */
+ 	if (nlsze_mult < 4) {
++		cipso_v4_doi_putdef(doi_def);
+ 		rcu_read_unlock();
+ 		kfree_skb(ans_skb);
+ 		nlsze_mult *= 2;
+ 		goto list_start;
+ 	}
+ list_failure_lock:
++	cipso_v4_doi_putdef(doi_def);
+ 	rcu_read_unlock();
+ list_failure:
+ 	kfree_skb(ans_skb);
diff --git a/queue-4.9/kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch b/queue-4.9/kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch
new file mode 100644
index 00000000000..6bb114d59cd
--- /dev/null
+++ b/queue-4.9/kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch
@@ -0,0 +1,75 @@
+From foo@baz Mon Jan 24 07:28:36 PM CET 2022
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Mon, 24 Jan 2022 17:45:00 +0100
+Subject: KVM: do not allow mapping valid but non-reference-counted pages
+To: stable@vger.kernel.org
+Cc: Paolo Bonzini <pbonzini@redhat.com>, Nicholas Piggin <npiggin@gmail.com>
+Message-ID: <Ye7XjLxf61gjp8w2@decadent.org.uk>
+Content-Disposition: inline
+
+From: Nicholas Piggin <npiggin@gmail.com>
+
+commit f8be156be163a052a067306417cd0ff679068c97 upstream.
+
+It's possible to create a region which maps valid but non-refcounted
+pages (e.g., tail pages of non-compound higher order allocations). These
+host pages can then be returned by gfn_to_page, gfn_to_pfn, etc., family
+of APIs, which take a reference to the page, which takes it from 0 to 1.
+When the reference is dropped, this will free the page incorrectly.
+
+Fix this by only taking a reference on valid pages if it was non-zero,
+which indicates it is participating in normal refcounting (and can be
+released with put_page).
+
+This addresses CVE-2021-22543.
+
+Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
+Tested-by: Paolo Bonzini <pbonzini@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ virt/kvm/kvm_main.c |   19 +++++++++++++++++--
+ 1 file changed, 17 insertions(+), 2 deletions(-)
+
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -1513,6 +1513,13 @@ static bool vma_is_valid(struct vm_area_
+ 	return true;
+ }
+ 
++static int kvm_try_get_pfn(kvm_pfn_t pfn)
++{
++	if (kvm_is_reserved_pfn(pfn))
++		return 1;
++	return get_page_unless_zero(pfn_to_page(pfn));
++}
++
+ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+ 			       unsigned long addr, bool *async,
+ 			       bool write_fault, bool *writable,
+@@ -1562,13 +1569,21 @@ static int hva_to_pfn_remapped(struct vm
+ 	 * Whoever called remap_pfn_range is also going to call e.g.
+ 	 * unmap_mapping_range before the underlying pages are freed,
+ 	 * causing a call to our MMU notifier.
++	 *
++	 * Certain IO or PFNMAP mappings can be backed with valid
++	 * struct pages, but be allocated without refcounting e.g.,
++	 * tail pages of non-compound higher order allocations, which
++	 * would then underflow the refcount when the caller does the
++	 * required put_page. Don't allow those pages here.
+ 	 */ 
+-	kvm_get_pfn(pfn);
++	if (!kvm_try_get_pfn(pfn))
++		r = -EFAULT;
+ 
+ out:
+ 	pte_unmap_unlock(ptep, ptl);
+ 	*p_pfn = pfn;
+-	return 0;
++
++	return r;
+ }
+ 
+ /*
diff --git a/queue-4.9/kvm-do-not-assume-pte-is-writable-after-follow_pfn.patch b/queue-4.9/kvm-do-not-assume-pte-is-writable-after-follow_pfn.patch
new file mode 100644
index 00000000000..66900a67c25
--- /dev/null
+++ b/queue-4.9/kvm-do-not-assume-pte-is-writable-after-follow_pfn.patch
@@ -0,0 +1,95 @@
+From foo@baz Mon Jan 24 07:28:36 PM CET 2022
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Mon, 24 Jan 2022 17:43:47 +0100
+Subject: KVM: do not assume PTE is writable after follow_pfn
+To: stable@vger.kernel.org
+Cc: Paolo Bonzini <pbonzini@redhat.com>, David Stevens <stevensd@google.com>, Ovidiu Panait <ovidiu.panait@windriver.com>, Ross Zwisler <ross.zwisler@linux.intel.com>, Andrew Morton <akpm@linux-foundation.org>
+Message-ID: <Ye7XQ+uWAtNM+OlG@decadent.org.uk>
+Content-Disposition: inline
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit bd2fae8da794b55bf2ac02632da3a151b10e664c upstream.
+
+In order to convert an HVA to a PFN, KVM usually tries to use
+the get_user_pages family of functinso.  This however is not
+possible for VM_IO vmas; in that case, KVM instead uses follow_pfn.
+
+In doing this however KVM loses the information on whether the
+PFN is writable.  That is usually not a problem because the main
+use of VM_IO vmas with KVM is for BARs in PCI device assignment,
+however it is a bug.  To fix it, use follow_pte and check pte_write
+while under the protection of the PTE lock.  The information can
+be used to fail hva_to_pfn_remapped or passed back to the
+caller via *writable.
+
+Usage of follow_pfn was introduced in commit add6a0cd1c5b ("KVM: MMU: try to fix
+up page faults before giving up", 2016-07-05); however, even older version
+have the same issue, all the way back to commit 2e2e3738af33 ("KVM:
+Handle vma regions with no backing page", 2008-07-20), as they also did
+not check whether the PFN was writable.
+
+Fixes: 2e2e3738af33 ("KVM: Handle vma regions with no backing page")
+Reported-by: David Stevens <stevensd@google.com>
+Cc: 3pvd@google.com
+Cc: Jann Horn <jannh@google.com>
+Cc: Jason Gunthorpe <jgg@ziepe.ca>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+[OP: backport to 4.19, adjust follow_pte() -> follow_pte_pmd()]
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+[bwh: Backport to 4.9: follow_pte_pmd() does not take start or end
+ parameters]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ virt/kvm/kvm_main.c |   15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -1519,9 +1519,11 @@ static int hva_to_pfn_remapped(struct vm
+ 			       kvm_pfn_t *p_pfn)
+ {
+ 	unsigned long pfn;
++	pte_t *ptep;
++	spinlock_t *ptl;
+ 	int r;
+ 
+-	r = follow_pfn(vma, addr, &pfn);
++	r = follow_pte_pmd(vma->vm_mm, addr, &ptep, NULL, &ptl);
+ 	if (r) {
+ 		/*
+ 		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+@@ -1536,14 +1538,19 @@ static int hva_to_pfn_remapped(struct vm
+ 		if (r)
+ 			return r;
+ 
+-		r = follow_pfn(vma, addr, &pfn);
++		r = follow_pte_pmd(vma->vm_mm, addr, &ptep, NULL, &ptl);
+ 		if (r)
+ 			return r;
++	}
+ 
++	if (write_fault && !pte_write(*ptep)) {
++		pfn = KVM_PFN_ERR_RO_FAULT;
++		goto out;
+ 	}
+ 
+ 	if (writable)
+-		*writable = true;
++		*writable = pte_write(*ptep);
++	pfn = pte_pfn(*ptep);
+ 
+ 	/*
+ 	 * Get a reference here because callers of *hva_to_pfn* and
+@@ -1558,6 +1565,8 @@ static int hva_to_pfn_remapped(struct vm
+ 	 */ 
+ 	kvm_get_pfn(pfn);
+ 
++out:
++	pte_unmap_unlock(ptep, ptl);
+ 	*p_pfn = pfn;
+ 	return 0;
+ }
diff --git a/queue-4.9/kvm-use-kvm_pfn_t-for-local-pfn-variable-in-hva_to_pfn_remapped.patch b/queue-4.9/kvm-use-kvm_pfn_t-for-local-pfn-variable-in-hva_to_pfn_remapped.patch
new file mode 100644
index 00000000000..d8c21beef7c
--- /dev/null
+++ b/queue-4.9/kvm-use-kvm_pfn_t-for-local-pfn-variable-in-hva_to_pfn_remapped.patch
@@ -0,0 +1,51 @@
+From foo@baz Mon Jan 24 07:28:36 PM CET 2022
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Mon, 24 Jan 2022 17:44:27 +0100
+Subject: KVM: Use kvm_pfn_t for local PFN variable in hva_to_pfn_remapped()
+To: stable@vger.kernel.org
+Cc: Paolo Bonzini <pbonzini@redhat.com>, Sean Christopherson <seanjc@google.com>, David Stevens <stevensd@google.com>
+Message-ID: <Ye7Xa599VFy2WEkC@decadent.org.uk>
+Content-Disposition: inline
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit a9545779ee9e9e103648f6f2552e73cfe808d0f4 upstream.
+
+Use kvm_pfn_t, a.k.a. u64, for the local 'pfn' variable when retrieving
+a so called "remapped" hva/pfn pair.  In theory, the hva could resolve to
+a pfn in high memory on a 32-bit kernel.
+
+This bug was inadvertantly exposed by commit bd2fae8da794 ("KVM: do not
+assume PTE is writable after follow_pfn"), which added an error PFN value
+to the mix, causing gcc to comlain about overflowing the unsigned long.
+
+  arch/x86/kvm/../../../virt/kvm/kvm_main.c: In function âhva_to_pfn_remappedâ:
+  include/linux/kvm_host.h:89:30: error: conversion from âlong long unsigned intâ
+                                  to âlong unsigned intâ changes value from
+                                  â9218868437227405314â to â2â [-Werror=overflow]
+   89 | #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2)
+      |                              ^
+virt/kvm/kvm_main.c:1935:9: note: in expansion of macro âKVM_PFN_ERR_RO_FAULTâ
+
+Cc: stable@vger.kernel.org
+Fixes: add6a0cd1c5b ("KVM: MMU: try to fix up page faults before giving up")
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20210208201940.1258328-1-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ virt/kvm/kvm_main.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -1518,7 +1518,7 @@ static int hva_to_pfn_remapped(struct vm
+ 			       bool write_fault, bool *writable,
+ 			       kvm_pfn_t *p_pfn)
+ {
+-	unsigned long pfn;
++	kvm_pfn_t pfn;
+ 	pte_t *ptep;
+ 	spinlock_t *ptl;
+ 	int r;
diff --git a/queue-4.9/lib-timerqueue-rely-on-rbtree-semantics-for-next-timer.patch b/queue-4.9/lib-timerqueue-rely-on-rbtree-semantics-for-next-timer.patch
new file mode 100644
index 00000000000..a33088f9232
--- /dev/null
+++ b/queue-4.9/lib-timerqueue-rely-on-rbtree-semantics-for-next-timer.patch
@@ -0,0 +1,141 @@
+From foo@baz Mon Jan 24 07:28:36 PM CET 2022
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Mon, 24 Jan 2022 17:33:46 +0100
+Subject: lib/timerqueue: Rely on rbtree semantics for next timer
+To: stable@vger.kernel.org
+Cc: Davidlohr Bueso <dave@stgolabs.net>, Thomas Gleixner <tglx@linutronix.de>
+Message-ID: <Ye7U6m6TkFmoM5iZ@decadent.org.uk>
+Content-Disposition: inline
+
+From: Davidlohr Bueso <dave@stgolabs.net>
+
+commit 511885d7061eda3eb1faf3f57dcc936ff75863f1 upstream.
+
+Simplify the timerqueue code by using cached rbtrees and rely on the tree
+leftmost node semantics to get the timer with earliest expiration time.
+This is a drop in conversion, and therefore semantics remain untouched.
+
+The runtime overhead of cached rbtrees is be pretty much the same as the
+current head->next method, noting that when removing the leftmost node,
+a common operation for the timerqueue, the rb_next(leftmost) is O(1) as
+well, so the next timer will either be the right node or its parent.
+Therefore no extra pointer chasing. Finally, the size of the struct
+timerqueue_head remains the same.
+
+Passes several hours of rcutorture.
+
+Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://lkml.kernel.org/r/20190724152323.bojciei3muvfxalm@linux-r8p5
+[bwh: While this was supposed to be just refactoring, it also fixed a
+ security flaw (CVE-2021-20317).  Backported to 4.9:
+ - Deleted code in timerqueue_del() is different before commit d852d39432f5
+   "timerqueue: Use rb_entry_safe() instead of open-coding it"
+ - Adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/timerqueue.h |   13 ++++++-------
+ lib/timerqueue.c           |   31 ++++++++++++-------------------
+ 2 files changed, 18 insertions(+), 26 deletions(-)
+
+--- a/include/linux/timerqueue.h
++++ b/include/linux/timerqueue.h
+@@ -11,8 +11,7 @@ struct timerqueue_node {
+ };
+ 
+ struct timerqueue_head {
+-	struct rb_root head;
+-	struct timerqueue_node *next;
++	struct rb_root_cached rb_root;
+ };
+ 
+ 
+@@ -28,13 +27,14 @@ extern struct timerqueue_node *timerqueu
+  *
+  * @head: head of timerqueue
+  *
+- * Returns a pointer to the timer node that has the
+- * earliest expiration time.
++ * Returns a pointer to the timer node that has the earliest expiration time.
+  */
+ static inline
+ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
+ {
+-	return head->next;
++	struct rb_node *leftmost = rb_first_cached(&head->rb_root);
++
++	return rb_entry(leftmost, struct timerqueue_node, node);
+ }
+ 
+ static inline void timerqueue_init(struct timerqueue_node *node)
+@@ -44,7 +44,6 @@ static inline void timerqueue_init(struc
+ 
+ static inline void timerqueue_init_head(struct timerqueue_head *head)
+ {
+-	head->head = RB_ROOT;
+-	head->next = NULL;
++	head->rb_root = RB_ROOT_CACHED;
+ }
+ #endif /* _LINUX_TIMERQUEUE_H */
+--- a/lib/timerqueue.c
++++ b/lib/timerqueue.c
+@@ -38,9 +38,10 @@
+  */
+ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
+ {
+-	struct rb_node **p = &head->head.rb_node;
++	struct rb_node **p = &head->rb_root.rb_root.rb_node;
+ 	struct rb_node *parent = NULL;
+-	struct timerqueue_node  *ptr;
++	struct timerqueue_node *ptr;
++	bool leftmost = true;
+ 
+ 	/* Make sure we don't add nodes that are already added */
+ 	WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
+@@ -48,19 +49,17 @@ bool timerqueue_add(struct timerqueue_he
+ 	while (*p) {
+ 		parent = *p;
+ 		ptr = rb_entry(parent, struct timerqueue_node, node);
+-		if (node->expires.tv64 < ptr->expires.tv64)
++		if (node->expires.tv64 < ptr->expires.tv64) {
+ 			p = &(*p)->rb_left;
+-		else
++		} else {
+ 			p = &(*p)->rb_right;
++			leftmost = false;
++		}
+ 	}
+ 	rb_link_node(&node->node, parent, p);
+-	rb_insert_color(&node->node, &head->head);
++	rb_insert_color_cached(&node->node, &head->rb_root, leftmost);
+ 
+-	if (!head->next || node->expires.tv64 < head->next->expires.tv64) {
+-		head->next = node;
+-		return true;
+-	}
+-	return false;
++	return leftmost;
+ }
+ EXPORT_SYMBOL_GPL(timerqueue_add);
+ 
+@@ -76,16 +75,10 @@ bool timerqueue_del(struct timerqueue_he
+ {
+ 	WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
+ 
+-	/* update next pointer */
+-	if (head->next == node) {
+-		struct rb_node *rbn = rb_next(&node->node);
+-
+-		head->next = rbn ?
+-			rb_entry(rbn, struct timerqueue_node, node) : NULL;
+-	}
+-	rb_erase(&node->node, &head->head);
++	rb_erase_cached(&node->node, &head->rb_root);
+ 	RB_CLEAR_NODE(&node->node);
+-	return head->next != NULL;
++
++	return !RB_EMPTY_ROOT(&head->rb_root.rb_root);
+ }
+ EXPORT_SYMBOL_GPL(timerqueue_del);
+ 
diff --git a/queue-4.9/mm-add-follow_pte_pmd.patch b/queue-4.9/mm-add-follow_pte_pmd.patch
new file mode 100644
index 00000000000..49f1636141f
--- /dev/null
+++ b/queue-4.9/mm-add-follow_pte_pmd.patch
@@ -0,0 +1,123 @@
+From foo@baz Mon Jan 24 07:28:36 PM CET 2022
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Mon, 24 Jan 2022 17:41:12 +0100
+Subject: mm: add follow_pte_pmd()
+To: stable@vger.kernel.org
+Cc: Paolo Bonzini <pbonzini@redhat.com>, Ross Zwisler <ross.zwisler@linux.intel.com>, ". Andrew Morton" <akpm@linux-foundation.org>
+Message-ID: <Ye7WqMoYNyxOqWId@decadent.org.uk>
+Content-Disposition: inline
+
+From: Ross Zwisler <ross.zwisler@linux.intel.com>
+
+commit 097963959594c5eccaba42510f7033f703211bda upstream.
+
+Patch series "Write protect DAX PMDs in *sync path".
+
+Currently dax_mapping_entry_mkclean() fails to clean and write protect
+the pmd_t of a DAX PMD entry during an *sync operation.  This can result
+in data loss, as detailed in patch 2.
+
+This series is based on Dan's "libnvdimm-pending" branch, which is the
+current home for Jan's "dax: Page invalidation fixes" series.  You can
+find a working tree here:
+
+  https://git.kernel.org/cgit/linux/kernel/git/zwisler/linux.git/log/?h=dax_pmd_clean
+
+This patch (of 2):
+
+Similar to follow_pte(), follow_pte_pmd() allows either a PTE leaf or a
+huge page PMD leaf to be found and returned.
+
+Link: http://lkml.kernel.org/r/1482272586-21177-2-git-send-email-ross.zwisler@linux.intel.com
+Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Suggested-by: Dave Hansen <dave.hansen@intel.com>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Jan Kara <jack@suse.cz>
+Cc: Matthew Wilcox <mawilcox@microsoft.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h |    2 ++
+ mm/memory.c        |   37 ++++++++++++++++++++++++++++++-------
+ 2 files changed, 32 insertions(+), 7 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1269,6 +1269,8 @@ int copy_page_range(struct mm_struct *ds
+ 			struct vm_area_struct *vma);
+ void unmap_mapping_range(struct address_space *mapping,
+ 		loff_t const holebegin, loff_t const holelen, int even_cows);
++int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
++			     pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
+ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+ 	unsigned long *pfn);
+ int follow_phys(struct vm_area_struct *vma, unsigned long address,
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3780,8 +3780,8 @@ int __pmd_alloc(struct mm_struct *mm, pu
+ }
+ #endif /* __PAGETABLE_PMD_FOLDED */
+ 
+-static int __follow_pte(struct mm_struct *mm, unsigned long address,
+-		pte_t **ptepp, spinlock_t **ptlp)
++static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
++		pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+ {
+ 	pgd_t *pgd;
+ 	pud_t *pud;
+@@ -3798,11 +3798,20 @@ static int __follow_pte(struct mm_struct
+ 
+ 	pmd = pmd_offset(pud, address);
+ 	VM_BUG_ON(pmd_trans_huge(*pmd));
+-	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+-		goto out;
+ 
+-	/* We cannot handle huge page PFN maps. Luckily they don't exist. */
+-	if (pmd_huge(*pmd))
++	if (pmd_huge(*pmd)) {
++		if (!pmdpp)
++			goto out;
++
++		*ptlp = pmd_lock(mm, pmd);
++		if (pmd_huge(*pmd)) {
++			*pmdpp = pmd;
++			return 0;
++		}
++		spin_unlock(*ptlp);
++	}
++
++	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+ 		goto out;
+ 
+ 	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
+@@ -3825,9 +3834,23 @@ static inline int follow_pte(struct mm_s
+ 
+ 	/* (void) is needed to make gcc happy */
+ 	(void) __cond_lock(*ptlp,
+-			   !(res = __follow_pte(mm, address, ptepp, ptlp)));
++			   !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
++					   ptlp)));
++	return res;
++}
++
++int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
++			     pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
++{
++	int res;
++
++	/* (void) is needed to make gcc happy */
++	(void) __cond_lock(*ptlp,
++			   !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
++					   ptlp)));
+ 	return res;
+ }
++EXPORT_SYMBOL(follow_pte_pmd);
+ 
+ /**
+  * follow_pfn - look up PFN at a user virtual address
diff --git a/queue-4.9/rbtree-cache-leftmost-node-internally.patch b/queue-4.9/rbtree-cache-leftmost-node-internally.patch
new file mode 100644
index 00000000000..eb6db9687a7
--- /dev/null
+++ b/queue-4.9/rbtree-cache-leftmost-node-internally.patch
@@ -0,0 +1,317 @@
+From foo@baz Mon Jan 24 07:28:36 PM CET 2022
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Mon, 24 Jan 2022 17:33:03 +0100
+Subject: rbtree: cache leftmost node internally
+To: stable@vger.kernel.org
+Cc: Davidlohr Bueso <dbueso@suse.de>
+Message-ID: <Ye7Uv7mOe8NWdbqP@decadent.org.uk>
+Content-Disposition: inline
+
+From: Davidlohr Bueso <dave@stgolabs.net>
+
+commit cd9e61ed1eebbcd5dfad59475d41ec58d9b64b6a upstream.
+
+Patch series "rbtree: Cache leftmost node internally", v4.
+
+A series to extending rbtrees to internally cache the leftmost node such
+that we can have fast overlap check optimization for all interval tree
+users[1].  The benefits of this series are that:
+
+(i)   Unify users that do internal leftmost node caching.
+(ii)  Optimize all interval tree users.
+(iii) Convert at least two new users (epoll and procfs) to the new interface.
+
+This patch (of 16):
+
+Red-black tree semantics imply that nodes with smaller or greater (or
+equal for duplicates) keys always be to the left and right,
+respectively.  For the kernel this is extremely evident when considering
+our rb_first() semantics.  Enabling lookups for the smallest node in the
+tree in O(1) can save a good chunk of cycles in not having to walk down
+the tree each time.  To this end there are a few core users that
+explicitly do this, such as the scheduler and rtmutexes.  There is also
+the desire for interval trees to have this optimization allowing faster
+overlap checking.
+
+This patch introduces a new 'struct rb_root_cached' which is just the
+root with a cached pointer to the leftmost node.  The reason why the
+regular rb_root was not extended instead of adding a new structure was
+that this allows the user to have the choice between memory footprint
+and actual tree performance.  The new wrappers on top of the regular
+rb_root calls are:
+
+ - rb_first_cached(cached_root) -- which is a fast replacement
+     for rb_first.
+
+ - rb_insert_color_cached(node, cached_root, new)
+
+ - rb_erase_cached(node, cached_root)
+
+In addition, augmented cached interfaces are also added for basic
+insertion and deletion operations; which becomes important for the
+interval tree changes.
+
+With the exception of the inserts, which adds a bool for updating the
+new leftmost, the interfaces are kept the same.  To this end, porting rb
+users to the cached version becomes really trivial, and keeping current
+rbtree semantics for users that don't care about the optimization
+requires zero overhead.
+
+Link: http://lkml.kernel.org/r/20170719014603.19029-2-dave@stgolabs.net
+Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/rbtree.txt         |   33 +++++++++++++++++++++++++++++++++
+ include/linux/rbtree.h           |   21 +++++++++++++++++++++
+ include/linux/rbtree_augmented.h |   33 ++++++++++++++++++++++++++++++---
+ lib/rbtree.c                     |   34 +++++++++++++++++++++++++++++-----
+ 4 files changed, 113 insertions(+), 8 deletions(-)
+
+--- a/Documentation/rbtree.txt
++++ b/Documentation/rbtree.txt
+@@ -190,6 +190,39 @@ Example:
+   for (node = rb_first(&mytree); node; node = rb_next(node))
+ 	printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
+ 
++Cached rbtrees
++--------------
++
++Computing the leftmost (smallest) node is quite a common task for binary
++search trees, such as for traversals or users relying on a the particular
++order for their own logic. To this end, users can use 'struct rb_root_cached'
++to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding
++potentially expensive tree iterations. This is done at negligible runtime
++overhead for maintanence; albeit larger memory footprint.
++
++Similar to the rb_root structure, cached rbtrees are initialized to be
++empty via:
++
++  struct rb_root_cached mytree = RB_ROOT_CACHED;
++
++Cached rbtree is simply a regular rb_root with an extra pointer to cache the
++leftmost node. This allows rb_root_cached to exist wherever rb_root does,
++which permits augmented trees to be supported as well as only a few extra
++interfaces:
++
++  struct rb_node *rb_first_cached(struct rb_root_cached *tree);
++  void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool);
++  void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
++
++Both insert and erase calls have their respective counterpart of augmented
++trees:
++
++  void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *,
++				  bool, struct rb_augment_callbacks *);
++  void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *,
++				 struct rb_augment_callbacks *);
++
++
+ Support for Augmented rbtrees
+ -----------------------------
+ 
+--- a/include/linux/rbtree.h
++++ b/include/linux/rbtree.h
+@@ -44,10 +44,25 @@ struct rb_root {
+ 	struct rb_node *rb_node;
+ };
+ 
++/*
++ * Leftmost-cached rbtrees.
++ *
++ * We do not cache the rightmost node based on footprint
++ * size vs number of potential users that could benefit
++ * from O(1) rb_last(). Just not worth it, users that want
++ * this feature can always implement the logic explicitly.
++ * Furthermore, users that want to cache both pointers may
++ * find it a bit asymmetric, but that's ok.
++ */
++struct rb_root_cached {
++	struct rb_root rb_root;
++	struct rb_node *rb_leftmost;
++};
+ 
+ #define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
+ 
+ #define RB_ROOT	(struct rb_root) { NULL, }
++#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
+ #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
+ 
+ #define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
+@@ -69,6 +84,12 @@ extern struct rb_node *rb_prev(const str
+ extern struct rb_node *rb_first(const struct rb_root *);
+ extern struct rb_node *rb_last(const struct rb_root *);
+ 
++extern void rb_insert_color_cached(struct rb_node *,
++				   struct rb_root_cached *, bool);
++extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
++/* Same as rb_first(), but O(1) */
++#define rb_first_cached(root) (root)->rb_leftmost
++
+ /* Postorder iteration - always visit the parent after its children */
+ extern struct rb_node *rb_first_postorder(const struct rb_root *);
+ extern struct rb_node *rb_next_postorder(const struct rb_node *);
+--- a/include/linux/rbtree_augmented.h
++++ b/include/linux/rbtree_augmented.h
+@@ -41,7 +41,9 @@ struct rb_augment_callbacks {
+ 	void (*rotate)(struct rb_node *old, struct rb_node *new);
+ };
+ 
+-extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
++extern void __rb_insert_augmented(struct rb_node *node,
++				  struct rb_root *root,
++				  bool newleft, struct rb_node **leftmost,
+ 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+ /*
+  * Fixup the rbtree and update the augmented information when rebalancing.
+@@ -57,7 +59,16 @@ static inline void
+ rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+ 		    const struct rb_augment_callbacks *augment)
+ {
+-	__rb_insert_augmented(node, root, augment->rotate);
++	__rb_insert_augmented(node, root, false, NULL, augment->rotate);
++}
++
++static inline void
++rb_insert_augmented_cached(struct rb_node *node,
++			   struct rb_root_cached *root, bool newleft,
++			   const struct rb_augment_callbacks *augment)
++{
++	__rb_insert_augmented(node, &root->rb_root,
++			      newleft, &root->rb_leftmost, augment->rotate);
+ }
+ 
+ #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
+@@ -148,6 +159,7 @@ extern void __rb_erase_color(struct rb_n
+ 
+ static __always_inline struct rb_node *
+ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
++		     struct rb_node **leftmost,
+ 		     const struct rb_augment_callbacks *augment)
+ {
+ 	struct rb_node *child = node->rb_right;
+@@ -155,6 +167,9 @@ __rb_erase_augmented(struct rb_node *nod
+ 	struct rb_node *parent, *rebalance;
+ 	unsigned long pc;
+ 
++	if (leftmost && node == *leftmost)
++		*leftmost = rb_next(node);
++
+ 	if (!tmp) {
+ 		/*
+ 		 * Case 1: node to erase has no more than 1 child (easy!)
+@@ -254,9 +269,21 @@ static __always_inline void
+ rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+ 		   const struct rb_augment_callbacks *augment)
+ {
+-	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
++	struct rb_node *rebalance = __rb_erase_augmented(node, root,
++							 NULL, augment);
+ 	if (rebalance)
+ 		__rb_erase_color(rebalance, root, augment->rotate);
+ }
+ 
++static __always_inline void
++rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
++			  const struct rb_augment_callbacks *augment)
++{
++	struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root,
++							 &root->rb_leftmost,
++							 augment);
++	if (rebalance)
++		__rb_erase_color(rebalance, &root->rb_root, augment->rotate);
++}
++
+ #endif	/* _LINUX_RBTREE_AUGMENTED_H */
+--- a/lib/rbtree.c
++++ b/lib/rbtree.c
+@@ -95,10 +95,14 @@ __rb_rotate_set_parents(struct rb_node *
+ 
+ static __always_inline void
+ __rb_insert(struct rb_node *node, struct rb_root *root,
++	    bool newleft, struct rb_node **leftmost,
+ 	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+ {
+ 	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
+ 
++	if (newleft)
++		*leftmost = node;
++
+ 	while (true) {
+ 		/*
+ 		 * Loop invariant: node is red
+@@ -417,19 +421,38 @@ static const struct rb_augment_callbacks
+ 
+ void rb_insert_color(struct rb_node *node, struct rb_root *root)
+ {
+-	__rb_insert(node, root, dummy_rotate);
++	__rb_insert(node, root, false, NULL, dummy_rotate);
+ }
+ EXPORT_SYMBOL(rb_insert_color);
+ 
+ void rb_erase(struct rb_node *node, struct rb_root *root)
+ {
+ 	struct rb_node *rebalance;
+-	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
++	rebalance = __rb_erase_augmented(node, root,
++					 NULL, &dummy_callbacks);
+ 	if (rebalance)
+ 		____rb_erase_color(rebalance, root, dummy_rotate);
+ }
+ EXPORT_SYMBOL(rb_erase);
+ 
++void rb_insert_color_cached(struct rb_node *node,
++			    struct rb_root_cached *root, bool leftmost)
++{
++	__rb_insert(node, &root->rb_root, leftmost,
++		    &root->rb_leftmost, dummy_rotate);
++}
++EXPORT_SYMBOL(rb_insert_color_cached);
++
++void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
++{
++	struct rb_node *rebalance;
++	rebalance = __rb_erase_augmented(node, &root->rb_root,
++					 &root->rb_leftmost, &dummy_callbacks);
++	if (rebalance)
++		____rb_erase_color(rebalance, &root->rb_root, dummy_rotate);
++}
++EXPORT_SYMBOL(rb_erase_cached);
++
+ /*
+  * Augmented rbtree manipulation functions.
+  *
+@@ -438,9 +461,10 @@ EXPORT_SYMBOL(rb_erase);
+  */
+ 
+ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
++			   bool newleft, struct rb_node **leftmost,
+ 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+ {
+-	__rb_insert(node, root, augment_rotate);
++	__rb_insert(node, root, newleft, leftmost, augment_rotate);
+ }
+ EXPORT_SYMBOL(__rb_insert_augmented);
+ 
+@@ -485,7 +509,7 @@ struct rb_node *rb_next(const struct rb_
+ 	 * as we can.
+ 	 */
+ 	if (node->rb_right) {
+-		node = node->rb_right; 
++		node = node->rb_right;
+ 		while (node->rb_left)
+ 			node=node->rb_left;
+ 		return (struct rb_node *)node;
+@@ -517,7 +541,7 @@ struct rb_node *rb_prev(const struct rb_
+ 	 * as we can.
+ 	 */
+ 	if (node->rb_left) {
+-		node = node->rb_left; 
++		node = node->rb_left;
+ 		while (node->rb_right)
+ 			node=node->rb_right;
+ 		return (struct rb_node *)node;
diff --git a/queue-4.9/series b/queue-4.9/series
index a845b740e4f..cd4fd938cc0 100644
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -148,3 +148,10 @@ gup-document-and-work-around-cow-can-break-either-way-issue.patch
 drm-ttm-nouveau-don-t-call-tt-destroy-callback-on-alloc-failure.patch
 gianfar-simplify-fcs-handling-and-fix-memory-leak.patch
 gianfar-fix-jumbo-packets-napi-rx-overrun-crash.patch
+cipso-calipso-resolve-a-number-of-problems-with-the-doi-refcounts.patch
+rbtree-cache-leftmost-node-internally.patch
+lib-timerqueue-rely-on-rbtree-semantics-for-next-timer.patch
+mm-add-follow_pte_pmd.patch
+kvm-do-not-assume-pte-is-writable-after-follow_pfn.patch
+kvm-use-kvm_pfn_t-for-local-pfn-variable-in-hva_to_pfn_remapped.patch
+kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch