From: Greg Kroah-Hartman Date: Mon, 24 Jan 2022 18:32:58 +0000 (+0100) Subject: 4.9-stable patches X-Git-Tag: v4.4.300~16 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c769fb4d1741dd67694459da21e1c10bfe0b10f6;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: cipso-calipso-resolve-a-number-of-problems-with-the-doi-refcounts.patch kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch kvm-do-not-assume-pte-is-writable-after-follow_pfn.patch kvm-use-kvm_pfn_t-for-local-pfn-variable-in-hva_to_pfn_remapped.patch lib-timerqueue-rely-on-rbtree-semantics-for-next-timer.patch mm-add-follow_pte_pmd.patch rbtree-cache-leftmost-node-internally.patch --- diff --git a/queue-4.9/cipso-calipso-resolve-a-number-of-problems-with-the-doi-refcounts.patch b/queue-4.9/cipso-calipso-resolve-a-number-of-problems-with-the-doi-refcounts.patch new file mode 100644 index 00000000000..8c3006b93c7 --- /dev/null +++ b/queue-4.9/cipso-calipso-resolve-a-number-of-problems-with-the-doi-refcounts.patch @@ -0,0 +1,140 @@ +From foo@baz Mon Jan 24 07:28:36 PM CET 2022 +From: Ben Hutchings +Date: Mon, 24 Jan 2022 17:32:21 +0100 +Subject: cipso,calipso: resolve a number of problems with the DOI refcounts +To: stable@vger.kernel.org +Cc: Paul Moore +Message-ID: +Content-Disposition: inline + +From: Paul Moore + +commit ad5d07f4a9cd671233ae20983848874731102c08 upstream. + +The current CIPSO and CALIPSO refcounting scheme for the DOI +definitions is a bit flawed in that we: + +1. Don't correctly match gets/puts in netlbl_cipsov4_list(). +2. Decrement the refcount on each attempt to remove the DOI from the + DOI list, only removing it from the list once the refcount drops + to zero. + +This patch fixes these problems by adding the missing "puts" to +netlbl_cipsov4_list() and introduces a more conventional, i.e. +not-buggy, refcounting mechanism to the DOI definitions. Upon the +addition of a DOI to the DOI list, it is initialized with a refcount +of one, removing a DOI from the list removes it from the list and +drops the refcount by one; "gets" and "puts" behave as expected with +respect to refcounts, increasing and decreasing the DOI's refcount by +one. + +Fixes: b1edeb102397 ("netlabel: Replace protocol/NetLabel linking with refrerence counts") +Fixes: d7cce01504a0 ("netlabel: Add support for removing a CALIPSO DOI.") +Reported-by: syzbot+9ec037722d2603a9f52e@syzkaller.appspotmail.com +Signed-off-by: Paul Moore +Signed-off-by: David S. Miller +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/cipso_ipv4.c | 11 +---------- + net/ipv6/calipso.c | 14 +++++--------- + net/netlabel/netlabel_cipso_v4.c | 3 +++ + 3 files changed, 9 insertions(+), 19 deletions(-) + +--- a/net/ipv4/cipso_ipv4.c ++++ b/net/ipv4/cipso_ipv4.c +@@ -534,16 +534,10 @@ int cipso_v4_doi_remove(u32 doi, struct + ret_val = -ENOENT; + goto doi_remove_return; + } +- if (!atomic_dec_and_test(&doi_def->refcount)) { +- spin_unlock(&cipso_v4_doi_list_lock); +- ret_val = -EBUSY; +- goto doi_remove_return; +- } + list_del_rcu(&doi_def->list); + spin_unlock(&cipso_v4_doi_list_lock); + +- cipso_v4_cache_invalidate(); +- call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); ++ cipso_v4_doi_putdef(doi_def); + ret_val = 0; + + doi_remove_return: +@@ -600,9 +594,6 @@ void cipso_v4_doi_putdef(struct cipso_v4 + + if (!atomic_dec_and_test(&doi_def->refcount)) + return; +- spin_lock(&cipso_v4_doi_list_lock); +- list_del_rcu(&doi_def->list); +- spin_unlock(&cipso_v4_doi_list_lock); + + cipso_v4_cache_invalidate(); + call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); +--- a/net/ipv6/calipso.c ++++ b/net/ipv6/calipso.c +@@ -97,6 +97,9 @@ struct calipso_map_cache_entry { + + static struct calipso_map_cache_bkt *calipso_cache; + ++static void calipso_cache_invalidate(void); ++static void calipso_doi_putdef(struct calipso_doi *doi_def); ++ + /* Label Mapping Cache Functions + */ + +@@ -458,15 +461,10 @@ static int calipso_doi_remove(u32 doi, s + ret_val = -ENOENT; + goto doi_remove_return; + } +- if (!atomic_dec_and_test(&doi_def->refcount)) { +- spin_unlock(&calipso_doi_list_lock); +- ret_val = -EBUSY; +- goto doi_remove_return; +- } + list_del_rcu(&doi_def->list); + spin_unlock(&calipso_doi_list_lock); + +- call_rcu(&doi_def->rcu, calipso_doi_free_rcu); ++ calipso_doi_putdef(doi_def); + ret_val = 0; + + doi_remove_return: +@@ -522,10 +520,8 @@ static void calipso_doi_putdef(struct ca + + if (!atomic_dec_and_test(&doi_def->refcount)) + return; +- spin_lock(&calipso_doi_list_lock); +- list_del_rcu(&doi_def->list); +- spin_unlock(&calipso_doi_list_lock); + ++ calipso_cache_invalidate(); + call_rcu(&doi_def->rcu, calipso_doi_free_rcu); + } + +--- a/net/netlabel/netlabel_cipso_v4.c ++++ b/net/netlabel/netlabel_cipso_v4.c +@@ -587,6 +587,7 @@ list_start: + + break; + } ++ cipso_v4_doi_putdef(doi_def); + rcu_read_unlock(); + + genlmsg_end(ans_skb, data); +@@ -595,12 +596,14 @@ list_start: + list_retry: + /* XXX - this limit is a guesstimate */ + if (nlsze_mult < 4) { ++ cipso_v4_doi_putdef(doi_def); + rcu_read_unlock(); + kfree_skb(ans_skb); + nlsze_mult *= 2; + goto list_start; + } + list_failure_lock: ++ cipso_v4_doi_putdef(doi_def); + rcu_read_unlock(); + list_failure: + kfree_skb(ans_skb); diff --git a/queue-4.9/kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch b/queue-4.9/kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch new file mode 100644 index 00000000000..6bb114d59cd --- /dev/null +++ b/queue-4.9/kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch @@ -0,0 +1,75 @@ +From foo@baz Mon Jan 24 07:28:36 PM CET 2022 +From: Ben Hutchings +Date: Mon, 24 Jan 2022 17:45:00 +0100 +Subject: KVM: do not allow mapping valid but non-reference-counted pages +To: stable@vger.kernel.org +Cc: Paolo Bonzini , Nicholas Piggin +Message-ID: +Content-Disposition: inline + +From: Nicholas Piggin + +commit f8be156be163a052a067306417cd0ff679068c97 upstream. + +It's possible to create a region which maps valid but non-refcounted +pages (e.g., tail pages of non-compound higher order allocations). These +host pages can then be returned by gfn_to_page, gfn_to_pfn, etc., family +of APIs, which take a reference to the page, which takes it from 0 to 1. +When the reference is dropped, this will free the page incorrectly. + +Fix this by only taking a reference on valid pages if it was non-zero, +which indicates it is participating in normal refcounting (and can be +released with put_page). + +This addresses CVE-2021-22543. + +Signed-off-by: Nicholas Piggin +Tested-by: Paolo Bonzini +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + virt/kvm/kvm_main.c | 19 +++++++++++++++++-- + 1 file changed, 17 insertions(+), 2 deletions(-) + +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -1513,6 +1513,13 @@ static bool vma_is_valid(struct vm_area_ + return true; + } + ++static int kvm_try_get_pfn(kvm_pfn_t pfn) ++{ ++ if (kvm_is_reserved_pfn(pfn)) ++ return 1; ++ return get_page_unless_zero(pfn_to_page(pfn)); ++} ++ + static int hva_to_pfn_remapped(struct vm_area_struct *vma, + unsigned long addr, bool *async, + bool write_fault, bool *writable, +@@ -1562,13 +1569,21 @@ static int hva_to_pfn_remapped(struct vm + * Whoever called remap_pfn_range is also going to call e.g. + * unmap_mapping_range before the underlying pages are freed, + * causing a call to our MMU notifier. ++ * ++ * Certain IO or PFNMAP mappings can be backed with valid ++ * struct pages, but be allocated without refcounting e.g., ++ * tail pages of non-compound higher order allocations, which ++ * would then underflow the refcount when the caller does the ++ * required put_page. Don't allow those pages here. + */ +- kvm_get_pfn(pfn); ++ if (!kvm_try_get_pfn(pfn)) ++ r = -EFAULT; + + out: + pte_unmap_unlock(ptep, ptl); + *p_pfn = pfn; +- return 0; ++ ++ return r; + } + + /* diff --git a/queue-4.9/kvm-do-not-assume-pte-is-writable-after-follow_pfn.patch b/queue-4.9/kvm-do-not-assume-pte-is-writable-after-follow_pfn.patch new file mode 100644 index 00000000000..66900a67c25 --- /dev/null +++ b/queue-4.9/kvm-do-not-assume-pte-is-writable-after-follow_pfn.patch @@ -0,0 +1,95 @@ +From foo@baz Mon Jan 24 07:28:36 PM CET 2022 +From: Ben Hutchings +Date: Mon, 24 Jan 2022 17:43:47 +0100 +Subject: KVM: do not assume PTE is writable after follow_pfn +To: stable@vger.kernel.org +Cc: Paolo Bonzini , David Stevens , Ovidiu Panait , Ross Zwisler , Andrew Morton +Message-ID: +Content-Disposition: inline + +From: Paolo Bonzini + +commit bd2fae8da794b55bf2ac02632da3a151b10e664c upstream. + +In order to convert an HVA to a PFN, KVM usually tries to use +the get_user_pages family of functinso. This however is not +possible for VM_IO vmas; in that case, KVM instead uses follow_pfn. + +In doing this however KVM loses the information on whether the +PFN is writable. That is usually not a problem because the main +use of VM_IO vmas with KVM is for BARs in PCI device assignment, +however it is a bug. To fix it, use follow_pte and check pte_write +while under the protection of the PTE lock. The information can +be used to fail hva_to_pfn_remapped or passed back to the +caller via *writable. + +Usage of follow_pfn was introduced in commit add6a0cd1c5b ("KVM: MMU: try to fix +up page faults before giving up", 2016-07-05); however, even older version +have the same issue, all the way back to commit 2e2e3738af33 ("KVM: +Handle vma regions with no backing page", 2008-07-20), as they also did +not check whether the PFN was writable. + +Fixes: 2e2e3738af33 ("KVM: Handle vma regions with no backing page") +Reported-by: David Stevens +Cc: 3pvd@google.com +Cc: Jann Horn +Cc: Jason Gunthorpe +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +[OP: backport to 4.19, adjust follow_pte() -> follow_pte_pmd()] +Signed-off-by: Ovidiu Panait +Signed-off-by: Greg Kroah-Hartman +[bwh: Backport to 4.9: follow_pte_pmd() does not take start or end + parameters] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + virt/kvm/kvm_main.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -1519,9 +1519,11 @@ static int hva_to_pfn_remapped(struct vm + kvm_pfn_t *p_pfn) + { + unsigned long pfn; ++ pte_t *ptep; ++ spinlock_t *ptl; + int r; + +- r = follow_pfn(vma, addr, &pfn); ++ r = follow_pte_pmd(vma->vm_mm, addr, &ptep, NULL, &ptl); + if (r) { + /* + * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does +@@ -1536,14 +1538,19 @@ static int hva_to_pfn_remapped(struct vm + if (r) + return r; + +- r = follow_pfn(vma, addr, &pfn); ++ r = follow_pte_pmd(vma->vm_mm, addr, &ptep, NULL, &ptl); + if (r) + return r; ++ } + ++ if (write_fault && !pte_write(*ptep)) { ++ pfn = KVM_PFN_ERR_RO_FAULT; ++ goto out; + } + + if (writable) +- *writable = true; ++ *writable = pte_write(*ptep); ++ pfn = pte_pfn(*ptep); + + /* + * Get a reference here because callers of *hva_to_pfn* and +@@ -1558,6 +1565,8 @@ static int hva_to_pfn_remapped(struct vm + */ + kvm_get_pfn(pfn); + ++out: ++ pte_unmap_unlock(ptep, ptl); + *p_pfn = pfn; + return 0; + } diff --git a/queue-4.9/kvm-use-kvm_pfn_t-for-local-pfn-variable-in-hva_to_pfn_remapped.patch b/queue-4.9/kvm-use-kvm_pfn_t-for-local-pfn-variable-in-hva_to_pfn_remapped.patch new file mode 100644 index 00000000000..d8c21beef7c --- /dev/null +++ b/queue-4.9/kvm-use-kvm_pfn_t-for-local-pfn-variable-in-hva_to_pfn_remapped.patch @@ -0,0 +1,51 @@ +From foo@baz Mon Jan 24 07:28:36 PM CET 2022 +From: Ben Hutchings +Date: Mon, 24 Jan 2022 17:44:27 +0100 +Subject: KVM: Use kvm_pfn_t for local PFN variable in hva_to_pfn_remapped() +To: stable@vger.kernel.org +Cc: Paolo Bonzini , Sean Christopherson , David Stevens +Message-ID: +Content-Disposition: inline + +From: Sean Christopherson + +commit a9545779ee9e9e103648f6f2552e73cfe808d0f4 upstream. + +Use kvm_pfn_t, a.k.a. u64, for the local 'pfn' variable when retrieving +a so called "remapped" hva/pfn pair. In theory, the hva could resolve to +a pfn in high memory on a 32-bit kernel. + +This bug was inadvertantly exposed by commit bd2fae8da794 ("KVM: do not +assume PTE is writable after follow_pfn"), which added an error PFN value +to the mix, causing gcc to comlain about overflowing the unsigned long. + + arch/x86/kvm/../../../virt/kvm/kvm_main.c: In function ‘hva_to_pfn_remapped’: + include/linux/kvm_host.h:89:30: error: conversion from ‘long long unsigned int’ + to ‘long unsigned int’ changes value from + ‘9218868437227405314’ to ‘2’ [-Werror=overflow] + 89 | #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) + | ^ +virt/kvm/kvm_main.c:1935:9: note: in expansion of macro ‘KVM_PFN_ERR_RO_FAULT’ + +Cc: stable@vger.kernel.org +Fixes: add6a0cd1c5b ("KVM: MMU: try to fix up page faults before giving up") +Signed-off-by: Sean Christopherson +Message-Id: <20210208201940.1258328-1-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + virt/kvm/kvm_main.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -1518,7 +1518,7 @@ static int hva_to_pfn_remapped(struct vm + bool write_fault, bool *writable, + kvm_pfn_t *p_pfn) + { +- unsigned long pfn; ++ kvm_pfn_t pfn; + pte_t *ptep; + spinlock_t *ptl; + int r; diff --git a/queue-4.9/lib-timerqueue-rely-on-rbtree-semantics-for-next-timer.patch b/queue-4.9/lib-timerqueue-rely-on-rbtree-semantics-for-next-timer.patch new file mode 100644 index 00000000000..a33088f9232 --- /dev/null +++ b/queue-4.9/lib-timerqueue-rely-on-rbtree-semantics-for-next-timer.patch @@ -0,0 +1,141 @@ +From foo@baz Mon Jan 24 07:28:36 PM CET 2022 +From: Ben Hutchings +Date: Mon, 24 Jan 2022 17:33:46 +0100 +Subject: lib/timerqueue: Rely on rbtree semantics for next timer +To: stable@vger.kernel.org +Cc: Davidlohr Bueso , Thomas Gleixner +Message-ID: +Content-Disposition: inline + +From: Davidlohr Bueso + +commit 511885d7061eda3eb1faf3f57dcc936ff75863f1 upstream. + +Simplify the timerqueue code by using cached rbtrees and rely on the tree +leftmost node semantics to get the timer with earliest expiration time. +This is a drop in conversion, and therefore semantics remain untouched. + +The runtime overhead of cached rbtrees is be pretty much the same as the +current head->next method, noting that when removing the leftmost node, +a common operation for the timerqueue, the rb_next(leftmost) is O(1) as +well, so the next timer will either be the right node or its parent. +Therefore no extra pointer chasing. Finally, the size of the struct +timerqueue_head remains the same. + +Passes several hours of rcutorture. + +Signed-off-by: Davidlohr Bueso +Signed-off-by: Thomas Gleixner +Link: https://lkml.kernel.org/r/20190724152323.bojciei3muvfxalm@linux-r8p5 +[bwh: While this was supposed to be just refactoring, it also fixed a + security flaw (CVE-2021-20317). Backported to 4.9: + - Deleted code in timerqueue_del() is different before commit d852d39432f5 + "timerqueue: Use rb_entry_safe() instead of open-coding it" + - Adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/timerqueue.h | 13 ++++++------- + lib/timerqueue.c | 31 ++++++++++++------------------- + 2 files changed, 18 insertions(+), 26 deletions(-) + +--- a/include/linux/timerqueue.h ++++ b/include/linux/timerqueue.h +@@ -11,8 +11,7 @@ struct timerqueue_node { + }; + + struct timerqueue_head { +- struct rb_root head; +- struct timerqueue_node *next; ++ struct rb_root_cached rb_root; + }; + + +@@ -28,13 +27,14 @@ extern struct timerqueue_node *timerqueu + * + * @head: head of timerqueue + * +- * Returns a pointer to the timer node that has the +- * earliest expiration time. ++ * Returns a pointer to the timer node that has the earliest expiration time. + */ + static inline + struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) + { +- return head->next; ++ struct rb_node *leftmost = rb_first_cached(&head->rb_root); ++ ++ return rb_entry(leftmost, struct timerqueue_node, node); + } + + static inline void timerqueue_init(struct timerqueue_node *node) +@@ -44,7 +44,6 @@ static inline void timerqueue_init(struc + + static inline void timerqueue_init_head(struct timerqueue_head *head) + { +- head->head = RB_ROOT; +- head->next = NULL; ++ head->rb_root = RB_ROOT_CACHED; + } + #endif /* _LINUX_TIMERQUEUE_H */ +--- a/lib/timerqueue.c ++++ b/lib/timerqueue.c +@@ -38,9 +38,10 @@ + */ + bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) + { +- struct rb_node **p = &head->head.rb_node; ++ struct rb_node **p = &head->rb_root.rb_root.rb_node; + struct rb_node *parent = NULL; +- struct timerqueue_node *ptr; ++ struct timerqueue_node *ptr; ++ bool leftmost = true; + + /* Make sure we don't add nodes that are already added */ + WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); +@@ -48,19 +49,17 @@ bool timerqueue_add(struct timerqueue_he + while (*p) { + parent = *p; + ptr = rb_entry(parent, struct timerqueue_node, node); +- if (node->expires.tv64 < ptr->expires.tv64) ++ if (node->expires.tv64 < ptr->expires.tv64) { + p = &(*p)->rb_left; +- else ++ } else { + p = &(*p)->rb_right; ++ leftmost = false; ++ } + } + rb_link_node(&node->node, parent, p); +- rb_insert_color(&node->node, &head->head); ++ rb_insert_color_cached(&node->node, &head->rb_root, leftmost); + +- if (!head->next || node->expires.tv64 < head->next->expires.tv64) { +- head->next = node; +- return true; +- } +- return false; ++ return leftmost; + } + EXPORT_SYMBOL_GPL(timerqueue_add); + +@@ -76,16 +75,10 @@ bool timerqueue_del(struct timerqueue_he + { + WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); + +- /* update next pointer */ +- if (head->next == node) { +- struct rb_node *rbn = rb_next(&node->node); +- +- head->next = rbn ? +- rb_entry(rbn, struct timerqueue_node, node) : NULL; +- } +- rb_erase(&node->node, &head->head); ++ rb_erase_cached(&node->node, &head->rb_root); + RB_CLEAR_NODE(&node->node); +- return head->next != NULL; ++ ++ return !RB_EMPTY_ROOT(&head->rb_root.rb_root); + } + EXPORT_SYMBOL_GPL(timerqueue_del); + diff --git a/queue-4.9/mm-add-follow_pte_pmd.patch b/queue-4.9/mm-add-follow_pte_pmd.patch new file mode 100644 index 00000000000..49f1636141f --- /dev/null +++ b/queue-4.9/mm-add-follow_pte_pmd.patch @@ -0,0 +1,123 @@ +From foo@baz Mon Jan 24 07:28:36 PM CET 2022 +From: Ben Hutchings +Date: Mon, 24 Jan 2022 17:41:12 +0100 +Subject: mm: add follow_pte_pmd() +To: stable@vger.kernel.org +Cc: Paolo Bonzini , Ross Zwisler , ". Andrew Morton" +Message-ID: +Content-Disposition: inline + +From: Ross Zwisler + +commit 097963959594c5eccaba42510f7033f703211bda upstream. + +Patch series "Write protect DAX PMDs in *sync path". + +Currently dax_mapping_entry_mkclean() fails to clean and write protect +the pmd_t of a DAX PMD entry during an *sync operation. This can result +in data loss, as detailed in patch 2. + +This series is based on Dan's "libnvdimm-pending" branch, which is the +current home for Jan's "dax: Page invalidation fixes" series. You can +find a working tree here: + + https://git.kernel.org/cgit/linux/kernel/git/zwisler/linux.git/log/?h=dax_pmd_clean + +This patch (of 2): + +Similar to follow_pte(), follow_pte_pmd() allows either a PTE leaf or a +huge page PMD leaf to be found and returned. + +Link: http://lkml.kernel.org/r/1482272586-21177-2-git-send-email-ross.zwisler@linux.intel.com +Signed-off-by: Ross Zwisler +Suggested-by: Dave Hansen +Cc: Alexander Viro +Cc: Christoph Hellwig +Cc: Dan Williams +Cc: Dave Chinner +Cc: Jan Kara +Cc: Matthew Wilcox +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/mm.h | 2 ++ + mm/memory.c | 37 ++++++++++++++++++++++++++++++------- + 2 files changed, 32 insertions(+), 7 deletions(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1269,6 +1269,8 @@ int copy_page_range(struct mm_struct *ds + struct vm_area_struct *vma); + void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); ++int follow_pte_pmd(struct mm_struct *mm, unsigned long address, ++ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); + int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn); + int follow_phys(struct vm_area_struct *vma, unsigned long address, +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -3780,8 +3780,8 @@ int __pmd_alloc(struct mm_struct *mm, pu + } + #endif /* __PAGETABLE_PMD_FOLDED */ + +-static int __follow_pte(struct mm_struct *mm, unsigned long address, +- pte_t **ptepp, spinlock_t **ptlp) ++static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, ++ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) + { + pgd_t *pgd; + pud_t *pud; +@@ -3798,11 +3798,20 @@ static int __follow_pte(struct mm_struct + + pmd = pmd_offset(pud, address); + VM_BUG_ON(pmd_trans_huge(*pmd)); +- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) +- goto out; + +- /* We cannot handle huge page PFN maps. Luckily they don't exist. */ +- if (pmd_huge(*pmd)) ++ if (pmd_huge(*pmd)) { ++ if (!pmdpp) ++ goto out; ++ ++ *ptlp = pmd_lock(mm, pmd); ++ if (pmd_huge(*pmd)) { ++ *pmdpp = pmd; ++ return 0; ++ } ++ spin_unlock(*ptlp); ++ } ++ ++ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); +@@ -3825,9 +3834,23 @@ static inline int follow_pte(struct mm_s + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(*ptlp, +- !(res = __follow_pte(mm, address, ptepp, ptlp))); ++ !(res = __follow_pte_pmd(mm, address, ptepp, NULL, ++ ptlp))); ++ return res; ++} ++ ++int follow_pte_pmd(struct mm_struct *mm, unsigned long address, ++ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) ++{ ++ int res; ++ ++ /* (void) is needed to make gcc happy */ ++ (void) __cond_lock(*ptlp, ++ !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp, ++ ptlp))); + return res; + } ++EXPORT_SYMBOL(follow_pte_pmd); + + /** + * follow_pfn - look up PFN at a user virtual address diff --git a/queue-4.9/rbtree-cache-leftmost-node-internally.patch b/queue-4.9/rbtree-cache-leftmost-node-internally.patch new file mode 100644 index 00000000000..eb6db9687a7 --- /dev/null +++ b/queue-4.9/rbtree-cache-leftmost-node-internally.patch @@ -0,0 +1,317 @@ +From foo@baz Mon Jan 24 07:28:36 PM CET 2022 +From: Ben Hutchings +Date: Mon, 24 Jan 2022 17:33:03 +0100 +Subject: rbtree: cache leftmost node internally +To: stable@vger.kernel.org +Cc: Davidlohr Bueso +Message-ID: +Content-Disposition: inline + +From: Davidlohr Bueso + +commit cd9e61ed1eebbcd5dfad59475d41ec58d9b64b6a upstream. + +Patch series "rbtree: Cache leftmost node internally", v4. + +A series to extending rbtrees to internally cache the leftmost node such +that we can have fast overlap check optimization for all interval tree +users[1]. The benefits of this series are that: + +(i) Unify users that do internal leftmost node caching. +(ii) Optimize all interval tree users. +(iii) Convert at least two new users (epoll and procfs) to the new interface. + +This patch (of 16): + +Red-black tree semantics imply that nodes with smaller or greater (or +equal for duplicates) keys always be to the left and right, +respectively. For the kernel this is extremely evident when considering +our rb_first() semantics. Enabling lookups for the smallest node in the +tree in O(1) can save a good chunk of cycles in not having to walk down +the tree each time. To this end there are a few core users that +explicitly do this, such as the scheduler and rtmutexes. There is also +the desire for interval trees to have this optimization allowing faster +overlap checking. + +This patch introduces a new 'struct rb_root_cached' which is just the +root with a cached pointer to the leftmost node. The reason why the +regular rb_root was not extended instead of adding a new structure was +that this allows the user to have the choice between memory footprint +and actual tree performance. The new wrappers on top of the regular +rb_root calls are: + + - rb_first_cached(cached_root) -- which is a fast replacement + for rb_first. + + - rb_insert_color_cached(node, cached_root, new) + + - rb_erase_cached(node, cached_root) + +In addition, augmented cached interfaces are also added for basic +insertion and deletion operations; which becomes important for the +interval tree changes. + +With the exception of the inserts, which adds a bool for updating the +new leftmost, the interfaces are kept the same. To this end, porting rb +users to the cached version becomes really trivial, and keeping current +rbtree semantics for users that don't care about the optimization +requires zero overhead. + +Link: http://lkml.kernel.org/r/20170719014603.19029-2-dave@stgolabs.net +Signed-off-by: Davidlohr Bueso +Reviewed-by: Jan Kara +Acked-by: Peter Zijlstra (Intel) +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/rbtree.txt | 33 +++++++++++++++++++++++++++++++++ + include/linux/rbtree.h | 21 +++++++++++++++++++++ + include/linux/rbtree_augmented.h | 33 ++++++++++++++++++++++++++++++--- + lib/rbtree.c | 34 +++++++++++++++++++++++++++++----- + 4 files changed, 113 insertions(+), 8 deletions(-) + +--- a/Documentation/rbtree.txt ++++ b/Documentation/rbtree.txt +@@ -190,6 +190,39 @@ Example: + for (node = rb_first(&mytree); node; node = rb_next(node)) + printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); + ++Cached rbtrees ++-------------- ++ ++Computing the leftmost (smallest) node is quite a common task for binary ++search trees, such as for traversals or users relying on a the particular ++order for their own logic. To this end, users can use 'struct rb_root_cached' ++to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding ++potentially expensive tree iterations. This is done at negligible runtime ++overhead for maintanence; albeit larger memory footprint. ++ ++Similar to the rb_root structure, cached rbtrees are initialized to be ++empty via: ++ ++ struct rb_root_cached mytree = RB_ROOT_CACHED; ++ ++Cached rbtree is simply a regular rb_root with an extra pointer to cache the ++leftmost node. This allows rb_root_cached to exist wherever rb_root does, ++which permits augmented trees to be supported as well as only a few extra ++interfaces: ++ ++ struct rb_node *rb_first_cached(struct rb_root_cached *tree); ++ void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool); ++ void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); ++ ++Both insert and erase calls have their respective counterpart of augmented ++trees: ++ ++ void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *, ++ bool, struct rb_augment_callbacks *); ++ void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *, ++ struct rb_augment_callbacks *); ++ ++ + Support for Augmented rbtrees + ----------------------------- + +--- a/include/linux/rbtree.h ++++ b/include/linux/rbtree.h +@@ -44,10 +44,25 @@ struct rb_root { + struct rb_node *rb_node; + }; + ++/* ++ * Leftmost-cached rbtrees. ++ * ++ * We do not cache the rightmost node based on footprint ++ * size vs number of potential users that could benefit ++ * from O(1) rb_last(). Just not worth it, users that want ++ * this feature can always implement the logic explicitly. ++ * Furthermore, users that want to cache both pointers may ++ * find it a bit asymmetric, but that's ok. ++ */ ++struct rb_root_cached { ++ struct rb_root rb_root; ++ struct rb_node *rb_leftmost; ++}; + + #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) + + #define RB_ROOT (struct rb_root) { NULL, } ++#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } + #define rb_entry(ptr, type, member) container_of(ptr, type, member) + + #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) +@@ -69,6 +84,12 @@ extern struct rb_node *rb_prev(const str + extern struct rb_node *rb_first(const struct rb_root *); + extern struct rb_node *rb_last(const struct rb_root *); + ++extern void rb_insert_color_cached(struct rb_node *, ++ struct rb_root_cached *, bool); ++extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); ++/* Same as rb_first(), but O(1) */ ++#define rb_first_cached(root) (root)->rb_leftmost ++ + /* Postorder iteration - always visit the parent after its children */ + extern struct rb_node *rb_first_postorder(const struct rb_root *); + extern struct rb_node *rb_next_postorder(const struct rb_node *); +--- a/include/linux/rbtree_augmented.h ++++ b/include/linux/rbtree_augmented.h +@@ -41,7 +41,9 @@ struct rb_augment_callbacks { + void (*rotate)(struct rb_node *old, struct rb_node *new); + }; + +-extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, ++extern void __rb_insert_augmented(struct rb_node *node, ++ struct rb_root *root, ++ bool newleft, struct rb_node **leftmost, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); + /* + * Fixup the rbtree and update the augmented information when rebalancing. +@@ -57,7 +59,16 @@ static inline void + rb_insert_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) + { +- __rb_insert_augmented(node, root, augment->rotate); ++ __rb_insert_augmented(node, root, false, NULL, augment->rotate); ++} ++ ++static inline void ++rb_insert_augmented_cached(struct rb_node *node, ++ struct rb_root_cached *root, bool newleft, ++ const struct rb_augment_callbacks *augment) ++{ ++ __rb_insert_augmented(node, &root->rb_root, ++ newleft, &root->rb_leftmost, augment->rotate); + } + + #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ +@@ -148,6 +159,7 @@ extern void __rb_erase_color(struct rb_n + + static __always_inline struct rb_node * + __rb_erase_augmented(struct rb_node *node, struct rb_root *root, ++ struct rb_node **leftmost, + const struct rb_augment_callbacks *augment) + { + struct rb_node *child = node->rb_right; +@@ -155,6 +167,9 @@ __rb_erase_augmented(struct rb_node *nod + struct rb_node *parent, *rebalance; + unsigned long pc; + ++ if (leftmost && node == *leftmost) ++ *leftmost = rb_next(node); ++ + if (!tmp) { + /* + * Case 1: node to erase has no more than 1 child (easy!) +@@ -254,9 +269,21 @@ static __always_inline void + rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) + { +- struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); ++ struct rb_node *rebalance = __rb_erase_augmented(node, root, ++ NULL, augment); + if (rebalance) + __rb_erase_color(rebalance, root, augment->rotate); + } + ++static __always_inline void ++rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, ++ const struct rb_augment_callbacks *augment) ++{ ++ struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root, ++ &root->rb_leftmost, ++ augment); ++ if (rebalance) ++ __rb_erase_color(rebalance, &root->rb_root, augment->rotate); ++} ++ + #endif /* _LINUX_RBTREE_AUGMENTED_H */ +--- a/lib/rbtree.c ++++ b/lib/rbtree.c +@@ -95,10 +95,14 @@ __rb_rotate_set_parents(struct rb_node * + + static __always_inline void + __rb_insert(struct rb_node *node, struct rb_root *root, ++ bool newleft, struct rb_node **leftmost, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) + { + struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; + ++ if (newleft) ++ *leftmost = node; ++ + while (true) { + /* + * Loop invariant: node is red +@@ -417,19 +421,38 @@ static const struct rb_augment_callbacks + + void rb_insert_color(struct rb_node *node, struct rb_root *root) + { +- __rb_insert(node, root, dummy_rotate); ++ __rb_insert(node, root, false, NULL, dummy_rotate); + } + EXPORT_SYMBOL(rb_insert_color); + + void rb_erase(struct rb_node *node, struct rb_root *root) + { + struct rb_node *rebalance; +- rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); ++ rebalance = __rb_erase_augmented(node, root, ++ NULL, &dummy_callbacks); + if (rebalance) + ____rb_erase_color(rebalance, root, dummy_rotate); + } + EXPORT_SYMBOL(rb_erase); + ++void rb_insert_color_cached(struct rb_node *node, ++ struct rb_root_cached *root, bool leftmost) ++{ ++ __rb_insert(node, &root->rb_root, leftmost, ++ &root->rb_leftmost, dummy_rotate); ++} ++EXPORT_SYMBOL(rb_insert_color_cached); ++ ++void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) ++{ ++ struct rb_node *rebalance; ++ rebalance = __rb_erase_augmented(node, &root->rb_root, ++ &root->rb_leftmost, &dummy_callbacks); ++ if (rebalance) ++ ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate); ++} ++EXPORT_SYMBOL(rb_erase_cached); ++ + /* + * Augmented rbtree manipulation functions. + * +@@ -438,9 +461,10 @@ EXPORT_SYMBOL(rb_erase); + */ + + void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, ++ bool newleft, struct rb_node **leftmost, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) + { +- __rb_insert(node, root, augment_rotate); ++ __rb_insert(node, root, newleft, leftmost, augment_rotate); + } + EXPORT_SYMBOL(__rb_insert_augmented); + +@@ -485,7 +509,7 @@ struct rb_node *rb_next(const struct rb_ + * as we can. + */ + if (node->rb_right) { +- node = node->rb_right; ++ node = node->rb_right; + while (node->rb_left) + node=node->rb_left; + return (struct rb_node *)node; +@@ -517,7 +541,7 @@ struct rb_node *rb_prev(const struct rb_ + * as we can. + */ + if (node->rb_left) { +- node = node->rb_left; ++ node = node->rb_left; + while (node->rb_right) + node=node->rb_right; + return (struct rb_node *)node; diff --git a/queue-4.9/series b/queue-4.9/series index a845b740e4f..cd4fd938cc0 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -148,3 +148,10 @@ gup-document-and-work-around-cow-can-break-either-way-issue.patch drm-ttm-nouveau-don-t-call-tt-destroy-callback-on-alloc-failure.patch gianfar-simplify-fcs-handling-and-fix-memory-leak.patch gianfar-fix-jumbo-packets-napi-rx-overrun-crash.patch +cipso-calipso-resolve-a-number-of-problems-with-the-doi-refcounts.patch +rbtree-cache-leftmost-node-internally.patch +lib-timerqueue-rely-on-rbtree-semantics-for-next-timer.patch +mm-add-follow_pte_pmd.patch +kvm-do-not-assume-pte-is-writable-after-follow_pfn.patch +kvm-use-kvm_pfn_t-for-local-pfn-variable-in-hva_to_pfn_remapped.patch +kvm-do-not-allow-mapping-valid-but-non-reference-counted-pages.patch