--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:13:37 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:54 -0400
+Subject: maple_tree: add RCU lock checking to rcu callback functions
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Suren Baghdasaryan <surenb@google.com>, "Liam R . Howlett" <Liam.Howlett@oracle.com>
+Message-ID: <20230411151055.2910579-14-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 790e1fa86b340c2bd4a327e01c161f7a1ad885f6 upstream.
+
+Dereferencing RCU objects within the RCU callback without the RCU check
+has caused lockdep to complain. Fix the RCU dereferencing by using the
+RCU callback lock to ensure the operation is safe.
+
+Also stop creating a new lock to use for dereferencing during destruction
+of the tree or subtree. Instead, pass through a pointer to the tree that
+has the lock that is held for RCU dereferencing checking. It also does
+not make sense to use the maple state in the freeing scenario as the tree
+walk is a special case where the tree no longer has the normal encodings
+and parent pointers.
+
+Link: https://lkml.kernel.org/r/20230227173632.3292573-8-surenb@google.com
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Cc: stable@vger.kernel.org
+Reported-by: Suren Baghdasaryan <surenb@google.com>
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 188 ++++++++++++++++++++++++++++---------------------------
+ 1 file changed, 96 insertions(+), 92 deletions(-)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -819,6 +819,11 @@ static inline void *mt_slot(const struct
+ return rcu_dereference_check(slots[offset], mt_locked(mt));
+ }
+
++static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots,
++ unsigned char offset)
++{
++ return rcu_dereference_protected(slots[offset], mt_locked(mt));
++}
+ /*
+ * mas_slot_locked() - Get the slot value when holding the maple tree lock.
+ * @mas: The maple state
+@@ -830,7 +835,7 @@ static inline void *mt_slot(const struct
+ static inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots,
+ unsigned char offset)
+ {
+- return rcu_dereference_protected(slots[offset], mt_locked(mas->tree));
++ return mt_slot_locked(mas->tree, slots, offset);
+ }
+
+ /*
+@@ -902,34 +907,35 @@ static inline void ma_set_meta(struct ma
+ }
+
+ /*
+- * mas_clear_meta() - clear the metadata information of a node, if it exists
+- * @mas: The maple state
++ * mt_clear_meta() - clear the metadata information of a node, if it exists
++ * @mt: The maple tree
+ * @mn: The maple node
+- * @mt: The maple node type
++ * @type: The maple node type
+ * @offset: The offset of the highest sub-gap in this node.
+ * @end: The end of the data in this node.
+ */
+-static inline void mas_clear_meta(struct ma_state *mas, struct maple_node *mn,
+- enum maple_type mt)
++static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn,
++ enum maple_type type)
+ {
+ struct maple_metadata *meta;
+ unsigned long *pivots;
+ void __rcu **slots;
+ void *next;
+
+- switch (mt) {
++ switch (type) {
+ case maple_range_64:
+ pivots = mn->mr64.pivot;
+ if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
+ slots = mn->mr64.slot;
+- next = mas_slot_locked(mas, slots,
+- MAPLE_RANGE64_SLOTS - 1);
+- if (unlikely((mte_to_node(next) && mte_node_type(next))))
+- return; /* The last slot is a node, no metadata */
++ next = mt_slot_locked(mt, slots,
++ MAPLE_RANGE64_SLOTS - 1);
++ if (unlikely((mte_to_node(next) &&
++ mte_node_type(next))))
++ return; /* no metadata, could be node */
+ }
+ fallthrough;
+ case maple_arange_64:
+- meta = ma_meta(mn, mt);
++ meta = ma_meta(mn, type);
+ break;
+ default:
+ return;
+@@ -5477,7 +5483,7 @@ no_gap:
+ }
+
+ /*
+- * mas_dead_leaves() - Mark all leaves of a node as dead.
++ * mte_dead_leaves() - Mark all leaves of a node as dead.
+ * @mas: The maple state
+ * @slots: Pointer to the slot array
+ * @type: The maple node type
+@@ -5487,16 +5493,16 @@ no_gap:
+ * Return: The number of leaves marked as dead.
+ */
+ static inline
+-unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots,
+- enum maple_type mt)
++unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt,
++ void __rcu **slots)
+ {
+ struct maple_node *node;
+ enum maple_type type;
+ void *entry;
+ int offset;
+
+- for (offset = 0; offset < mt_slots[mt]; offset++) {
+- entry = mas_slot_locked(mas, slots, offset);
++ for (offset = 0; offset < mt_slot_count(enode); offset++) {
++ entry = mt_slot(mt, slots, offset);
+ type = mte_node_type(entry);
+ node = mte_to_node(entry);
+ /* Use both node and type to catch LE & BE metadata */
+@@ -5511,162 +5517,160 @@ unsigned char mas_dead_leaves(struct ma_
+ return offset;
+ }
+
+-static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset)
++/**
++ * mte_dead_walk() - Walk down a dead tree to just before the leaves
++ * @enode: The maple encoded node
++ * @offset: The starting offset
++ *
++ * Note: This can only be used from the RCU callback context.
++ */
++static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset)
+ {
+- struct maple_node *next;
++ struct maple_node *node, *next;
+ void __rcu **slots = NULL;
+
+- next = mas_mn(mas);
++ next = mte_to_node(*enode);
+ do {
+- mas->node = mt_mk_node(next, next->type);
+- slots = ma_slots(next, next->type);
+- next = mas_slot_locked(mas, slots, offset);
++ *enode = ma_enode_ptr(next);
++ node = mte_to_node(*enode);
++ slots = ma_slots(node, node->type);
++ next = rcu_dereference_protected(slots[offset],
++ lock_is_held(&rcu_callback_map));
+ offset = 0;
+ } while (!ma_is_leaf(next->type));
+
+ return slots;
+ }
+
++/**
++ * mt_free_walk() - Walk & free a tree in the RCU callback context
++ * @head: The RCU head that's within the node.
++ *
++ * Note: This can only be used from the RCU callback context.
++ */
+ static void mt_free_walk(struct rcu_head *head)
+ {
+ void __rcu **slots;
+ struct maple_node *node, *start;
+- struct maple_tree mt;
++ struct maple_enode *enode;
+ unsigned char offset;
+ enum maple_type type;
+- MA_STATE(mas, &mt, 0, 0);
+
+ node = container_of(head, struct maple_node, rcu);
+
+ if (ma_is_leaf(node->type))
+ goto free_leaf;
+
+- mt_init_flags(&mt, node->ma_flags);
+- mas_lock(&mas);
+ start = node;
+- mas.node = mt_mk_node(node, node->type);
+- slots = mas_dead_walk(&mas, 0);
+- node = mas_mn(&mas);
++ enode = mt_mk_node(node, node->type);
++ slots = mte_dead_walk(&enode, 0);
++ node = mte_to_node(enode);
+ do {
+ mt_free_bulk(node->slot_len, slots);
+ offset = node->parent_slot + 1;
+- mas.node = node->piv_parent;
+- if (mas_mn(&mas) == node)
+- goto start_slots_free;
+-
+- type = mte_node_type(mas.node);
+- slots = ma_slots(mte_to_node(mas.node), type);
+- if ((offset < mt_slots[type]) && (slots[offset]))
+- slots = mas_dead_walk(&mas, offset);
+-
+- node = mas_mn(&mas);
++ enode = node->piv_parent;
++ if (mte_to_node(enode) == node)
++ goto free_leaf;
++
++ type = mte_node_type(enode);
++ slots = ma_slots(mte_to_node(enode), type);
++ if ((offset < mt_slots[type]) &&
++ rcu_dereference_protected(slots[offset],
++ lock_is_held(&rcu_callback_map)))
++ slots = mte_dead_walk(&enode, offset);
++ node = mte_to_node(enode);
+ } while ((node != start) || (node->slot_len < offset));
+
+ slots = ma_slots(node, node->type);
+ mt_free_bulk(node->slot_len, slots);
+
+-start_slots_free:
+- mas_unlock(&mas);
+ free_leaf:
+ mt_free_rcu(&node->rcu);
+ }
+
+-static inline void __rcu **mas_destroy_descend(struct ma_state *mas,
+- struct maple_enode *prev, unsigned char offset)
++static inline void __rcu **mte_destroy_descend(struct maple_enode **enode,
++ struct maple_tree *mt, struct maple_enode *prev, unsigned char offset)
+ {
+ struct maple_node *node;
+- struct maple_enode *next = mas->node;
++ struct maple_enode *next = *enode;
+ void __rcu **slots = NULL;
++ enum maple_type type;
++ unsigned char next_offset = 0;
+
+ do {
+- mas->node = next;
+- node = mas_mn(mas);
+- slots = ma_slots(node, mte_node_type(mas->node));
+- next = mas_slot_locked(mas, slots, 0);
+- if ((mte_dead_node(next))) {
+- mte_to_node(next)->type = mte_node_type(next);
+- next = mas_slot_locked(mas, slots, 1);
+- }
++ *enode = next;
++ node = mte_to_node(*enode);
++ type = mte_node_type(*enode);
++ slots = ma_slots(node, type);
++ next = mt_slot_locked(mt, slots, next_offset);
++ if ((mte_dead_node(next)))
++ next = mt_slot_locked(mt, slots, ++next_offset);
+
+- mte_set_node_dead(mas->node);
+- node->type = mte_node_type(mas->node);
+- mas_clear_meta(mas, node, node->type);
++ mte_set_node_dead(*enode);
++ node->type = type;
+ node->piv_parent = prev;
+ node->parent_slot = offset;
+- offset = 0;
+- prev = mas->node;
++ offset = next_offset;
++ next_offset = 0;
++ prev = *enode;
+ } while (!mte_is_leaf(next));
+
+ return slots;
+ }
+
+-static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
++static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
+ bool free)
+ {
+ void __rcu **slots;
+ struct maple_node *node = mte_to_node(enode);
+ struct maple_enode *start;
+- struct maple_tree mt;
+
+- MA_STATE(mas, &mt, 0, 0);
+-
+- mas.node = enode;
+ if (mte_is_leaf(enode)) {
+ node->type = mte_node_type(enode);
+ goto free_leaf;
+ }
+
+- ma_flags &= ~MT_FLAGS_LOCK_MASK;
+- mt_init_flags(&mt, ma_flags);
+- mas_lock(&mas);
+-
+- mte_to_node(enode)->ma_flags = ma_flags;
+ start = enode;
+- slots = mas_destroy_descend(&mas, start, 0);
+- node = mas_mn(&mas);
++ slots = mte_destroy_descend(&enode, mt, start, 0);
++ node = mte_to_node(enode); // Updated in the above call.
+ do {
+ enum maple_type type;
+ unsigned char offset;
+ struct maple_enode *parent, *tmp;
+
+- node->type = mte_node_type(mas.node);
+- node->slot_len = mas_dead_leaves(&mas, slots, node->type);
++ node->slot_len = mte_dead_leaves(enode, mt, slots);
+ if (free)
+ mt_free_bulk(node->slot_len, slots);
+ offset = node->parent_slot + 1;
+- mas.node = node->piv_parent;
+- if (mas_mn(&mas) == node)
+- goto start_slots_free;
++ enode = node->piv_parent;
++ if (mte_to_node(enode) == node)
++ goto free_leaf;
+
+- type = mte_node_type(mas.node);
+- slots = ma_slots(mte_to_node(mas.node), type);
++ type = mte_node_type(enode);
++ slots = ma_slots(mte_to_node(enode), type);
+ if (offset >= mt_slots[type])
+ goto next;
+
+- tmp = mas_slot_locked(&mas, slots, offset);
++ tmp = mt_slot_locked(mt, slots, offset);
+ if (mte_node_type(tmp) && mte_to_node(tmp)) {
+- parent = mas.node;
+- mas.node = tmp;
+- slots = mas_destroy_descend(&mas, parent, offset);
++ parent = enode;
++ enode = tmp;
++ slots = mte_destroy_descend(&enode, mt, parent, offset);
+ }
+ next:
+- node = mas_mn(&mas);
+- } while (start != mas.node);
++ node = mte_to_node(enode);
++ } while (start != enode);
+
+- node = mas_mn(&mas);
+- node->type = mte_node_type(mas.node);
+- node->slot_len = mas_dead_leaves(&mas, slots, node->type);
++ node = mte_to_node(enode);
++ node->slot_len = mte_dead_leaves(enode, mt, slots);
+ if (free)
+ mt_free_bulk(node->slot_len, slots);
+
+-start_slots_free:
+- mas_unlock(&mas);
+-
+ free_leaf:
+ if (free)
+ mt_free_rcu(&node->rcu);
+ else
+- mas_clear_meta(&mas, node, node->type);
++ mt_clear_meta(mt, node, node->type);
+ }
+
+ /*
+@@ -5682,10 +5686,10 @@ static inline void mte_destroy_walk(stru
+ struct maple_node *node = mte_to_node(enode);
+
+ if (mt_in_rcu(mt)) {
+- mt_destroy_walk(enode, mt->ma_flags, false);
++ mt_destroy_walk(enode, mt, false);
+ call_rcu(&node->rcu, mt_free_walk);
+ } else {
+- mt_destroy_walk(enode, mt->ma_flags, true);
++ mt_destroy_walk(enode, mt, true);
+ }
+ }
+
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:15:03 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:53 -0400
+Subject: maple_tree: add smp_rmb() to dead node detection
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, "Liam R . Howlett" <Liam.Howlett@oracle.com>
+Message-ID: <20230411151055.2910579-13-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 0a2b18d948838e16912b3b627b504ab062b7d02a upstream.
+
+Add an smp_rmb() before reading the parent pointer to ensure that anything
+read from the node prior to the parent pointer hasn't been reordered ahead
+of this check.
+
+The is necessary for RCU mode.
+
+Link: https://lkml.kernel.org/r/20230227173632.3292573-7-surenb@google.com
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Cc: stable@vger.kernel.org
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -534,9 +534,11 @@ static inline struct maple_node *mte_par
+ */
+ static inline bool ma_dead_node(const struct maple_node *node)
+ {
+- struct maple_node *parent = (void *)((unsigned long)
+- node->parent & ~MAPLE_NODE_MASK);
++ struct maple_node *parent;
+
++ /* Do not reorder reads from the node prior to the parent check */
++ smp_rmb();
++ parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK);
+ return (parent == node);
+ }
+
+@@ -551,6 +553,8 @@ static inline bool mte_dead_node(const s
+ struct maple_node *parent, *node;
+
+ node = mte_to_node(enode);
++ /* Do not reorder reads from the node prior to the parent check */
++ smp_rmb();
+ parent = mte_parent(enode);
+ return (parent == node);
+ }
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:13:00 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:48 -0400
+Subject: maple_tree: be more cautious about dead nodes
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Stable@vger.kernel.org, Liam Howlett <Liam.Howlett@oracle.com>
+Message-ID: <20230411151055.2910579-8-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 39d0bd86c499ecd6abae42a9b7112056c5560691 upstream.
+
+ma_pivots() and ma_data_end() may be called with a dead node. Ensure to
+that the node isn't dead before using the returned values.
+
+This is necessary for RCU mode of the maple tree.
+
+Link: https://lkml.kernel.org/r/20230227173632.3292573-1-surenb@google.com
+Link: https://lkml.kernel.org/r/20230227173632.3292573-2-surenb@google.com
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Cc: <Stable@vger.kernel.org>
+Signed-off-by: Liam Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 52 +++++++++++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 43 insertions(+), 9 deletions(-)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -539,6 +539,7 @@ static inline bool ma_dead_node(const st
+
+ return (parent == node);
+ }
++
+ /*
+ * mte_dead_node() - check if the @enode is dead.
+ * @enode: The encoded maple node
+@@ -620,6 +621,8 @@ static inline unsigned int mas_alloc_req
+ * @node - the maple node
+ * @type - the node type
+ *
++ * In the event of a dead node, this array may be %NULL
++ *
+ * Return: A pointer to the maple node pivots
+ */
+ static inline unsigned long *ma_pivots(struct maple_node *node,
+@@ -1091,8 +1094,11 @@ static int mas_ascend(struct ma_state *m
+ a_type = mas_parent_enum(mas, p_enode);
+ a_node = mte_parent(p_enode);
+ a_slot = mte_parent_slot(p_enode);
+- pivots = ma_pivots(a_node, a_type);
+ a_enode = mt_mk_node(a_node, a_type);
++ pivots = ma_pivots(a_node, a_type);
++
++ if (unlikely(ma_dead_node(a_node)))
++ return 1;
+
+ if (!set_min && a_slot) {
+ set_min = true;
+@@ -1398,6 +1404,9 @@ static inline unsigned char ma_data_end(
+ {
+ unsigned char offset;
+
++ if (!pivots)
++ return 0;
++
+ if (type == maple_arange_64)
+ return ma_meta_end(node, type);
+
+@@ -1433,6 +1442,9 @@ static inline unsigned char mas_data_end
+ return ma_meta_end(node, type);
+
+ pivots = ma_pivots(node, type);
++ if (unlikely(ma_dead_node(node)))
++ return 0;
++
+ offset = mt_pivots[type] - 1;
+ if (likely(!pivots[offset]))
+ return ma_meta_end(node, type);
+@@ -4498,6 +4510,9 @@ static inline int mas_prev_node(struct m
+ node = mas_mn(mas);
+ slots = ma_slots(node, mt);
+ pivots = ma_pivots(node, mt);
++ if (unlikely(ma_dead_node(node)))
++ return 1;
++
+ mas->max = pivots[offset];
+ if (offset)
+ mas->min = pivots[offset - 1] + 1;
+@@ -4519,6 +4534,9 @@ static inline int mas_prev_node(struct m
+ slots = ma_slots(node, mt);
+ pivots = ma_pivots(node, mt);
+ offset = ma_data_end(node, mt, pivots, mas->max);
++ if (unlikely(ma_dead_node(node)))
++ return 1;
++
+ if (offset)
+ mas->min = pivots[offset - 1] + 1;
+
+@@ -4567,6 +4585,7 @@ static inline int mas_next_node(struct m
+ struct maple_enode *enode;
+ int level = 0;
+ unsigned char offset;
++ unsigned char node_end;
+ enum maple_type mt;
+ void __rcu **slots;
+
+@@ -4590,7 +4609,11 @@ static inline int mas_next_node(struct m
+ node = mas_mn(mas);
+ mt = mte_node_type(mas->node);
+ pivots = ma_pivots(node, mt);
+- } while (unlikely(offset == ma_data_end(node, mt, pivots, mas->max)));
++ node_end = ma_data_end(node, mt, pivots, mas->max);
++ if (unlikely(ma_dead_node(node)))
++ return 1;
++
++ } while (unlikely(offset == node_end));
+
+ slots = ma_slots(node, mt);
+ pivot = mas_safe_pivot(mas, pivots, ++offset, mt);
+@@ -4606,6 +4629,9 @@ static inline int mas_next_node(struct m
+ mt = mte_node_type(mas->node);
+ slots = ma_slots(node, mt);
+ pivots = ma_pivots(node, mt);
++ if (unlikely(ma_dead_node(node)))
++ return 1;
++
+ offset = 0;
+ pivot = pivots[0];
+ }
+@@ -4652,11 +4678,14 @@ static inline void *mas_next_nentry(stru
+ return NULL;
+ }
+
+- pivots = ma_pivots(node, type);
+ slots = ma_slots(node, type);
+- mas->index = mas_safe_min(mas, pivots, mas->offset);
++ pivots = ma_pivots(node, type);
+ count = ma_data_end(node, type, pivots, mas->max);
+- if (ma_dead_node(node))
++ if (unlikely(ma_dead_node(node)))
++ return NULL;
++
++ mas->index = mas_safe_min(mas, pivots, mas->offset);
++ if (unlikely(ma_dead_node(node)))
+ return NULL;
+
+ if (mas->index > max)
+@@ -4814,6 +4843,11 @@ retry:
+
+ slots = ma_slots(mn, mt);
+ pivots = ma_pivots(mn, mt);
++ if (unlikely(ma_dead_node(mn))) {
++ mas_rewalk(mas, index);
++ goto retry;
++ }
++
+ if (offset == mt_pivots[mt])
+ pivot = mas->max;
+ else
+@@ -6616,11 +6650,11 @@ static inline void *mas_first_entry(stru
+ while (likely(!ma_is_leaf(mt))) {
+ MT_BUG_ON(mas->tree, mte_dead_node(mas->node));
+ slots = ma_slots(mn, mt);
+- pivots = ma_pivots(mn, mt);
+- max = pivots[0];
+ entry = mas_slot(mas, slots, 0);
++ pivots = ma_pivots(mn, mt);
+ if (unlikely(ma_dead_node(mn)))
+ return NULL;
++ max = pivots[0];
+ mas->node = entry;
+ mn = mas_mn(mas);
+ mt = mte_node_type(mas->node);
+@@ -6640,13 +6674,13 @@ static inline void *mas_first_entry(stru
+ if (likely(entry))
+ return entry;
+
+- pivots = ma_pivots(mn, mt);
+- mas->index = pivots[0] + 1;
+ mas->offset = 1;
+ entry = mas_slot(mas, slots, 1);
++ pivots = ma_pivots(mn, mt);
+ if (unlikely(ma_dead_node(mn)))
+ return NULL;
+
++ mas->index = pivots[0] + 1;
+ if (mas->index > limit)
+ goto none;
+
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:13:23 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:50 -0400
+Subject: maple_tree: detect dead nodes in mas_start()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Stable@vger.kernel.org, Liam Howlett <Liam.Howlett@oracle.com>
+Message-ID: <20230411151055.2910579-10-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit a7b92d59c885018cb7bb88539892278e4fd64b29 upstream.
+
+When initially starting a search, the root node may already be in the
+process of being replaced in RCU mode. Detect and restart the walk if
+this is the case. This is necessary for RCU mode of the maple tree.
+
+Link: https://lkml.kernel.org/r/20230227173632.3292573-3-surenb@google.com
+Cc: <Stable@vger.kernel.org>
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Signed-off-by: Liam Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -1357,12 +1357,16 @@ static inline struct maple_enode *mas_st
+ mas->max = ULONG_MAX;
+ mas->depth = 0;
+
++retry:
+ root = mas_root(mas);
+ /* Tree with nodes */
+ if (likely(xa_is_node(root))) {
+ mas->depth = 1;
+ mas->node = mte_safe_root(root);
+ mas->offset = 0;
++ if (mte_dead_node(mas->node))
++ goto retry;
++
+ return NULL;
+ }
+
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:13:39 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:51 -0400
+Subject: maple_tree: fix freeing of nodes in rcu mode
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Stable@vger.kernel.org, Liam Howlett <Liam.Howlett@oracle.com>
+Message-ID: <20230411151055.2910579-11-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 2e5b4921f8efc9e845f4f04741797d16f36847eb upstream.
+
+The walk to destroy the nodes was not always setting the node type and
+would result in a destroy method potentially using the values as nodes.
+Avoid this by setting the correct node types. This is necessary for the
+RCU mode of the maple tree.
+
+Link: https://lkml.kernel.org/r/20230227173632.3292573-4-surenb@google.com
+Cc: <Stable@vger.kernel.org>
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Signed-off-by: Liam Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 62 insertions(+), 11 deletions(-)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -898,6 +898,44 @@ static inline void ma_set_meta(struct ma
+ }
+
+ /*
++ * mas_clear_meta() - clear the metadata information of a node, if it exists
++ * @mas: The maple state
++ * @mn: The maple node
++ * @mt: The maple node type
++ * @offset: The offset of the highest sub-gap in this node.
++ * @end: The end of the data in this node.
++ */
++static inline void mas_clear_meta(struct ma_state *mas, struct maple_node *mn,
++ enum maple_type mt)
++{
++ struct maple_metadata *meta;
++ unsigned long *pivots;
++ void __rcu **slots;
++ void *next;
++
++ switch (mt) {
++ case maple_range_64:
++ pivots = mn->mr64.pivot;
++ if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
++ slots = mn->mr64.slot;
++ next = mas_slot_locked(mas, slots,
++ MAPLE_RANGE64_SLOTS - 1);
++ if (unlikely((mte_to_node(next) && mte_node_type(next))))
++ return; /* The last slot is a node, no metadata */
++ }
++ fallthrough;
++ case maple_arange_64:
++ meta = ma_meta(mn, mt);
++ break;
++ default:
++ return;
++ }
++
++ meta->gap = 0;
++ meta->end = 0;
++}
++
++/*
+ * ma_meta_end() - Get the data end of a node from the metadata
+ * @mn: The maple node
+ * @mt: The maple node type
+@@ -5438,20 +5476,22 @@ no_gap:
+ * mas_dead_leaves() - Mark all leaves of a node as dead.
+ * @mas: The maple state
+ * @slots: Pointer to the slot array
++ * @type: The maple node type
+ *
+ * Must hold the write lock.
+ *
+ * Return: The number of leaves marked as dead.
+ */
+ static inline
+-unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots)
++unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots,
++ enum maple_type mt)
+ {
+ struct maple_node *node;
+ enum maple_type type;
+ void *entry;
+ int offset;
+
+- for (offset = 0; offset < mt_slot_count(mas->node); offset++) {
++ for (offset = 0; offset < mt_slots[mt]; offset++) {
+ entry = mas_slot_locked(mas, slots, offset);
+ type = mte_node_type(entry);
+ node = mte_to_node(entry);
+@@ -5470,14 +5510,13 @@ unsigned char mas_dead_leaves(struct ma_
+
+ static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset)
+ {
+- struct maple_node *node, *next;
++ struct maple_node *next;
+ void __rcu **slots = NULL;
+
+ next = mas_mn(mas);
+ do {
+- mas->node = ma_enode_ptr(next);
+- node = mas_mn(mas);
+- slots = ma_slots(node, node->type);
++ mas->node = mt_mk_node(next, next->type);
++ slots = ma_slots(next, next->type);
+ next = mas_slot_locked(mas, slots, offset);
+ offset = 0;
+ } while (!ma_is_leaf(next->type));
+@@ -5541,11 +5580,14 @@ static inline void __rcu **mas_destroy_d
+ node = mas_mn(mas);
+ slots = ma_slots(node, mte_node_type(mas->node));
+ next = mas_slot_locked(mas, slots, 0);
+- if ((mte_dead_node(next)))
++ if ((mte_dead_node(next))) {
++ mte_to_node(next)->type = mte_node_type(next);
+ next = mas_slot_locked(mas, slots, 1);
++ }
+
+ mte_set_node_dead(mas->node);
+ node->type = mte_node_type(mas->node);
++ mas_clear_meta(mas, node, node->type);
+ node->piv_parent = prev;
+ node->parent_slot = offset;
+ offset = 0;
+@@ -5565,13 +5607,18 @@ static void mt_destroy_walk(struct maple
+
+ MA_STATE(mas, &mt, 0, 0);
+
+- if (mte_is_leaf(enode))
++ mas.node = enode;
++ if (mte_is_leaf(enode)) {
++ node->type = mte_node_type(enode);
+ goto free_leaf;
++ }
+
++ ma_flags &= ~MT_FLAGS_LOCK_MASK;
+ mt_init_flags(&mt, ma_flags);
+ mas_lock(&mas);
+
+- mas.node = start = enode;
++ mte_to_node(enode)->ma_flags = ma_flags;
++ start = enode;
+ slots = mas_destroy_descend(&mas, start, 0);
+ node = mas_mn(&mas);
+ do {
+@@ -5579,7 +5626,8 @@ static void mt_destroy_walk(struct maple
+ unsigned char offset;
+ struct maple_enode *parent, *tmp;
+
+- node->slot_len = mas_dead_leaves(&mas, slots);
++ node->type = mte_node_type(mas.node);
++ node->slot_len = mas_dead_leaves(&mas, slots, node->type);
+ if (free)
+ mt_free_bulk(node->slot_len, slots);
+ offset = node->parent_slot + 1;
+@@ -5603,7 +5651,8 @@ next:
+ } while (start != mas.node);
+
+ node = mas_mn(&mas);
+- node->slot_len = mas_dead_leaves(&mas, slots);
++ node->type = mte_node_type(mas.node);
++ node->slot_len = mas_dead_leaves(&mas, slots, node->type);
+ if (free)
+ mt_free_bulk(node->slot_len, slots);
+
+@@ -5613,6 +5662,8 @@ start_slots_free:
+ free_leaf:
+ if (free)
+ mt_free_rcu(&node->rcu);
++ else
++ mas_clear_meta(&mas, node, node->type);
+ }
+
+ /*
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:13:01 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:45 -0400
+Subject: maple_tree: fix handle of invalidated state in mas_wr_store_setup()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Stable@vger.kernel.org, "Liam R . Howlett" <Liam.Howlett@oracle.com>, SeongJae Park <sj@kernel.org>
+Message-ID: <20230411151055.2910579-5-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 1202700c3f8cc5f7e4646c3cf05ee6f7c8bc6ccf upstream.
+
+If an invalidated maple state is encountered during write, reset the maple
+state to MAS_START. This will result in a re-walk of the tree to the
+correct location for the write.
+
+Link: https://lore.kernel.org/all/20230107020126.1627-1-sj@kernel.org/
+Link: https://lkml.kernel.org/r/20230120162650.984577-6-Liam.Howlett@oracle.com
+Cc: <Stable@vger.kernel.org>
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Reported-by: SeongJae Park <sj@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -5599,6 +5599,9 @@ static inline void mte_destroy_walk(stru
+
+ static void mas_wr_store_setup(struct ma_wr_state *wr_mas)
+ {
++ if (unlikely(mas_is_paused(wr_mas->mas)))
++ mas_reset(wr_mas->mas);
++
+ if (!mas_is_start(wr_mas->mas)) {
+ if (mas_is_none(wr_mas->mas)) {
+ mas_reset(wr_mas->mas);
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:13:01 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:46 -0400
+Subject: maple_tree: fix mas_prev() and mas_find() state handling
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Stable@vger.kernel.org, "Liam R . Howlett" <Liam.Howlett@oracle.com>, syzbot+502859d610c661e56545@syzkaller.appspotmail.com
+Message-ID: <20230411151055.2910579-6-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 17dc622c7b0f94e49bed030726df4db12ecaa6b5 upstream.
+
+When mas_prev() does not find anything, set the state to MAS_NONE.
+
+Handle the MAS_NONE in mas_find() like a MAS_START.
+
+Link: https://lkml.kernel.org/r/20230120162650.984577-7-Liam.Howlett@oracle.com
+Cc: <Stable@vger.kernel.org>
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Reported-by: <syzbot+502859d610c661e56545@syzkaller.appspotmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -4849,7 +4849,7 @@ static inline void *mas_prev_entry(struc
+
+ if (mas->index < min) {
+ mas->index = mas->last = min;
+- mas_pause(mas);
++ mas->node = MAS_NONE;
+ return NULL;
+ }
+ retry:
+@@ -5911,6 +5911,7 @@ void *mas_prev(struct ma_state *mas, uns
+ if (!mas->index) {
+ /* Nothing comes before 0 */
+ mas->last = 0;
++ mas->node = MAS_NONE;
+ return NULL;
+ }
+
+@@ -6001,6 +6002,9 @@ void *mas_find(struct ma_state *mas, uns
+ mas->index = ++mas->last;
+ }
+
++ if (unlikely(mas_is_none(mas)))
++ mas->node = MAS_START;
++
+ if (unlikely(mas_is_start(mas))) {
+ /* First run or continue */
+ void *entry;
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:12:28 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:43 -0400
+Subject: maple_tree: fix potential rcu issue
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Stable@vger.kernel.org, "Liam R . Howlett" <Liam.Howlett@oracle.com>
+Message-ID: <20230411151055.2910579-3-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 65be6f058b0eba98dc6c6f197ea9f62c9b6a519f upstream.
+
+Ensure the node isn't dead after reading the node end.
+
+Link: https://lkml.kernel.org/r/20230120162650.984577-3-Liam.Howlett@oracle.com
+Cc: <Stable@vger.kernel.org>
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -4655,13 +4655,13 @@ static inline void *mas_next_nentry(stru
+ pivots = ma_pivots(node, type);
+ slots = ma_slots(node, type);
+ mas->index = mas_safe_min(mas, pivots, mas->offset);
++ count = ma_data_end(node, type, pivots, mas->max);
+ if (ma_dead_node(node))
+ return NULL;
+
+ if (mas->index > max)
+ return NULL;
+
+- count = ma_data_end(node, type, pivots, mas->max);
+ if (mas->offset > count)
+ return NULL;
+
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:12:40 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:44 -0400
+Subject: maple_tree: reduce user error potential
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Stable@vger.kernel.org, "Liam R . Howlett" <Liam.Howlett@oracle.com>
+Message-ID: <20230411151055.2910579-4-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 50e81c82ad947045c7ed26ddc9acb17276b653b6 upstream.
+
+When iterating, a user may operate on the tree and cause the maple state
+to be altered and left in an unintuitive state. Detect this scenario and
+correct it by setting to the limit and invalidating the state.
+
+Link: https://lkml.kernel.org/r/20230120162650.984577-4-Liam.Howlett@oracle.com
+Cc: <Stable@vger.kernel.org>
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -4736,6 +4736,11 @@ static inline void *mas_next_entry(struc
+ unsigned long last;
+ enum maple_type mt;
+
++ if (mas->index > limit) {
++ mas->index = mas->last = limit;
++ mas_pause(mas);
++ return NULL;
++ }
+ last = mas->last;
+ retry:
+ offset = mas->offset;
+@@ -4842,6 +4847,11 @@ static inline void *mas_prev_entry(struc
+ {
+ void *entry;
+
++ if (mas->index < min) {
++ mas->index = mas->last = min;
++ mas_pause(mas);
++ return NULL;
++ }
+ retry:
+ while (likely(!mas_is_none(mas))) {
+ entry = mas_prev_nentry(mas, min, mas->index);
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:13:00 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:49 -0400
+Subject: maple_tree: refine ma_state init from mas_start()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Stable@vger.kernel.org, Vernon Yang <vernon2gm@gmail.com>, "Liam R . Howlett" <Liam.Howlett@oracle.com>
+Message-ID: <20230411151055.2910579-9-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 46b345848261009477552d654cb2f65000c30e4d upstream.
+
+If mas->node is an MAS_START, there are three cases, and they all assign
+different values to mas->node and mas->offset. So there is no need to set
+them to a default value before updating.
+
+Update them directly to make them easier to understand and for better
+readability.
+
+Link: https://lkml.kernel.org/r/20221221060058.609003-7-vernon2gm@gmail.com
+Cc: <Stable@vger.kernel.org>
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Signed-off-by: Vernon Yang <vernon2gm@gmail.com>
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -1339,7 +1339,7 @@ static void mas_node_count(struct ma_sta
+ * mas_start() - Sets up maple state for operations.
+ * @mas: The maple state.
+ *
+- * If mas->node == MAS_START, then set the min, max, depth, and offset to
++ * If mas->node == MAS_START, then set the min, max and depth to
+ * defaults.
+ *
+ * Return:
+@@ -1353,22 +1353,22 @@ static inline struct maple_enode *mas_st
+ if (likely(mas_is_start(mas))) {
+ struct maple_enode *root;
+
+- mas->node = MAS_NONE;
+ mas->min = 0;
+ mas->max = ULONG_MAX;
+ mas->depth = 0;
+- mas->offset = 0;
+
+ root = mas_root(mas);
+ /* Tree with nodes */
+ if (likely(xa_is_node(root))) {
+ mas->depth = 1;
+ mas->node = mte_safe_root(root);
++ mas->offset = 0;
+ return NULL;
+ }
+
+ /* empty tree */
+ if (unlikely(!root)) {
++ mas->node = MAS_NONE;
+ mas->offset = MAPLE_NODE_SLOTS;
+ return NULL;
+ }
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:13:37 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:52 -0400
+Subject: maple_tree: remove extra smp_wmb() from mas_dead_leaves()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Liam Howlett <Liam.Howlett@oracle.com>
+Message-ID: <20230411151055.2910579-12-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 8372f4d83f96f35915106093cde4565836587123 upstream.
+
+The call to mte_set_dead_node() before the smp_wmb() already calls
+smp_wmb() so this is not needed. This is an optimization for the RCU mode
+of the maple tree.
+
+Link: https://lkml.kernel.org/r/20230227173632.3292573-5-surenb@google.com
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Cc: stable@vger.kernel.org
+Signed-off-by: Liam Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -5500,7 +5500,6 @@ unsigned char mas_dead_leaves(struct ma_
+ break;
+
+ mte_set_node_dead(entry);
+- smp_wmb(); /* Needed for RCU */
+ node->type = type;
+ rcu_assign_pointer(slots[offset], node);
+ }
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:13:02 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:42 -0400
+Subject: maple_tree: remove GFP_ZERO from kmem_cache_alloc() and kmem_cache_alloc_bulk()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, Liam Howlett <Liam.Howlett@oracle.com>, Jirka Hladky <jhladky@redhat.com>, Matthew Wilcox <willy@infradead.org>
+Message-ID: <20230411151055.2910579-2-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 541e06b772c1aaffb3b6a245ccface36d7107af2 upstream.
+
+Preallocations are common in the VMA code to avoid allocating under
+certain locking conditions. The preallocations must also cover the
+worst-case scenario. Removing the GFP_ZERO flag from the
+kmem_cache_alloc() (and bulk variant) calls will reduce the amount of time
+spent zeroing memory that may not be used. Only zero out the necessary
+area to keep track of the allocations in the maple state. Zero the entire
+node prior to using it in the tree.
+
+This required internal changes to node counting on allocation, so the test
+code is also updated.
+
+This restores some micro-benchmark performance: up to +9% in mmtests mmap1
+by my testing +10% to +20% in mmap, mmapaddr, mmapmany tests reported by
+Red Hat
+
+Link: https://bugzilla.redhat.com/show_bug.cgi?id=2149636
+Link: https://lkml.kernel.org/r/20230105160427.2988454-1-Liam.Howlett@oracle.com
+Cc: stable@vger.kernel.org
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Signed-off-by: Liam Howlett <Liam.Howlett@oracle.com>
+Reported-by: Jirka Hladky <jhladky@redhat.com>
+Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c | 80 ++++++++++++++++++++-------------------
+ tools/testing/radix-tree/maple.c | 18 ++++----
+ 2 files changed, 52 insertions(+), 46 deletions(-)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -149,13 +149,12 @@ struct maple_subtree_state {
+ /* Functions */
+ static inline struct maple_node *mt_alloc_one(gfp_t gfp)
+ {
+- return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO);
++ return kmem_cache_alloc(maple_node_cache, gfp);
+ }
+
+ static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes)
+ {
+- return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size,
+- nodes);
++ return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes);
+ }
+
+ static inline void mt_free_bulk(size_t size, void __rcu **nodes)
+@@ -1128,9 +1127,10 @@ static inline struct maple_node *mas_pop
+ {
+ struct maple_alloc *ret, *node = mas->alloc;
+ unsigned long total = mas_allocated(mas);
++ unsigned int req = mas_alloc_req(mas);
+
+ /* nothing or a request pending. */
+- if (unlikely(!total))
++ if (WARN_ON(!total))
+ return NULL;
+
+ if (total == 1) {
+@@ -1140,27 +1140,25 @@ static inline struct maple_node *mas_pop
+ goto single_node;
+ }
+
+- if (!node->node_count) {
++ if (node->node_count == 1) {
+ /* Single allocation in this node. */
+ mas->alloc = node->slot[0];
+- node->slot[0] = NULL;
+ mas->alloc->total = node->total - 1;
+ ret = node;
+ goto new_head;
+ }
+-
+ node->total--;
+- ret = node->slot[node->node_count];
+- node->slot[node->node_count--] = NULL;
++ ret = node->slot[--node->node_count];
++ node->slot[node->node_count] = NULL;
+
+ single_node:
+ new_head:
+- ret->total = 0;
+- ret->node_count = 0;
+- if (ret->request_count) {
+- mas_set_alloc_req(mas, ret->request_count + 1);
+- ret->request_count = 0;
++ if (req) {
++ req++;
++ mas_set_alloc_req(mas, req);
+ }
++
++ memset(ret, 0, sizeof(*ret));
+ return (struct maple_node *)ret;
+ }
+
+@@ -1179,21 +1177,20 @@ static inline void mas_push_node(struct
+ unsigned long count;
+ unsigned int requested = mas_alloc_req(mas);
+
+- memset(reuse, 0, sizeof(*reuse));
+ count = mas_allocated(mas);
+
+- if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) {
+- if (head->slot[0])
+- head->node_count++;
+- head->slot[head->node_count] = reuse;
++ reuse->request_count = 0;
++ reuse->node_count = 0;
++ if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) {
++ head->slot[head->node_count++] = reuse;
+ head->total++;
+ goto done;
+ }
+
+ reuse->total = 1;
+ if ((head) && !((unsigned long)head & 0x1)) {
+- head->request_count = 0;
+ reuse->slot[0] = head;
++ reuse->node_count = 1;
+ reuse->total += head->total;
+ }
+
+@@ -1212,7 +1209,6 @@ static inline void mas_alloc_nodes(struc
+ {
+ struct maple_alloc *node;
+ unsigned long allocated = mas_allocated(mas);
+- unsigned long success = allocated;
+ unsigned int requested = mas_alloc_req(mas);
+ unsigned int count;
+ void **slots = NULL;
+@@ -1228,24 +1224,29 @@ static inline void mas_alloc_nodes(struc
+ WARN_ON(!allocated);
+ }
+
+- if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) {
++ if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) {
+ node = (struct maple_alloc *)mt_alloc_one(gfp);
+ if (!node)
+ goto nomem_one;
+
+- if (allocated)
++ if (allocated) {
+ node->slot[0] = mas->alloc;
++ node->node_count = 1;
++ } else {
++ node->node_count = 0;
++ }
+
+- success++;
+ mas->alloc = node;
++ node->total = ++allocated;
+ requested--;
+ }
+
+ node = mas->alloc;
++ node->request_count = 0;
+ while (requested) {
+ max_req = MAPLE_ALLOC_SLOTS;
+- if (node->slot[0]) {
+- unsigned int offset = node->node_count + 1;
++ if (node->node_count) {
++ unsigned int offset = node->node_count;
+
+ slots = (void **)&node->slot[offset];
+ max_req -= offset;
+@@ -1259,15 +1260,13 @@ static inline void mas_alloc_nodes(struc
+ goto nomem_bulk;
+
+ node->node_count += count;
+- /* zero indexed. */
+- if (slots == (void **)&node->slot)
+- node->node_count--;
+-
+- success += count;
++ allocated += count;
+ node = node->slot[0];
++ node->node_count = 0;
++ node->request_count = 0;
+ requested -= count;
+ }
+- mas->alloc->total = success;
++ mas->alloc->total = allocated;
+ return;
+
+ nomem_bulk:
+@@ -1276,7 +1275,7 @@ nomem_bulk:
+ nomem_one:
+ mas_set_alloc_req(mas, requested);
+ if (mas->alloc && !(((unsigned long)mas->alloc & 0x1)))
+- mas->alloc->total = success;
++ mas->alloc->total = allocated;
+ mas_set_err(mas, -ENOMEM);
+ return;
+
+@@ -5725,6 +5724,7 @@ int mas_preallocate(struct ma_state *mas
+ void mas_destroy(struct ma_state *mas)
+ {
+ struct maple_alloc *node;
++ unsigned long total;
+
+ /*
+ * When using mas_for_each() to insert an expected number of elements,
+@@ -5747,14 +5747,20 @@ void mas_destroy(struct ma_state *mas)
+ }
+ mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC);
+
+- while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) {
++ total = mas_allocated(mas);
++ while (total) {
+ node = mas->alloc;
+ mas->alloc = node->slot[0];
+- if (node->node_count > 0)
+- mt_free_bulk(node->node_count,
+- (void __rcu **)&node->slot[1]);
++ if (node->node_count > 1) {
++ size_t count = node->node_count - 1;
++
++ mt_free_bulk(count, (void __rcu **)&node->slot[1]);
++ total -= count;
++ }
+ kmem_cache_free(maple_node_cache, node);
++ total--;
+ }
++
+ mas->alloc = NULL;
+ }
+ EXPORT_SYMBOL_GPL(mas_destroy);
+--- a/tools/testing/radix-tree/maple.c
++++ b/tools/testing/radix-tree/maple.c
+@@ -173,11 +173,11 @@ static noinline void check_new_node(stru
+
+ if (!MAPLE_32BIT) {
+ if (i >= 35)
+- e = i - 35;
++ e = i - 34;
+ else if (i >= 5)
+- e = i - 5;
++ e = i - 4;
+ else if (i >= 2)
+- e = i - 2;
++ e = i - 1;
+ } else {
+ if (i >= 4)
+ e = i - 4;
+@@ -305,17 +305,17 @@ static noinline void check_new_node(stru
+ MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM));
+ MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL));
+ MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
+- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
++ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
+
+ mn = mas_pop_node(&mas); /* get the next node. */
+ MT_BUG_ON(mt, mn == NULL);
+ MT_BUG_ON(mt, not_empty(mn));
+ MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS);
+- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2);
++ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
+
+ mas_push_node(&mas, mn);
+ MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
+- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
++ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
+
+ /* Check the limit of pop/push/pop */
+ mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */
+@@ -323,14 +323,14 @@ static noinline void check_new_node(stru
+ MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM));
+ MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL));
+ MT_BUG_ON(mt, mas_alloc_req(&mas));
+- MT_BUG_ON(mt, mas.alloc->node_count);
++ MT_BUG_ON(mt, mas.alloc->node_count != 1);
+ MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
+ mn = mas_pop_node(&mas);
+ MT_BUG_ON(mt, not_empty(mn));
+ MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
+- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
++ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
+ mas_push_node(&mas, mn);
+- MT_BUG_ON(mt, mas.alloc->node_count);
++ MT_BUG_ON(mt, mas.alloc->node_count != 1);
+ MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
+ mn = mas_pop_node(&mas);
+ MT_BUG_ON(mt, not_empty(mn));
--- /dev/null
+From stable-owner@vger.kernel.org Tue Apr 11 17:14:26 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 11 Apr 2023 11:10:55 -0400
+Subject: mm: enable maple tree RCU mode by default.
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: maple-tree@lists.infradead.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Liam R. Howlett" <Liam.Howlett@Oracle.com>, "Liam R . Howlett" <Liam.Howlett@oracle.com>, syzbot+8d95422d3537159ca390@syzkaller.appspotmail.com
+Message-ID: <20230411151055.2910579-15-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
+
+commit 3dd4432549415f3c65dd52d5c687629efbf4ece1 upstream.
+
+Use the maple tree in RCU mode for VMA tracking.
+
+The maple tree tracks the stack and is able to update the pivot
+(lower/upper boundary) in-place to allow the page fault handler to write
+to the tree while holding just the mmap read lock. This is safe as the
+writes to the stack have a guard VMA which ensures there will always be
+a NULL in the direction of the growth and thus will only update a pivot.
+
+It is possible, but not recommended, to have VMAs that grow up/down
+without guard VMAs. syzbot has constructed a testcase which sets up a
+VMA to grow and consume the empty space. Overwriting the entire NULL
+entry causes the tree to be altered in a way that is not safe for
+concurrent readers; the readers may see a node being rewritten or one
+that does not match the maple state they are using.
+
+Enabling RCU mode allows the concurrent readers to see a stable node and
+will return the expected result.
+
+Link: https://lkml.kernel.org/r/20230227173632.3292573-9-surenb@google.com
+Cc: stable@vger.kernel.org
+Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree")
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Reported-by: syzbot+8d95422d3537159ca390@syzkaller.appspotmail.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm_types.h | 3 ++-
+ kernel/fork.c | 3 +++
+ mm/mmap.c | 3 ++-
+ 3 files changed, 7 insertions(+), 2 deletions(-)
+
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -810,7 +810,8 @@ struct mm_struct {
+ unsigned long cpu_bitmap[];
+ };
+
+-#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
++#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
++ MT_FLAGS_USE_RCU)
+ extern struct mm_struct init_mm;
+
+ /* Pointer magic because the dynamic array size confuses some compilers. */
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -617,6 +617,7 @@ static __latent_entropy int dup_mmap(str
+ if (retval)
+ goto out;
+
++ mt_clear_in_rcu(mas.tree);
+ mas_for_each(&old_mas, mpnt, ULONG_MAX) {
+ struct file *file;
+
+@@ -703,6 +704,8 @@ static __latent_entropy int dup_mmap(str
+ retval = arch_dup_mmap(oldmm, mm);
+ loop_out:
+ mas_destroy(&mas);
++ if (!retval)
++ mt_set_in_rcu(mas.tree);
+ out:
+ mmap_write_unlock(mm);
+ flush_tlb_mm(oldmm);
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -2308,7 +2308,7 @@ do_mas_align_munmap(struct ma_state *mas
+ int count = 0;
+ int error = -ENOMEM;
+ MA_STATE(mas_detach, &mt_detach, 0, 0);
+- mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
++ mt_init_flags(&mt_detach, mas->tree->ma_flags & MT_FLAGS_LOCK_MASK);
+ mt_set_external_lock(&mt_detach, &mm->mmap_lock);
+
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+@@ -3095,6 +3095,7 @@ void exit_mmap(struct mm_struct *mm)
+ */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ mmap_write_lock(mm);
++ mt_clear_in_rcu(&mm->mm_mt);
+ free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
+ USER_PGTABLES_CEILING);
+ tlb_finish_mmu(&tlb);
--- /dev/null
+From 7c7b962938ddda6a9cd095de557ee5250706ea88 Mon Sep 17 00:00:00 2001
+From: Alistair Popple <apopple@nvidia.com>
+Date: Thu, 30 Mar 2023 12:25:19 +1100
+Subject: mm: take a page reference when removing device exclusive entries
+
+From: Alistair Popple <apopple@nvidia.com>
+
+commit 7c7b962938ddda6a9cd095de557ee5250706ea88 upstream.
+
+Device exclusive page table entries are used to prevent CPU access to a
+page whilst it is being accessed from a device. Typically this is used to
+implement atomic operations when the underlying bus does not support
+atomic access. When a CPU thread encounters a device exclusive entry it
+locks the page and restores the original entry after calling mmu notifiers
+to signal drivers that exclusive access is no longer available.
+
+The device exclusive entry holds a reference to the page making it safe to
+access the struct page whilst the entry is present. However the fault
+handling code does not hold the PTL when taking the page lock. This means
+if there are multiple threads faulting concurrently on the device
+exclusive entry one will remove the entry whilst others will wait on the
+page lock without holding a reference.
+
+This can lead to threads locking or waiting on a folio with a zero
+refcount. Whilst mmap_lock prevents the pages getting freed via munmap()
+they may still be freed by a migration. This leads to warnings such as
+PAGE_FLAGS_CHECK_AT_FREE due to the page being locked when the refcount
+drops to zero.
+
+Fix this by trying to take a reference on the folio before locking it.
+The code already checks the PTE under the PTL and aborts if the entry is
+no longer there. It is also possible the folio has been unmapped, freed
+and re-allocated allowing a reference to be taken on an unrelated folio.
+This case is also detected by the PTE check and the folio is unlocked
+without further changes.
+
+Link: https://lkml.kernel.org/r/20230330012519.804116-1-apopple@nvidia.com
+Fixes: b756a3b5e7ea ("mm: device exclusive memory access")
+Signed-off-by: Alistair Popple <apopple@nvidia.com>
+Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
+Reviewed-by: John Hubbard <jhubbard@nvidia.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Christoph Hellwig <hch@infradead.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c | 16 +++++++++++++++-
+ 1 file changed, 15 insertions(+), 1 deletion(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3580,8 +3580,21 @@ static vm_fault_t remove_device_exclusiv
+ struct vm_area_struct *vma = vmf->vma;
+ struct mmu_notifier_range range;
+
+- if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
++ /*
++ * We need a reference to lock the folio because we don't hold
++ * the PTL so a racing thread can remove the device-exclusive
++ * entry and unmap it. If the folio is free the entry must
++ * have been removed already. If it happens to have already
++ * been re-allocated after being freed all we do is lock and
++ * unlock it.
++ */
++ if (!folio_try_get(folio))
++ return 0;
++
++ if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
++ folio_put(folio);
+ return VM_FAULT_RETRY;
++ }
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+ vma->vm_mm, vmf->address & PAGE_MASK,
+ (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
+@@ -3594,6 +3607,7 @@ static vm_fault_t remove_device_exclusiv
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ folio_unlock(folio);
++ folio_put(folio);
+
+ mmu_notifier_invalidate_range_end(&range);
+ return 0;
drm-amdgpu-for-s0ix-skip-sdma-5.x-suspend-resume.patch
drm-amdgpu-skip-psp-suspend-for-imu-enabled-asics-mode2-reset.patch
drm-bridge-lt9611-fix-pll-being-unable-to-lock.patch
+mm-take-a-page-reference-when-removing-device-exclusive-entries.patch
+maple_tree-remove-gfp_zero-from-kmem_cache_alloc-and-kmem_cache_alloc_bulk.patch
+maple_tree-fix-potential-rcu-issue.patch
+maple_tree-reduce-user-error-potential.patch
+maple_tree-fix-handle-of-invalidated-state-in-mas_wr_store_setup.patch
+maple_tree-fix-mas_prev-and-mas_find-state-handling.patch
+maple_tree-be-more-cautious-about-dead-nodes.patch
+maple_tree-refine-ma_state-init-from-mas_start.patch
+maple_tree-detect-dead-nodes-in-mas_start.patch
+maple_tree-fix-freeing-of-nodes-in-rcu-mode.patch
+maple_tree-remove-extra-smp_wmb-from-mas_dead_leaves.patch
+maple_tree-add-smp_rmb-to-dead-node-detection.patch
+maple_tree-add-rcu-lock-checking-to-rcu-callback-functions.patch
+mm-enable-maple-tree-rcu-mode-by-default.patch