6.0-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 7 Nov 2022 12:28:18 +0000 (13:28 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 7 Nov 2022 12:28:18 +0000 (13:28 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 7 Nov 2022 12:28:18 +0000 (13:28 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 7 Nov 2022 12:28:18 +0000 (13:28 +0100)
diff --git a/queue-6.0/acpi-numa-add-cxl-cfmws-nodes-to-the-possible-nodes-set.patch b/queue-6.0/acpi-numa-add-cxl-cfmws-nodes-to-the-possible-nodes-set.patch

new file mode 100644 (file)

index 0000000..5917b2a
--- /dev/null
+++ b/queue-6.0/acpi-numa-add-cxl-cfmws-nodes-to-the-possible-nodes-set.patch
@@ -0,0 +1,42 @@
+From 24f0692bfd41fd207d99c993a5785c3426762046 Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Thu, 20 Oct 2022 16:54:55 -0700
+Subject: ACPI: NUMA: Add CXL CFMWS 'nodes' to the possible nodes set
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 24f0692bfd41fd207d99c993a5785c3426762046 upstream.
+
+The ACPI CEDT.CFMWS indicates a range of possible address where new CXL
+regions can appear. Each range is associated with a QTG id (QoS
+Throttling Group id). For each range + QTG pair that is not covered by a proximity
+domain in the SRAT, Linux creates a new NUMA node. However, the commit
+that added the new ranges missed updating the node_possible mask which
+causes memory_group_register() to fail. Add the new nodes to the
+nodes_possible mask.
+
+Cc: <stable@vger.kernel.org>
+Fixes: fd49f99c1809 ("ACPI: NUMA: Add a node and memblk for each CFMWS not in SRAT")
+Cc: Alison Schofield <alison.schofield@intel.com>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Reported-by: Vishal Verma <vishal.l.verma@intel.com>
+Tested-by: Vishal Verma <vishal.l.verma@intel.com>
+Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
+Link: https://lore.kernel.org/r/166631003537.1167078.9373680312035292395.stgit@dwillia2-xfh.jf.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/acpi/numa/srat.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/acpi/numa/srat.c
++++ b/drivers/acpi/numa/srat.c
+@@ -327,6 +327,7 @@ static int __init acpi_parse_cfmws(union
+               pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n",
+                       node, start, end);
+       }
++      node_set(node, numa_nodes_parsed);
+ 
+       /* Set the next available fake_pxm value */
+       (*fake_pxm)++;
diff --git a/queue-6.0/btrfs-don-t-use-btrfs_chunk-sub_stripes-from-disk.patch b/queue-6.0/btrfs-don-t-use-btrfs_chunk-sub_stripes-from-disk.patch

new file mode 100644 (file)

index 0000000..550d4a7
--- /dev/null
+++ b/queue-6.0/btrfs-don-t-use-btrfs_chunk-sub_stripes-from-disk.patch
@@ -0,0 +1,91 @@
+From 76a66ba101329316a5d7f4275070be22eb85fdf2 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 21 Oct 2022 08:43:45 +0800
+Subject: btrfs: don't use btrfs_chunk::sub_stripes from disk
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 76a66ba101329316a5d7f4275070be22eb85fdf2 upstream.
+
+[BUG]
+There are two reports (the earliest one from LKP, a more recent one from
+kernel bugzilla) that we can have some chunks with 0 as sub_stripes.
+
+This will cause divide-by-zero errors at btrfs_rmap_block, which is
+introduced by a recent kernel patch ac0677348f3c ("btrfs: merge
+calculations for simple striped profiles in btrfs_rmap_block"):
+
+               if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+                                BTRFS_BLOCK_GROUP_RAID10)) {
+                       stripe_nr = stripe_nr * map->num_stripes + i;
+                       stripe_nr = div_u64(stripe_nr, map->sub_stripes); <<<
+               }
+
+[CAUSE]
+From the more recent report, it has been proven that we have some chunks
+with 0 as sub_stripes, mostly caused by older mkfs.
+
+It turns out that the mkfs.btrfs fix is only introduced in 6718ab4d33aa
+("btrfs-progs: Initialize sub_stripes to 1 in btrfs_alloc_data_chunk")
+which is included in v5.4 btrfs-progs release.
+
+So there would be quite some old filesystems with such 0 sub_stripes.
+
+[FIX]
+Just don't trust the sub_stripes values from disk.
+
+We have a trusted btrfs_raid_array[] to fetch the correct sub_stripes
+numbers for each profile and that are fixed.
+
+By this, we can keep the compatibility with older filesystems while
+still avoid divide-by-zero bugs.
+
+Reported-by: kernel test robot <oliver.sang@intel.com>
+Reported-by: Viktor Kuzmin <kvaster@gmail.com>
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216559
+Fixes: ac0677348f3c ("btrfs: merge calculations for simple striped profiles in btrfs_rmap_block")
+CC: stable@vger.kernel.org # 6.0
+Reviewed-by: Su Yue <glass@fydeos.io>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/volumes.c |   12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -7029,6 +7029,7 @@ static int read_one_chunk(struct btrfs_k
+       u64 devid;
+       u64 type;
+       u8 uuid[BTRFS_UUID_SIZE];
++      int index;
+       int num_stripes;
+       int ret;
+       int i;
+@@ -7036,6 +7037,7 @@ static int read_one_chunk(struct btrfs_k
+       logical = key->offset;
+       length = btrfs_chunk_length(leaf, chunk);
+       type = btrfs_chunk_type(leaf, chunk);
++      index = btrfs_bg_flags_to_raid_index(type);
+       num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ 
+ #if BITS_PER_LONG == 32
+@@ -7089,7 +7091,15 @@ static int read_one_chunk(struct btrfs_k
+       map->io_align = btrfs_chunk_io_align(leaf, chunk);
+       map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+       map->type = type;
+-      map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
++      /*
++       * We can't use the sub_stripes value, as for profiles other than
++       * RAID10, they may have 0 as sub_stripes for filesystems created by
++       * older mkfs (<v5.4).
++       * In that case, it can cause divide-by-zero errors later.
++       * Since currently sub_stripes is fixed for each profile, let's
++       * use the trusted value instead.
++       */
++      map->sub_stripes = btrfs_raid_array[index].sub_stripes;
+       map->verified_stripes = 0;
+       em->orig_block_len = btrfs_calc_stripe_length(em);
+       for (i = 0; i < num_stripes; i++) {
diff --git a/queue-6.0/btrfs-fix-a-memory-allocation-failure-test-in-btrfs_submit_direct.patch b/queue-6.0/btrfs-fix-a-memory-allocation-failure-test-in-btrfs_submit_direct.patch

new file mode 100644 (file)

index 0000000..a8825ae
--- /dev/null
+++ b/queue-6.0/btrfs-fix-a-memory-allocation-failure-test-in-btrfs_submit_direct.patch
@@ -0,0 +1,33 @@
+From 063b1f21cc9be07291a1f5e227436f353c6d1695 Mon Sep 17 00:00:00 2001
+From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
+Date: Sun, 30 Oct 2022 08:35:28 +0100
+Subject: btrfs: fix a memory allocation failure test in btrfs_submit_direct
+
+From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
+
+commit 063b1f21cc9be07291a1f5e227436f353c6d1695 upstream.
+
+After allocation 'dip' is tested instead of 'dip->csums'.  Fix it.
+
+Fixes: 642c5d34da53 ("btrfs: allocate the btrfs_dio_private as part of the iomap dio bio")
+CC: stable@vger.kernel.org # 5.19+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -8142,7 +8142,7 @@ static void btrfs_submit_direct(const st
+                */
+               status = BLK_STS_RESOURCE;
+               dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
+-              if (!dip)
++              if (!dip->csums)
+                       goto out_err;
+ 
+               status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
diff --git a/queue-6.0/btrfs-fix-tree-mod-log-mishandling-of-reallocated-nodes.patch b/queue-6.0/btrfs-fix-tree-mod-log-mishandling-of-reallocated-nodes.patch

new file mode 100644 (file)

index 0000000..2059e9f
--- /dev/null
+++ b/queue-6.0/btrfs-fix-tree-mod-log-mishandling-of-reallocated-nodes.patch
@@ -0,0 +1,189 @@
+From 968b71583130b6104c9f33ba60446d598e327a8b Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Fri, 14 Oct 2022 08:52:46 -0400
+Subject: btrfs: fix tree mod log mishandling of reallocated nodes
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 968b71583130b6104c9f33ba60446d598e327a8b upstream.
+
+We have been seeing the following panic in production
+
+  kernel BUG at fs/btrfs/tree-mod-log.c:677!
+  invalid opcode: 0000 [#1] SMP
+  RIP: 0010:tree_mod_log_rewind+0x1b4/0x200
+  RSP: 0000:ffffc9002c02f890 EFLAGS: 00010293
+  RAX: 0000000000000003 RBX: ffff8882b448c700 RCX: 0000000000000000
+  RDX: 0000000000008000 RSI: 00000000000000a7 RDI: ffff88877d831c00
+  RBP: 0000000000000002 R08: 000000000000009f R09: 0000000000000000
+  R10: 0000000000000000 R11: 0000000000100c40 R12: 0000000000000001
+  R13: ffff8886c26d6a00 R14: ffff88829f5424f8 R15: ffff88877d831a00
+  FS:  00007fee1d80c780(0000) GS:ffff8890400c0000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 00007fee1963a020 CR3: 0000000434f33002 CR4: 00000000007706e0
+  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+  PKRU: 55555554
+  Call Trace:
+   btrfs_get_old_root+0x12b/0x420
+   btrfs_search_old_slot+0x64/0x2f0
+   ? tree_mod_log_oldest_root+0x3d/0xf0
+   resolve_indirect_ref+0xfd/0x660
+   ? ulist_alloc+0x31/0x60
+   ? kmem_cache_alloc_trace+0x114/0x2c0
+   find_parent_nodes+0x97a/0x17e0
+   ? ulist_alloc+0x30/0x60
+   btrfs_find_all_roots_safe+0x97/0x150
+   iterate_extent_inodes+0x154/0x370
+   ? btrfs_search_path_in_tree+0x240/0x240
+   iterate_inodes_from_logical+0x98/0xd0
+   ? btrfs_search_path_in_tree+0x240/0x240
+   btrfs_ioctl_logical_to_ino+0xd9/0x180
+   btrfs_ioctl+0xe2/0x2ec0
+   ? __mod_memcg_lruvec_state+0x3d/0x280
+   ? do_sys_openat2+0x6d/0x140
+   ? kretprobe_dispatcher+0x47/0x70
+   ? kretprobe_rethook_handler+0x38/0x50
+   ? rethook_trampoline_handler+0x82/0x140
+   ? arch_rethook_trampoline_callback+0x3b/0x50
+   ? kmem_cache_free+0xfb/0x270
+   ? do_sys_openat2+0xd5/0x140
+   __x64_sys_ioctl+0x71/0xb0
+   do_syscall_64+0x2d/0x40
+
+Which is this code in tree_mod_log_rewind()
+
+       switch (tm->op) {
+        case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+               BUG_ON(tm->slot < n);
+
+This occurs because we replay the nodes in order that they happened, and
+when we do a REPLACE we will log a REMOVE_WHILE_FREEING for every slot,
+starting at 0.  'n' here is the number of items in this block, which in
+this case was 1, but we had 2 REMOVE_WHILE_FREEING operations.
+
+The actual root cause of this was that we were replaying operations for
+a block that shouldn't have been replayed.  Consider the following
+sequence of events
+
+1. We have an already modified root, and we do a btrfs_get_tree_mod_seq().
+2. We begin removing items from this root, triggering KEY_REPLACE for
+   it's child slots.
+3. We remove one of the 2 children this root node points to, thus triggering
+   the root node promotion of the remaining child, and freeing this node.
+4. We modify a new root, and re-allocate the above node to the root node of
+   this other root.
+
+The tree mod log looks something like this
+
+       logical 0       op KEY_REPLACE (slot 1)                 seq 2
+       logical 0       op KEY_REMOVE (slot 1)                  seq 3
+       logical 0       op KEY_REMOVE_WHILE_FREEING (slot 0)    seq 4
+       logical 4096    op LOG_ROOT_REPLACE (old logical 0)     seq 5
+       logical 8192    op KEY_REMOVE_WHILE_FREEING (slot 1)    seq 6
+       logical 8192    op KEY_REMOVE_WHILE_FREEING (slot 0)    seq 7
+       logical 0       op LOG_ROOT_REPLACE (old logical 8192)  seq 8
+
+>From here the bug is triggered by the following steps
+
+1.  Call btrfs_get_old_root() on the new_root.
+2.  We call tree_mod_log_oldest_root(btrfs_root_node(new_root)), which is
+    currently logical 0.
+3.  tree_mod_log_oldest_root() calls tree_mod_log_search_oldest(), which
+    gives us the KEY_REPLACE seq 2, and since that's not a
+    LOG_ROOT_REPLACE we incorrectly believe that we don't have an old
+    root, because we expect that the most recent change should be a
+    LOG_ROOT_REPLACE.
+4.  Back in tree_mod_log_oldest_root() we don't have a LOG_ROOT_REPLACE,
+    so we don't set old_root, we simply use our existing extent buffer.
+5.  Since we're using our existing extent buffer (logical 0) we call
+    tree_mod_log_search(0) in order to get the newest change to start the
+    rewind from, which ends up being the LOG_ROOT_REPLACE at seq 8.
+6.  Again since we didn't find an old_root we simply clone logical 0 at
+    it's current state.
+7.  We call tree_mod_log_rewind() with the cloned extent buffer.
+8.  Set n = btrfs_header_nritems(logical 0), which would be whatever the
+    original nritems was when we COWed the original root, say for this
+    example it's 2.
+9.  We start from the newest operation and work our way forward, so we
+    see LOG_ROOT_REPLACE which we ignore.
+10. Next we see KEY_REMOVE_WHILE_FREEING for slot 0, which triggers the
+    BUG_ON(tm->slot < n), because it expects if we've done this we have a
+    completely empty extent buffer to replay completely.
+
+The correct thing would be to find the first LOG_ROOT_REPLACE, and then
+get the old_root set to logical 8192.  In fact making that change fixes
+this particular problem.
+
+However consider the much more complicated case.  We have a child node
+in this tree and the above situation.  In the above case we freed one
+of the child blocks at the seq 3 operation.  If this block was also
+re-allocated and got new tree mod log operations we would have a
+different problem.  btrfs_search_old_slot(orig root) would get down to
+the logical 0 root that still pointed at that node.  However in
+btrfs_search_old_slot() we call tree_mod_log_rewind(buf) directly.  This
+is not context aware enough to know which operations we should be
+replaying.  If the block was re-allocated multiple times we may only
+want to replay a range of operations, and determining what that range is
+isn't possible to determine.
+
+We could maybe solve this by keeping track of which root the node
+belonged to at every tree mod log operation, and then passing this
+around to make sure we're only replaying operations that relate to the
+root we're trying to rewind.
+
+However there's a simpler way to solve this problem, simply disallow
+reallocations if we have currently running tree mod log users.  We
+already do this for leaf's, so we're simply expanding this to nodes as
+well.  This is a relatively uncommon occurrence, and the problem is
+complicated enough I'm worried that we will still have corner cases in
+the reallocation case.  So fix this in the most straightforward way
+possible.
+
+Fixes: bd989ba359f2 ("Btrfs: add tree modification log functions")
+CC: stable@vger.kernel.org # 3.3+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c |   25 +++++++++++++------------
+ 1 file changed, 13 insertions(+), 12 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3294,21 +3294,22 @@ void btrfs_free_tree_block(struct btrfs_
+               }
+ 
+               /*
+-               * If this is a leaf and there are tree mod log users, we may
+-               * have recorded mod log operations that point to this leaf.
+-               * So we must make sure no one reuses this leaf's extent before
+-               * mod log operations are applied to a node, otherwise after
+-               * rewinding a node using the mod log operations we get an
+-               * inconsistent btree, as the leaf's extent may now be used as
+-               * a node or leaf for another different btree.
++               * If there are tree mod log users we may have recorded mod log
++               * operations for this node.  If we re-allocate this node we
++               * could replay operations on this node that happened when it
++               * existed in a completely different root.  For example if it
++               * was part of root A, then was reallocated to root B, and we
++               * are doing a btrfs_old_search_slot(root b), we could replay
++               * operations that happened when the block was part of root A,
++               * giving us an inconsistent view of the btree.
++               *
+                * We are safe from races here because at this point no other
+                * node or root points to this extent buffer, so if after this
+-               * check a new tree mod log user joins, it will not be able to
+-               * find a node pointing to this leaf and record operations that
+-               * point to this leaf.
++               * check a new tree mod log user joins we will not have an
++               * existing log of operations on this node that we have to
++               * contend with.
+                */
+-              if (btrfs_header_level(buf) == 0 &&
+-                  test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
++              if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+                       must_pin = true;
+ 
+               if (must_pin || btrfs_is_zoned(fs_info)) {
diff --git a/queue-6.0/btrfs-fix-type-of-parameter-generation-in-btrfs_get_dentry.patch b/queue-6.0/btrfs-fix-type-of-parameter-generation-in-btrfs_get_dentry.patch

new file mode 100644 (file)

index 0000000..2643cfb
--- /dev/null
+++ b/queue-6.0/btrfs-fix-type-of-parameter-generation-in-btrfs_get_dentry.patch
@@ -0,0 +1,44 @@
+From 2398091f9c2c8e0040f4f9928666787a3e8108a7 Mon Sep 17 00:00:00 2001
+From: David Sterba <dsterba@suse.com>
+Date: Tue, 18 Oct 2022 16:05:52 +0200
+Subject: btrfs: fix type of parameter generation in btrfs_get_dentry
+
+From: David Sterba <dsterba@suse.com>
+
+commit 2398091f9c2c8e0040f4f9928666787a3e8108a7 upstream.
+
+The type of parameter generation has been u32 since the beginning,
+however all callers pass a u64 generation, so unify the types to prevent
+potential loss.
+
+CC: stable@vger.kernel.org # 4.9+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/export.c |    2 +-
+ fs/btrfs/export.h |    2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/export.c
++++ b/fs/btrfs/export.c
+@@ -58,7 +58,7 @@ static int btrfs_encode_fh(struct inode
+ }
+ 
+ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+-                              u64 root_objectid, u32 generation,
++                              u64 root_objectid, u64 generation,
+                               int check_generation)
+ {
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+--- a/fs/btrfs/export.h
++++ b/fs/btrfs/export.h
+@@ -19,7 +19,7 @@ struct btrfs_fid {
+ } __attribute__ ((packed));
+ 
+ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+-                              u64 root_objectid, u32 generation,
++                              u64 root_objectid, u64 generation,
+                               int check_generation);
+ struct dentry *btrfs_get_parent(struct dentry *child);
+ 
diff --git a/queue-6.0/cxl-pmem-fix-cxl_pmem_region-and-cxl_memdev-leak.patch b/queue-6.0/cxl-pmem-fix-cxl_pmem_region-and-cxl_memdev-leak.patch

new file mode 100644 (file)

index 0000000..480d345
--- /dev/null
+++ b/queue-6.0/cxl-pmem-fix-cxl_pmem_region-and-cxl_memdev-leak.patch
@@ -0,0 +1,269 @@
+From 4d07ae22e79ebc2d7528bbc69daa53b86981cb3a Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Thu, 3 Nov 2022 17:30:36 -0700
+Subject: cxl/pmem: Fix cxl_pmem_region and cxl_memdev leak
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 4d07ae22e79ebc2d7528bbc69daa53b86981cb3a upstream.
+
+When a cxl_nvdimm object goes through a ->remove() event (device
+physically removed, nvdimm-bridge disabled, or nvdimm device disabled),
+then any associated regions must also be disabled. As highlighted by the
+cxl-create-region.sh test [1], a single device may host multiple
+regions, but the driver was only tracking one region at a time. This
+leads to a situation where only the last enabled region per nvdimm
+device is cleaned up properly. Other regions are leaked, and this also
+causes cxl_memdev reference leaks.
+
+Fix the tracking by allowing cxl_nvdimm objects to track multiple region
+associations.
+
+Cc: <stable@vger.kernel.org>
+Link: https://github.com/pmem/ndctl/blob/main/test/cxl-create-region.sh [1]
+Reported-by: Vishal Verma <vishal.l.verma@intel.com>
+Fixes: 04ad63f086d1 ("cxl/region: Introduce cxl_pmem_region objects")
+Reviewed-by: Dave Jiang <dave.jiang@intel.com>
+Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
+Link: https://lore.kernel.org/r/166752183647.947915.2045230911503793901.stgit@dwillia2-xfh.jf.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/cxl/core/pmem.c |    2 
+ drivers/cxl/cxl.h       |    2 
+ drivers/cxl/pmem.c      |  101 ++++++++++++++++++++++++++++++------------------
+ 3 files changed, 68 insertions(+), 37 deletions(-)
+
+--- a/drivers/cxl/core/pmem.c
++++ b/drivers/cxl/core/pmem.c
+@@ -188,6 +188,7 @@ static void cxl_nvdimm_release(struct de
+ {
+       struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
+ 
++      xa_destroy(&cxl_nvd->pmem_regions);
+       kfree(cxl_nvd);
+ }
+ 
+@@ -230,6 +231,7 @@ static struct cxl_nvdimm *cxl_nvdimm_all
+ 
+       dev = &cxl_nvd->dev;
+       cxl_nvd->cxlmd = cxlmd;
++      xa_init(&cxl_nvd->pmem_regions);
+       device_initialize(dev);
+       lockdep_set_class(&dev->mutex, &cxl_nvdimm_key);
+       device_set_pm_not_required(dev);
+--- a/drivers/cxl/cxl.h
++++ b/drivers/cxl/cxl.h
+@@ -423,7 +423,7 @@ struct cxl_nvdimm {
+       struct device dev;
+       struct cxl_memdev *cxlmd;
+       struct cxl_nvdimm_bridge *bridge;
+-      struct cxl_pmem_region *region;
++      struct xarray pmem_regions;
+ };
+ 
+ struct cxl_pmem_region_mapping {
+--- a/drivers/cxl/pmem.c
++++ b/drivers/cxl/pmem.c
+@@ -30,17 +30,20 @@ static void unregister_nvdimm(void *nvdi
+       struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+       struct cxl_nvdimm_bridge *cxl_nvb = cxl_nvd->bridge;
+       struct cxl_pmem_region *cxlr_pmem;
++      unsigned long index;
+ 
+       device_lock(&cxl_nvb->dev);
+-      cxlr_pmem = cxl_nvd->region;
+       dev_set_drvdata(&cxl_nvd->dev, NULL);
+-      cxl_nvd->region = NULL;
+-      device_unlock(&cxl_nvb->dev);
++      xa_for_each(&cxl_nvd->pmem_regions, index, cxlr_pmem) {
++              get_device(&cxlr_pmem->dev);
++              device_unlock(&cxl_nvb->dev);
+ 
+-      if (cxlr_pmem) {
+               device_release_driver(&cxlr_pmem->dev);
+               put_device(&cxlr_pmem->dev);
++
++              device_lock(&cxl_nvb->dev);
+       }
++      device_unlock(&cxl_nvb->dev);
+ 
+       nvdimm_delete(nvdimm);
+       cxl_nvd->bridge = NULL;
+@@ -366,25 +369,49 @@ static int match_cxl_nvdimm(struct devic
+ 
+ static void unregister_nvdimm_region(void *nd_region)
+ {
+-      struct cxl_nvdimm_bridge *cxl_nvb;
+-      struct cxl_pmem_region *cxlr_pmem;
++      nvdimm_region_delete(nd_region);
++}
++
++static int cxl_nvdimm_add_region(struct cxl_nvdimm *cxl_nvd,
++                               struct cxl_pmem_region *cxlr_pmem)
++{
++      int rc;
++
++      rc = xa_insert(&cxl_nvd->pmem_regions, (unsigned long)cxlr_pmem,
++                     cxlr_pmem, GFP_KERNEL);
++      if (rc)
++              return rc;
++
++      get_device(&cxlr_pmem->dev);
++      return 0;
++}
++
++static void cxl_nvdimm_del_region(struct cxl_nvdimm *cxl_nvd,
++                                struct cxl_pmem_region *cxlr_pmem)
++{
++      /*
++       * It is possible this is called without a corresponding
++       * cxl_nvdimm_add_region for @cxlr_pmem
++       */
++      cxlr_pmem = xa_erase(&cxl_nvd->pmem_regions, (unsigned long)cxlr_pmem);
++      if (cxlr_pmem)
++              put_device(&cxlr_pmem->dev);
++}
++
++static void release_mappings(void *data)
++{
+       int i;
++      struct cxl_pmem_region *cxlr_pmem = data;
++      struct cxl_nvdimm_bridge *cxl_nvb = cxlr_pmem->bridge;
+ 
+-      cxlr_pmem = nd_region_provider_data(nd_region);
+-      cxl_nvb = cxlr_pmem->bridge;
+       device_lock(&cxl_nvb->dev);
+       for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+               struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+               struct cxl_nvdimm *cxl_nvd = m->cxl_nvd;
+ 
+-              if (cxl_nvd->region) {
+-                      put_device(&cxlr_pmem->dev);
+-                      cxl_nvd->region = NULL;
+-              }
++              cxl_nvdimm_del_region(cxl_nvd, cxlr_pmem);
+       }
+       device_unlock(&cxl_nvb->dev);
+-
+-      nvdimm_region_delete(nd_region);
+ }
+ 
+ static void cxlr_pmem_remove_resource(void *res)
+@@ -422,7 +449,7 @@ static int cxl_pmem_region_probe(struct
+       if (!cxl_nvb->nvdimm_bus) {
+               dev_dbg(dev, "nvdimm bus not found\n");
+               rc = -ENXIO;
+-              goto err;
++              goto out_nvb;
+       }
+ 
+       memset(&mappings, 0, sizeof(mappings));
+@@ -431,7 +458,7 @@ static int cxl_pmem_region_probe(struct
+       res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL);
+       if (!res) {
+               rc = -ENOMEM;
+-              goto err;
++              goto out_nvb;
+       }
+ 
+       res->name = "Persistent Memory";
+@@ -442,11 +469,11 @@ static int cxl_pmem_region_probe(struct
+ 
+       rc = insert_resource(&iomem_resource, res);
+       if (rc)
+-              goto err;
++              goto out_nvb;
+ 
+       rc = devm_add_action_or_reset(dev, cxlr_pmem_remove_resource, res);
+       if (rc)
+-              goto err;
++              goto out_nvb;
+ 
+       ndr_desc.res = res;
+       ndr_desc.provider_data = cxlr_pmem;
+@@ -462,7 +489,7 @@ static int cxl_pmem_region_probe(struct
+       nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
+       if (!nd_set) {
+               rc = -ENOMEM;
+-              goto err;
++              goto out_nvb;
+       }
+ 
+       ndr_desc.memregion = cxlr->id;
+@@ -472,9 +499,13 @@ static int cxl_pmem_region_probe(struct
+       info = kmalloc_array(cxlr_pmem->nr_mappings, sizeof(*info), GFP_KERNEL);
+       if (!info) {
+               rc = -ENOMEM;
+-              goto err;
++              goto out_nvb;
+       }
+ 
++      rc = devm_add_action_or_reset(dev, release_mappings, cxlr_pmem);
++      if (rc)
++              goto out_nvd;
++
+       for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+               struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+               struct cxl_memdev *cxlmd = m->cxlmd;
+@@ -486,7 +517,7 @@ static int cxl_pmem_region_probe(struct
+                       dev_dbg(dev, "[%d]: %s: no cxl_nvdimm found\n", i,
+                               dev_name(&cxlmd->dev));
+                       rc = -ENODEV;
+-                      goto err;
++                      goto out_nvd;
+               }
+ 
+               /* safe to drop ref now with bridge lock held */
+@@ -498,10 +529,17 @@ static int cxl_pmem_region_probe(struct
+                       dev_dbg(dev, "[%d]: %s: no nvdimm found\n", i,
+                               dev_name(&cxlmd->dev));
+                       rc = -ENODEV;
+-                      goto err;
++                      goto out_nvd;
+               }
+-              cxl_nvd->region = cxlr_pmem;
+-              get_device(&cxlr_pmem->dev);
++
++              /*
++               * Pin the region per nvdimm device as those may be released
++               * out-of-order with respect to the region, and a single nvdimm
++               * maybe associated with multiple regions
++               */
++              rc = cxl_nvdimm_add_region(cxl_nvd, cxlr_pmem);
++              if (rc)
++                      goto out_nvd;
+               m->cxl_nvd = cxl_nvd;
+               mappings[i] = (struct nd_mapping_desc) {
+                       .nvdimm = nvdimm,
+@@ -527,27 +565,18 @@ static int cxl_pmem_region_probe(struct
+               nvdimm_pmem_region_create(cxl_nvb->nvdimm_bus, &ndr_desc);
+       if (!cxlr_pmem->nd_region) {
+               rc = -ENOMEM;
+-              goto err;
++              goto out_nvd;
+       }
+ 
+       rc = devm_add_action_or_reset(dev, unregister_nvdimm_region,
+                                     cxlr_pmem->nd_region);
+-out:
++out_nvd:
+       kfree(info);
++out_nvb:
+       device_unlock(&cxl_nvb->dev);
+       put_device(&cxl_nvb->dev);
+ 
+       return rc;
+-
+-err:
+-      dev_dbg(dev, "failed to create nvdimm region\n");
+-      for (i--; i >= 0; i--) {
+-              nvdimm = mappings[i].nvdimm;
+-              cxl_nvd = nvdimm_provider_data(nvdimm);
+-              put_device(&cxl_nvd->region->dev);
+-              cxl_nvd->region = NULL;
+-      }
+-      goto out;
+ }
+ 
+ static struct cxl_driver cxl_pmem_region_driver = {
diff --git a/queue-6.0/cxl-region-fix-cxl_region-leak-cleanup-targets-at-region-delete.patch b/queue-6.0/cxl-region-fix-cxl_region-leak-cleanup-targets-at-region-delete.patch

new file mode 100644 (file)

index 0000000..df8a3bd
--- /dev/null
+++ b/queue-6.0/cxl-region-fix-cxl_region-leak-cleanup-targets-at-region-delete.patch
@@ -0,0 +1,49 @@
+From 0d9e734018d70cecf79e2e4c6082167160a0f13f Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Thu, 3 Nov 2022 17:30:30 -0700
+Subject: cxl/region: Fix cxl_region leak, cleanup targets at region delete
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 0d9e734018d70cecf79e2e4c6082167160a0f13f upstream.
+
+When a region is deleted any targets that have been previously assigned
+to that region hold references to it. Trigger those references to
+drop by detaching all targets at unregister_region() time.
+
+Otherwise that region object will leak as userspace has lost the ability
+to detach targets once region sysfs is torn down.
+
+Cc: <stable@vger.kernel.org>
+Fixes: b9686e8c8e39 ("cxl/region: Enable the assignment of endpoint decoders to regions")
+Reviewed-by: Dave Jiang <dave.jiang@intel.com>
+Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
+Link: https://lore.kernel.org/r/166752183055.947915.17681995648556534844.stgit@dwillia2-xfh.jf.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/cxl/core/region.c |   11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/drivers/cxl/core/region.c
++++ b/drivers/cxl/core/region.c
+@@ -1556,8 +1556,19 @@ static struct cxl_region *to_cxl_region(
+ static void unregister_region(void *dev)
+ {
+       struct cxl_region *cxlr = to_cxl_region(dev);
++      struct cxl_region_params *p = &cxlr->params;
++      int i;
+ 
+       device_del(dev);
++
++      /*
++       * Now that region sysfs is shutdown, the parameter block is now
++       * read-only, so no need to hold the region rwsem to access the
++       * region parameters.
++       */
++      for (i = 0; i < p->interleave_ways; i++)
++              detach_target(cxlr, i);
++
+       cxl_region_iomem_release(cxlr);
+       put_device(dev);
+ }
diff --git a/queue-6.0/cxl-region-fix-decoder-allocation-crash.patch b/queue-6.0/cxl-region-fix-decoder-allocation-crash.patch

new file mode 100644 (file)

index 0000000..3a3a27e
--- /dev/null
+++ b/queue-6.0/cxl-region-fix-decoder-allocation-crash.patch
@@ -0,0 +1,144 @@
+From 71ee71d7adcba648077997a29a91158d20c40b09 Mon Sep 17 00:00:00 2001
+From: Vishal Verma <vishal.l.verma@intel.com>
+Date: Tue, 1 Nov 2022 01:41:00 -0600
+Subject: cxl/region: Fix decoder allocation crash
+
+From: Vishal Verma <vishal.l.verma@intel.com>
+
+commit 71ee71d7adcba648077997a29a91158d20c40b09 upstream.
+
+When an intermediate port's decoders have been exhausted by existing
+regions, and creating a new region with the port in question in it's
+hierarchical path is attempted, cxl_port_attach_region() fails to find a
+port decoder (as would be expected), and drops into the failure / cleanup
+path.
+
+However, during cleanup of the region reference, a sanity check attempts
+to dereference the decoder, which in the above case didn't exist. This
+causes a NULL pointer dereference BUG.
+
+To fix this, refactor the decoder allocation and de-allocation into
+helper routines, and in this 'free' routine, check that the decoder,
+@cxld, is valid before attempting any operations on it.
+
+Cc: <stable@vger.kernel.org>
+Suggested-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
+Reviewed-by: Dave Jiang <dave.jiang@intel.com>
+Fixes: 384e624bb211 ("cxl/region: Attach endpoint decoders")
+Link: https://lore.kernel.org/r/20221101074100.1732003-1-vishal.l.verma@intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/cxl/core/region.c |   67 ++++++++++++++++++++++++++++------------------
+ 1 file changed, 41 insertions(+), 26 deletions(-)
+
+--- a/drivers/cxl/core/region.c
++++ b/drivers/cxl/core/region.c
+@@ -686,18 +686,27 @@ static struct cxl_region_ref *alloc_regi
+       return cxl_rr;
+ }
+ 
+-static void free_region_ref(struct cxl_region_ref *cxl_rr)
++static void cxl_rr_free_decoder(struct cxl_region_ref *cxl_rr)
+ {
+-      struct cxl_port *port = cxl_rr->port;
+       struct cxl_region *cxlr = cxl_rr->region;
+       struct cxl_decoder *cxld = cxl_rr->decoder;
+ 
++      if (!cxld)
++              return;
++
+       dev_WARN_ONCE(&cxlr->dev, cxld->region != cxlr, "region mismatch\n");
+       if (cxld->region == cxlr) {
+               cxld->region = NULL;
+               put_device(&cxlr->dev);
+       }
++}
+ 
++static void free_region_ref(struct cxl_region_ref *cxl_rr)
++{
++      struct cxl_port *port = cxl_rr->port;
++      struct cxl_region *cxlr = cxl_rr->region;
++
++      cxl_rr_free_decoder(cxl_rr);
+       xa_erase(&port->regions, (unsigned long)cxlr);
+       xa_destroy(&cxl_rr->endpoints);
+       kfree(cxl_rr);
+@@ -728,6 +737,33 @@ static int cxl_rr_ep_add(struct cxl_regi
+       return 0;
+ }
+ 
++static int cxl_rr_alloc_decoder(struct cxl_port *port, struct cxl_region *cxlr,
++                              struct cxl_endpoint_decoder *cxled,
++                              struct cxl_region_ref *cxl_rr)
++{
++      struct cxl_decoder *cxld;
++
++      if (port == cxled_to_port(cxled))
++              cxld = &cxled->cxld;
++      else
++              cxld = cxl_region_find_decoder(port, cxlr);
++      if (!cxld) {
++              dev_dbg(&cxlr->dev, "%s: no decoder available\n",
++                      dev_name(&port->dev));
++              return -EBUSY;
++      }
++
++      if (cxld->region) {
++              dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n",
++                      dev_name(&port->dev), dev_name(&cxld->dev),
++                      dev_name(&cxld->region->dev));
++              return -EBUSY;
++      }
++
++      cxl_rr->decoder = cxld;
++      return 0;
++}
++
+ /**
+  * cxl_port_attach_region() - track a region's interest in a port by endpoint
+  * @port: port to add a new region reference 'struct cxl_region_ref'
+@@ -794,12 +830,6 @@ static int cxl_port_attach_region(struct
+                       cxl_rr->nr_targets++;
+                       nr_targets_inc = true;
+               }
+-
+-              /*
+-               * The decoder for @cxlr was allocated when the region was first
+-               * attached to @port.
+-               */
+-              cxld = cxl_rr->decoder;
+       } else {
+               cxl_rr = alloc_region_ref(port, cxlr);
+               if (IS_ERR(cxl_rr)) {
+@@ -810,26 +840,11 @@ static int cxl_port_attach_region(struct
+               }
+               nr_targets_inc = true;
+ 
+-              if (port == cxled_to_port(cxled))
+-                      cxld = &cxled->cxld;
+-              else
+-                      cxld = cxl_region_find_decoder(port, cxlr);
+-              if (!cxld) {
+-                      dev_dbg(&cxlr->dev, "%s: no decoder available\n",
+-                              dev_name(&port->dev));
+-                      goto out_erase;
+-              }
+-
+-              if (cxld->region) {
+-                      dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n",
+-                              dev_name(&port->dev), dev_name(&cxld->dev),
+-                              dev_name(&cxld->region->dev));
+-                      rc = -EBUSY;
++              rc = cxl_rr_alloc_decoder(port, cxlr, cxled, cxl_rr);
++              if (rc)
+                       goto out_erase;
+-              }
+-
+-              cxl_rr->decoder = cxld;
+       }
++      cxld = cxl_rr->decoder;
+ 
+       rc = cxl_rr_ep_add(cxl_rr, cxled);
+       if (rc) {
diff --git a/queue-6.0/cxl-region-fix-distance-calculation-with-passthrough-ports.patch b/queue-6.0/cxl-region-fix-distance-calculation-with-passthrough-ports.patch

new file mode 100644 (file)

index 0000000..049b207
--- /dev/null
+++ b/queue-6.0/cxl-region-fix-distance-calculation-with-passthrough-ports.patch
@@ -0,0 +1,114 @@
+From e4f6dfa9ef756a3934a4caf618b1e86e9e8e21d0 Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Thu, 3 Nov 2022 17:30:54 -0700
+Subject: cxl/region: Fix 'distance' calculation with passthrough ports
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit e4f6dfa9ef756a3934a4caf618b1e86e9e8e21d0 upstream.
+
+When programming port decode targets, the algorithm wants to ensure that
+two devices are compatible to be programmed as peers beneath a given
+port. A compatible peer is a target that shares the same dport, and
+where that target's interleave position also routes it to the same
+dport. Compatibility is determined by the device's interleave position
+being >= to distance. For example, if a given dport can only map every
+Nth position then positions less than N away from the last target
+programmed are incompatible.
+
+The @distance for the host-bridge's cxl_port in a simple dual-ported
+host-bridge configuration with 2 direct-attached devices is 1, i.e. An
+x2 region divided by 2 dports to reach 2 region targets.
+
+An x4 region under an x2 host-bridge would need 2 intervening switches
+where the @distance at the host bridge level is 2 (x4 region divided by
+2 switches to reach 4 devices).
+
+However, the distance between peers underneath a single ported
+host-bridge is always zero because there is no limit to the number of
+devices that can be mapped. In other words, there are no decoders to
+program in a passthrough, all descendants are mapped and distance only
+starts matters for the intervening descendant ports of the passthrough
+port.
+
+Add tracking for the number of dports mapped to a port, and use that to
+detect the passthrough case for calculating @distance.
+
+Cc: <stable@vger.kernel.org>
+Reported-by: Bobo WL <lmw.bobo@gmail.com>
+Reported-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
+Link: http://lore.kernel.org/r/20221010172057.00001559@huawei.com
+Fixes: 27b3f8d13830 ("cxl/region: Program target lists")
+Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
+Link: https://lore.kernel.org/r/166752185440.947915.6617495912508299445.stgit@dwillia2-xfh.jf.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/cxl/core/port.c   |   11 +++++++++--
+ drivers/cxl/core/region.c |    9 ++++++++-
+ drivers/cxl/cxl.h         |    2 ++
+ 3 files changed, 19 insertions(+), 3 deletions(-)
+
+--- a/drivers/cxl/core/port.c
++++ b/drivers/cxl/core/port.c
+@@ -811,6 +811,7 @@ static struct cxl_dport *find_dport(stru
+ static int add_dport(struct cxl_port *port, struct cxl_dport *new)
+ {
+       struct cxl_dport *dup;
++      int rc;
+ 
+       device_lock_assert(&port->dev);
+       dup = find_dport(port, new->port_id);
+@@ -821,8 +822,14 @@ static int add_dport(struct cxl_port *po
+                       dev_name(dup->dport));
+               return -EBUSY;
+       }
+-      return xa_insert(&port->dports, (unsigned long)new->dport, new,
+-                       GFP_KERNEL);
++
++      rc = xa_insert(&port->dports, (unsigned long)new->dport, new,
++                     GFP_KERNEL);
++      if (rc)
++              return rc;
++
++      port->nr_dports++;
++      return 0;
+ }
+ 
+ /*
+--- a/drivers/cxl/core/region.c
++++ b/drivers/cxl/core/region.c
+@@ -989,7 +989,14 @@ static int cxl_port_setup_targets(struct
+       if (cxl_rr->nr_targets_set) {
+               int i, distance;
+ 
+-              distance = p->nr_targets / cxl_rr->nr_targets;
++              /*
++               * Passthrough ports impose no distance requirements between
++               * peers
++               */
++              if (port->nr_dports == 1)
++                      distance = 0;
++              else
++                      distance = p->nr_targets / cxl_rr->nr_targets;
+               for (i = 0; i < cxl_rr->nr_targets_set; i++)
+                       if (ep->dport == cxlsd->target[i]) {
+                               rc = check_last_peer(cxled, ep, cxl_rr,
+--- a/drivers/cxl/cxl.h
++++ b/drivers/cxl/cxl.h
+@@ -457,6 +457,7 @@ struct cxl_pmem_region {
+  * @regions: cxl_region_ref instances, regions mapped by this port
+  * @parent_dport: dport that points to this port in the parent
+  * @decoder_ida: allocator for decoder ids
++ * @nr_dports: number of entries in @dports
+  * @hdm_end: track last allocated HDM decoder instance for allocation ordering
+  * @commit_end: cursor to track highest committed decoder for commit ordering
+  * @component_reg_phys: component register capability base address (optional)
+@@ -475,6 +476,7 @@ struct cxl_port {
+       struct xarray regions;
+       struct cxl_dport *parent_dport;
+       struct ida decoder_ida;
++      int nr_dports;
+       int hdm_end;
+       int commit_end;
+       resource_size_t component_reg_phys;
diff --git a/queue-6.0/cxl-region-fix-region-hpa-ordering-validation.patch b/queue-6.0/cxl-region-fix-region-hpa-ordering-validation.patch

new file mode 100644 (file)

index 0000000..691261b
--- /dev/null
+++ b/queue-6.0/cxl-region-fix-region-hpa-ordering-validation.patch
@@ -0,0 +1,54 @@
+From a90accb358ae33ea982a35595573f7a045993f8b Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Thu, 3 Nov 2022 17:30:24 -0700
+Subject: cxl/region: Fix region HPA ordering validation
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit a90accb358ae33ea982a35595573f7a045993f8b upstream.
+
+Some regions may not have any address space allocated. Skip them when
+validating HPA order otherwise a crash like the following may result:
+
+ devm_cxl_add_region: cxl_acpi cxl_acpi.0: decoder3.4: created region9
+ BUG: kernel NULL pointer dereference, address: 0000000000000000
+ [..]
+ RIP: 0010:store_targetN+0x655/0x1740 [cxl_core]
+ [..]
+ Call Trace:
+  <TASK>
+  kernfs_fop_write_iter+0x144/0x200
+  vfs_write+0x24a/0x4d0
+  ksys_write+0x69/0xf0
+  do_syscall_64+0x3a/0x90
+
+store_targetN+0x655/0x1740:
+alloc_region_ref at drivers/cxl/core/region.c:676
+(inlined by) cxl_port_attach_region at drivers/cxl/core/region.c:850
+(inlined by) cxl_region_attach at drivers/cxl/core/region.c:1290
+(inlined by) attach_target at drivers/cxl/core/region.c:1410
+(inlined by) store_targetN at drivers/cxl/core/region.c:1453
+
+Cc: <stable@vger.kernel.org>
+Fixes: 384e624bb211 ("cxl/region: Attach endpoint decoders")
+Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
+Reviewed-by: Dave Jiang <dave.jiang@intel.com>
+Link: https://lore.kernel.org/r/166752182461.947915.497032805239915067.stgit@dwillia2-xfh.jf.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/cxl/core/region.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/drivers/cxl/core/region.c
++++ b/drivers/cxl/core/region.c
+@@ -657,6 +657,9 @@ static struct cxl_region_ref *alloc_regi
+       xa_for_each(&port->regions, index, iter) {
+               struct cxl_region_params *ip = &iter->region->params;
+ 
++              if (!ip->res)
++                      continue;
++
+               if (ip->res->start > p->res->start) {
+                       dev_dbg(&cxlr->dev,
+                               "%s: HPA order violation %s:%pr vs %pr\n",
diff --git a/queue-6.0/ftrace-fix-use-after-free-for-dynamic-ftrace_ops.patch b/queue-6.0/ftrace-fix-use-after-free-for-dynamic-ftrace_ops.patch

new file mode 100644 (file)

index 0000000..e0b1c9c
--- /dev/null
+++ b/queue-6.0/ftrace-fix-use-after-free-for-dynamic-ftrace_ops.patch
@@ -0,0 +1,139 @@
+From 0e792b89e6800cd9cb4757a76a96f7ef3e8b6294 Mon Sep 17 00:00:00 2001
+From: Li Huafei <lihuafei1@huawei.com>
+Date: Thu, 3 Nov 2022 11:10:10 +0800
+Subject: ftrace: Fix use-after-free for dynamic ftrace_ops
+
+From: Li Huafei <lihuafei1@huawei.com>
+
+commit 0e792b89e6800cd9cb4757a76a96f7ef3e8b6294 upstream.
+
+KASAN reported a use-after-free with ftrace ops [1]. It was found from
+vmcore that perf had registered two ops with the same content
+successively, both dynamic. After unregistering the second ops, a
+use-after-free occurred.
+
+In ftrace_shutdown(), when the second ops is unregistered, the
+FTRACE_UPDATE_CALLS command is not set because there is another enabled
+ops with the same content.  Also, both ops are dynamic and the ftrace
+callback function is ftrace_ops_list_func, so the
+FTRACE_UPDATE_TRACE_FUNC command will not be set. Eventually the value
+of 'command' will be 0 and ftrace_shutdown() will skip the rcu
+synchronization.
+
+However, ftrace may be activated. When the ops is released, another CPU
+may be accessing the ops.  Add the missing synchronization to fix this
+problem.
+
+[1]
+BUG: KASAN: use-after-free in __ftrace_ops_list_func kernel/trace/ftrace.c:7020 [inline]
+BUG: KASAN: use-after-free in ftrace_ops_list_func+0x2b0/0x31c kernel/trace/ftrace.c:7049
+Read of size 8 at addr ffff56551965bbc8 by task syz-executor.2/14468
+
+CPU: 1 PID: 14468 Comm: syz-executor.2 Not tainted 5.10.0 #7
+Hardware name: linux,dummy-virt (DT)
+Call trace:
+ dump_backtrace+0x0/0x40c arch/arm64/kernel/stacktrace.c:132
+ show_stack+0x30/0x40 arch/arm64/kernel/stacktrace.c:196
+ __dump_stack lib/dump_stack.c:77 [inline]
+ dump_stack+0x1b4/0x248 lib/dump_stack.c:118
+ print_address_description.constprop.0+0x28/0x48c mm/kasan/report.c:387
+ __kasan_report mm/kasan/report.c:547 [inline]
+ kasan_report+0x118/0x210 mm/kasan/report.c:564
+ check_memory_region_inline mm/kasan/generic.c:187 [inline]
+ __asan_load8+0x98/0xc0 mm/kasan/generic.c:253
+ __ftrace_ops_list_func kernel/trace/ftrace.c:7020 [inline]
+ ftrace_ops_list_func+0x2b0/0x31c kernel/trace/ftrace.c:7049
+ ftrace_graph_call+0x0/0x4
+ __might_sleep+0x8/0x100 include/linux/perf_event.h:1170
+ __might_fault mm/memory.c:5183 [inline]
+ __might_fault+0x58/0x70 mm/memory.c:5171
+ do_strncpy_from_user lib/strncpy_from_user.c:41 [inline]
+ strncpy_from_user+0x1f4/0x4b0 lib/strncpy_from_user.c:139
+ getname_flags+0xb0/0x31c fs/namei.c:149
+ getname+0x2c/0x40 fs/namei.c:209
+ [...]
+
+Allocated by task 14445:
+ kasan_save_stack+0x24/0x50 mm/kasan/common.c:48
+ kasan_set_track mm/kasan/common.c:56 [inline]
+ __kasan_kmalloc mm/kasan/common.c:479 [inline]
+ __kasan_kmalloc.constprop.0+0x110/0x13c mm/kasan/common.c:449
+ kasan_kmalloc+0xc/0x14 mm/kasan/common.c:493
+ kmem_cache_alloc_trace+0x440/0x924 mm/slub.c:2950
+ kmalloc include/linux/slab.h:563 [inline]
+ kzalloc include/linux/slab.h:675 [inline]
+ perf_event_alloc.part.0+0xb4/0x1350 kernel/events/core.c:11230
+ perf_event_alloc kernel/events/core.c:11733 [inline]
+ __do_sys_perf_event_open kernel/events/core.c:11831 [inline]
+ __se_sys_perf_event_open+0x550/0x15f4 kernel/events/core.c:11723
+ __arm64_sys_perf_event_open+0x6c/0x80 kernel/events/core.c:11723
+ [...]
+
+Freed by task 14445:
+ kasan_save_stack+0x24/0x50 mm/kasan/common.c:48
+ kasan_set_track+0x24/0x34 mm/kasan/common.c:56
+ kasan_set_free_info+0x20/0x40 mm/kasan/generic.c:358
+ __kasan_slab_free.part.0+0x11c/0x1b0 mm/kasan/common.c:437
+ __kasan_slab_free mm/kasan/common.c:445 [inline]
+ kasan_slab_free+0x2c/0x40 mm/kasan/common.c:446
+ slab_free_hook mm/slub.c:1569 [inline]
+ slab_free_freelist_hook mm/slub.c:1608 [inline]
+ slab_free mm/slub.c:3179 [inline]
+ kfree+0x12c/0xc10 mm/slub.c:4176
+ perf_event_alloc.part.0+0xa0c/0x1350 kernel/events/core.c:11434
+ perf_event_alloc kernel/events/core.c:11733 [inline]
+ __do_sys_perf_event_open kernel/events/core.c:11831 [inline]
+ __se_sys_perf_event_open+0x550/0x15f4 kernel/events/core.c:11723
+ [...]
+
+Link: https://lore.kernel.org/linux-trace-kernel/20221103031010.166498-1-lihuafei1@huawei.com
+
+Fixes: edb096e00724f ("ftrace: Fix memleak when unregistering dynamic ops when tracing disabled")
+Cc: stable@vger.kernel.org
+Suggested-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Li Huafei <lihuafei1@huawei.com>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/ftrace.c |   16 +++-------------
+ 1 file changed, 3 insertions(+), 13 deletions(-)
+
+--- a/kernel/trace/ftrace.c
++++ b/kernel/trace/ftrace.c
+@@ -3031,18 +3031,8 @@ int ftrace_shutdown(struct ftrace_ops *o
+               command |= FTRACE_UPDATE_TRACE_FUNC;
+       }
+ 
+-      if (!command || !ftrace_enabled) {
+-              /*
+-               * If these are dynamic or per_cpu ops, they still
+-               * need their data freed. Since, function tracing is
+-               * not currently active, we can just free them
+-               * without synchronizing all CPUs.
+-               */
+-              if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+-                      goto free_ops;
+-
+-              return 0;
+-      }
++      if (!command || !ftrace_enabled)
++              goto out;
+ 
+       /*
+        * If the ops uses a trampoline, then it needs to be
+@@ -3079,6 +3069,7 @@ int ftrace_shutdown(struct ftrace_ops *o
+       removed_ops = NULL;
+       ops->flags &= ~FTRACE_OPS_FL_REMOVING;
+ 
++out:
+       /*
+        * Dynamic ops may be freed, we must make sure that all
+        * callers are done before leaving this function.
+@@ -3106,7 +3097,6 @@ int ftrace_shutdown(struct ftrace_ops *o
+               if (IS_ENABLED(CONFIG_PREEMPTION))
+                       synchronize_rcu_tasks();
+ 
+- free_ops:
+               ftrace_trampoline_free(ops);
+       }
+ 
diff --git a/queue-6.0/series b/queue-6.0/series

index 6ac76c59ad772c4362d86b914a392523403898bb..4104a121e2f6201ee61798944d70c8ea45469bfc 100644 (file)
--- a/queue-6.0/series
+++ b/queue-6.0/series
@@ -127,3 +127,14 @@ fscrypt-stop-using-keyrings-subsystem-for-fscrypt_master_key.patch
  fscrypt-fix-keyring-memory-leak-on-mount-failure.patch
  clk-renesas-r8a779g0-add-sasyncper-clocks.patch
  btrfs-fix-lost-file-sync-on-direct-io-write-with-nowait-and-dsync-iocb.patch
+btrfs-fix-tree-mod-log-mishandling-of-reallocated-nodes.patch
+btrfs-fix-type-of-parameter-generation-in-btrfs_get_dentry.patch
+btrfs-don-t-use-btrfs_chunk-sub_stripes-from-disk.patch
+btrfs-fix-a-memory-allocation-failure-test-in-btrfs_submit_direct.patch
+acpi-numa-add-cxl-cfmws-nodes-to-the-possible-nodes-set.patch
+cxl-pmem-fix-cxl_pmem_region-and-cxl_memdev-leak.patch
+cxl-region-fix-decoder-allocation-crash.patch
+cxl-region-fix-region-hpa-ordering-validation.patch
+cxl-region-fix-cxl_region-leak-cleanup-targets-at-region-delete.patch
+cxl-region-fix-distance-calculation-with-passthrough-ports.patch
+ftrace-fix-use-after-free-for-dynamic-ftrace_ops.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 7 Nov 2022 12:28:18 +0000 (13:28 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 7 Nov 2022 12:28:18 +0000 (13:28 +0100)
queue-6.0/acpi-numa-add-cxl-cfmws-nodes-to-the-possible-nodes-set.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/btrfs-don-t-use-btrfs_chunk-sub_stripes-from-disk.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/btrfs-fix-a-memory-allocation-failure-test-in-btrfs_submit_direct.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/btrfs-fix-tree-mod-log-mishandling-of-reallocated-nodes.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/btrfs-fix-type-of-parameter-generation-in-btrfs_get_dentry.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/cxl-pmem-fix-cxl_pmem_region-and-cxl_memdev-leak.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/cxl-region-fix-cxl_region-leak-cleanup-targets-at-region-delete.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/cxl-region-fix-decoder-allocation-crash.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/cxl-region-fix-distance-calculation-with-passthrough-ports.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/cxl-region-fix-region-hpa-ordering-validation.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/ftrace-fix-use-after-free-for-dynamic-ftrace_ops.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/series		patch \| blob \| blame \| history