Fixes for 5.15

author Sasha Levin <sashal@kernel.org>

Mon, 29 Nov 2021 03:16:54 +0000 (22:16 -0500)

committer Sasha Levin <sashal@kernel.org>

Mon, 29 Nov 2021 03:16:54 +0000 (22:16 -0500)
author Sasha Levin <sashal@kernel.org>
Mon, 29 Nov 2021 03:16:54 +0000 (22:16 -0500)
committer Sasha Levin <sashal@kernel.org>
Mon, 29 Nov 2021 03:16:54 +0000 (22:16 -0500)
diff --git a/queue-5.15/ceph-properly-handle-statfs-on-multifs-setups.patch b/queue-5.15/ceph-properly-handle-statfs-on-multifs-setups.patch

new file mode 100644 (file)

index 0000000..7289c56
--- /dev/null
+++ b/queue-5.15/ceph-properly-handle-statfs-on-multifs-setups.patch
@@ -0,0 +1,67 @@
+From 07f330a82e38cf1b6740332b89dc2a2e22e5703e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Oct 2021 11:12:58 -0400
+Subject: ceph: properly handle statfs on multifs setups
+
+From: Jeff Layton <jlayton@kernel.org>
+
+[ Upstream commit 8cfc0c7ed34f7929ce7e5d7c6eecf4d01ba89a84 ]
+
+ceph_statfs currently stuffs the cluster fsid into the f_fsid field.
+This was fine when we only had a single filesystem per cluster, but now
+that we have multiples we need to use something that will vary between
+them.
+
+Change ceph_statfs to xor each 32-bit chunk of the fsid (aka cluster id)
+into the lower bits of the statfs->f_fsid. Change the lower bits to hold
+the fscid (filesystem ID within the cluster).
+
+That should give us a value that is guaranteed to be unique between
+filesystems within a cluster, and should minimize the chance of
+collisions between mounts of different clusters.
+
+URL: https://tracker.ceph.com/issues/52812
+Reported-by: Sachin Prabhu <sprabhu@redhat.com>
+Signed-off-by: Jeff Layton <jlayton@kernel.org>
+Reviewed-by: Xiubo Li <xiubli@redhat.com>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ceph/super.c | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+diff --git a/fs/ceph/super.c b/fs/ceph/super.c
+index fd8742bae8471..202ddde3d62ad 100644
+--- a/fs/ceph/super.c
++++ b/fs/ceph/super.c
+@@ -52,8 +52,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+       struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
+       struct ceph_mon_client *monc = &fsc->client->monc;
+       struct ceph_statfs st;
+-      u64 fsid;
+-      int err;
++      int i, err;
+       u64 data_pool;
+ 
+       if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
+@@ -99,12 +98,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+       buf->f_namelen = NAME_MAX;
+ 
+       /* Must convert the fsid, for consistent values across arches */
++      buf->f_fsid.val[0] = 0;
+       mutex_lock(&monc->mutex);
+-      fsid = le64_to_cpu(*(__le64 *)(&monc->monmap->fsid)) ^
+-             le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1));
++      for (i = 0 ; i < sizeof(monc->monmap->fsid) / sizeof(__le32) ; ++i)
++              buf->f_fsid.val[0] ^= le32_to_cpu(((__le32 *)&monc->monmap->fsid)[i]);
+       mutex_unlock(&monc->mutex);
+ 
+-      buf->f_fsid = u64_to_fsid(fsid);
++      /* fold the fs_cluster_id into the upper bits */
++      buf->f_fsid.val[1] = monc->fs_cluster_id;
+ 
+       return 0;
+ }
+-- 
+2.33.0
+
diff --git a/queue-5.15/cifs-nosharesock-should-not-share-socket-with-future.patch b/queue-5.15/cifs-nosharesock-should-not-share-socket-with-future.patch

new file mode 100644 (file)

index 0000000..66c39dd
--- /dev/null
+++ b/queue-5.15/cifs-nosharesock-should-not-share-socket-with-future.patch
@@ -0,0 +1,73 @@
+From 28192917a4520ee82ab741cefdba79ea2c39cbb7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 6 Nov 2021 11:31:53 +0000
+Subject: cifs: nosharesock should not share socket with future sessions
+
+From: Shyam Prasad N <sprasad@microsoft.com>
+
+[ Upstream commit c9f1c19cf7c50949885fa5afdb2cb242d61a7fac ]
+
+Today, when a new mount is done with nosharesock, we ensure
+that we don't select an existing matching session. However,
+we don't mark the connection as nosharesock, which means that
+those could be shared with future sessions.
+
+Fixed it with this commit. Also printing this info in DebugData.
+
+Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
+Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/cifs/cifs_debug.c | 2 ++
+ fs/cifs/cifsglob.h   | 1 +
+ fs/cifs/connect.c    | 8 +++++++-
+ 3 files changed, 10 insertions(+), 1 deletion(-)
+
+diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
+index de2c12bcfa4bc..905a901f7f80b 100644
+--- a/fs/cifs/cifs_debug.c
++++ b/fs/cifs/cifs_debug.c
+@@ -358,6 +358,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
+                       seq_printf(m, " signed");
+               if (server->posix_ext_supported)
+                       seq_printf(m, " posix");
++              if (server->nosharesock)
++                      seq_printf(m, " nosharesock");
+ 
+               if (server->rdma)
+                       seq_printf(m, "\nRDMA ");
+diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
+index dea4c929d3f46..3e5b8e177cfa7 100644
+--- a/fs/cifs/cifsglob.h
++++ b/fs/cifs/cifsglob.h
+@@ -592,6 +592,7 @@ struct TCP_Server_Info {
+       struct list_head pending_mid_q;
+       bool noblocksnd;                /* use blocking sendmsg */
+       bool noautotune;                /* do not autotune send buf sizes */
++      bool nosharesock;
+       bool tcp_nodelay;
+       unsigned int credits;  /* send no more requests at once */
+       unsigned int max_credits; /* can override large 32000 default at mnt */
+diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
+index e757ee52cc777..d26703a05c6b4 100644
+--- a/fs/cifs/connect.c
++++ b/fs/cifs/connect.c
+@@ -1217,7 +1217,13 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *
+ {
+       struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
+ 
+-      if (ctx->nosharesock)
++      if (ctx->nosharesock) {
++              server->nosharesock = true;
++              return 0;
++      }
++
++      /* this server does not share socket */
++      if (server->nosharesock)
+               return 0;
+ 
+       /* If multidialect negotiation see if existing sessions match one */
+-- 
+2.33.0
+
diff --git a/queue-5.15/f2fs-quota-fix-potential-deadlock.patch b/queue-5.15/f2fs-quota-fix-potential-deadlock.patch

new file mode 100644 (file)

index 0000000..2a9d640
--- /dev/null
+++ b/queue-5.15/f2fs-quota-fix-potential-deadlock.patch
@@ -0,0 +1,65 @@
+From 8ae24aacc1020dac21b216841199f68f8cf1e5ec Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 3 Sep 2021 10:38:11 +0800
+Subject: f2fs: quota: fix potential deadlock
+
+From: Chao Yu <chao@kernel.org>
+
+[ Upstream commit a5c0042200b28fff3bde6fa128ddeaef97990f8d ]
+
+As Yi Zhuang reported in bugzilla:
+
+https://bugzilla.kernel.org/show_bug.cgi?id=214299
+
+There is potential deadlock during quota data flush as below:
+
+Thread A:                      Thread B:
+f2fs_dquot_acquire
+down_read(&sbi->quota_sem)
+                               f2fs_write_checkpoint
+                               block_operations
+                               f2fs_look_all
+                               down_write(&sbi->cp_rwsem)
+f2fs_quota_write
+f2fs_write_begin
+__do_map_lock
+f2fs_lock_op
+down_read(&sbi->cp_rwsem)
+                               __need_flush_qutoa
+                               down_write(&sbi->quota_sem)
+
+This patch changes block_operations() to use trylock, if it fails,
+it means there is potential quota data updater, in this condition,
+let's flush quota data first and then trylock again to check dirty
+status of quota data.
+
+The side effect is: in heavy race condition (e.g. multi quota data
+upaters vs quota data flusher), it may decrease the probability of
+synchronizing quota data successfully in checkpoint() due to limited
+retry time of quota flush.
+
+Reported-by: Yi Zhuang <zhuangyi1@huawei.com>
+Signed-off-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/f2fs/checkpoint.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
+index 83e9bc0f91ffd..7b02827242312 100644
+--- a/fs/f2fs/checkpoint.c
++++ b/fs/f2fs/checkpoint.c
+@@ -1162,7 +1162,8 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
+       if (!is_journalled_quota(sbi))
+               return false;
+ 
+-      down_write(&sbi->quota_sem);
++      if (!down_write_trylock(&sbi->quota_sem))
++              return true;
+       if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
+               ret = false;
+       } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) {
+-- 
+2.33.0
+
diff --git a/queue-5.15/f2fs-set-sbi_need_fsck-flag-when-inconsistent-node-b.patch b/queue-5.15/f2fs-set-sbi_need_fsck-flag-when-inconsistent-node-b.patch

new file mode 100644 (file)

index 0000000..e089cf5
--- /dev/null
+++ b/queue-5.15/f2fs-set-sbi_need_fsck-flag-when-inconsistent-node-b.patch
@@ -0,0 +1,38 @@
+From 06215e5f1d7114ed82e2a1f7156119ecc5b5f1d3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 18 Sep 2021 20:46:36 +0800
+Subject: f2fs: set SBI_NEED_FSCK flag when inconsistent node block found
+
+From: Weichao Guo <guoweichao@oppo.com>
+
+[ Upstream commit 6663b138ded1a59e630c9e605e42aa7fde490cdc ]
+
+Inconsistent node block will cause a file fail to open or read,
+which could make the user process crashes or stucks. Let's mark
+SBI_NEED_FSCK flag to trigger a fix at next fsck time. After
+unlinking the corrupted file, the user process could regenerate
+a new one and work correctly.
+
+Signed-off-by: Weichao Guo <guoweichao@oppo.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/f2fs/node.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
+index e863136081b47..556fcd8457f3f 100644
+--- a/fs/f2fs/node.c
++++ b/fs/f2fs/node.c
+@@ -1443,6 +1443,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
+                         nid, nid_of_node(page), ino_of_node(page),
+                         ofs_of_node(page), cpver_of_node(page),
+                         next_blkaddr_of_node(page));
++              set_sbi_flag(sbi, SBI_NEED_FSCK);
+               err = -EINVAL;
+ out_err:
+               ClearPageUptodate(page);
+-- 
+2.33.0
+
diff --git a/queue-5.15/iommu-rockchip-fix-page_desc_hi_masks-for-rk3568.patch b/queue-5.15/iommu-rockchip-fix-page_desc_hi_masks-for-rk3568.patch

new file mode 100644 (file)

index 0000000..e5359b7
--- /dev/null
+++ b/queue-5.15/iommu-rockchip-fix-page_desc_hi_masks-for-rk3568.patch
@@ -0,0 +1,47 @@
+From 5e789ed4bbb3bd860bca4a9a8db249ce67f9c63a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Nov 2021 03:13:25 +0100
+Subject: iommu/rockchip: Fix PAGE_DESC_HI_MASKs for RK3568
+
+From: Alex Bee <knaerzche@gmail.com>
+
+[ Upstream commit f7ff3cff3527ff1e70cad8d2fe7c0c7b6f83120a ]
+
+With the submission of iommu driver for RK3568 a subtle bug was
+introduced: PAGE_DESC_HI_MASK1 and PAGE_DESC_HI_MASK2 have to be
+the other way arround - that leads to random errors, especially when
+addresses beyond 32 bit are used.
+
+Fix it.
+
+Fixes: c55356c534aa ("iommu: rockchip: Add support for iommu v2")
+Signed-off-by: Alex Bee <knaerzche@gmail.com>
+Tested-by: Peter Geis <pgwipeout@gmail.com>
+Reviewed-by: Heiko Stuebner <heiko@sntech.de>
+Tested-by: Dan Johansen <strit@manjaro.org>
+Reviewed-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
+Link: https://lore.kernel.org/r/20211124021325.858139-1-knaerzche@gmail.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/iommu/rockchip-iommu.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
+index 5cb260820eda6..7f23ad61c094f 100644
+--- a/drivers/iommu/rockchip-iommu.c
++++ b/drivers/iommu/rockchip-iommu.c
+@@ -200,8 +200,8 @@ static inline phys_addr_t rk_dte_pt_address(u32 dte)
+ #define DTE_HI_MASK2  GENMASK(7, 4)
+ #define DTE_HI_SHIFT1 24 /* shift bit 8 to bit 32 */
+ #define DTE_HI_SHIFT2 32 /* shift bit 4 to bit 36 */
+-#define PAGE_DESC_HI_MASK1    GENMASK_ULL(39, 36)
+-#define PAGE_DESC_HI_MASK2    GENMASK_ULL(35, 32)
++#define PAGE_DESC_HI_MASK1    GENMASK_ULL(35, 32)
++#define PAGE_DESC_HI_MASK2    GENMASK_ULL(39, 36)
+ 
+ static inline phys_addr_t rk_dte_pt_address_v2(u32 dte)
+ {
+-- 
+2.33.0
+
diff --git a/queue-5.15/iommu-vt-d-fix-unmap_pages-support.patch b/queue-5.15/iommu-vt-d-fix-unmap_pages-support.patch

new file mode 100644 (file)

index 0000000..e1fc43b
--- /dev/null
+++ b/queue-5.15/iommu-vt-d-fix-unmap_pages-support.patch
@@ -0,0 +1,88 @@
+From 6d2de354d125782cd23228ad93e2bfd28dc0ee24 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 26 Nov 2021 21:55:56 +0800
+Subject: iommu/vt-d: Fix unmap_pages support
+
+From: Alex Williamson <alex.williamson@redhat.com>
+
+[ Upstream commit 86dc40c7ea9c22f64571e0e45f695de73a0e2644 ]
+
+When supporting only the .map and .unmap callbacks of iommu_ops,
+the IOMMU driver can make assumptions about the size and alignment
+used for mappings based on the driver provided pgsize_bitmap.  VT-d
+previously used essentially PAGE_MASK for this bitmap as any power
+of two mapping was acceptably filled by native page sizes.
+
+However, with the .map_pages and .unmap_pages interface we're now
+getting page-size and count arguments.  If we simply combine these
+as (page-size * count) and make use of the previous map/unmap
+functions internally, any size and alignment assumptions are very
+different.
+
+As an example, a given vfio device assignment VM will often create
+a 4MB mapping at IOVA pfn [0x3fe00 - 0x401ff].  On a system that
+does not support IOMMU super pages, the unmap_pages interface will
+ask to unmap 1024 4KB pages at the base IOVA.  dma_pte_clear_level()
+will recurse down to level 2 of the page table where the first half
+of the pfn range exactly matches the entire pte level.  We clear the
+pte, increment the pfn by the level size, but (oops) the next pte is
+on a new page, so we exit the loop an pop back up a level.  When we
+then update the pfn based on that higher level, we seem to assume
+that the previous pfn value was at the start of the level.  In this
+case the level size is 256K pfns, which we add to the base pfn and
+get a results of 0x7fe00, which is clearly greater than 0x401ff,
+so we're done.  Meanwhile we never cleared the ptes for the remainder
+of the range.  When the VM remaps this range, we're overwriting valid
+ptes and the VT-d driver complains loudly, as reported by the user
+report linked below.
+
+The fix for this seems relatively simple, if each iteration of the
+loop in dma_pte_clear_level() is assumed to clear to the end of the
+level pte page, then our next pfn should be calculated from level_pfn
+rather than our working pfn.
+
+Fixes: 3f34f1259776 ("iommu/vt-d: Implement map/unmap_pages() iommu_ops callback")
+Reported-by: Ajay Garg <ajaygargnsit@gmail.com>
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Tested-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
+Link: https://lore.kernel.org/all/20211002124012.18186-1-ajaygargnsit@gmail.com/
+Link: https://lore.kernel.org/r/163659074748.1617923.12716161410774184024.stgit@omen
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Link: https://lore.kernel.org/r/20211126135556.397932-3-baolu.lu@linux.intel.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/iommu/intel/iommu.c | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
+index 9a356075d3450..78f8c8e6803e9 100644
+--- a/drivers/iommu/intel/iommu.c
++++ b/drivers/iommu/intel/iommu.c
+@@ -1226,13 +1226,11 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
+       pte = &pte[pfn_level_offset(pfn, level)];
+ 
+       do {
+-              unsigned long level_pfn;
++              unsigned long level_pfn = pfn & level_mask(level);
+ 
+               if (!dma_pte_present(pte))
+                       goto next;
+ 
+-              level_pfn = pfn & level_mask(level);
+-
+               /* If range covers entire pagetable, free it */
+               if (start_pfn <= level_pfn &&
+                   last_pfn >= level_pfn + level_size(level) - 1) {
+@@ -1253,7 +1251,7 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
+                                                      freelist);
+               }
+ next:
+-              pfn += level_size(level);
++              pfn = level_pfn + level_size(level);
+       } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
+ 
+       if (first_pte)
+-- 
+2.33.0
+
diff --git a/queue-5.15/locking-rwsem-make-handoff-bit-handling-more-consist.patch b/queue-5.15/locking-rwsem-make-handoff-bit-handling-more-consist.patch

new file mode 100644 (file)

index 0000000..482ccec
--- /dev/null
+++ b/queue-5.15/locking-rwsem-make-handoff-bit-handling-more-consist.patch
@@ -0,0 +1,381 @@
+From 4c5eb65ede27e046b344a36354403c098e1cb2a6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Nov 2021 20:29:12 -0500
+Subject: locking/rwsem: Make handoff bit handling more consistent
+
+From: Waiman Long <longman@redhat.com>
+
+[ Upstream commit d257cc8cb8d5355ffc43a96bab94db7b5a324803 ]
+
+There are some inconsistency in the way that the handoff bit is being
+handled in readers and writers that lead to a race condition.
+
+Firstly, when a queue head writer set the handoff bit, it will clear
+it when the writer is being killed or interrupted on its way out
+without acquiring the lock. That is not the case for a queue head
+reader. The handoff bit will simply be inherited by the next waiter.
+
+Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
+the waiter and handoff bits are cleared if the wait queue becomes
+empty.  For rwsem_down_write_slowpath(), however, the handoff bit is
+not checked and cleared if the wait queue is empty. This can
+potentially make the handoff bit set with empty wait queue.
+
+Worse, the situation in rwsem_down_write_slowpath() relies on wstate,
+a variable set outside of the critical section containing the ->count
+manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF
+can be double subtracted, corrupting ->count.
+
+To make the handoff bit handling more consistent and robust, extract
+out handoff bit clearing code into the new rwsem_del_waiter() helper
+function. Also, completely eradicate wstate; always evaluate
+everything inside the same critical section.
+
+The common function will only use atomic_long_andnot() to clear bits
+when the wait queue is empty to avoid possible race condition.  If the
+first waiter with handoff bit set is killed or interrupted to exit the
+slowpath without acquiring the lock, the next waiter will inherit the
+handoff bit.
+
+While at it, simplify the trylock for loop in
+rwsem_down_write_slowpath() to make it easier to read.
+
+Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation")
+Reported-by: Zhenhua Ma <mazhenhua@xiaomi.com>
+Suggested-by: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20211116012912.723980-1-longman@redhat.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/locking/rwsem.c | 171 ++++++++++++++++++++---------------------
+ 1 file changed, 85 insertions(+), 86 deletions(-)
+
+diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
+index 29eea50a3e678..e63f740c2cc84 100644
+--- a/kernel/locking/rwsem.c
++++ b/kernel/locking/rwsem.c
+@@ -106,9 +106,9 @@
+  * atomic_long_cmpxchg() will be used to obtain writer lock.
+  *
+  * There are three places where the lock handoff bit may be set or cleared.
+- * 1) rwsem_mark_wake() for readers.
+- * 2) rwsem_try_write_lock() for writers.
+- * 3) Error path of rwsem_down_write_slowpath().
++ * 1) rwsem_mark_wake() for readers           -- set, clear
++ * 2) rwsem_try_write_lock() for writers      -- set, clear
++ * 3) rwsem_del_waiter()                      -- clear
+  *
+  * For all the above cases, wait_lock will be held. A writer must also
+  * be the first one in the wait_list to be eligible for setting the handoff
+@@ -335,6 +335,9 @@ struct rwsem_waiter {
+       struct task_struct *task;
+       enum rwsem_waiter_type type;
+       unsigned long timeout;
++
++      /* Writer only, not initialized in reader */
++      bool handoff_set;
+ };
+ #define rwsem_first_waiter(sem) \
+       list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
+@@ -345,12 +348,6 @@ enum rwsem_wake_type {
+       RWSEM_WAKE_READ_OWNED   /* Waker thread holds the read lock */
+ };
+ 
+-enum writer_wait_state {
+-      WRITER_NOT_FIRST,       /* Writer is not first in wait list */
+-      WRITER_FIRST,           /* Writer is first in wait list     */
+-      WRITER_HANDOFF          /* Writer is first & handoff needed */
+-};
+-
+ /*
+  * The typical HZ value is either 250 or 1000. So set the minimum waiting
+  * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
+@@ -366,6 +363,31 @@ enum writer_wait_state {
+  */
+ #define MAX_READERS_WAKEUP    0x100
+ 
++static inline void
++rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
++{
++      lockdep_assert_held(&sem->wait_lock);
++      list_add_tail(&waiter->list, &sem->wait_list);
++      /* caller will set RWSEM_FLAG_WAITERS */
++}
++
++/*
++ * Remove a waiter from the wait_list and clear flags.
++ *
++ * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
++ * this function. Modify with care.
++ */
++static inline void
++rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
++{
++      lockdep_assert_held(&sem->wait_lock);
++      list_del(&waiter->list);
++      if (likely(!list_empty(&sem->wait_list)))
++              return;
++
++      atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
++}
++
+ /*
+  * handle the lock release when processes blocked on it that can now run
+  * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
+@@ -377,6 +399,8 @@ enum writer_wait_state {
+  *   preferably when the wait_lock is released
+  * - woken process blocks are discarded from the list after having task zeroed
+  * - writers are only marked woken if downgrading is false
++ *
++ * Implies rwsem_del_waiter() for all woken readers.
+  */
+ static void rwsem_mark_wake(struct rw_semaphore *sem,
+                           enum rwsem_wake_type wake_type,
+@@ -491,18 +515,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
+ 
+       adjustment = woken * RWSEM_READER_BIAS - adjustment;
+       lockevent_cond_inc(rwsem_wake_reader, woken);
++
++      oldcount = atomic_long_read(&sem->count);
+       if (list_empty(&sem->wait_list)) {
+-              /* hit end of list above */
++              /*
++               * Combined with list_move_tail() above, this implies
++               * rwsem_del_waiter().
++               */
+               adjustment -= RWSEM_FLAG_WAITERS;
++              if (oldcount & RWSEM_FLAG_HANDOFF)
++                      adjustment -= RWSEM_FLAG_HANDOFF;
++      } else if (woken) {
++              /*
++               * When we've woken a reader, we no longer need to force
++               * writers to give up the lock and we can clear HANDOFF.
++               */
++              if (oldcount & RWSEM_FLAG_HANDOFF)
++                      adjustment -= RWSEM_FLAG_HANDOFF;
+       }
+ 
+-      /*
+-       * When we've woken a reader, we no longer need to force writers
+-       * to give up the lock and we can clear HANDOFF.
+-       */
+-      if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
+-              adjustment -= RWSEM_FLAG_HANDOFF;
+-
+       if (adjustment)
+               atomic_long_add(adjustment, &sem->count);
+ 
+@@ -533,12 +564,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
+  * race conditions between checking the rwsem wait list and setting the
+  * sem->count accordingly.
+  *
+- * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
+- * bit is set or the lock is acquired with handoff bit cleared.
++ * Implies rwsem_del_waiter() on success.
+  */
+ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
+-                                      enum writer_wait_state wstate)
++                                      struct rwsem_waiter *waiter)
+ {
++      bool first = rwsem_first_waiter(sem) == waiter;
+       long count, new;
+ 
+       lockdep_assert_held(&sem->wait_lock);
+@@ -547,13 +578,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
+       do {
+               bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
+ 
+-              if (has_handoff && wstate == WRITER_NOT_FIRST)
+-                      return false;
++              if (has_handoff) {
++                      if (!first)
++                              return false;
++
++                      /* First waiter inherits a previously set handoff bit */
++                      waiter->handoff_set = true;
++              }
+ 
+               new = count;
+ 
+               if (count & RWSEM_LOCK_MASK) {
+-                      if (has_handoff || (wstate != WRITER_HANDOFF))
++                      if (has_handoff || (!rt_task(waiter->task) &&
++                                          !time_after(jiffies, waiter->timeout)))
+                               return false;
+ 
+                       new |= RWSEM_FLAG_HANDOFF;
+@@ -570,9 +607,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
+        * We have either acquired the lock with handoff bit cleared or
+        * set the handoff bit.
+        */
+-      if (new & RWSEM_FLAG_HANDOFF)
++      if (new & RWSEM_FLAG_HANDOFF) {
++              waiter->handoff_set = true;
++              lockevent_inc(rwsem_wlock_handoff);
+               return false;
++      }
+ 
++      /*
++       * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
++       * success.
++       */
++      list_del(&waiter->list);
+       rwsem_set_owner(sem);
+       return true;
+ }
+@@ -953,7 +998,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
+               }
+               adjustment += RWSEM_FLAG_WAITERS;
+       }
+-      list_add_tail(&waiter.list, &sem->wait_list);
++      rwsem_add_waiter(sem, &waiter);
+ 
+       /* we're now waiting on the lock, but no longer actively locking */
+       count = atomic_long_add_return(adjustment, &sem->count);
+@@ -999,11 +1044,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
+       return sem;
+ 
+ out_nolock:
+-      list_del(&waiter.list);
+-      if (list_empty(&sem->wait_list)) {
+-              atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
+-                                 &sem->count);
+-      }
++      rwsem_del_waiter(sem, &waiter);
+       raw_spin_unlock_irq(&sem->wait_lock);
+       __set_current_state(TASK_RUNNING);
+       lockevent_inc(rwsem_rlock_fail);
+@@ -1017,9 +1058,7 @@ static struct rw_semaphore *
+ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
+ {
+       long count;
+-      enum writer_wait_state wstate;
+       struct rwsem_waiter waiter;
+-      struct rw_semaphore *ret = sem;
+       DEFINE_WAKE_Q(wake_q);
+ 
+       /* do optimistic spinning and steal lock if possible */
+@@ -1035,16 +1074,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
+       waiter.task = current;
+       waiter.type = RWSEM_WAITING_FOR_WRITE;
+       waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
++      waiter.handoff_set = false;
+ 
+       raw_spin_lock_irq(&sem->wait_lock);
+-
+-      /* account for this before adding a new element to the list */
+-      wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
+-
+-      list_add_tail(&waiter.list, &sem->wait_list);
++      rwsem_add_waiter(sem, &waiter);
+ 
+       /* we're now waiting on the lock */
+-      if (wstate == WRITER_NOT_FIRST) {
++      if (rwsem_first_waiter(sem) != &waiter) {
+               count = atomic_long_read(&sem->count);
+ 
+               /*
+@@ -1080,13 +1116,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
+       /* wait until we successfully acquire the lock */
+       set_current_state(state);
+       for (;;) {
+-              if (rwsem_try_write_lock(sem, wstate)) {
++              if (rwsem_try_write_lock(sem, &waiter)) {
+                       /* rwsem_try_write_lock() implies ACQUIRE on success */
+                       break;
+               }
+ 
+               raw_spin_unlock_irq(&sem->wait_lock);
+ 
++              if (signal_pending_state(state, current))
++                      goto out_nolock;
++
+               /*
+                * After setting the handoff bit and failing to acquire
+                * the lock, attempt to spin on owner to accelerate lock
+@@ -1095,7 +1134,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
+                * In this case, we attempt to acquire the lock again
+                * without sleeping.
+                */
+-              if (wstate == WRITER_HANDOFF) {
++              if (waiter.handoff_set) {
+                       enum owner_state owner_state;
+ 
+                       preempt_disable();
+@@ -1106,66 +1145,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
+                               goto trylock_again;
+               }
+ 
+-              /* Block until there are no active lockers. */
+-              for (;;) {
+-                      if (signal_pending_state(state, current))
+-                              goto out_nolock;
+-
+-                      schedule();
+-                      lockevent_inc(rwsem_sleep_writer);
+-                      set_current_state(state);
+-                      /*
+-                       * If HANDOFF bit is set, unconditionally do
+-                       * a trylock.
+-                       */
+-                      if (wstate == WRITER_HANDOFF)
+-                              break;
+-
+-                      if ((wstate == WRITER_NOT_FIRST) &&
+-                          (rwsem_first_waiter(sem) == &waiter))
+-                              wstate = WRITER_FIRST;
+-
+-                      count = atomic_long_read(&sem->count);
+-                      if (!(count & RWSEM_LOCK_MASK))
+-                              break;
+-
+-                      /*
+-                       * The setting of the handoff bit is deferred
+-                       * until rwsem_try_write_lock() is called.
+-                       */
+-                      if ((wstate == WRITER_FIRST) && (rt_task(current) ||
+-                          time_after(jiffies, waiter.timeout))) {
+-                              wstate = WRITER_HANDOFF;
+-                              lockevent_inc(rwsem_wlock_handoff);
+-                              break;
+-                      }
+-              }
++              schedule();
++              lockevent_inc(rwsem_sleep_writer);
++              set_current_state(state);
+ trylock_again:
+               raw_spin_lock_irq(&sem->wait_lock);
+       }
+       __set_current_state(TASK_RUNNING);
+-      list_del(&waiter.list);
+       raw_spin_unlock_irq(&sem->wait_lock);
+       lockevent_inc(rwsem_wlock);
+-
+-      return ret;
++      return sem;
+ 
+ out_nolock:
+       __set_current_state(TASK_RUNNING);
+       raw_spin_lock_irq(&sem->wait_lock);
+-      list_del(&waiter.list);
+-
+-      if (unlikely(wstate == WRITER_HANDOFF))
+-              atomic_long_add(-RWSEM_FLAG_HANDOFF,  &sem->count);
+-
+-      if (list_empty(&sem->wait_list))
+-              atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
+-      else
++      rwsem_del_waiter(sem, &waiter);
++      if (!list_empty(&sem->wait_list))
+               rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+       raw_spin_unlock_irq(&sem->wait_lock);
+       wake_up_q(&wake_q);
+       lockevent_inc(rwsem_wlock_fail);
+-
+       return ERR_PTR(-EINTR);
+ }
+ 
+-- 
+2.33.0
+
diff --git a/queue-5.15/perf-ignore-sigtrap-for-tracepoints-destined-for-oth.patch b/queue-5.15/perf-ignore-sigtrap-for-tracepoints-destined-for-oth.patch

new file mode 100644 (file)

index 0000000..4d151eb
--- /dev/null
+++ b/queue-5.15/perf-ignore-sigtrap-for-tracepoints-destined-for-oth.patch
@@ -0,0 +1,86 @@
+From 61f4ecf2f145a940ee7b764cd4584134cf249034 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Nov 2021 13:22:32 +0100
+Subject: perf: Ignore sigtrap for tracepoints destined for other tasks
+
+From: Marco Elver <elver@google.com>
+
+[ Upstream commit 73743c3b092277febbf69b250ce8ebbca0525aa2 ]
+
+syzbot reported that the warning in perf_sigtrap() fires, saying that
+the event's task does not match current:
+
+ | WARNING: CPU: 0 PID: 9090 at kernel/events/core.c:6446 perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513
+ | Modules linked in:
+ | CPU: 0 PID: 9090 Comm: syz-executor.1 Not tainted 5.15.0-syzkaller #0
+ | Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+ | RIP: 0010:perf_sigtrap kernel/events/core.c:6446 [inline]
+ | RIP: 0010:perf_pending_event_disable kernel/events/core.c:6470 [inline]
+ | RIP: 0010:perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513
+ | ...
+ | Call Trace:
+ |  <IRQ>
+ |  irq_work_single+0x106/0x220 kernel/irq_work.c:211
+ |  irq_work_run_list+0x6a/0x90 kernel/irq_work.c:242
+ |  irq_work_run+0x4f/0xd0 kernel/irq_work.c:251
+ |  __sysvec_irq_work+0x95/0x3d0 arch/x86/kernel/irq_work.c:22
+ |  sysvec_irq_work+0x8e/0xc0 arch/x86/kernel/irq_work.c:17
+ |  </IRQ>
+ |  <TASK>
+ |  asm_sysvec_irq_work+0x12/0x20 arch/x86/include/asm/idtentry.h:664
+ | RIP: 0010:__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline]
+ | RIP: 0010:_raw_spin_unlock_irqrestore+0x38/0x70 kernel/locking/spinlock.c:194
+ | ...
+ |  coredump_task_exit kernel/exit.c:371 [inline]
+ |  do_exit+0x1865/0x25c0 kernel/exit.c:771
+ |  do_group_exit+0xe7/0x290 kernel/exit.c:929
+ |  get_signal+0x3b0/0x1ce0 kernel/signal.c:2820
+ |  arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:868
+ |  handle_signal_work kernel/entry/common.c:148 [inline]
+ |  exit_to_user_mode_loop kernel/entry/common.c:172 [inline]
+ |  exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:207
+ |  __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline]
+ |  syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:300
+ |  do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86
+ |  entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+On x86 this shouldn't happen, which has arch_irq_work_raise().
+
+The test program sets up a perf event with sigtrap set to fire on the
+'sched_wakeup' tracepoint, which fired in ttwu_do_wakeup().
+
+This happened because the 'sched_wakeup' tracepoint also takes a task
+argument passed on to perf_tp_event(), which is used to deliver the
+event to that other task.
+
+Since we cannot deliver synchronous signals to other tasks, skip an event if
+perf_tp_event() is targeted at another task and perf_event_attr::sigtrap is
+set, which will avoid ever entering perf_sigtrap() for such events.
+
+Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events")
+Reported-by: syzbot+663359e32ce6f1a305ad@syzkaller.appspotmail.com
+Signed-off-by: Marco Elver <elver@google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/YYpoCOBmC/kJWfmI@elver.google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/core.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index 7162b600e7eaa..2931faf92a76f 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -9729,6 +9729,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
+                               continue;
+                       if (event->attr.config != entry->type)
+                               continue;
++                      /* Cannot deliver synchronous signal to other task. */
++                      if (event->attr.sigtrap)
++                              continue;
+                       if (perf_tp_event_match(event, &data, regs))
+                               perf_swevent_event(event, count, &data, regs);
+               }
+-- 
+2.33.0
+
diff --git a/queue-5.15/riscv-dts-microchip-drop-duplicated-mmc-sdhc-node.patch b/queue-5.15/riscv-dts-microchip-drop-duplicated-mmc-sdhc-node.patch

new file mode 100644 (file)

index 0000000..626433e
--- /dev/null
+++ b/queue-5.15/riscv-dts-microchip-drop-duplicated-mmc-sdhc-node.patch
@@ -0,0 +1,98 @@
+From 1082fccdaaae28cc12878411b956d4470f7385ff Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 27 Sep 2021 14:50:42 +0200
+Subject: riscv: dts: microchip: drop duplicated MMC/SDHC node
+
+From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+
+[ Upstream commit 42a57a47bb0c0f531321a7001972a3ca121409bd ]
+
+Devicetree source is a description of hardware and hardware has only one
+block @20008000 which can be configured either as eMMC or SDHC.  Having
+two node for different modes is an obscure, unusual and confusing way to
+configure it.  Instead the board file is supposed to customize the block
+to its needs, e.g. to SDHC mode.
+
+This fixes dtbs_check warning:
+  arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dt.yaml: sdhc@20008000: $nodename:0: 'sdhc@20008000' does not match '^mmc(@.*)?$'
+
+Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../microchip/microchip-mpfs-icicle-kit.dts   | 11 ++++++-
+ .../boot/dts/microchip/microchip-mpfs.dtsi    | 29 ++-----------------
+ 2 files changed, 12 insertions(+), 28 deletions(-)
+
+diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
+index be0d77624cf53..cce5eca31f257 100644
+--- a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
+@@ -56,8 +56,17 @@ &serial3 {
+       status = "okay";
+ };
+ 
+-&sdcard {
++&mmc {
+       status = "okay";
++
++      bus-width = <4>;
++      disable-wp;
++      cap-sd-highspeed;
++      card-detect-delay = <200>;
++      sd-uhs-sdr12;
++      sd-uhs-sdr25;
++      sd-uhs-sdr50;
++      sd-uhs-sdr104;
+ };
+ 
+ &emac0 {
+diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
+index 446f41d6a87e9..b12fd594e7172 100644
+--- a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
+@@ -262,39 +262,14 @@ serial3: serial@20104000 {
+                       status = "disabled";
+               };
+ 
+-              emmc: mmc@20008000 {
++              /* Common node entry for emmc/sd */
++              mmc: mmc@20008000 {
+                       compatible = "cdns,sd4hc";
+                       reg = <0x0 0x20008000 0x0 0x1000>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <88 89>;
+                       pinctrl-names = "default";
+                       clocks = <&clkcfg 6>;
+-                      bus-width = <4>;
+-                      cap-mmc-highspeed;
+-                      mmc-ddr-3_3v;
+-                      max-frequency = <200000000>;
+-                      non-removable;
+-                      no-sd;
+-                      no-sdio;
+-                      voltage-ranges = <3300 3300>;
+-                      status = "disabled";
+-              };
+-
+-              sdcard: sdhc@20008000 {
+-                      compatible = "cdns,sd4hc";
+-                      reg = <0x0 0x20008000 0x0 0x1000>;
+-                      interrupt-parent = <&plic>;
+-                      interrupts = <88>;
+-                      pinctrl-names = "default";
+-                      clocks = <&clkcfg 6>;
+-                      bus-width = <4>;
+-                      disable-wp;
+-                      cap-sd-highspeed;
+-                      card-detect-delay = <200>;
+-                      sd-uhs-sdr12;
+-                      sd-uhs-sdr25;
+-                      sd-uhs-sdr50;
+-                      sd-uhs-sdr104;
+                       max-frequency = <200000000>;
+                       status = "disabled";
+               };
+-- 
+2.33.0
+
diff --git a/queue-5.15/riscv-dts-microchip-fix-board-compatible.patch b/queue-5.15/riscv-dts-microchip-fix-board-compatible.patch

new file mode 100644 (file)

index 0000000..f13d478
--- /dev/null
+++ b/queue-5.15/riscv-dts-microchip-fix-board-compatible.patch
@@ -0,0 +1,55 @@
+From f8aa3780c08969f271101c4fef63a5cabc19dd9e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 27 Sep 2021 14:50:41 +0200
+Subject: riscv: dts: microchip: fix board compatible
+
+From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+
+[ Upstream commit fd86dd2a5dc5ff1044423c19fef3907862f591c4 ]
+
+According to bindings, the compatible must include microchip,mpfs.  This
+fixes dtbs_check warning:
+
+  arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dt.yaml: /: compatible: ['microchip,mpfs-icicle-kit'] is too short
+
+Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
+Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
+Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts | 2 +-
+ arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi           | 4 ++--
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
+index b254c60589a1c..be0d77624cf53 100644
+--- a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
+@@ -12,7 +12,7 @@ / {
+       #address-cells = <2>;
+       #size-cells = <2>;
+       model = "Microchip PolarFire-SoC Icicle Kit";
+-      compatible = "microchip,mpfs-icicle-kit";
++      compatible = "microchip,mpfs-icicle-kit", "microchip,mpfs";
+ 
+       aliases {
+               ethernet0 = &emac1;
+diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
+index 9d2fbbc1f7778..446f41d6a87e9 100644
+--- a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
+@@ -6,8 +6,8 @@
+ / {
+       #address-cells = <2>;
+       #size-cells = <2>;
+-      model = "Microchip MPFS Icicle Kit";
+-      compatible = "microchip,mpfs-icicle-kit";
++      model = "Microchip PolarFire SoC";
++      compatible = "microchip,mpfs";
+ 
+       chosen {
+       };
+-- 
+2.33.0
+
diff --git a/queue-5.15/sched-scs-reset-task-stack-state-in-bringup_cpu.patch b/queue-5.15/sched-scs-reset-task-stack-state-in-bringup_cpu.patch

new file mode 100644 (file)

index 0000000..3b4277f
--- /dev/null
+++ b/queue-5.15/sched-scs-reset-task-stack-state-in-bringup_cpu.patch
@@ -0,0 +1,133 @@
+From 58af3afbaf6ca63292a78772935ae24054fc8065 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Nov 2021 11:40:47 +0000
+Subject: sched/scs: Reset task stack state in bringup_cpu()
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+[ Upstream commit dce1ca0525bfdc8a69a9343bc714fbc19a2f04b3 ]
+
+To hot unplug a CPU, the idle task on that CPU calls a few layers of C
+code before finally leaving the kernel. When KASAN is in use, poisoned
+shadow is left around for each of the active stack frames, and when
+shadow call stacks are in use. When shadow call stacks (SCS) are in use
+the task's saved SCS SP is left pointing at an arbitrary point within
+the task's shadow call stack.
+
+When a CPU is offlined than onlined back into the kernel, this stale
+state can adversely affect execution. Stale KASAN shadow can alias new
+stackframes and result in bogus KASAN warnings. A stale SCS SP is
+effectively a memory leak, and prevents a portion of the shadow call
+stack being used. Across a number of hotplug cycles the idle task's
+entire shadow call stack can become unusable.
+
+We previously fixed the KASAN issue in commit:
+
+  e1b77c92981a5222 ("sched/kasan: remove stale KASAN poison after hotplug")
+
+... by removing any stale KASAN stack poison immediately prior to
+onlining a CPU.
+
+Subsequently in commit:
+
+  f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled")
+
+... the refactoring left the KASAN and SCS cleanup in one-time idle
+thread initialization code rather than something invoked prior to each
+CPU being onlined, breaking both as above.
+
+We fixed SCS (but not KASAN) in commit:
+
+  63acd42c0d4942f7 ("sched/scs: Reset the shadow stack when idle_task_exit")
+
+... but as this runs in the context of the idle task being offlined it's
+potentially fragile.
+
+To fix these consistently and more robustly, reset the SCS SP and KASAN
+shadow of a CPU's idle task immediately before we online that CPU in
+bringup_cpu(). This ensures the idle task always has a consistent state
+when it is running, and removes the need to so so when exiting an idle
+task.
+
+Whenever any thread is created, dup_task_struct() will give the task a
+stack which is free of KASAN shadow, and initialize the task's SCS SP,
+so there's no need to specially initialize either for idle thread within
+init_idle(), as this was only necessary to handle hotplug cycles.
+
+I've tested this on arm64 with:
+
+* gcc 11.1.0, defconfig +KASAN_INLINE, KASAN_STACK
+* clang 12.0.0, defconfig +KASAN_INLINE, KASAN_STACK, SHADOW_CALL_STACK
+
+... offlining and onlining CPUS with:
+
+| while true; do
+|   for C in /sys/devices/system/cpu/cpu*/online; do
+|     echo 0 > $C;
+|     echo 1 > $C;
+|   done
+| done
+
+Fixes: f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled")
+Reported-by: Qian Cai <quic_qiancai@quicinc.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
+Tested-by: Qian Cai <quic_qiancai@quicinc.com>
+Link: https://lore.kernel.org/lkml/20211115113310.35693-1-mark.rutland@arm.com/
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cpu.c        | 7 +++++++
+ kernel/sched/core.c | 4 ----
+ 2 files changed, 7 insertions(+), 4 deletions(-)
+
+diff --git a/kernel/cpu.c b/kernel/cpu.c
+index 192e43a874076..407a2568f35eb 100644
+--- a/kernel/cpu.c
++++ b/kernel/cpu.c
+@@ -31,6 +31,7 @@
+ #include <linux/smpboot.h>
+ #include <linux/relay.h>
+ #include <linux/slab.h>
++#include <linux/scs.h>
+ #include <linux/percpu-rwsem.h>
+ #include <linux/cpuset.h>
+ 
+@@ -587,6 +588,12 @@ static int bringup_cpu(unsigned int cpu)
+       struct task_struct *idle = idle_thread_get(cpu);
+       int ret;
+ 
++      /*
++       * Reset stale stack state from the last time this CPU was online.
++       */
++      scs_task_reset(idle);
++      kasan_unpoison_task_stack(idle);
++
+       /*
+        * Some architectures have to walk the irq descriptors to
+        * setup the vector space for the cpu which comes online.
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 779f27a4b46ac..6f4625f8276f1 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -8641,9 +8641,6 @@ void __init init_idle(struct task_struct *idle, int cpu)
+       idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
+       kthread_set_per_cpu(idle, cpu);
+ 
+-      scs_task_reset(idle);
+-      kasan_unpoison_task_stack(idle);
+-
+ #ifdef CONFIG_SMP
+       /*
+        * It's possible that init_idle() gets called multiple times on a task,
+@@ -8799,7 +8796,6 @@ void idle_task_exit(void)
+               finish_arch_post_lock_switch();
+       }
+ 
+-      scs_task_reset(current);
+       /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
+ }
+ 
+-- 
+2.33.0
+
diff --git a/queue-5.15/series b/queue-5.15/series

index b7b86f03c4884a3a25f09b6a114fa0734dceb81e..f8294bbe0a7ea742d9ae169451325230aa488d98 100644 (file)
--- a/queue-5.15/series
+++ b/queue-5.15/series
@@ -152,3 +152,14 @@ net-hns3-fix-vf-rss-failed-problem-after-pf-enable-m.patch
  net-hns3-fix-incorrect-components-info-of-ethtool-re.patch
  net-mscc-ocelot-don-t-downgrade-timestamping-rx-filt.patch
  net-mscc-ocelot-correctly-report-the-timestamping-rx.patch
+locking-rwsem-make-handoff-bit-handling-more-consist.patch
+perf-ignore-sigtrap-for-tracepoints-destined-for-oth.patch
+sched-scs-reset-task-stack-state-in-bringup_cpu.patch
+iommu-rockchip-fix-page_desc_hi_masks-for-rk3568.patch
+iommu-vt-d-fix-unmap_pages-support.patch
+f2fs-quota-fix-potential-deadlock.patch
+f2fs-set-sbi_need_fsck-flag-when-inconsistent-node-b.patch
+riscv-dts-microchip-fix-board-compatible.patch
+riscv-dts-microchip-drop-duplicated-mmc-sdhc-node.patch
+cifs-nosharesock-should-not-share-socket-with-future.patch
+ceph-properly-handle-statfs-on-multifs-setups.patch
author	Sasha Levin <sashal@kernel.org>
	Mon, 29 Nov 2021 03:16:54 +0000 (22:16 -0500)
committer	Sasha Levin <sashal@kernel.org>
	Mon, 29 Nov 2021 03:16:54 +0000 (22:16 -0500)
queue-5.15/ceph-properly-handle-statfs-on-multifs-setups.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/cifs-nosharesock-should-not-share-socket-with-future.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/f2fs-quota-fix-potential-deadlock.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/f2fs-set-sbi_need_fsck-flag-when-inconsistent-node-b.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/iommu-rockchip-fix-page_desc_hi_masks-for-rk3568.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/iommu-vt-d-fix-unmap_pages-support.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/locking-rwsem-make-handoff-bit-handling-more-consist.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/perf-ignore-sigtrap-for-tracepoints-destined-for-oth.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/riscv-dts-microchip-drop-duplicated-mmc-sdhc-node.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/riscv-dts-microchip-fix-board-compatible.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/sched-scs-reset-task-stack-state-in-bringup_cpu.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/series		patch \| blob \| blame \| history