From: Sasha Levin Date: Mon, 29 Nov 2021 03:16:54 +0000 (-0500) Subject: Fixes for 5.15 X-Git-Tag: v5.15.6~44 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b8c3ca378b64f13d28a992035221cf8b400773f5;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.15 Signed-off-by: Sasha Levin --- diff --git a/queue-5.15/ceph-properly-handle-statfs-on-multifs-setups.patch b/queue-5.15/ceph-properly-handle-statfs-on-multifs-setups.patch new file mode 100644 index 00000000000..7289c56b9be --- /dev/null +++ b/queue-5.15/ceph-properly-handle-statfs-on-multifs-setups.patch @@ -0,0 +1,67 @@ +From 07f330a82e38cf1b6740332b89dc2a2e22e5703e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Oct 2021 11:12:58 -0400 +Subject: ceph: properly handle statfs on multifs setups + +From: Jeff Layton + +[ Upstream commit 8cfc0c7ed34f7929ce7e5d7c6eecf4d01ba89a84 ] + +ceph_statfs currently stuffs the cluster fsid into the f_fsid field. +This was fine when we only had a single filesystem per cluster, but now +that we have multiples we need to use something that will vary between +them. + +Change ceph_statfs to xor each 32-bit chunk of the fsid (aka cluster id) +into the lower bits of the statfs->f_fsid. Change the lower bits to hold +the fscid (filesystem ID within the cluster). + +That should give us a value that is guaranteed to be unique between +filesystems within a cluster, and should minimize the chance of +collisions between mounts of different clusters. + +URL: https://tracker.ceph.com/issues/52812 +Reported-by: Sachin Prabhu +Signed-off-by: Jeff Layton +Reviewed-by: Xiubo Li +Signed-off-by: Ilya Dryomov +Signed-off-by: Sasha Levin +--- + fs/ceph/super.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/fs/ceph/super.c b/fs/ceph/super.c +index fd8742bae8471..202ddde3d62ad 100644 +--- a/fs/ceph/super.c ++++ b/fs/ceph/super.c +@@ -52,8 +52,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) + struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry)); + struct ceph_mon_client *monc = &fsc->client->monc; + struct ceph_statfs st; +- u64 fsid; +- int err; ++ int i, err; + u64 data_pool; + + if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) { +@@ -99,12 +98,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) + buf->f_namelen = NAME_MAX; + + /* Must convert the fsid, for consistent values across arches */ ++ buf->f_fsid.val[0] = 0; + mutex_lock(&monc->mutex); +- fsid = le64_to_cpu(*(__le64 *)(&monc->monmap->fsid)) ^ +- le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1)); ++ for (i = 0 ; i < sizeof(monc->monmap->fsid) / sizeof(__le32) ; ++i) ++ buf->f_fsid.val[0] ^= le32_to_cpu(((__le32 *)&monc->monmap->fsid)[i]); + mutex_unlock(&monc->mutex); + +- buf->f_fsid = u64_to_fsid(fsid); ++ /* fold the fs_cluster_id into the upper bits */ ++ buf->f_fsid.val[1] = monc->fs_cluster_id; + + return 0; + } +-- +2.33.0 + diff --git a/queue-5.15/cifs-nosharesock-should-not-share-socket-with-future.patch b/queue-5.15/cifs-nosharesock-should-not-share-socket-with-future.patch new file mode 100644 index 00000000000..66c39dd647a --- /dev/null +++ b/queue-5.15/cifs-nosharesock-should-not-share-socket-with-future.patch @@ -0,0 +1,73 @@ +From 28192917a4520ee82ab741cefdba79ea2c39cbb7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 6 Nov 2021 11:31:53 +0000 +Subject: cifs: nosharesock should not share socket with future sessions + +From: Shyam Prasad N + +[ Upstream commit c9f1c19cf7c50949885fa5afdb2cb242d61a7fac ] + +Today, when a new mount is done with nosharesock, we ensure +that we don't select an existing matching session. However, +we don't mark the connection as nosharesock, which means that +those could be shared with future sessions. + +Fixed it with this commit. Also printing this info in DebugData. + +Signed-off-by: Shyam Prasad N +Reviewed-by: Paulo Alcantara (SUSE) +Signed-off-by: Steve French +Signed-off-by: Sasha Levin +--- + fs/cifs/cifs_debug.c | 2 ++ + fs/cifs/cifsglob.h | 1 + + fs/cifs/connect.c | 8 +++++++- + 3 files changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c +index de2c12bcfa4bc..905a901f7f80b 100644 +--- a/fs/cifs/cifs_debug.c ++++ b/fs/cifs/cifs_debug.c +@@ -358,6 +358,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) + seq_printf(m, " signed"); + if (server->posix_ext_supported) + seq_printf(m, " posix"); ++ if (server->nosharesock) ++ seq_printf(m, " nosharesock"); + + if (server->rdma) + seq_printf(m, "\nRDMA "); +diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h +index dea4c929d3f46..3e5b8e177cfa7 100644 +--- a/fs/cifs/cifsglob.h ++++ b/fs/cifs/cifsglob.h +@@ -592,6 +592,7 @@ struct TCP_Server_Info { + struct list_head pending_mid_q; + bool noblocksnd; /* use blocking sendmsg */ + bool noautotune; /* do not autotune send buf sizes */ ++ bool nosharesock; + bool tcp_nodelay; + unsigned int credits; /* send no more requests at once */ + unsigned int max_credits; /* can override large 32000 default at mnt */ +diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c +index e757ee52cc777..d26703a05c6b4 100644 +--- a/fs/cifs/connect.c ++++ b/fs/cifs/connect.c +@@ -1217,7 +1217,13 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context * + { + struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr; + +- if (ctx->nosharesock) ++ if (ctx->nosharesock) { ++ server->nosharesock = true; ++ return 0; ++ } ++ ++ /* this server does not share socket */ ++ if (server->nosharesock) + return 0; + + /* If multidialect negotiation see if existing sessions match one */ +-- +2.33.0 + diff --git a/queue-5.15/f2fs-quota-fix-potential-deadlock.patch b/queue-5.15/f2fs-quota-fix-potential-deadlock.patch new file mode 100644 index 00000000000..2a9d64019ab --- /dev/null +++ b/queue-5.15/f2fs-quota-fix-potential-deadlock.patch @@ -0,0 +1,65 @@ +From 8ae24aacc1020dac21b216841199f68f8cf1e5ec Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 3 Sep 2021 10:38:11 +0800 +Subject: f2fs: quota: fix potential deadlock + +From: Chao Yu + +[ Upstream commit a5c0042200b28fff3bde6fa128ddeaef97990f8d ] + +As Yi Zhuang reported in bugzilla: + +https://bugzilla.kernel.org/show_bug.cgi?id=214299 + +There is potential deadlock during quota data flush as below: + +Thread A: Thread B: +f2fs_dquot_acquire +down_read(&sbi->quota_sem) + f2fs_write_checkpoint + block_operations + f2fs_look_all + down_write(&sbi->cp_rwsem) +f2fs_quota_write +f2fs_write_begin +__do_map_lock +f2fs_lock_op +down_read(&sbi->cp_rwsem) + __need_flush_qutoa + down_write(&sbi->quota_sem) + +This patch changes block_operations() to use trylock, if it fails, +it means there is potential quota data updater, in this condition, +let's flush quota data first and then trylock again to check dirty +status of quota data. + +The side effect is: in heavy race condition (e.g. multi quota data +upaters vs quota data flusher), it may decrease the probability of +synchronizing quota data successfully in checkpoint() due to limited +retry time of quota flush. + +Reported-by: Yi Zhuang +Signed-off-by: Chao Yu +Signed-off-by: Jaegeuk Kim +Signed-off-by: Sasha Levin +--- + fs/f2fs/checkpoint.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c +index 83e9bc0f91ffd..7b02827242312 100644 +--- a/fs/f2fs/checkpoint.c ++++ b/fs/f2fs/checkpoint.c +@@ -1162,7 +1162,8 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) + if (!is_journalled_quota(sbi)) + return false; + +- down_write(&sbi->quota_sem); ++ if (!down_write_trylock(&sbi->quota_sem)) ++ return true; + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { + ret = false; + } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) { +-- +2.33.0 + diff --git a/queue-5.15/f2fs-set-sbi_need_fsck-flag-when-inconsistent-node-b.patch b/queue-5.15/f2fs-set-sbi_need_fsck-flag-when-inconsistent-node-b.patch new file mode 100644 index 00000000000..e089cf5554c --- /dev/null +++ b/queue-5.15/f2fs-set-sbi_need_fsck-flag-when-inconsistent-node-b.patch @@ -0,0 +1,38 @@ +From 06215e5f1d7114ed82e2a1f7156119ecc5b5f1d3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 18 Sep 2021 20:46:36 +0800 +Subject: f2fs: set SBI_NEED_FSCK flag when inconsistent node block found + +From: Weichao Guo + +[ Upstream commit 6663b138ded1a59e630c9e605e42aa7fde490cdc ] + +Inconsistent node block will cause a file fail to open or read, +which could make the user process crashes or stucks. Let's mark +SBI_NEED_FSCK flag to trigger a fix at next fsck time. After +unlinking the corrupted file, the user process could regenerate +a new one and work correctly. + +Signed-off-by: Weichao Guo +Reviewed-by: Chao Yu +Signed-off-by: Jaegeuk Kim +Signed-off-by: Sasha Levin +--- + fs/f2fs/node.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c +index e863136081b47..556fcd8457f3f 100644 +--- a/fs/f2fs/node.c ++++ b/fs/f2fs/node.c +@@ -1443,6 +1443,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, + nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); ++ set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; + out_err: + ClearPageUptodate(page); +-- +2.33.0 + diff --git a/queue-5.15/iommu-rockchip-fix-page_desc_hi_masks-for-rk3568.patch b/queue-5.15/iommu-rockchip-fix-page_desc_hi_masks-for-rk3568.patch new file mode 100644 index 00000000000..e5359b7d734 --- /dev/null +++ b/queue-5.15/iommu-rockchip-fix-page_desc_hi_masks-for-rk3568.patch @@ -0,0 +1,47 @@ +From 5e789ed4bbb3bd860bca4a9a8db249ce67f9c63a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Nov 2021 03:13:25 +0100 +Subject: iommu/rockchip: Fix PAGE_DESC_HI_MASKs for RK3568 + +From: Alex Bee + +[ Upstream commit f7ff3cff3527ff1e70cad8d2fe7c0c7b6f83120a ] + +With the submission of iommu driver for RK3568 a subtle bug was +introduced: PAGE_DESC_HI_MASK1 and PAGE_DESC_HI_MASK2 have to be +the other way arround - that leads to random errors, especially when +addresses beyond 32 bit are used. + +Fix it. + +Fixes: c55356c534aa ("iommu: rockchip: Add support for iommu v2") +Signed-off-by: Alex Bee +Tested-by: Peter Geis +Reviewed-by: Heiko Stuebner +Tested-by: Dan Johansen +Reviewed-by: Benjamin Gaignard +Link: https://lore.kernel.org/r/20211124021325.858139-1-knaerzche@gmail.com +Signed-off-by: Joerg Roedel +Signed-off-by: Sasha Levin +--- + drivers/iommu/rockchip-iommu.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c +index 5cb260820eda6..7f23ad61c094f 100644 +--- a/drivers/iommu/rockchip-iommu.c ++++ b/drivers/iommu/rockchip-iommu.c +@@ -200,8 +200,8 @@ static inline phys_addr_t rk_dte_pt_address(u32 dte) + #define DTE_HI_MASK2 GENMASK(7, 4) + #define DTE_HI_SHIFT1 24 /* shift bit 8 to bit 32 */ + #define DTE_HI_SHIFT2 32 /* shift bit 4 to bit 36 */ +-#define PAGE_DESC_HI_MASK1 GENMASK_ULL(39, 36) +-#define PAGE_DESC_HI_MASK2 GENMASK_ULL(35, 32) ++#define PAGE_DESC_HI_MASK1 GENMASK_ULL(35, 32) ++#define PAGE_DESC_HI_MASK2 GENMASK_ULL(39, 36) + + static inline phys_addr_t rk_dte_pt_address_v2(u32 dte) + { +-- +2.33.0 + diff --git a/queue-5.15/iommu-vt-d-fix-unmap_pages-support.patch b/queue-5.15/iommu-vt-d-fix-unmap_pages-support.patch new file mode 100644 index 00000000000..e1fc43b1833 --- /dev/null +++ b/queue-5.15/iommu-vt-d-fix-unmap_pages-support.patch @@ -0,0 +1,88 @@ +From 6d2de354d125782cd23228ad93e2bfd28dc0ee24 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 26 Nov 2021 21:55:56 +0800 +Subject: iommu/vt-d: Fix unmap_pages support + +From: Alex Williamson + +[ Upstream commit 86dc40c7ea9c22f64571e0e45f695de73a0e2644 ] + +When supporting only the .map and .unmap callbacks of iommu_ops, +the IOMMU driver can make assumptions about the size and alignment +used for mappings based on the driver provided pgsize_bitmap. VT-d +previously used essentially PAGE_MASK for this bitmap as any power +of two mapping was acceptably filled by native page sizes. + +However, with the .map_pages and .unmap_pages interface we're now +getting page-size and count arguments. If we simply combine these +as (page-size * count) and make use of the previous map/unmap +functions internally, any size and alignment assumptions are very +different. + +As an example, a given vfio device assignment VM will often create +a 4MB mapping at IOVA pfn [0x3fe00 - 0x401ff]. On a system that +does not support IOMMU super pages, the unmap_pages interface will +ask to unmap 1024 4KB pages at the base IOVA. dma_pte_clear_level() +will recurse down to level 2 of the page table where the first half +of the pfn range exactly matches the entire pte level. We clear the +pte, increment the pfn by the level size, but (oops) the next pte is +on a new page, so we exit the loop an pop back up a level. When we +then update the pfn based on that higher level, we seem to assume +that the previous pfn value was at the start of the level. In this +case the level size is 256K pfns, which we add to the base pfn and +get a results of 0x7fe00, which is clearly greater than 0x401ff, +so we're done. Meanwhile we never cleared the ptes for the remainder +of the range. When the VM remaps this range, we're overwriting valid +ptes and the VT-d driver complains loudly, as reported by the user +report linked below. + +The fix for this seems relatively simple, if each iteration of the +loop in dma_pte_clear_level() is assumed to clear to the end of the +level pte page, then our next pfn should be calculated from level_pfn +rather than our working pfn. + +Fixes: 3f34f1259776 ("iommu/vt-d: Implement map/unmap_pages() iommu_ops callback") +Reported-by: Ajay Garg +Signed-off-by: Alex Williamson +Tested-by: Giovanni Cabiddu +Link: https://lore.kernel.org/all/20211002124012.18186-1-ajaygargnsit@gmail.com/ +Link: https://lore.kernel.org/r/163659074748.1617923.12716161410774184024.stgit@omen +Signed-off-by: Lu Baolu +Link: https://lore.kernel.org/r/20211126135556.397932-3-baolu.lu@linux.intel.com +Signed-off-by: Joerg Roedel +Signed-off-by: Sasha Levin +--- + drivers/iommu/intel/iommu.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c +index 9a356075d3450..78f8c8e6803e9 100644 +--- a/drivers/iommu/intel/iommu.c ++++ b/drivers/iommu/intel/iommu.c +@@ -1226,13 +1226,11 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, + pte = &pte[pfn_level_offset(pfn, level)]; + + do { +- unsigned long level_pfn; ++ unsigned long level_pfn = pfn & level_mask(level); + + if (!dma_pte_present(pte)) + goto next; + +- level_pfn = pfn & level_mask(level); +- + /* If range covers entire pagetable, free it */ + if (start_pfn <= level_pfn && + last_pfn >= level_pfn + level_size(level) - 1) { +@@ -1253,7 +1251,7 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, + freelist); + } + next: +- pfn += level_size(level); ++ pfn = level_pfn + level_size(level); + } while (!first_pte_in_page(++pte) && pfn <= last_pfn); + + if (first_pte) +-- +2.33.0 + diff --git a/queue-5.15/locking-rwsem-make-handoff-bit-handling-more-consist.patch b/queue-5.15/locking-rwsem-make-handoff-bit-handling-more-consist.patch new file mode 100644 index 00000000000..482ccec1831 --- /dev/null +++ b/queue-5.15/locking-rwsem-make-handoff-bit-handling-more-consist.patch @@ -0,0 +1,381 @@ +From 4c5eb65ede27e046b344a36354403c098e1cb2a6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Nov 2021 20:29:12 -0500 +Subject: locking/rwsem: Make handoff bit handling more consistent + +From: Waiman Long + +[ Upstream commit d257cc8cb8d5355ffc43a96bab94db7b5a324803 ] + +There are some inconsistency in the way that the handoff bit is being +handled in readers and writers that lead to a race condition. + +Firstly, when a queue head writer set the handoff bit, it will clear +it when the writer is being killed or interrupted on its way out +without acquiring the lock. That is not the case for a queue head +reader. The handoff bit will simply be inherited by the next waiter. + +Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both +the waiter and handoff bits are cleared if the wait queue becomes +empty. For rwsem_down_write_slowpath(), however, the handoff bit is +not checked and cleared if the wait queue is empty. This can +potentially make the handoff bit set with empty wait queue. + +Worse, the situation in rwsem_down_write_slowpath() relies on wstate, +a variable set outside of the critical section containing the ->count +manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF +can be double subtracted, corrupting ->count. + +To make the handoff bit handling more consistent and robust, extract +out handoff bit clearing code into the new rwsem_del_waiter() helper +function. Also, completely eradicate wstate; always evaluate +everything inside the same critical section. + +The common function will only use atomic_long_andnot() to clear bits +when the wait queue is empty to avoid possible race condition. If the +first waiter with handoff bit set is killed or interrupted to exit the +slowpath without acquiring the lock, the next waiter will inherit the +handoff bit. + +While at it, simplify the trylock for loop in +rwsem_down_write_slowpath() to make it easier to read. + +Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation") +Reported-by: Zhenhua Ma +Suggested-by: Peter Zijlstra +Signed-off-by: Waiman Long +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20211116012912.723980-1-longman@redhat.com +Signed-off-by: Sasha Levin +--- + kernel/locking/rwsem.c | 171 ++++++++++++++++++++--------------------- + 1 file changed, 85 insertions(+), 86 deletions(-) + +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index 29eea50a3e678..e63f740c2cc84 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -106,9 +106,9 @@ + * atomic_long_cmpxchg() will be used to obtain writer lock. + * + * There are three places where the lock handoff bit may be set or cleared. +- * 1) rwsem_mark_wake() for readers. +- * 2) rwsem_try_write_lock() for writers. +- * 3) Error path of rwsem_down_write_slowpath(). ++ * 1) rwsem_mark_wake() for readers -- set, clear ++ * 2) rwsem_try_write_lock() for writers -- set, clear ++ * 3) rwsem_del_waiter() -- clear + * + * For all the above cases, wait_lock will be held. A writer must also + * be the first one in the wait_list to be eligible for setting the handoff +@@ -335,6 +335,9 @@ struct rwsem_waiter { + struct task_struct *task; + enum rwsem_waiter_type type; + unsigned long timeout; ++ ++ /* Writer only, not initialized in reader */ ++ bool handoff_set; + }; + #define rwsem_first_waiter(sem) \ + list_first_entry(&sem->wait_list, struct rwsem_waiter, list) +@@ -345,12 +348,6 @@ enum rwsem_wake_type { + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ + }; + +-enum writer_wait_state { +- WRITER_NOT_FIRST, /* Writer is not first in wait list */ +- WRITER_FIRST, /* Writer is first in wait list */ +- WRITER_HANDOFF /* Writer is first & handoff needed */ +-}; +- + /* + * The typical HZ value is either 250 or 1000. So set the minimum waiting + * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait +@@ -366,6 +363,31 @@ enum writer_wait_state { + */ + #define MAX_READERS_WAKEUP 0x100 + ++static inline void ++rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) ++{ ++ lockdep_assert_held(&sem->wait_lock); ++ list_add_tail(&waiter->list, &sem->wait_list); ++ /* caller will set RWSEM_FLAG_WAITERS */ ++} ++ ++/* ++ * Remove a waiter from the wait_list and clear flags. ++ * ++ * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of ++ * this function. Modify with care. ++ */ ++static inline void ++rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) ++{ ++ lockdep_assert_held(&sem->wait_lock); ++ list_del(&waiter->list); ++ if (likely(!list_empty(&sem->wait_list))) ++ return; ++ ++ atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); ++} ++ + /* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must +@@ -377,6 +399,8 @@ enum writer_wait_state { + * preferably when the wait_lock is released + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only marked woken if downgrading is false ++ * ++ * Implies rwsem_del_waiter() for all woken readers. + */ + static void rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, +@@ -491,18 +515,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, + + adjustment = woken * RWSEM_READER_BIAS - adjustment; + lockevent_cond_inc(rwsem_wake_reader, woken); ++ ++ oldcount = atomic_long_read(&sem->count); + if (list_empty(&sem->wait_list)) { +- /* hit end of list above */ ++ /* ++ * Combined with list_move_tail() above, this implies ++ * rwsem_del_waiter(). ++ */ + adjustment -= RWSEM_FLAG_WAITERS; ++ if (oldcount & RWSEM_FLAG_HANDOFF) ++ adjustment -= RWSEM_FLAG_HANDOFF; ++ } else if (woken) { ++ /* ++ * When we've woken a reader, we no longer need to force ++ * writers to give up the lock and we can clear HANDOFF. ++ */ ++ if (oldcount & RWSEM_FLAG_HANDOFF) ++ adjustment -= RWSEM_FLAG_HANDOFF; + } + +- /* +- * When we've woken a reader, we no longer need to force writers +- * to give up the lock and we can clear HANDOFF. +- */ +- if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) +- adjustment -= RWSEM_FLAG_HANDOFF; +- + if (adjustment) + atomic_long_add(adjustment, &sem->count); + +@@ -533,12 +564,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + * +- * If wstate is WRITER_HANDOFF, it will make sure that either the handoff +- * bit is set or the lock is acquired with handoff bit cleared. ++ * Implies rwsem_del_waiter() on success. + */ + static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, +- enum writer_wait_state wstate) ++ struct rwsem_waiter *waiter) + { ++ bool first = rwsem_first_waiter(sem) == waiter; + long count, new; + + lockdep_assert_held(&sem->wait_lock); +@@ -547,13 +578,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, + do { + bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); + +- if (has_handoff && wstate == WRITER_NOT_FIRST) +- return false; ++ if (has_handoff) { ++ if (!first) ++ return false; ++ ++ /* First waiter inherits a previously set handoff bit */ ++ waiter->handoff_set = true; ++ } + + new = count; + + if (count & RWSEM_LOCK_MASK) { +- if (has_handoff || (wstate != WRITER_HANDOFF)) ++ if (has_handoff || (!rt_task(waiter->task) && ++ !time_after(jiffies, waiter->timeout))) + return false; + + new |= RWSEM_FLAG_HANDOFF; +@@ -570,9 +607,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, + * We have either acquired the lock with handoff bit cleared or + * set the handoff bit. + */ +- if (new & RWSEM_FLAG_HANDOFF) ++ if (new & RWSEM_FLAG_HANDOFF) { ++ waiter->handoff_set = true; ++ lockevent_inc(rwsem_wlock_handoff); + return false; ++ } + ++ /* ++ * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on ++ * success. ++ */ ++ list_del(&waiter->list); + rwsem_set_owner(sem); + return true; + } +@@ -953,7 +998,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat + } + adjustment += RWSEM_FLAG_WAITERS; + } +- list_add_tail(&waiter.list, &sem->wait_list); ++ rwsem_add_waiter(sem, &waiter); + + /* we're now waiting on the lock, but no longer actively locking */ + count = atomic_long_add_return(adjustment, &sem->count); +@@ -999,11 +1044,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat + return sem; + + out_nolock: +- list_del(&waiter.list); +- if (list_empty(&sem->wait_list)) { +- atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, +- &sem->count); +- } ++ rwsem_del_waiter(sem, &waiter); + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock_fail); +@@ -1017,9 +1058,7 @@ static struct rw_semaphore * + rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) + { + long count; +- enum writer_wait_state wstate; + struct rwsem_waiter waiter; +- struct rw_semaphore *ret = sem; + DEFINE_WAKE_Q(wake_q); + + /* do optimistic spinning and steal lock if possible */ +@@ -1035,16 +1074,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_WRITE; + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; ++ waiter.handoff_set = false; + + raw_spin_lock_irq(&sem->wait_lock); +- +- /* account for this before adding a new element to the list */ +- wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; +- +- list_add_tail(&waiter.list, &sem->wait_list); ++ rwsem_add_waiter(sem, &waiter); + + /* we're now waiting on the lock */ +- if (wstate == WRITER_NOT_FIRST) { ++ if (rwsem_first_waiter(sem) != &waiter) { + count = atomic_long_read(&sem->count); + + /* +@@ -1080,13 +1116,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) + /* wait until we successfully acquire the lock */ + set_current_state(state); + for (;;) { +- if (rwsem_try_write_lock(sem, wstate)) { ++ if (rwsem_try_write_lock(sem, &waiter)) { + /* rwsem_try_write_lock() implies ACQUIRE on success */ + break; + } + + raw_spin_unlock_irq(&sem->wait_lock); + ++ if (signal_pending_state(state, current)) ++ goto out_nolock; ++ + /* + * After setting the handoff bit and failing to acquire + * the lock, attempt to spin on owner to accelerate lock +@@ -1095,7 +1134,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) + * In this case, we attempt to acquire the lock again + * without sleeping. + */ +- if (wstate == WRITER_HANDOFF) { ++ if (waiter.handoff_set) { + enum owner_state owner_state; + + preempt_disable(); +@@ -1106,66 +1145,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) + goto trylock_again; + } + +- /* Block until there are no active lockers. */ +- for (;;) { +- if (signal_pending_state(state, current)) +- goto out_nolock; +- +- schedule(); +- lockevent_inc(rwsem_sleep_writer); +- set_current_state(state); +- /* +- * If HANDOFF bit is set, unconditionally do +- * a trylock. +- */ +- if (wstate == WRITER_HANDOFF) +- break; +- +- if ((wstate == WRITER_NOT_FIRST) && +- (rwsem_first_waiter(sem) == &waiter)) +- wstate = WRITER_FIRST; +- +- count = atomic_long_read(&sem->count); +- if (!(count & RWSEM_LOCK_MASK)) +- break; +- +- /* +- * The setting of the handoff bit is deferred +- * until rwsem_try_write_lock() is called. +- */ +- if ((wstate == WRITER_FIRST) && (rt_task(current) || +- time_after(jiffies, waiter.timeout))) { +- wstate = WRITER_HANDOFF; +- lockevent_inc(rwsem_wlock_handoff); +- break; +- } +- } ++ schedule(); ++ lockevent_inc(rwsem_sleep_writer); ++ set_current_state(state); + trylock_again: + raw_spin_lock_irq(&sem->wait_lock); + } + __set_current_state(TASK_RUNNING); +- list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + lockevent_inc(rwsem_wlock); +- +- return ret; ++ return sem; + + out_nolock: + __set_current_state(TASK_RUNNING); + raw_spin_lock_irq(&sem->wait_lock); +- list_del(&waiter.list); +- +- if (unlikely(wstate == WRITER_HANDOFF)) +- atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); +- +- if (list_empty(&sem->wait_list)) +- atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); +- else ++ rwsem_del_waiter(sem, &waiter); ++ if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + lockevent_inc(rwsem_wlock_fail); +- + return ERR_PTR(-EINTR); + } + +-- +2.33.0 + diff --git a/queue-5.15/perf-ignore-sigtrap-for-tracepoints-destined-for-oth.patch b/queue-5.15/perf-ignore-sigtrap-for-tracepoints-destined-for-oth.patch new file mode 100644 index 00000000000..4d151eb9fbd --- /dev/null +++ b/queue-5.15/perf-ignore-sigtrap-for-tracepoints-destined-for-oth.patch @@ -0,0 +1,86 @@ +From 61f4ecf2f145a940ee7b764cd4584134cf249034 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 9 Nov 2021 13:22:32 +0100 +Subject: perf: Ignore sigtrap for tracepoints destined for other tasks + +From: Marco Elver + +[ Upstream commit 73743c3b092277febbf69b250ce8ebbca0525aa2 ] + +syzbot reported that the warning in perf_sigtrap() fires, saying that +the event's task does not match current: + + | WARNING: CPU: 0 PID: 9090 at kernel/events/core.c:6446 perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513 + | Modules linked in: + | CPU: 0 PID: 9090 Comm: syz-executor.1 Not tainted 5.15.0-syzkaller #0 + | Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 + | RIP: 0010:perf_sigtrap kernel/events/core.c:6446 [inline] + | RIP: 0010:perf_pending_event_disable kernel/events/core.c:6470 [inline] + | RIP: 0010:perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513 + | ... + | Call Trace: + | + | irq_work_single+0x106/0x220 kernel/irq_work.c:211 + | irq_work_run_list+0x6a/0x90 kernel/irq_work.c:242 + | irq_work_run+0x4f/0xd0 kernel/irq_work.c:251 + | __sysvec_irq_work+0x95/0x3d0 arch/x86/kernel/irq_work.c:22 + | sysvec_irq_work+0x8e/0xc0 arch/x86/kernel/irq_work.c:17 + | + | + | asm_sysvec_irq_work+0x12/0x20 arch/x86/include/asm/idtentry.h:664 + | RIP: 0010:__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline] + | RIP: 0010:_raw_spin_unlock_irqrestore+0x38/0x70 kernel/locking/spinlock.c:194 + | ... + | coredump_task_exit kernel/exit.c:371 [inline] + | do_exit+0x1865/0x25c0 kernel/exit.c:771 + | do_group_exit+0xe7/0x290 kernel/exit.c:929 + | get_signal+0x3b0/0x1ce0 kernel/signal.c:2820 + | arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:868 + | handle_signal_work kernel/entry/common.c:148 [inline] + | exit_to_user_mode_loop kernel/entry/common.c:172 [inline] + | exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:207 + | __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] + | syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:300 + | do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86 + | entry_SYSCALL_64_after_hwframe+0x44/0xae + +On x86 this shouldn't happen, which has arch_irq_work_raise(). + +The test program sets up a perf event with sigtrap set to fire on the +'sched_wakeup' tracepoint, which fired in ttwu_do_wakeup(). + +This happened because the 'sched_wakeup' tracepoint also takes a task +argument passed on to perf_tp_event(), which is used to deliver the +event to that other task. + +Since we cannot deliver synchronous signals to other tasks, skip an event if +perf_tp_event() is targeted at another task and perf_event_attr::sigtrap is +set, which will avoid ever entering perf_sigtrap() for such events. + +Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") +Reported-by: syzbot+663359e32ce6f1a305ad@syzkaller.appspotmail.com +Signed-off-by: Marco Elver +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/YYpoCOBmC/kJWfmI@elver.google.com +Signed-off-by: Sasha Levin +--- + kernel/events/core.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 7162b600e7eaa..2931faf92a76f 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -9729,6 +9729,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, + continue; + if (event->attr.config != entry->type) + continue; ++ /* Cannot deliver synchronous signal to other task. */ ++ if (event->attr.sigtrap) ++ continue; + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } +-- +2.33.0 + diff --git a/queue-5.15/riscv-dts-microchip-drop-duplicated-mmc-sdhc-node.patch b/queue-5.15/riscv-dts-microchip-drop-duplicated-mmc-sdhc-node.patch new file mode 100644 index 00000000000..626433eefb3 --- /dev/null +++ b/queue-5.15/riscv-dts-microchip-drop-duplicated-mmc-sdhc-node.patch @@ -0,0 +1,98 @@ +From 1082fccdaaae28cc12878411b956d4470f7385ff Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 27 Sep 2021 14:50:42 +0200 +Subject: riscv: dts: microchip: drop duplicated MMC/SDHC node + +From: Krzysztof Kozlowski + +[ Upstream commit 42a57a47bb0c0f531321a7001972a3ca121409bd ] + +Devicetree source is a description of hardware and hardware has only one +block @20008000 which can be configured either as eMMC or SDHC. Having +two node for different modes is an obscure, unusual and confusing way to +configure it. Instead the board file is supposed to customize the block +to its needs, e.g. to SDHC mode. + +This fixes dtbs_check warning: + arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dt.yaml: sdhc@20008000: $nodename:0: 'sdhc@20008000' does not match '^mmc(@.*)?$' + +Signed-off-by: Krzysztof Kozlowski +Signed-off-by: Palmer Dabbelt +Signed-off-by: Sasha Levin +--- + .../microchip/microchip-mpfs-icicle-kit.dts | 11 ++++++- + .../boot/dts/microchip/microchip-mpfs.dtsi | 29 ++----------------- + 2 files changed, 12 insertions(+), 28 deletions(-) + +diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts +index be0d77624cf53..cce5eca31f257 100644 +--- a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts ++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts +@@ -56,8 +56,17 @@ &serial3 { + status = "okay"; + }; + +-&sdcard { ++&mmc { + status = "okay"; ++ ++ bus-width = <4>; ++ disable-wp; ++ cap-sd-highspeed; ++ card-detect-delay = <200>; ++ sd-uhs-sdr12; ++ sd-uhs-sdr25; ++ sd-uhs-sdr50; ++ sd-uhs-sdr104; + }; + + &emac0 { +diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi +index 446f41d6a87e9..b12fd594e7172 100644 +--- a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi ++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi +@@ -262,39 +262,14 @@ serial3: serial@20104000 { + status = "disabled"; + }; + +- emmc: mmc@20008000 { ++ /* Common node entry for emmc/sd */ ++ mmc: mmc@20008000 { + compatible = "cdns,sd4hc"; + reg = <0x0 0x20008000 0x0 0x1000>; + interrupt-parent = <&plic>; + interrupts = <88 89>; + pinctrl-names = "default"; + clocks = <&clkcfg 6>; +- bus-width = <4>; +- cap-mmc-highspeed; +- mmc-ddr-3_3v; +- max-frequency = <200000000>; +- non-removable; +- no-sd; +- no-sdio; +- voltage-ranges = <3300 3300>; +- status = "disabled"; +- }; +- +- sdcard: sdhc@20008000 { +- compatible = "cdns,sd4hc"; +- reg = <0x0 0x20008000 0x0 0x1000>; +- interrupt-parent = <&plic>; +- interrupts = <88>; +- pinctrl-names = "default"; +- clocks = <&clkcfg 6>; +- bus-width = <4>; +- disable-wp; +- cap-sd-highspeed; +- card-detect-delay = <200>; +- sd-uhs-sdr12; +- sd-uhs-sdr25; +- sd-uhs-sdr50; +- sd-uhs-sdr104; + max-frequency = <200000000>; + status = "disabled"; + }; +-- +2.33.0 + diff --git a/queue-5.15/riscv-dts-microchip-fix-board-compatible.patch b/queue-5.15/riscv-dts-microchip-fix-board-compatible.patch new file mode 100644 index 00000000000..f13d47845fe --- /dev/null +++ b/queue-5.15/riscv-dts-microchip-fix-board-compatible.patch @@ -0,0 +1,55 @@ +From f8aa3780c08969f271101c4fef63a5cabc19dd9e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 27 Sep 2021 14:50:41 +0200 +Subject: riscv: dts: microchip: fix board compatible + +From: Krzysztof Kozlowski + +[ Upstream commit fd86dd2a5dc5ff1044423c19fef3907862f591c4 ] + +According to bindings, the compatible must include microchip,mpfs. This +fixes dtbs_check warning: + + arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dt.yaml: /: compatible: ['microchip,mpfs-icicle-kit'] is too short + +Signed-off-by: Krzysztof Kozlowski +Reviewed-by: Conor Dooley +Reviewed-by: Geert Uytterhoeven +Signed-off-by: Palmer Dabbelt +Signed-off-by: Sasha Levin +--- + arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts | 2 +- + arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts +index b254c60589a1c..be0d77624cf53 100644 +--- a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts ++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts +@@ -12,7 +12,7 @@ / { + #address-cells = <2>; + #size-cells = <2>; + model = "Microchip PolarFire-SoC Icicle Kit"; +- compatible = "microchip,mpfs-icicle-kit"; ++ compatible = "microchip,mpfs-icicle-kit", "microchip,mpfs"; + + aliases { + ethernet0 = &emac1; +diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi +index 9d2fbbc1f7778..446f41d6a87e9 100644 +--- a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi ++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi +@@ -6,8 +6,8 @@ + / { + #address-cells = <2>; + #size-cells = <2>; +- model = "Microchip MPFS Icicle Kit"; +- compatible = "microchip,mpfs-icicle-kit"; ++ model = "Microchip PolarFire SoC"; ++ compatible = "microchip,mpfs"; + + chosen { + }; +-- +2.33.0 + diff --git a/queue-5.15/sched-scs-reset-task-stack-state-in-bringup_cpu.patch b/queue-5.15/sched-scs-reset-task-stack-state-in-bringup_cpu.patch new file mode 100644 index 00000000000..3b4277f79fb --- /dev/null +++ b/queue-5.15/sched-scs-reset-task-stack-state-in-bringup_cpu.patch @@ -0,0 +1,133 @@ +From 58af3afbaf6ca63292a78772935ae24054fc8065 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Nov 2021 11:40:47 +0000 +Subject: sched/scs: Reset task stack state in bringup_cpu() + +From: Mark Rutland + +[ Upstream commit dce1ca0525bfdc8a69a9343bc714fbc19a2f04b3 ] + +To hot unplug a CPU, the idle task on that CPU calls a few layers of C +code before finally leaving the kernel. When KASAN is in use, poisoned +shadow is left around for each of the active stack frames, and when +shadow call stacks are in use. When shadow call stacks (SCS) are in use +the task's saved SCS SP is left pointing at an arbitrary point within +the task's shadow call stack. + +When a CPU is offlined than onlined back into the kernel, this stale +state can adversely affect execution. Stale KASAN shadow can alias new +stackframes and result in bogus KASAN warnings. A stale SCS SP is +effectively a memory leak, and prevents a portion of the shadow call +stack being used. Across a number of hotplug cycles the idle task's +entire shadow call stack can become unusable. + +We previously fixed the KASAN issue in commit: + + e1b77c92981a5222 ("sched/kasan: remove stale KASAN poison after hotplug") + +... by removing any stale KASAN stack poison immediately prior to +onlining a CPU. + +Subsequently in commit: + + f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled") + +... the refactoring left the KASAN and SCS cleanup in one-time idle +thread initialization code rather than something invoked prior to each +CPU being onlined, breaking both as above. + +We fixed SCS (but not KASAN) in commit: + + 63acd42c0d4942f7 ("sched/scs: Reset the shadow stack when idle_task_exit") + +... but as this runs in the context of the idle task being offlined it's +potentially fragile. + +To fix these consistently and more robustly, reset the SCS SP and KASAN +shadow of a CPU's idle task immediately before we online that CPU in +bringup_cpu(). This ensures the idle task always has a consistent state +when it is running, and removes the need to so so when exiting an idle +task. + +Whenever any thread is created, dup_task_struct() will give the task a +stack which is free of KASAN shadow, and initialize the task's SCS SP, +so there's no need to specially initialize either for idle thread within +init_idle(), as this was only necessary to handle hotplug cycles. + +I've tested this on arm64 with: + +* gcc 11.1.0, defconfig +KASAN_INLINE, KASAN_STACK +* clang 12.0.0, defconfig +KASAN_INLINE, KASAN_STACK, SHADOW_CALL_STACK + +... offlining and onlining CPUS with: + +| while true; do +| for C in /sys/devices/system/cpu/cpu*/online; do +| echo 0 > $C; +| echo 1 > $C; +| done +| done + +Fixes: f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled") +Reported-by: Qian Cai +Signed-off-by: Mark Rutland +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Valentin Schneider +Tested-by: Qian Cai +Link: https://lore.kernel.org/lkml/20211115113310.35693-1-mark.rutland@arm.com/ +Signed-off-by: Sasha Levin +--- + kernel/cpu.c | 7 +++++++ + kernel/sched/core.c | 4 ---- + 2 files changed, 7 insertions(+), 4 deletions(-) + +diff --git a/kernel/cpu.c b/kernel/cpu.c +index 192e43a874076..407a2568f35eb 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -587,6 +588,12 @@ static int bringup_cpu(unsigned int cpu) + struct task_struct *idle = idle_thread_get(cpu); + int ret; + ++ /* ++ * Reset stale stack state from the last time this CPU was online. ++ */ ++ scs_task_reset(idle); ++ kasan_unpoison_task_stack(idle); ++ + /* + * Some architectures have to walk the irq descriptors to + * setup the vector space for the cpu which comes online. +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 779f27a4b46ac..6f4625f8276f1 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -8641,9 +8641,6 @@ void __init init_idle(struct task_struct *idle, int cpu) + idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; + kthread_set_per_cpu(idle, cpu); + +- scs_task_reset(idle); +- kasan_unpoison_task_stack(idle); +- + #ifdef CONFIG_SMP + /* + * It's possible that init_idle() gets called multiple times on a task, +@@ -8799,7 +8796,6 @@ void idle_task_exit(void) + finish_arch_post_lock_switch(); + } + +- scs_task_reset(current); + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ + } + +-- +2.33.0 + diff --git a/queue-5.15/series b/queue-5.15/series index b7b86f03c48..f8294bbe0a7 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -152,3 +152,14 @@ net-hns3-fix-vf-rss-failed-problem-after-pf-enable-m.patch net-hns3-fix-incorrect-components-info-of-ethtool-re.patch net-mscc-ocelot-don-t-downgrade-timestamping-rx-filt.patch net-mscc-ocelot-correctly-report-the-timestamping-rx.patch +locking-rwsem-make-handoff-bit-handling-more-consist.patch +perf-ignore-sigtrap-for-tracepoints-destined-for-oth.patch +sched-scs-reset-task-stack-state-in-bringup_cpu.patch +iommu-rockchip-fix-page_desc_hi_masks-for-rk3568.patch +iommu-vt-d-fix-unmap_pages-support.patch +f2fs-quota-fix-potential-deadlock.patch +f2fs-set-sbi_need_fsck-flag-when-inconsistent-node-b.patch +riscv-dts-microchip-fix-board-compatible.patch +riscv-dts-microchip-drop-duplicated-mmc-sdhc-node.patch +cifs-nosharesock-should-not-share-socket-with-future.patch +ceph-properly-handle-statfs-on-multifs-setups.patch