From: Greg Kroah-Hartman Date: Mon, 20 Mar 2023 14:13:31 +0000 (+0100) Subject: 5.10-stable patches X-Git-Tag: v4.14.311~15 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=02ccb4a57f39ed456b81967888afcf0c64cade98;p=thirdparty%2Fkernel%2Fstable-queue.git 5.10-stable patches added patches: attr-add-in_group_or_capable.patch attr-add-setattr_should_drop_sgid.patch attr-use-consistent-sgid-stripping-checks.patch drm-i915-active-fix-misuse-of-non-idle-barriers-as-fence-trackers.patch drm-i915-don-t-use-stolen-memory-for-ring-buffers-with-llc.patch fs-add-mode_strip_sgid-helper.patch fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch fs-move-should_remove_suid.patch fs-use-consistent-setgid-checks-in-is_sxid.patch io_uring-avoid-null-ptr-deref-in-io_arm_poll_handler.patch pci-dpc-await-readiness-of-secondary-bus-after-reset.patch pci-unify-delay-handling-for-reset-and-resume.patch s390-ipl-add-missing-intersection-check-to-ipl_report-handling.patch xfs-don-t-assert-fail-on-perag-references-on-teardown.patch xfs-don-t-leak-btree-cursor-when-insrec-fails-after-a-split.patch xfs-fallocate-should-call-file_modified.patch xfs-purge-dquots-after-inode-walk-fails-during-quotacheck.patch xfs-remove-xfs_prealloc_sync.patch xfs-remove-xfs_setattr_time-declaration.patch xfs-set-prealloc-flag-in-xfs_alloc_file_space.patch xfs-use-setattr_copy-to-set-vfs-inode-attributes.patch --- diff --git a/queue-5.10/attr-add-in_group_or_capable.patch b/queue-5.10/attr-add-in_group_or_capable.patch new file mode 100644 index 00000000000..215bf02f791 --- /dev/null +++ b/queue-5.10/attr-add-in_group_or_capable.patch @@ -0,0 +1,116 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:16:15 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:24 +0200 +Subject: attr: add in_group_or_capable() +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org +Message-ID: <20230318101529.1361673-11-amir73il@gmail.com> + +From: Amir Goldstein + +commit 11c2a8700cdcabf9b639b7204a1e38e2a0b6798e upstream. + +[backported to 5.10.y, prior to idmapped mounts] + +In setattr_{copy,prepare}() we need to perform the same permission +checks to determine whether we need to drop the setgid bit or not. +Instead of open-coding it twice add a simple helper the encapsulates the +logic. We will reuse this helpers to make dropping the setgid bit during +write operations more consistent in a follow up patch. + +Reviewed-by: Amir Goldstein +Signed-off-by: Christian Brauner (Microsoft) +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/attr.c | 11 +++++------ + fs/inode.c | 25 +++++++++++++++++++++---- + fs/internal.h | 1 + + 3 files changed, 27 insertions(+), 10 deletions(-) + +--- a/fs/attr.c ++++ b/fs/attr.c +@@ -18,6 +18,8 @@ + #include + #include + ++#include "internal.h" ++ + static bool chown_ok(const struct inode *inode, kuid_t uid) + { + if (uid_eq(current_fsuid(), inode->i_uid) && +@@ -90,9 +92,8 @@ int setattr_prepare(struct dentry *dentr + if (!inode_owner_or_capable(inode)) + return -EPERM; + /* Also check the setgid bit! */ +- if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : +- inode->i_gid) && +- !capable_wrt_inode_uidgid(inode, CAP_FSETID)) ++ if (!in_group_or_capable(inode, (ia_valid & ATTR_GID) ? ++ attr->ia_gid : inode->i_gid)) + attr->ia_mode &= ~S_ISGID; + } + +@@ -193,9 +194,7 @@ void setattr_copy(struct inode *inode, c + inode->i_ctime = attr->ia_ctime; + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; +- +- if (!in_group_p(inode->i_gid) && +- !capable_wrt_inode_uidgid(inode, CAP_FSETID)) ++ if (!in_group_or_capable(inode, inode->i_gid)) + mode &= ~S_ISGID; + inode->i_mode = mode; + } +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -2380,6 +2380,26 @@ int vfs_ioc_fssetxattr_check(struct inod + EXPORT_SYMBOL(vfs_ioc_fssetxattr_check); + + /** ++ * in_group_or_capable - check whether caller is CAP_FSETID privileged ++ * @inode: inode to check ++ * @gid: the new/current gid of @inode ++ * ++ * Check wether @gid is in the caller's group list or if the caller is ++ * privileged with CAP_FSETID over @inode. This can be used to determine ++ * whether the setgid bit can be kept or must be dropped. ++ * ++ * Return: true if the caller is sufficiently privileged, false if not. ++ */ ++bool in_group_or_capable(const struct inode *inode, kgid_t gid) ++{ ++ if (in_group_p(gid)) ++ return true; ++ if (capable_wrt_inode_uidgid(inode, CAP_FSETID)) ++ return true; ++ return false; ++} ++ ++/** + * mode_strip_sgid - handle the sgid bit for non-directories + * @dir: parent directory inode + * @mode: mode of the file to be created in @dir +@@ -2398,11 +2418,8 @@ umode_t mode_strip_sgid(const struct ino + return mode; + if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) + return mode; +- if (in_group_p(dir->i_gid)) ++ if (in_group_or_capable(dir, dir->i_gid)) + return mode; +- if (capable_wrt_inode_uidgid(dir, CAP_FSETID)) +- return mode; +- + return mode & ~S_ISGID; + } + EXPORT_SYMBOL(mode_strip_sgid); +--- a/fs/internal.h ++++ b/fs/internal.h +@@ -149,6 +149,7 @@ extern int vfs_open(const struct path *, + extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); + extern void inode_add_lru(struct inode *inode); + extern int dentry_needs_remove_privs(struct dentry *dentry); ++bool in_group_or_capable(const struct inode *inode, kgid_t gid); + + /* + * fs-writeback.c diff --git a/queue-5.10/attr-add-setattr_should_drop_sgid.patch b/queue-5.10/attr-add-setattr_should_drop_sgid.patch new file mode 100644 index 00000000000..481f4eb0198 --- /dev/null +++ b/queue-5.10/attr-add-setattr_should_drop_sgid.patch @@ -0,0 +1,79 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:16:16 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:26 +0200 +Subject: attr: add setattr_should_drop_sgid() +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org +Message-ID: <20230318101529.1361673-13-amir73il@gmail.com> + +From: Amir Goldstein + +commit 72ae017c5451860443a16fb2a8c243bff3e396b8 upstream. + +[backported to 5.10.y, prior to idmapped mounts] + +The current setgid stripping logic during write and ownership change +operations is inconsistent and strewn over multiple places. In order to +consolidate it and make more consistent we'll add a new helper +setattr_should_drop_sgid(). The function retains the old behavior where +we remove the S_ISGID bit unconditionally when S_IXGRP is set but also +when it isn't set and the caller is neither in the group of the inode +nor privileged over the inode. + +We will use this helper both in write operation permission removal such +as file_remove_privs() as well as in ownership change operations. + +Reviewed-by: Amir Goldstein +Signed-off-by: Christian Brauner (Microsoft) +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/attr.c | 25 +++++++++++++++++++++++++ + fs/internal.h | 5 +++++ + 2 files changed, 30 insertions(+) + +--- a/fs/attr.c ++++ b/fs/attr.c +@@ -20,6 +20,31 @@ + + #include "internal.h" + ++/** ++ * setattr_should_drop_sgid - determine whether the setgid bit needs to be ++ * removed ++ * @inode: inode to check ++ * ++ * This function determines whether the setgid bit needs to be removed. ++ * We retain backwards compatibility and require setgid bit to be removed ++ * unconditionally if S_IXGRP is set. Otherwise we have the exact same ++ * requirements as setattr_prepare() and setattr_copy(). ++ * ++ * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. ++ */ ++int setattr_should_drop_sgid(const struct inode *inode) ++{ ++ umode_t mode = inode->i_mode; ++ ++ if (!(mode & S_ISGID)) ++ return 0; ++ if (mode & S_IXGRP) ++ return ATTR_KILL_SGID; ++ if (!in_group_or_capable(inode, inode->i_gid)) ++ return ATTR_KILL_SGID; ++ return 0; ++} ++ + /* + * The logic we want is + * +--- a/fs/internal.h ++++ b/fs/internal.h +@@ -197,3 +197,8 @@ int sb_init_dio_done_wq(struct super_blo + */ + int do_statx(int dfd, const char __user *filename, unsigned flags, + unsigned int mask, struct statx __user *buffer); ++ ++/* ++ * fs/attr.c ++ */ ++int setattr_should_drop_sgid(const struct inode *inode); diff --git a/queue-5.10/attr-use-consistent-sgid-stripping-checks.patch b/queue-5.10/attr-use-consistent-sgid-stripping-checks.patch new file mode 100644 index 00000000000..bac24a87dad --- /dev/null +++ b/queue-5.10/attr-use-consistent-sgid-stripping-checks.patch @@ -0,0 +1,278 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:16:16 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:27 +0200 +Subject: attr: use consistent sgid stripping checks +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org +Message-ID: <20230318101529.1361673-14-amir73il@gmail.com> + +From: Amir Goldstein + +commit ed5a7047d2011cb6b2bf84ceb6680124cc6a7d95 upstream. + +[backported to 5.10.y, prior to idmapped mounts] + +Currently setgid stripping in file_remove_privs()'s should_remove_suid() +helper is inconsistent with other parts of the vfs. Specifically, it only +raises ATTR_KILL_SGID if the inode is S_ISGID and S_IXGRP but not if the +inode isn't in the caller's groups and the caller isn't privileged over the +inode although we require this already in setattr_prepare() and +setattr_copy() and so all filesystem implement this requirement implicitly +because they have to use setattr_{prepare,copy}() anyway. + +But the inconsistency shows up in setgid stripping bugs for overlayfs in +xfstests (e.g., generic/673, generic/683, generic/685, generic/686, +generic/687). For example, we test whether suid and setgid stripping works +correctly when performing various write-like operations as an unprivileged +user (fallocate, reflink, write, etc.): + +echo "Test 1 - qa_user, non-exec file $verb" +setup_testfile +chmod a+rws $junk_file +commit_and_check "$qa_user" "$verb" 64k 64k + +The test basically creates a file with 6666 permissions. While the file has +the S_ISUID and S_ISGID bits set it does not have the S_IXGRP set. On a +regular filesystem like xfs what will happen is: + +sys_fallocate() +-> vfs_fallocate() + -> xfs_file_fallocate() + -> file_modified() + -> __file_remove_privs() + -> dentry_needs_remove_privs() + -> should_remove_suid() + -> __remove_privs() + newattrs.ia_valid = ATTR_FORCE | kill; + -> notify_change() + -> setattr_copy() + +In should_remove_suid() we can see that ATTR_KILL_SUID is raised +unconditionally because the file in the test has S_ISUID set. + +But we also see that ATTR_KILL_SGID won't be set because while the file +is S_ISGID it is not S_IXGRP (see above) which is a condition for +ATTR_KILL_SGID being raised. + +So by the time we call notify_change() we have attr->ia_valid set to +ATTR_KILL_SUID | ATTR_FORCE. Now notify_change() sees that +ATTR_KILL_SUID is set and does: + +ia_valid = attr->ia_valid |= ATTR_MODE +attr->ia_mode = (inode->i_mode & ~S_ISUID); + +which means that when we call setattr_copy() later we will definitely +update inode->i_mode. Note that attr->ia_mode still contains S_ISGID. + +Now we call into the filesystem's ->setattr() inode operation which will +end up calling setattr_copy(). Since ATTR_MODE is set we will hit: + +if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (!vfsgid_in_group_p(vfsgid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + mode &= ~S_ISGID; + inode->i_mode = mode; +} + +and since the caller in the test is neither capable nor in the group of the +inode the S_ISGID bit is stripped. + +But assume the file isn't suid then ATTR_KILL_SUID won't be raised which +has the consequence that neither the setgid nor the suid bits are stripped +even though it should be stripped because the inode isn't in the caller's +groups and the caller isn't privileged over the inode. + +If overlayfs is in the mix things become a bit more complicated and the bug +shows up more clearly. When e.g., ovl_setattr() is hit from +ovl_fallocate()'s call to file_remove_privs() then ATTR_KILL_SUID and +ATTR_KILL_SGID might be raised but because the check in notify_change() is +questioning the ATTR_KILL_SGID flag again by requiring S_IXGRP for it to be +stripped the S_ISGID bit isn't removed even though it should be stripped: + +sys_fallocate() +-> vfs_fallocate() + -> ovl_fallocate() + -> file_remove_privs() + -> dentry_needs_remove_privs() + -> should_remove_suid() + -> __remove_privs() + newattrs.ia_valid = ATTR_FORCE | kill; + -> notify_change() + -> ovl_setattr() + // TAKE ON MOUNTER'S CREDS + -> ovl_do_notify_change() + -> notify_change() + // GIVE UP MOUNTER'S CREDS + // TAKE ON MOUNTER'S CREDS + -> vfs_fallocate() + -> xfs_file_fallocate() + -> file_modified() + -> __file_remove_privs() + -> dentry_needs_remove_privs() + -> should_remove_suid() + -> __remove_privs() + newattrs.ia_valid = attr_force | kill; + -> notify_change() + +The fix for all of this is to make file_remove_privs()'s +should_remove_suid() helper to perform the same checks as we already +require in setattr_prepare() and setattr_copy() and have notify_change() +not pointlessly requiring S_IXGRP again. It doesn't make any sense in the +first place because the caller must calculate the flags via +should_remove_suid() anyway which would raise ATTR_KILL_SGID. + +While we're at it we move should_remove_suid() from inode.c to attr.c +where it belongs with the rest of the iattr helpers. Especially since it +returns ATTR_KILL_S{G,U}ID flags. We also rename it to +setattr_should_drop_suidgid() to better reflect that it indicates both +setuid and setgid bit removal and also that it returns attr flags. + +Running xfstests with this doesn't report any regressions. We should really +try and use consistent checks. + +Reviewed-by: Amir Goldstein +Signed-off-by: Christian Brauner (Microsoft) +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/trace/ftrace.rst | 2 +- + fs/attr.c | 31 +++++++++++++++++-------------- + fs/inode.c | 2 +- + fs/ocfs2/file.c | 4 ++-- + fs/open.c | 6 +++--- + include/linux/fs.h | 2 +- + 6 files changed, 25 insertions(+), 22 deletions(-) + +--- a/Documentation/trace/ftrace.rst ++++ b/Documentation/trace/ftrace.rst +@@ -2923,7 +2923,7 @@ Produces:: + bash-1994 [000] .... 4342.324898: ima_get_action <-process_measurement + bash-1994 [000] .... 4342.324898: ima_match_policy <-ima_get_action + bash-1994 [000] .... 4342.324899: do_truncate <-do_last +- bash-1994 [000] .... 4342.324899: should_remove_suid <-do_truncate ++ bash-1994 [000] .... 4342.324899: setattr_should_drop_suidgid <-do_truncate + bash-1994 [000] .... 4342.324899: notify_change <-do_truncate + bash-1994 [000] .... 4342.324900: current_fs_time <-notify_change + bash-1994 [000] .... 4342.324900: current_kernel_time <-current_fs_time +--- a/fs/attr.c ++++ b/fs/attr.c +@@ -45,34 +45,37 @@ int setattr_should_drop_sgid(const struc + return 0; + } + +-/* +- * The logic we want is ++/** ++ * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to ++ * be dropped ++ * @inode: inode to check ++ * ++ * This function determines whether the set{g,u}id bits need to be removed. ++ * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the ++ * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both ++ * set{g,u}id bits need to be removed the corresponding mask of both flags is ++ * returned. + * +- * if suid or (sgid and xgrp) +- * remove privs ++ * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits ++ * to remove, 0 otherwise. + */ +-int should_remove_suid(struct dentry *dentry) ++int setattr_should_drop_suidgid(struct inode *inode) + { +- umode_t mode = d_inode(dentry)->i_mode; ++ umode_t mode = inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + +- /* +- * sgid without any exec bits is just a mandatory locking mark; leave +- * it alone. If some exec bits are set, it's a real sgid; kill it. +- */ +- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) +- kill |= ATTR_KILL_SGID; ++ kill |= setattr_should_drop_sgid(inode); + + if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) + return kill; + + return 0; + } +-EXPORT_SYMBOL(should_remove_suid); ++EXPORT_SYMBOL(setattr_should_drop_suidgid); + + static bool chown_ok(const struct inode *inode, kuid_t uid) + { +@@ -350,7 +353,7 @@ int notify_change(struct dentry * dentry + } + } + if (ia_valid & ATTR_KILL_SGID) { +- if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { ++ if (mode & S_ISGID) { + if (!(ia_valid & ATTR_MODE)) { + ia_valid = attr->ia_valid |= ATTR_MODE; + attr->ia_mode = inode->i_mode; +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -1868,7 +1868,7 @@ int dentry_needs_remove_privs(struct den + if (IS_NOSEC(inode)) + return 0; + +- mask = should_remove_suid(dentry); ++ mask = setattr_should_drop_suidgid(inode); + ret = security_inode_need_killpriv(dentry); + if (ret < 0) + return ret; +--- a/fs/ocfs2/file.c ++++ b/fs/ocfs2/file.c +@@ -1994,7 +1994,7 @@ static int __ocfs2_change_file_space(str + } + } + +- if (file && should_remove_suid(file->f_path.dentry)) { ++ if (file && setattr_should_drop_suidgid(file_inode(file))) { + ret = __ocfs2_write_remove_suid(inode, di_bh); + if (ret) { + mlog_errno(ret); +@@ -2282,7 +2282,7 @@ static int ocfs2_prepare_inode_for_write + * inode. There's also the dinode i_size state which + * can be lost via setattr during extending writes (we + * set inode->i_size at the end of a write. */ +- if (should_remove_suid(dentry)) { ++ if (setattr_should_drop_suidgid(inode)) { + if (meta_level == 0) { + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, +--- a/fs/open.c ++++ b/fs/open.c +@@ -665,10 +665,10 @@ retry_deleg: + newattrs.ia_valid |= ATTR_GID; + newattrs.ia_gid = gid; + } +- if (!S_ISDIR(inode->i_mode)) +- newattrs.ia_valid |= +- ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; + inode_lock(inode); ++ if (!S_ISDIR(inode->i_mode)) ++ newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | ++ setattr_should_drop_sgid(inode); + error = security_path_chown(path, uid, gid); + if (!error) + error = notify_change(path->dentry, &newattrs, &delegated_inode); +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -2960,7 +2960,7 @@ extern void __destroy_inode(struct inode + extern struct inode *new_inode_pseudo(struct super_block *sb); + extern struct inode *new_inode(struct super_block *sb); + extern void free_inode_nonrcu(struct inode *inode); +-extern int should_remove_suid(struct dentry *); ++extern int setattr_should_drop_suidgid(struct inode *); + extern int file_remove_privs(struct file *); + + extern void __insert_inode_hash(struct inode *, unsigned long hashval); diff --git a/queue-5.10/drm-i915-active-fix-misuse-of-non-idle-barriers-as-fence-trackers.patch b/queue-5.10/drm-i915-active-fix-misuse-of-non-idle-barriers-as-fence-trackers.patch new file mode 100644 index 00000000000..4e8e834c829 --- /dev/null +++ b/queue-5.10/drm-i915-active-fix-misuse-of-non-idle-barriers-as-fence-trackers.patch @@ -0,0 +1,117 @@ +From e0e6b416b25ee14716f3549e0cbec1011b193809 Mon Sep 17 00:00:00 2001 +From: Janusz Krzysztofik +Date: Thu, 2 Mar 2023 13:08:20 +0100 +Subject: drm/i915/active: Fix misuse of non-idle barriers as fence trackers + +From: Janusz Krzysztofik + +commit e0e6b416b25ee14716f3549e0cbec1011b193809 upstream. + +Users reported oopses on list corruptions when using i915 perf with a +number of concurrently running graphics applications. Root cause analysis +pointed at an issue in barrier processing code -- a race among perf open / +close replacing active barriers with perf requests on kernel context and +concurrent barrier preallocate / acquire operations performed during user +context first pin / last unpin. + +When adding a request to a composite tracker, we try to reuse an existing +fence tracker, already allocated and registered with that composite. The +tracker we obtain may already track another fence, may be an idle barrier, +or an active barrier. + +If the tracker we get occurs a non-idle barrier then we try to delete that +barrier from a list of barrier tasks it belongs to. However, while doing +that we don't respect return value from a function that performs the +barrier deletion. Should the deletion ever fail, we would end up reusing +the tracker still registered as a barrier task. Since the same structure +field is reused with both fence callback lists and barrier tasks list, +list corruptions would likely occur. + +Barriers are now deleted from a barrier tasks list by temporarily removing +the list content, traversing that content with skip over the node to be +deleted, then populating the list back with the modified content. Should +that intentionally racy concurrent deletion attempts be not serialized, +one or more of those may fail because of the list being temporary empty. + +Related code that ignores the results of barrier deletion was initially +introduced in v5.4 by commit d8af05ff38ae ("drm/i915: Allow sharing the +idle-barrier from other kernel requests"). However, all users of the +barrier deletion routine were apparently serialized at that time, then the +issue didn't exhibit itself. Results of git bisect with help of a newly +developed igt@gem_barrier_race@remote-request IGT test indicate that list +corruptions might start to appear after commit 311770173fac ("drm/i915/gt: +Schedule request retirement when timeline idles"), introduced in v5.5. + +Respect results of barrier deletion attempts -- mark the barrier as idle +only if successfully deleted from the list. Then, before proceeding with +setting our fence as the one currently tracked, make sure that the tracker +we've got is not a non-idle barrier. If that check fails then don't use +that tracker but go back and try to acquire a new, usable one. + +v3: use unlikely() to document what outcome we expect (Andi), + - fix bad grammar in commit description. +v2: no code changes, + - blame commit 311770173fac ("drm/i915/gt: Schedule request retirement + when timeline idles"), v5.5, not commit d8af05ff38ae ("drm/i915: Allow + sharing the idle-barrier from other kernel requests"), v5.4, + - reword commit description. + +Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/6333 +Fixes: 311770173fac ("drm/i915/gt: Schedule request retirement when timeline idles") +Cc: Chris Wilson +Cc: stable@vger.kernel.org # v5.5 +Cc: Andi Shyti +Signed-off-by: Janusz Krzysztofik +Reviewed-by: Andi Shyti +Signed-off-by: Andi Shyti +Link: https://patchwork.freedesktop.org/patch/msgid/20230302120820.48740-1-janusz.krzysztofik@linux.intel.com +(cherry picked from commit 506006055769b10d1b2b4e22f636f3b45e0e9fc7) +Signed-off-by: Jani Nikula +Signed-off-by: Janusz Krzysztofik +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/i915/i915_active.c | 26 ++++++++++++++------------ + 1 file changed, 14 insertions(+), 12 deletions(-) + +--- a/drivers/gpu/drm/i915/i915_active.c ++++ b/drivers/gpu/drm/i915/i915_active.c +@@ -432,8 +432,7 @@ replace_barrier(struct i915_active *ref, + * we can use it to substitute for the pending idle-barrer + * request that we want to emit on the kernel_context. + */ +- __active_del_barrier(ref, node_from_active(active)); +- return true; ++ return __active_del_barrier(ref, node_from_active(active)); + } + + int i915_active_ref(struct i915_active *ref, u64 idx, struct dma_fence *fence) +@@ -446,16 +445,19 @@ int i915_active_ref(struct i915_active * + if (err) + return err; + +- active = active_instance(ref, idx); +- if (!active) { +- err = -ENOMEM; +- goto out; +- } +- +- if (replace_barrier(ref, active)) { +- RCU_INIT_POINTER(active->fence, NULL); +- atomic_dec(&ref->count); +- } ++ do { ++ active = active_instance(ref, idx); ++ if (!active) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ if (replace_barrier(ref, active)) { ++ RCU_INIT_POINTER(active->fence, NULL); ++ atomic_dec(&ref->count); ++ } ++ } while (unlikely(is_barrier(active))); ++ + if (!__i915_active_fence_set(active, fence)) + __i915_active_acquire(ref); + diff --git a/queue-5.10/drm-i915-don-t-use-stolen-memory-for-ring-buffers-with-llc.patch b/queue-5.10/drm-i915-don-t-use-stolen-memory-for-ring-buffers-with-llc.patch new file mode 100644 index 00000000000..7525ce24050 --- /dev/null +++ b/queue-5.10/drm-i915-don-t-use-stolen-memory-for-ring-buffers-with-llc.patch @@ -0,0 +1,48 @@ +From 690e0ec8e63da9a29b39fedc6ed5da09c7c82651 Mon Sep 17 00:00:00 2001 +From: John Harrison +Date: Wed, 15 Feb 2023 17:11:00 -0800 +Subject: drm/i915: Don't use stolen memory for ring buffers with LLC +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: John Harrison + +commit 690e0ec8e63da9a29b39fedc6ed5da09c7c82651 upstream. + +Direction from hardware is that stolen memory should never be used for +ring buffer allocations on platforms with LLC. There are too many +caching pitfalls due to the way stolen memory accesses are routed. So +it is safest to just not use it. + +Signed-off-by: John Harrison +Fixes: c58b735fc762 ("drm/i915: Allocate rings from stolen") +Cc: Chris Wilson +Cc: Joonas Lahtinen +Cc: Jani Nikula +Cc: Rodrigo Vivi +Cc: Tvrtko Ursulin +Cc: intel-gfx@lists.freedesktop.org +Cc: # v4.9+ +Tested-by: Jouni Högander +Reviewed-by: Daniele Ceraolo Spurio +Link: https://patchwork.freedesktop.org/patch/msgid/20230216011101.1909009-2-John.C.Harrison@Intel.com +(cherry picked from commit f54c1f6c697c4297f7ed94283c184acc338a5cf8) +Signed-off-by: Jani Nikula +Signed-off-by: John Harrison +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/i915/gt/intel_ring.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/gpu/drm/i915/gt/intel_ring.c ++++ b/drivers/gpu/drm/i915/gt/intel_ring.c +@@ -108,7 +108,7 @@ static struct i915_vma *create_ring_vma( + struct i915_vma *vma; + + obj = ERR_PTR(-ENODEV); +- if (i915_ggtt_has_aperture(ggtt)) ++ if (i915_ggtt_has_aperture(ggtt) && !HAS_LLC(i915)) + obj = i915_gem_object_create_stolen(i915, size); + if (IS_ERR(obj)) + obj = i915_gem_object_create_internal(i915, size); diff --git a/queue-5.10/fs-add-mode_strip_sgid-helper.patch b/queue-5.10/fs-add-mode_strip_sgid-helper.patch new file mode 100644 index 00000000000..8f448a37b0c --- /dev/null +++ b/queue-5.10/fs-add-mode_strip_sgid-helper.patch @@ -0,0 +1,97 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:16:11 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:22 +0200 +Subject: fs: add mode_strip_sgid() helper +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Yang Xu , Jeff Layton +Message-ID: <20230318101529.1361673-9-amir73il@gmail.com> + +From: Yang Xu + +commit 2b3416ceff5e6bd4922f6d1c61fb68113dd82302 upstream. + +[remove userns argument of helper for 5.10.y backport] + +Add a dedicated helper to handle the setgid bit when creating a new file +in a setgid directory. This is a preparatory patch for moving setgid +stripping into the vfs. The patch contains no functional changes. + +Currently the setgid stripping logic is open-coded directly in +inode_init_owner() and the individual filesystems are responsible for +handling setgid inheritance. Since this has proven to be brittle as +evidenced by old issues we uncovered over the last months (see [1] to +[3] below) we will try to move this logic into the vfs. + +Link: e014f37db1a2 ("xfs: use setattr_copy to set vfs inode attributes") [1] +Link: 01ea173e103e ("xfs: fix up non-directory creation in SGID directories") [2] +Link: fd84bfdddd16 ("ceph: fix up non-directory creation in SGID directories") [3] +Link: https://lore.kernel.org/r/1657779088-2242-1-git-send-email-xuyang2018.jy@fujitsu.com +Reviewed-by: Darrick J. Wong +Reviewed-by: Christian Brauner (Microsoft) +Reviewed-and-Tested-by: Jeff Layton +Signed-off-by: Yang Xu +Signed-off-by: Christian Brauner (Microsoft) +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/inode.c | 34 ++++++++++++++++++++++++++++++---- + include/linux/fs.h | 1 + + 2 files changed, 31 insertions(+), 4 deletions(-) + +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -2147,10 +2147,8 @@ void inode_init_owner(struct inode *inod + /* Directories are special, and always inherit S_ISGID */ + if (S_ISDIR(mode)) + mode |= S_ISGID; +- else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && +- !in_group_p(inode->i_gid) && +- !capable_wrt_inode_uidgid(dir, CAP_FSETID)) +- mode &= ~S_ISGID; ++ else ++ mode = mode_strip_sgid(dir, mode); + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; +@@ -2382,3 +2380,31 @@ int vfs_ioc_fssetxattr_check(struct inod + return 0; + } + EXPORT_SYMBOL(vfs_ioc_fssetxattr_check); ++ ++/** ++ * mode_strip_sgid - handle the sgid bit for non-directories ++ * @dir: parent directory inode ++ * @mode: mode of the file to be created in @dir ++ * ++ * If the @mode of the new file has both the S_ISGID and S_IXGRP bit ++ * raised and @dir has the S_ISGID bit raised ensure that the caller is ++ * either in the group of the parent directory or they have CAP_FSETID ++ * in their user namespace and are privileged over the parent directory. ++ * In all other cases, strip the S_ISGID bit from @mode. ++ * ++ * Return: the new mode to use for the file ++ */ ++umode_t mode_strip_sgid(const struct inode *dir, umode_t mode) ++{ ++ if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) ++ return mode; ++ if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) ++ return mode; ++ if (in_group_p(dir->i_gid)) ++ return mode; ++ if (capable_wrt_inode_uidgid(dir, CAP_FSETID)) ++ return mode; ++ ++ return mode & ~S_ISGID; ++} ++EXPORT_SYMBOL(mode_strip_sgid); +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -1768,6 +1768,7 @@ extern long compat_ptr_ioctl(struct file + extern void inode_init_owner(struct inode *inode, const struct inode *dir, + umode_t mode); + extern bool may_open_dev(const struct path *path); ++umode_t mode_strip_sgid(const struct inode *dir, umode_t mode); + + /* + * This is the "filldir" function type, used by readdir() to let diff --git a/queue-5.10/fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch b/queue-5.10/fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch new file mode 100644 index 00000000000..fee4a139c09 --- /dev/null +++ b/queue-5.10/fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch @@ -0,0 +1,346 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:16:10 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:23 +0200 +Subject: fs: move S_ISGID stripping into the vfs_*() helpers +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Yang Xu , Dave Chinner , Jeff Layton +Message-ID: <20230318101529.1361673-10-amir73il@gmail.com> + +From: Yang Xu + +commit 1639a49ccdce58ea248841ed9b23babcce6dbb0b upstream. + +[remove userns argument of helpers for 5.10.y backport] + +Move setgid handling out of individual filesystems and into the VFS +itself to stop the proliferation of setgid inheritance bugs. + +Creating files that have both the S_IXGRP and S_ISGID bit raised in +directories that themselves have the S_ISGID bit set requires additional +privileges to avoid security issues. + +When a filesystem creates a new inode it needs to take care that the +caller is either in the group of the newly created inode or they have +CAP_FSETID in their current user namespace and are privileged over the +parent directory of the new inode. If any of these two conditions is +true then the S_ISGID bit can be raised for an S_IXGRP file and if not +it needs to be stripped. + +However, there are several key issues with the current implementation: + +* S_ISGID stripping logic is entangled with umask stripping. + + If a filesystem doesn't support or enable POSIX ACLs then umask + stripping is done directly in the vfs before calling into the + filesystem. + If the filesystem does support POSIX ACLs then unmask stripping may be + done in the filesystem itself when calling posix_acl_create(). + + Since umask stripping has an effect on S_ISGID inheritance, e.g., by + stripping the S_IXGRP bit from the file to be created and all relevant + filesystems have to call posix_acl_create() before inode_init_owner() + where we currently take care of S_ISGID handling S_ISGID handling is + order dependent. IOW, whether or not you get a setgid bit depends on + POSIX ACLs and umask and in what order they are called. + + Note that technically filesystems are free to impose their own + ordering between posix_acl_create() and inode_init_owner() meaning + that there's additional ordering issues that influence S_SIGID + inheritance. + +* Filesystems that don't rely on inode_init_owner() don't get S_ISGID + stripping logic. + + While that may be intentional (e.g. network filesystems might just + defer setgid stripping to a server) it is often just a security issue. + +This is not just ugly it's unsustainably messy especially since we do +still have bugs in this area years after the initial round of setgid +bugfixes. + +So the current state is quite messy and while we won't be able to make +it completely clean as posix_acl_create() is still a filesystem specific +call we can improve the S_SIGD stripping situation quite a bit by +hoisting it out of inode_init_owner() and into the vfs creation +operations. This means we alleviate the burden for filesystems to handle +S_ISGID stripping correctly and can standardize the ordering between +S_ISGID and umask stripping in the vfs. + +We add a new helper vfs_prepare_mode() so S_ISGID handling is now done +in the VFS before umask handling. This has S_ISGID handling is +unaffected unaffected by whether umask stripping is done by the VFS +itself (if no POSIX ACLs are supported or enabled) or in the filesystem +in posix_acl_create() (if POSIX ACLs are supported). + +The vfs_prepare_mode() helper is called directly in vfs_*() helpers that +create new filesystem objects. We need to move them into there to make +sure that filesystems like overlayfs hat have callchains like: + +sys_mknod() +-> do_mknodat(mode) + -> .mknod = ovl_mknod(mode) + -> ovl_create(mode) + -> vfs_mknod(mode) + +get S_ISGID stripping done when calling into lower filesystems via +vfs_*() creation helpers. Moving vfs_prepare_mode() into e.g. +vfs_mknod() takes care of that. This is in any case semantically cleaner +because S_ISGID stripping is VFS security requirement. + +Security hooks so far have seen the mode with the umask applied but +without S_ISGID handling done. The relevant hooks are called outside of +vfs_*() creation helpers so by calling vfs_prepare_mode() from vfs_*() +helpers the security hooks would now see the mode without umask +stripping applied. For now we fix this by passing the mode with umask +settings applied to not risk any regressions for LSM hooks. IOW, nothing +changes for LSM hooks. It is worth pointing out that security hooks +never saw the mode that is seen by the filesystem when actually creating +the file. They have always been completely misplaced for that to work. + +The following filesystems use inode_init_owner() and thus relied on +S_ISGID stripping: spufs, 9p, bfs, btrfs, ext2, ext4, f2fs, hfsplus, +hugetlbfs, jfs, minix, nilfs2, ntfs3, ocfs2, omfs, overlayfs, ramfs, +reiserfs, sysv, ubifs, udf, ufs, xfs, zonefs, bpf, tmpfs. + +All of the above filesystems end up calling inode_init_owner() when new +filesystem objects are created through the ->mkdir(), ->mknod(), +->create(), ->tmpfile(), ->rename() inode operations. + +Since directories always inherit the S_ISGID bit with the exception of +xfs when irix_sgid_inherit mode is turned on S_ISGID stripping doesn't +apply. The ->symlink() and ->link() inode operations trivially inherit +the mode from the target and the ->rename() inode operation inherits the +mode from the source inode. All other creation inode operations will get +S_ISGID handling via vfs_prepare_mode() when called from their relevant +vfs_*() helpers. + +In addition to this there are filesystems which allow the creation of +filesystem objects through ioctl()s or - in the case of spufs - +circumventing the vfs in other ways. If filesystem objects are created +through ioctl()s the vfs doesn't know about it and can't apply regular +permission checking including S_ISGID logic. Therfore, a filesystem +relying on S_ISGID stripping in inode_init_owner() in their ioctl() +callpath will be affected by moving this logic into the vfs. We audited +those filesystems: + +* btrfs allows the creation of filesystem objects through various + ioctls(). Snapshot creation literally takes a snapshot and so the mode + is fully preserved and S_ISGID stripping doesn't apply. + + Creating a new subvolum relies on inode_init_owner() in + btrfs_new_subvol_inode() but only creates directories and doesn't + raise S_ISGID. + +* ocfs2 has a peculiar implementation of reflinks. In contrast to e.g. + xfs and btrfs FICLONE/FICLONERANGE ioctl() that is only concerned with + the actual extents ocfs2 uses a separate ioctl() that also creates the + target file. + + Iow, ocfs2 circumvents the vfs entirely here and did indeed rely on + inode_init_owner() to strip the S_ISGID bit. This is the only place + where a filesystem needs to call mode_strip_sgid() directly but this + is self-inflicted pain. + +* spufs doesn't go through the vfs at all and doesn't use ioctl()s + either. Instead it has a dedicated system call spufs_create() which + allows the creation of filesystem objects. But spufs only creates + directories and doesn't allo S_SIGID bits, i.e. it specifically only + allows 0777 bits. + +* bpf uses vfs_mkobj() but also doesn't allow S_ISGID bits to be created. + +The patch will have an effect on ext2 when the EXT2_MOUNT_GRPID mount +option is used, on ext4 when the EXT4_MOUNT_GRPID mount option is used, +and on xfs when the XFS_FEAT_GRPID mount option is used. When any of +these filesystems are mounted with their respective GRPID option then +newly created files inherit the parent directories group +unconditionally. In these cases non of the filesystems call +inode_init_owner() and thus did never strip the S_ISGID bit for newly +created files. Moving this logic into the VFS means that they now get +the S_ISGID bit stripped. This is a user visible change. If this leads +to regressions we will either need to figure out a better way or we need +to revert. However, given the various setgid bugs that we found just in +the last two years this is a regression risk we should take. + +Associated with this change is a new set of fstests to enforce the +semantics for all new filesystems. + +Link: https://lore.kernel.org/ceph-devel/20220427092201.wvsdjbnc7b4dttaw@wittgenstein [1] +Link: e014f37db1a2 ("xfs: use setattr_copy to set vfs inode attributes") [2] +Link: 01ea173e103e ("xfs: fix up non-directory creation in SGID directories") [3] +Link: fd84bfdddd16 ("ceph: fix up non-directory creation in SGID directories") [4] +Link: https://lore.kernel.org/r/1657779088-2242-3-git-send-email-xuyang2018.jy@fujitsu.com +Suggested-by: Dave Chinner +Suggested-by: Christian Brauner (Microsoft) +Reviewed-by: Darrick J. Wong +Reviewed-and-Tested-by: Jeff Layton +Signed-off-by: Yang Xu +[: rewrote commit message] +Signed-off-by: Christian Brauner (Microsoft) +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/inode.c | 2 - + fs/namei.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++--------- + fs/ocfs2/namei.c | 1 + 3 files changed, 68 insertions(+), 15 deletions(-) + +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -2147,8 +2147,6 @@ void inode_init_owner(struct inode *inod + /* Directories are special, and always inherit S_ISGID */ + if (S_ISDIR(mode)) + mode |= S_ISGID; +- else +- mode = mode_strip_sgid(dir, mode); + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -2798,6 +2798,63 @@ void unlock_rename(struct dentry *p1, st + } + EXPORT_SYMBOL(unlock_rename); + ++/** ++ * mode_strip_umask - handle vfs umask stripping ++ * @dir: parent directory of the new inode ++ * @mode: mode of the new inode to be created in @dir ++ * ++ * Umask stripping depends on whether or not the filesystem supports POSIX ++ * ACLs. If the filesystem doesn't support it umask stripping is done directly ++ * in here. If the filesystem does support POSIX ACLs umask stripping is ++ * deferred until the filesystem calls posix_acl_create(). ++ * ++ * Returns: mode ++ */ ++static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) ++{ ++ if (!IS_POSIXACL(dir)) ++ mode &= ~current_umask(); ++ return mode; ++} ++ ++/** ++ * vfs_prepare_mode - prepare the mode to be used for a new inode ++ * @dir: parent directory of the new inode ++ * @mode: mode of the new inode ++ * @mask_perms: allowed permission by the vfs ++ * @type: type of file to be created ++ * ++ * This helper consolidates and enforces vfs restrictions on the @mode of a new ++ * object to be created. ++ * ++ * Umask stripping depends on whether the filesystem supports POSIX ACLs (see ++ * the kernel documentation for mode_strip_umask()). Moving umask stripping ++ * after setgid stripping allows the same ordering for both non-POSIX ACL and ++ * POSIX ACL supporting filesystems. ++ * ++ * Note that it's currently valid for @type to be 0 if a directory is created. ++ * Filesystems raise that flag individually and we need to check whether each ++ * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a ++ * non-zero type. ++ * ++ * Returns: mode to be passed to the filesystem ++ */ ++static inline umode_t vfs_prepare_mode(const struct inode *dir, umode_t mode, ++ umode_t mask_perms, umode_t type) ++{ ++ mode = mode_strip_sgid(dir, mode); ++ mode = mode_strip_umask(dir, mode); ++ ++ /* ++ * Apply the vfs mandated allowed permission mask and set the type of ++ * file to be created before we call into the filesystem. ++ */ ++ mode &= (mask_perms & ~S_IFMT); ++ mode |= (type & S_IFMT); ++ ++ return mode; ++} ++ + int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool want_excl) + { +@@ -2807,8 +2864,8 @@ int vfs_create(struct inode *dir, struct + + if (!dir->i_op->create) + return -EACCES; /* shouldn't it be ENOSYS? */ +- mode &= S_IALLUGO; +- mode |= S_IFREG; ++ ++ mode = vfs_prepare_mode(dir, mode, S_IALLUGO, S_IFREG); + error = security_inode_create(dir, dentry, mode); + if (error) + return error; +@@ -3072,8 +3129,7 @@ static struct dentry *lookup_open(struct + if (open_flag & O_CREAT) { + if (open_flag & O_EXCL) + open_flag &= ~O_TRUNC; +- if (!IS_POSIXACL(dir->d_inode)) +- mode &= ~current_umask(); ++ mode = vfs_prepare_mode(dir->d_inode, mode, mode, mode); + if (likely(got_write)) + create_error = may_o_create(&nd->path, dentry, mode); + else +@@ -3286,8 +3342,7 @@ struct dentry *vfs_tmpfile(struct dentry + child = d_alloc(dentry, &slash_name); + if (unlikely(!child)) + goto out_err; +- if (!IS_POSIXACL(dir)) +- mode &= ~current_umask(); ++ mode = vfs_prepare_mode(dir, mode, mode, mode); + error = dir->i_op->tmpfile(dir, child, mode); + if (error) + goto out_err; +@@ -3548,6 +3603,7 @@ int vfs_mknod(struct inode *dir, struct + if (!dir->i_op->mknod) + return -EPERM; + ++ mode = vfs_prepare_mode(dir, mode, mode, mode); + error = devcgroup_inode_mknod(mode, dev); + if (error) + return error; +@@ -3596,9 +3652,8 @@ retry: + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + +- if (!IS_POSIXACL(path.dentry->d_inode)) +- mode &= ~current_umask(); +- error = security_path_mknod(&path, dentry, mode, dev); ++ error = security_path_mknod(&path, dentry, ++ mode_strip_umask(path.dentry->d_inode, mode), dev); + if (error) + goto out; + switch (mode & S_IFMT) { +@@ -3646,7 +3701,7 @@ int vfs_mkdir(struct inode *dir, struct + if (!dir->i_op->mkdir) + return -EPERM; + +- mode &= (S_IRWXUGO|S_ISVTX); ++ mode = vfs_prepare_mode(dir, mode, S_IRWXUGO | S_ISVTX, 0); + error = security_inode_mkdir(dir, dentry, mode); + if (error) + return error; +@@ -3673,9 +3728,8 @@ retry: + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + +- if (!IS_POSIXACL(path.dentry->d_inode)) +- mode &= ~current_umask(); +- error = security_path_mkdir(&path, dentry, mode); ++ error = security_path_mkdir(&path, dentry, ++ mode_strip_umask(path.dentry->d_inode, mode)); + if (!error) + error = vfs_mkdir(path.dentry->d_inode, dentry, mode); + done_path_create(&path, dentry); +--- a/fs/ocfs2/namei.c ++++ b/fs/ocfs2/namei.c +@@ -198,6 +198,7 @@ static struct inode *ocfs2_get_init_inod + * callers. */ + if (S_ISDIR(mode)) + set_nlink(inode, 2); ++ mode = mode_strip_sgid(dir, mode); + inode_init_owner(inode, dir, mode); + status = dquot_initialize(inode); + if (status) diff --git a/queue-5.10/fs-move-should_remove_suid.patch b/queue-5.10/fs-move-should_remove_suid.patch new file mode 100644 index 00000000000..fbd9766a810 --- /dev/null +++ b/queue-5.10/fs-move-should_remove_suid.patch @@ -0,0 +1,102 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:16:15 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:25 +0200 +Subject: fs: move should_remove_suid() +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org +Message-ID: <20230318101529.1361673-12-amir73il@gmail.com> + +From: Amir Goldstein + +commit e243e3f94c804ecca9a8241b5babe28f35258ef4 upstream. + +Move the helper from inode.c to attr.c. This keeps the the core of the +set{g,u}id stripping logic in one place when we add follow-up changes. +It is the better place anyway, since should_remove_suid() returns +ATTR_KILL_S{G,U}ID flags. + +Reviewed-by: Amir Goldstein +Signed-off-by: Christian Brauner (Microsoft) +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/attr.c | 29 +++++++++++++++++++++++++++++ + fs/inode.c | 29 ----------------------------- + 2 files changed, 29 insertions(+), 29 deletions(-) + +--- a/fs/attr.c ++++ b/fs/attr.c +@@ -20,6 +20,35 @@ + + #include "internal.h" + ++/* ++ * The logic we want is ++ * ++ * if suid or (sgid and xgrp) ++ * remove privs ++ */ ++int should_remove_suid(struct dentry *dentry) ++{ ++ umode_t mode = d_inode(dentry)->i_mode; ++ int kill = 0; ++ ++ /* suid always must be killed */ ++ if (unlikely(mode & S_ISUID)) ++ kill = ATTR_KILL_SUID; ++ ++ /* ++ * sgid without any exec bits is just a mandatory locking mark; leave ++ * it alone. If some exec bits are set, it's a real sgid; kill it. ++ */ ++ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) ++ kill |= ATTR_KILL_SGID; ++ ++ if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) ++ return kill; ++ ++ return 0; ++} ++EXPORT_SYMBOL(should_remove_suid); ++ + static bool chown_ok(const struct inode *inode, kuid_t uid) + { + if (uid_eq(current_fsuid(), inode->i_uid) && +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -1855,35 +1855,6 @@ skip_update: + EXPORT_SYMBOL(touch_atime); + + /* +- * The logic we want is +- * +- * if suid or (sgid and xgrp) +- * remove privs +- */ +-int should_remove_suid(struct dentry *dentry) +-{ +- umode_t mode = d_inode(dentry)->i_mode; +- int kill = 0; +- +- /* suid always must be killed */ +- if (unlikely(mode & S_ISUID)) +- kill = ATTR_KILL_SUID; +- +- /* +- * sgid without any exec bits is just a mandatory locking mark; leave +- * it alone. If some exec bits are set, it's a real sgid; kill it. +- */ +- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) +- kill |= ATTR_KILL_SGID; +- +- if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) +- return kill; +- +- return 0; +-} +-EXPORT_SYMBOL(should_remove_suid); +- +-/* + * Return mask of changes for notify_change() that need to be done as a + * response to write or truncate. Return 0 if nothing has to be changed. + * Negative value on error (change should be denied). diff --git a/queue-5.10/fs-use-consistent-setgid-checks-in-is_sxid.patch b/queue-5.10/fs-use-consistent-setgid-checks-in-is_sxid.patch new file mode 100644 index 00000000000..45513da74c2 --- /dev/null +++ b/queue-5.10/fs-use-consistent-setgid-checks-in-is_sxid.patch @@ -0,0 +1,42 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:16:17 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:28 +0200 +Subject: fs: use consistent setgid checks in is_sxid() +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Miklos Szeredi +Message-ID: <20230318101529.1361673-15-amir73il@gmail.com> + +From: Christian Brauner + +commit 8d84e39d76bd83474b26cb44f4b338635676e7e8 upstream. + +Now that we made the VFS setgid checking consistent an inode can't be +marked security irrelevant even if the setgid bit is still set. Make +this function consistent with all other helpers. + +Note that enforcing consistent setgid stripping checks for file +modification and mode- and ownership changes will cause the setgid bit +to be lost in more cases than useed to be the case. If an unprivileged +user wrote to a non-executable setgid file that they don't have +privilege over the setgid bit will be dropped. This will lead to +temporary failures in some xfstests until they have been updated. + +Reported-by: Miklos Szeredi +Signed-off-by: Christian Brauner (Microsoft) +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/fs.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3408,7 +3408,7 @@ int __init get_filesystem_list(char *buf + + static inline bool is_sxid(umode_t mode) + { +- return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); ++ return mode & (S_ISUID | S_ISGID); + } + + static inline int check_sticky(struct inode *dir, struct inode *inode) diff --git a/queue-5.10/io_uring-avoid-null-ptr-deref-in-io_arm_poll_handler.patch b/queue-5.10/io_uring-avoid-null-ptr-deref-in-io_arm_poll_handler.patch new file mode 100644 index 00000000000..7772ffea7b1 --- /dev/null +++ b/queue-5.10/io_uring-avoid-null-ptr-deref-in-io_arm_poll_handler.patch @@ -0,0 +1,45 @@ +From pchelkin@ispras.ru Mon Mar 20 14:30:30 2023 +From: Fedor Pchelkin +Date: Thu, 16 Mar 2023 21:56:16 +0300 +Subject: io_uring: avoid null-ptr-deref in io_arm_poll_handler +To: Jens Axboe , Greg Kroah-Hartman , stable@vger.kernel.org +Cc: Fedor Pchelkin , linux-kernel@vger.kernel.org, Alexey Khoroshilov , lvc-project@linuxtesting.org +Message-ID: <20230316185616.271024-1-pchelkin@ispras.ru> + +From: Fedor Pchelkin + +No upstream commit exists for this commit. + +The issue was introduced with backporting upstream commit c16bda37594f +("io_uring/poll: allow some retries for poll triggering spuriously"). + +Memory allocation can possibly fail causing invalid pointer be +dereferenced just before comparing it to NULL value. + +Move the pointer check in proper place (upstream has the similar location +of the check). In case the request has REQ_F_POLLED flag up, apoll can't +be NULL so no need to check there. + +Found by Linux Verification Center (linuxtesting.org) with Syzkaller. + +Signed-off-by: Fedor Pchelkin +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -5792,10 +5792,10 @@ static int io_arm_poll_handler(struct io + } + } else { + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); ++ if (unlikely(!apoll)) ++ return IO_APOLL_ABORTED; + apoll->poll.retries = APOLL_MAX_RETRY; + } +- if (unlikely(!apoll)) +- return IO_APOLL_ABORTED; + apoll->double_poll = NULL; + req->apoll = apoll; + req->flags |= REQ_F_POLLED; diff --git a/queue-5.10/pci-dpc-await-readiness-of-secondary-bus-after-reset.patch b/queue-5.10/pci-dpc-await-readiness-of-secondary-bus-after-reset.patch new file mode 100644 index 00000000000..890a660c078 --- /dev/null +++ b/queue-5.10/pci-dpc-await-readiness-of-secondary-bus-after-reset.patch @@ -0,0 +1,81 @@ +From 53b54ad074de1896f8b021615f65b27f557ce874 Mon Sep 17 00:00:00 2001 +From: Lukas Wunner +Date: Sun, 15 Jan 2023 09:20:33 +0100 +Subject: PCI/DPC: Await readiness of secondary bus after reset + +From: Lukas Wunner + +commit 53b54ad074de1896f8b021615f65b27f557ce874 upstream. + +pci_bridge_wait_for_secondary_bus() is called after a Secondary Bus +Reset, but not after a DPC-induced Hot Reset. + +As a result, the delays prescribed by PCIe r6.0 sec 6.6.1 are not +observed and devices on the secondary bus may be accessed before +they're ready. + +One affected device is Intel's Ponte Vecchio HPC GPU. It comprises a +PCIe switch whose upstream port is not immediately ready after reset. +Because its config space is restored too early, it remains in +D0uninitialized, its subordinate devices remain inaccessible and DPC +recovery fails with messages such as: + + i915 0000:8c:00.0: can't change power state from D3cold to D0 (config space inaccessible) + intel_vsec 0000:8e:00.1: can't change power state from D3cold to D0 (config space inaccessible) + pcieport 0000:89:02.0: AER: device recovery failed + +Fix it. + +Link: https://lore.kernel.org/r/9f5ff00e1593d8d9a4b452398b98aa14d23fca11.1673769517.git.lukas@wunner.de +Tested-by: Ravi Kishore Koppuravuri +Signed-off-by: Lukas Wunner +Signed-off-by: Bjorn Helgaas +Reviewed-by: Mika Westerberg +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/pci/pci.c | 3 --- + drivers/pci/pci.h | 6 ++++++ + drivers/pci/pcie/dpc.c | 4 ++-- + 3 files changed, 8 insertions(+), 5 deletions(-) + +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -157,9 +157,6 @@ static int __init pcie_port_pm_setup(cha + } + __setup("pcie_port_pm=", pcie_port_pm_setup); + +-/* Time to wait after a reset for device to become responsive */ +-#define PCIE_RESET_READY_POLL_MS 60000 +- + /** + * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children + * @bus: pointer to PCI bus structure to search +--- a/drivers/pci/pci.h ++++ b/drivers/pci/pci.h +@@ -53,6 +53,12 @@ int pci_bus_error_reset(struct pci_dev * + * Reset (PCIe r6.0 sec 5.8). + */ + #define PCI_RESET_WAIT 1000 /* msec */ ++/* ++ * Devices may extend the 1 sec period through Request Retry Status completions ++ * (PCIe r6.0 sec 2.3.1). The spec does not provide an upper limit, but 60 sec ++ * ought to be enough for any device to become responsive. ++ */ ++#define PCIE_RESET_READY_POLL_MS 60000 /* msec */ + + /** + * struct pci_platform_pm_ops - Firmware PM callbacks +--- a/drivers/pci/pcie/dpc.c ++++ b/drivers/pci/pcie/dpc.c +@@ -170,8 +170,8 @@ pci_ers_result_t dpc_reset_link(struct p + pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS, + PCI_EXP_DPC_STATUS_TRIGGER); + +- if (!pcie_wait_for_link(pdev, true)) { +- pci_info(pdev, "Data Link Layer Link Active not set in 1000 msec\n"); ++ if (pci_bridge_wait_for_secondary_bus(pdev, "DPC", ++ PCIE_RESET_READY_POLL_MS)) { + clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags); + ret = PCI_ERS_RESULT_DISCONNECT; + } else { diff --git a/queue-5.10/pci-unify-delay-handling-for-reset-and-resume.patch b/queue-5.10/pci-unify-delay-handling-for-reset-and-resume.patch new file mode 100644 index 00000000000..b3cd0c3c4cb --- /dev/null +++ b/queue-5.10/pci-unify-delay-handling-for-reset-and-resume.patch @@ -0,0 +1,257 @@ +From ac91e6980563ed53afadd925fa6585ffd2bc4a2c Mon Sep 17 00:00:00 2001 +From: Lukas Wunner +Date: Sun, 15 Jan 2023 09:20:32 +0100 +Subject: PCI: Unify delay handling for reset and resume + +From: Lukas Wunner + +commit ac91e6980563ed53afadd925fa6585ffd2bc4a2c upstream. + +Sheng Bi reports that pci_bridge_secondary_bus_reset() may fail to wait +for devices on the secondary bus to become accessible after reset: + +Although it does call pci_dev_wait(), it erroneously passes the bridge's +pci_dev rather than that of a child. The bridge of course is always +accessible while its secondary bus is reset, so pci_dev_wait() returns +immediately. + +Sheng Bi proposes introducing a new pci_bridge_secondary_bus_wait() +function which is called from pci_bridge_secondary_bus_reset(): + +https://lore.kernel.org/linux-pci/20220523171517.32407-1-windy.bi.enflame@gmail.com/ + +However we already have pci_bridge_wait_for_secondary_bus() which does +almost exactly what we need. So far it's only called on resume from +D3cold (which implies a Fundamental Reset per PCIe r6.0 sec 5.8). +Re-using it for Secondary Bus Resets is a leaner and more rational +approach than introducing a new function. + +That only requires a few minor tweaks: + +- Amend pci_bridge_wait_for_secondary_bus() to await accessibility of + the first device on the secondary bus by calling pci_dev_wait() after + performing the prescribed delays. pci_dev_wait() needs two parameters, + a reset reason and a timeout, which callers must now pass to + pci_bridge_wait_for_secondary_bus(). The timeout is 1 sec for resume + (PCIe r6.0 sec 6.6.1) and 60 sec for reset (commit 821cdad5c46c ("PCI: + Wait up to 60 seconds for device to become ready after FLR")). + Introduce a PCI_RESET_WAIT macro for the 1 sec timeout. + +- Amend pci_bridge_wait_for_secondary_bus() to return 0 on success or + -ENOTTY on error for consumption by pci_bridge_secondary_bus_reset(). + +- Drop an unnecessary 1 sec delay from pci_reset_secondary_bus() which + is now performed by pci_bridge_wait_for_secondary_bus(). A static + delay this long is only necessary for Conventional PCI, so modern + PCIe systems benefit from shorter reset times as a side effect. + +Fixes: 6b2f1351af56 ("PCI: Wait for device to become ready after secondary bus reset") +Link: https://lore.kernel.org/r/da77c92796b99ec568bd070cbe4725074a117038.1673769517.git.lukas@wunner.de +Reported-by: Sheng Bi +Tested-by: Ravi Kishore Koppuravuri +Signed-off-by: Lukas Wunner +Signed-off-by: Bjorn Helgaas +Reviewed-by: Mika Westerberg +Reviewed-by: Kuppuswamy Sathyanarayanan +Cc: stable@vger.kernel.org # v4.17+ +Signed-off-by: Greg Kroah-Hartman +--- + drivers/pci/pci-driver.c | 4 +-- + drivers/pci/pci.c | 54 ++++++++++++++++++++--------------------------- + drivers/pci/pci.h | 10 +++++++- + 3 files changed, 35 insertions(+), 33 deletions(-) + +--- a/drivers/pci/pci-driver.c ++++ b/drivers/pci/pci-driver.c +@@ -911,7 +911,7 @@ static int pci_pm_resume_noirq(struct de + pcie_pme_root_status_cleanup(pci_dev); + + if (!skip_bus_pm && prev_state == PCI_D3cold) +- pci_bridge_wait_for_secondary_bus(pci_dev); ++ pci_bridge_wait_for_secondary_bus(pci_dev, "resume", PCI_RESET_WAIT); + + if (pci_has_legacy_pm_support(pci_dev)) + return 0; +@@ -1298,7 +1298,7 @@ static int pci_pm_runtime_resume(struct + pci_pm_default_resume(pci_dev); + + if (prev_state == PCI_D3cold) +- pci_bridge_wait_for_secondary_bus(pci_dev); ++ pci_bridge_wait_for_secondary_bus(pci_dev, "resume", PCI_RESET_WAIT); + + if (pm && pm->runtime_resume) + error = pm->runtime_resume(dev); +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -1221,7 +1221,7 @@ static int pci_dev_wait(struct pci_dev * + return -ENOTTY; + } + +- if (delay > 1000) ++ if (delay > PCI_RESET_WAIT) + pci_info(dev, "not ready %dms after %s; waiting\n", + delay - 1, reset_type); + +@@ -1230,7 +1230,7 @@ static int pci_dev_wait(struct pci_dev * + pci_read_config_dword(dev, PCI_COMMAND, &id); + } + +- if (delay > 1000) ++ if (delay > PCI_RESET_WAIT) + pci_info(dev, "ready %dms after %s\n", delay - 1, + reset_type); + +@@ -4792,24 +4792,31 @@ static int pci_bus_max_d3cold_delay(cons + /** + * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible + * @dev: PCI bridge ++ * @reset_type: reset type in human-readable form ++ * @timeout: maximum time to wait for devices on secondary bus (milliseconds) + * + * Handle necessary delays before access to the devices on the secondary +- * side of the bridge are permitted after D3cold to D0 transition. ++ * side of the bridge are permitted after D3cold to D0 transition ++ * or Conventional Reset. + * + * For PCIe this means the delays in PCIe 5.0 section 6.6.1. For + * conventional PCI it means Tpvrh + Trhfa specified in PCI 3.0 section + * 4.3.2. ++ * ++ * Return 0 on success or -ENOTTY if the first device on the secondary bus ++ * failed to become accessible. + */ +-void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) ++int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type, ++ int timeout) + { + struct pci_dev *child; + int delay; + + if (pci_dev_is_disconnected(dev)) +- return; ++ return 0; + + if (!pci_is_bridge(dev)) +- return; ++ return 0; + + down_read(&pci_bus_sem); + +@@ -4821,14 +4828,14 @@ void pci_bridge_wait_for_secondary_bus(s + */ + if (!dev->subordinate || list_empty(&dev->subordinate->devices)) { + up_read(&pci_bus_sem); +- return; ++ return 0; + } + + /* Take d3cold_delay requirements into account */ + delay = pci_bus_max_d3cold_delay(dev->subordinate); + if (!delay) { + up_read(&pci_bus_sem); +- return; ++ return 0; + } + + child = list_first_entry(&dev->subordinate->devices, struct pci_dev, +@@ -4837,14 +4844,12 @@ void pci_bridge_wait_for_secondary_bus(s + + /* + * Conventional PCI and PCI-X we need to wait Tpvrh + Trhfa before +- * accessing the device after reset (that is 1000 ms + 100 ms). In +- * practice this should not be needed because we don't do power +- * management for them (see pci_bridge_d3_possible()). ++ * accessing the device after reset (that is 1000 ms + 100 ms). + */ + if (!pci_is_pcie(dev)) { + pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay); + msleep(1000 + delay); +- return; ++ return 0; + } + + /* +@@ -4861,11 +4866,11 @@ void pci_bridge_wait_for_secondary_bus(s + * configuration requests if we only wait for 100 ms (see + * https://bugzilla.kernel.org/show_bug.cgi?id=203885). + * +- * Therefore we wait for 100 ms and check for the device presence. +- * If it is still not present give it an additional 100 ms. ++ * Therefore we wait for 100 ms and check for the device presence ++ * until the timeout expires. + */ + if (!pcie_downstream_port(dev)) +- return; ++ return 0; + + if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) { + pci_dbg(dev, "waiting %d ms for downstream link\n", delay); +@@ -4876,14 +4881,11 @@ void pci_bridge_wait_for_secondary_bus(s + if (!pcie_wait_for_link_delay(dev, true, delay)) { + /* Did not train, no need to wait any further */ + pci_info(dev, "Data Link Layer Link Active not set in 1000 msec\n"); +- return; ++ return -ENOTTY; + } + } + +- if (!pci_device_is_present(child)) { +- pci_dbg(child, "waiting additional %d ms to become accessible\n", delay); +- msleep(delay); +- } ++ return pci_dev_wait(child, reset_type, timeout - delay); + } + + void pci_reset_secondary_bus(struct pci_dev *dev) +@@ -4902,15 +4904,6 @@ void pci_reset_secondary_bus(struct pci_ + + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); +- +- /* +- * Trhfa for conventional PCI is 2^25 clock cycles. +- * Assuming a minimum 33MHz clock this results in a 1s +- * delay before we can consider subordinate devices to +- * be re-initialized. PCIe has some ways to shorten this, +- * but we don't make use of them yet. +- */ +- ssleep(1); + } + + void __weak pcibios_reset_secondary_bus(struct pci_dev *dev) +@@ -4929,7 +4922,8 @@ int pci_bridge_secondary_bus_reset(struc + { + pcibios_reset_secondary_bus(dev); + +- return pci_dev_wait(dev, "bus reset", PCIE_RESET_READY_POLL_MS); ++ return pci_bridge_wait_for_secondary_bus(dev, "bus reset", ++ PCIE_RESET_READY_POLL_MS); + } + EXPORT_SYMBOL_GPL(pci_bridge_secondary_bus_reset); + +--- a/drivers/pci/pci.h ++++ b/drivers/pci/pci.h +@@ -47,6 +47,13 @@ int pci_bus_error_reset(struct pci_dev * + #define PCI_PM_D3HOT_WAIT 10 /* msec */ + #define PCI_PM_D3COLD_WAIT 100 /* msec */ + ++/* ++ * Following exit from Conventional Reset, devices must be ready within 1 sec ++ * (PCIe r6.0 sec 6.6.1). A D3cold to D0 transition implies a Conventional ++ * Reset (PCIe r6.0 sec 5.8). ++ */ ++#define PCI_RESET_WAIT 1000 /* msec */ ++ + /** + * struct pci_platform_pm_ops - Firmware PM callbacks + * +@@ -108,7 +115,8 @@ void pci_allocate_cap_save_buffers(struc + void pci_free_cap_save_buffers(struct pci_dev *dev); + bool pci_bridge_d3_possible(struct pci_dev *dev); + void pci_bridge_d3_update(struct pci_dev *dev); +-void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev); ++int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type, ++ int timeout); + + static inline void pci_wakeup_event(struct pci_dev *dev) + { diff --git a/queue-5.10/s390-ipl-add-missing-intersection-check-to-ipl_report-handling.patch b/queue-5.10/s390-ipl-add-missing-intersection-check-to-ipl_report-handling.patch new file mode 100644 index 00000000000..f61aa08b6aa --- /dev/null +++ b/queue-5.10/s390-ipl-add-missing-intersection-check-to-ipl_report-handling.patch @@ -0,0 +1,51 @@ +From a52e5cdbe8016d4e3e6322fd93d71afddb9a5af9 Mon Sep 17 00:00:00 2001 +From: Sven Schnelle +Date: Tue, 7 Mar 2023 14:35:23 +0100 +Subject: s390/ipl: add missing intersection check to ipl_report handling + +From: Sven Schnelle + +commit a52e5cdbe8016d4e3e6322fd93d71afddb9a5af9 upstream. + +The code which handles the ipl report is searching for a free location +in memory where it could copy the component and certificate entries to. +It checks for intersection between the sections required for the kernel +and the component/certificate data area, but fails to check whether +the data structures linking these data areas together intersect. + +This might cause the iplreport copy code to overwrite the iplreport +itself. Fix this by adding two addtional intersection checks. + +Cc: +Fixes: 9641b8cc733f ("s390/ipl: read IPL report at early boot") +Signed-off-by: Sven Schnelle +Reviewed-by: Vasily Gorbik +Signed-off-by: Vasily Gorbik +Signed-off-by: Sven Schnelle +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/boot/ipl_report.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/arch/s390/boot/ipl_report.c ++++ b/arch/s390/boot/ipl_report.c +@@ -57,11 +57,19 @@ repeat: + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && + intersects(INITRD_START, INITRD_SIZE, safe_addr, size)) + safe_addr = INITRD_START + INITRD_SIZE; ++ if (intersects(safe_addr, size, (unsigned long)comps, comps->len)) { ++ safe_addr = (unsigned long)comps + comps->len; ++ goto repeat; ++ } + for_each_rb_entry(comp, comps) + if (intersects(safe_addr, size, comp->addr, comp->len)) { + safe_addr = comp->addr + comp->len; + goto repeat; + } ++ if (intersects(safe_addr, size, (unsigned long)certs, certs->len)) { ++ safe_addr = (unsigned long)certs + certs->len; ++ goto repeat; ++ } + for_each_rb_entry(cert, certs) + if (intersects(safe_addr, size, cert->addr, cert->len)) { + safe_addr = cert->addr + cert->len; diff --git a/queue-5.10/series b/queue-5.10/series index 96440354924..9be4a6d3067 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -74,3 +74,24 @@ fbdev-stifb-provide-valid-pixelclock-and-add-fb_check_var-checks.patch cpuidle-psci-iterate-backwards-over-list-in-psci_pd_remove.patch x86-mce-make-sure-logged-mces-are-processed-after-sysfs-update.patch x86-mm-fix-use-of-uninitialized-buffer-in-sme_enable.patch +drm-i915-don-t-use-stolen-memory-for-ring-buffers-with-llc.patch +drm-i915-active-fix-misuse-of-non-idle-barriers-as-fence-trackers.patch +io_uring-avoid-null-ptr-deref-in-io_arm_poll_handler.patch +s390-ipl-add-missing-intersection-check-to-ipl_report-handling.patch +pci-unify-delay-handling-for-reset-and-resume.patch +pci-dpc-await-readiness-of-secondary-bus-after-reset.patch +xfs-don-t-assert-fail-on-perag-references-on-teardown.patch +xfs-purge-dquots-after-inode-walk-fails-during-quotacheck.patch +xfs-don-t-leak-btree-cursor-when-insrec-fails-after-a-split.patch +xfs-remove-xfs_prealloc_sync.patch +xfs-fallocate-should-call-file_modified.patch +xfs-set-prealloc-flag-in-xfs_alloc_file_space.patch +xfs-use-setattr_copy-to-set-vfs-inode-attributes.patch +fs-add-mode_strip_sgid-helper.patch +fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch +attr-add-in_group_or_capable.patch +fs-move-should_remove_suid.patch +attr-add-setattr_should_drop_sgid.patch +attr-use-consistent-sgid-stripping-checks.patch +fs-use-consistent-setgid-checks-in-is_sxid.patch +xfs-remove-xfs_setattr_time-declaration.patch diff --git a/queue-5.10/xfs-don-t-assert-fail-on-perag-references-on-teardown.patch b/queue-5.10/xfs-don-t-assert-fail-on-perag-references-on-teardown.patch new file mode 100644 index 00000000000..50a90ef8e63 --- /dev/null +++ b/queue-5.10/xfs-don-t-assert-fail-on-perag-references-on-teardown.patch @@ -0,0 +1,52 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:15:45 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:15 +0200 +Subject: xfs: don't assert fail on perag references on teardown +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner , Christoph Hellwig , Dave Chinner +Message-ID: <20230318101529.1361673-2-amir73il@gmail.com> + +From: Dave Chinner + +commit 5b55cbc2d72632e874e50d2e36bce608e55aaaea upstream. + +[backport for 5.10.y, prior to perag refactoring in v5.14] + +Not fatal, the assert is there to catch developer attention. I'm +seeing this occasionally during recoveryloop testing after a +shutdown, and I don't want this to stop an overnight recoveryloop +run as it is currently doing. + +Convert the ASSERT to a XFS_IS_CORRUPT() check so it will dump a +corruption report into the log and cause a test failure that way, +but it won't stop the machine dead. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Dave Chinner +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_mount.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -126,7 +126,6 @@ __xfs_free_perag( + { + struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + +- ASSERT(atomic_read(&pag->pag_ref) == 0); + kmem_free(pag); + } + +@@ -145,7 +144,7 @@ xfs_free_perag( + pag = radix_tree_delete(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + ASSERT(pag); +- ASSERT(atomic_read(&pag->pag_ref) == 0); ++ XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); + xfs_iunlink_destroy(pag); + xfs_buf_hash_destroy(pag); + call_rcu(&pag->rcu_head, __xfs_free_perag); diff --git a/queue-5.10/xfs-don-t-leak-btree-cursor-when-insrec-fails-after-a-split.patch b/queue-5.10/xfs-don-t-leak-btree-cursor-when-insrec-fails-after-a-split.patch new file mode 100644 index 00000000000..9169c89ce44 --- /dev/null +++ b/queue-5.10/xfs-don-t-leak-btree-cursor-when-insrec-fails-after-a-split.patch @@ -0,0 +1,93 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:15:49 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:17 +0200 +Subject: xfs: don't leak btree cursor when insrec fails after a split +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Christoph Hellwig , Dave Chinner , Dave Chinner +Message-ID: <20230318101529.1361673-4-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit a54f78def73d847cb060b18c4e4a3d1d26c9ca6d upstream. + +The recent patch to improve btree cycle checking caused a regression +when I rebased the in-memory btree branch atop the 5.19 for-next branch, +because in-memory short-pointer btrees do not have AG numbers. This +produced the following complaint from kmemleak: + +unreferenced object 0xffff88803d47dde8 (size 264): + comm "xfs_io", pid 4889, jiffies 4294906764 (age 24.072s) + hex dump (first 32 bytes): + 90 4d 0b 0f 80 88 ff ff 00 a0 bd 05 80 88 ff ff .M.............. + e0 44 3a a0 ff ff ff ff 00 df 08 06 80 88 ff ff .D:............. + backtrace: + [] xfbtree_dup_cursor+0x49/0xc0 [xfs] + [] xfs_btree_dup_cursor+0x3b/0x200 [xfs] + [] __xfs_btree_split+0x6ad/0x820 [xfs] + [] xfs_btree_split+0x60/0x110 [xfs] + [] xfs_btree_make_block_unfull+0x19a/0x1f0 [xfs] + [] xfs_btree_insrec+0x3aa/0x810 [xfs] + [] xfs_btree_insert+0xb3/0x240 [xfs] + [] xfs_rmap_insert+0x99/0x200 [xfs] + [] xfs_rmap_map_shared+0x192/0x5f0 [xfs] + [] xfs_rmap_map_raw+0x6b/0x90 [xfs] + [] xrep_rmap_stash+0xd5/0x1d0 [xfs] + [] xrep_rmap_visit_bmbt+0xa0/0xf0 [xfs] + [] xrep_rmap_scan_iext+0x56/0xa0 [xfs] + [] xrep_rmap_scan_ifork+0xd8/0x160 [xfs] + [] xrep_rmap_scan_inode+0x35/0x80 [xfs] + [] xrep_rmap_find_rmaps+0x10e/0x270 [xfs] + +I noticed that xfs_btree_insrec has a bunch of debug code that return +out of the function immediately, without freeing the "new" btree cursor +that can be returned when _make_block_unfull calls xfs_btree_split. Fix +the error return in this function to free the btree cursor. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_btree.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/fs/xfs/libxfs/xfs_btree.c ++++ b/fs/xfs/libxfs/xfs_btree.c +@@ -3190,7 +3190,7 @@ xfs_btree_insrec( + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer for block */ + union xfs_btree_ptr nptr; /* new block ptr */ +- struct xfs_btree_cur *ncur; /* new btree cursor */ ++ struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ + union xfs_btree_key nkey; /* new block key */ + union xfs_btree_key *lkey; + int optr; /* old key/record index */ +@@ -3270,7 +3270,7 @@ xfs_btree_insrec( + #ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) +- return error; ++ goto error0; + #endif + + /* +@@ -3290,7 +3290,7 @@ xfs_btree_insrec( + for (i = numrecs - ptr; i >= 0; i--) { + error = xfs_btree_debug_check_ptr(cur, pp, i, level); + if (error) +- return error; ++ goto error0; + } + + xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); +@@ -3375,6 +3375,8 @@ xfs_btree_insrec( + return 0; + + error0: ++ if (ncur) ++ xfs_btree_del_cursor(ncur, error); + return error; + } + diff --git a/queue-5.10/xfs-fallocate-should-call-file_modified.patch b/queue-5.10/xfs-fallocate-should-call-file_modified.patch new file mode 100644 index 00000000000..319364fd972 --- /dev/null +++ b/queue-5.10/xfs-fallocate-should-call-file_modified.patch @@ -0,0 +1,67 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:15:51 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:19 +0200 +Subject: xfs: fallocate() should call file_modified() +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner +Message-ID: <20230318101529.1361673-6-amir73il@gmail.com> + +From: Dave Chinner + +commit fbe7e520036583a783b13ff9744e35c2a329d9a4 upstream. + +In XFS, we always update the inode change and modification time when +any fallocate() operation succeeds. Furthermore, as various +fallocate modes can change the file contents (extending EOF, +punching holes, zeroing things, shifting extents), we should drop +file privileges like suid just like we do for a regular write(). +There's already a VFS helper that figures all this out for us, so +use that. + +The net effect of this is that we no longer drop suid/sgid if the +caller is root, but we also now drop file capabilities. + +We also move the xfs_update_prealloc_flags() function so that it now +is only called by the scope that needs to set the the prealloc flag. + +Based on a patch from Darrick Wong. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_file.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -895,6 +895,10 @@ xfs_file_fallocate( + goto out_unlock; + } + ++ error = file_modified(file); ++ if (error) ++ goto out_unlock; ++ + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = xfs_free_file_space(ip, offset, len); + if (error) +@@ -996,11 +1000,12 @@ xfs_file_fallocate( + if (error) + goto out_unlock; + } +- } + +- error = xfs_update_prealloc_flags(ip, flags); +- if (error) +- goto out_unlock; ++ error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); ++ if (error) ++ goto out_unlock; ++ ++ } + + /* Change file size if needed */ + if (new_size) { diff --git a/queue-5.10/xfs-purge-dquots-after-inode-walk-fails-during-quotacheck.patch b/queue-5.10/xfs-purge-dquots-after-inode-walk-fails-during-quotacheck.patch new file mode 100644 index 00000000000..d344452ded2 --- /dev/null +++ b/queue-5.10/xfs-purge-dquots-after-inode-walk-fails-during-quotacheck.patch @@ -0,0 +1,123 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:15:45 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:16 +0200 +Subject: xfs: purge dquots after inode walk fails during quotacheck +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Christoph Hellwig , Dave Chinner , Dave Chinner +Message-ID: <20230318101529.1361673-3-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit 86d40f1e49e9a909d25c35ba01bea80dbcd758cb upstream. + +[add XFS_QMOPT_QUOTALL flag to xfs_qm_dqpurge_all() for 5.10.y backport] + +xfs/434 and xfs/436 have been reporting occasional memory leaks of +xfs_dquot objects. These tests themselves were the messenger, not the +culprit, since they unload the xfs module, which trips the slub +debugging code while tearing down all the xfs slab caches: + +============================================================================= +BUG xfs_dquot (Tainted: G W ): Objects remaining in xfs_dquot on __kmem_cache_shutdown() +----------------------------------------------------------------------------- + +Slab 0xffffea000606de00 objects=30 used=5 fp=0xffff888181b78a78 flags=0x17ff80000010200(slab|head|node=0|zone=2|lastcpupid=0xfff) +CPU: 0 PID: 3953166 Comm: modprobe Tainted: G W 5.18.0-rc6-djwx #rc6 d5824be9e46a2393677bda868f9b154d917ca6a7 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20171121_152543-x86-ol7-builder-01.us.oracle.com-4.el7.1 04/01/2014 + +Since we don't generally rmmod the xfs module between fstests, this +means that xfs/434 is really just the canary in the coal mine -- +something leaked a dquot, but we don't know who. After days of pounding +on fstests with kmemleak enabled, I finally got it to spit this out: + +unreferenced object 0xffff8880465654c0 (size 536): + comm "u10:4", pid 88, jiffies 4294935810 (age 29.512s) + hex dump (first 32 bytes): + 60 4a 56 46 80 88 ff ff 58 ea e4 5c 80 88 ff ff `JVF....X..\.... + 00 e0 52 49 80 88 ff ff 01 00 01 00 00 00 00 00 ..RI............ + backtrace: + [] xfs_dquot_alloc+0x2c/0x530 [xfs] + [] xfs_qm_dqread+0x6f/0x330 [xfs] + [] xfs_qm_dqget+0x132/0x4e0 [xfs] + [] xfs_qm_quotacheck_dqadjust+0xa0/0x3e0 [xfs] + [] xfs_qm_dqusage_adjust+0x35d/0x4f0 [xfs] + [] xfs_iwalk_ag_recs+0x348/0x5d0 [xfs] + [] xfs_iwalk_run_callbacks+0x273/0x540 [xfs] + [] xfs_iwalk_ag+0x5ed/0x890 [xfs] + [] xfs_iwalk_ag_work+0xff/0x170 [xfs] + [] xfs_pwork_work+0x79/0x130 [xfs] + [] process_one_work+0x672/0x1040 + [] worker_thread+0x59b/0xec0 + [] kthread+0x29e/0x340 + [] ret_from_fork+0x1f/0x30 + +Now we know that quotacheck is at fault, but even this report was +canaryish -- it was triggered by xfs/494, which doesn't actually mount +any filesystems. (kmemleak can be a little slow to notice leaks, even +with fstests repeatedly whacking it to look for them.) Looking at the +*previous* fstest, however, showed that the test run before xfs/494 was +xfs/117. The tipoff to the problem is in this excerpt from dmesg: + +XFS (sda4): Quotacheck needed: Please wait. +XFS (sda4): Metadata corruption detected at xfs_dinode_verify.part.0+0xdb/0x7b0 [xfs], inode 0x119 dinode +XFS (sda4): Unmount and run xfs_repair +XFS (sda4): First 128 bytes of corrupted metadata buffer: +00000000: 49 4e 81 a4 03 02 00 00 00 00 00 00 00 00 00 00 IN.............. +00000010: 00 00 00 01 00 00 00 00 00 90 57 54 54 1a 4c 68 ..........WTT.Lh +00000020: 81 f9 7d e1 6d ee 16 00 34 bd 7d e1 6d ee 16 00 ..}.m...4.}.m... +00000030: 34 bd 7d e1 6d ee 16 00 00 00 00 00 00 00 00 00 4.}.m........... +00000040: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +00000050: 00 00 00 02 00 00 00 00 00 00 00 00 96 80 f3 ab ................ +00000060: ff ff ff ff da 57 7b 11 00 00 00 00 00 00 00 03 .....W{......... +00000070: 00 00 00 01 00 00 00 10 00 00 00 00 00 00 00 08 ................ +XFS (sda4): Quotacheck: Unsuccessful (Error -117): Disabling quotas. + +The dinode verifier decided that the inode was corrupt, which causes +iget to return with EFSCORRUPTED. Since this happened during +quotacheck, it is obvious that the kernel aborted the inode walk on +account of the corruption error and disabled quotas. Unfortunately, we +neglect to purge the dquot cache before doing that, which is how the +dquots leaked. + +The problems started 10 years ago in commit b84a3a, when the dquot lists +were converted to a radix tree, but the error handling behavior was not +correctly preserved -- in that commit, if the bulkstat failed and +usrquota was enabled, the bulkstat failure code would be overwritten by +the result of flushing all the dquots to disk. As long as that +succeeds, we'd continue the quota mount as if everything were ok, but +instead we're now operating with a corrupt inode and incorrect quota +usage counts. I didn't notice this bug in 2019 when I wrote commit +ebd126a, which changed quotacheck to skip the dqflush when the scan +doesn't complete due to inode walk failures. + +Introduced-by: b84a3a96751f ("xfs: remove the per-filesystem list of dquots") +Fixes: ebd126a651f8 ("xfs: convert quotacheck to use the new iwalk functions") +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_qm.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_qm.c ++++ b/fs/xfs/xfs_qm.c +@@ -1318,8 +1318,15 @@ xfs_qm_quotacheck( + + error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, + NULL); +- if (error) ++ if (error) { ++ /* ++ * The inode walk may have partially populated the dquot ++ * caches. We must purge them before disabling quota and ++ * tearing down the quotainfo, or else the dquots will leak. ++ */ ++ xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + goto error_return; ++ } + + /* + * We've made all the changes that we need to make incore. Flush them diff --git a/queue-5.10/xfs-remove-xfs_prealloc_sync.patch b/queue-5.10/xfs-remove-xfs_prealloc_sync.patch new file mode 100644 index 00000000000..2ea7b9a2a27 --- /dev/null +++ b/queue-5.10/xfs-remove-xfs_prealloc_sync.patch @@ -0,0 +1,83 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:15:50 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:18 +0200 +Subject: xfs: remove XFS_PREALLOC_SYNC +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner +Message-ID: <20230318101529.1361673-5-amir73il@gmail.com> + +From: Dave Chinner + +commit 472c6e46f589c26057596dcba160712a5b3e02c5 upstream. + +[partial backport for dependency - + xfs_ioc_space() still uses XFS_PREALLOC_SYNC] + +Callers can acheive the same thing by calling xfs_log_force_inode() +after making their modifications. There is no need for +xfs_update_prealloc_flags() to do this. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_file.c | 13 +++++++------ + fs/xfs/xfs_pnfs.c | 6 ++++-- + 2 files changed, 11 insertions(+), 8 deletions(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -94,8 +94,6 @@ xfs_update_prealloc_flags( + ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +- if (flags & XFS_PREALLOC_SYNC) +- xfs_trans_set_sync(tp); + return xfs_trans_commit(tp); + } + +@@ -1000,9 +998,6 @@ xfs_file_fallocate( + } + } + +- if (file->f_flags & O_DSYNC) +- flags |= XFS_PREALLOC_SYNC; +- + error = xfs_update_prealloc_flags(ip, flags); + if (error) + goto out_unlock; +@@ -1024,8 +1019,14 @@ xfs_file_fallocate( + * leave shifted extents past EOF and hence losing access to + * the data that is contained within them. + */ +- if (do_file_insert) ++ if (do_file_insert) { + error = xfs_insert_file_space(ip, offset, len); ++ if (error) ++ goto out_unlock; ++ } ++ ++ if (file->f_flags & O_DSYNC) ++ error = xfs_log_force_inode(ip); + + out_unlock: + xfs_iunlock(ip, iolock); +--- a/fs/xfs/xfs_pnfs.c ++++ b/fs/xfs/xfs_pnfs.c +@@ -164,10 +164,12 @@ xfs_fs_map_blocks( + * that the blocks allocated and handed out to the client are + * guaranteed to be present even after a server crash. + */ +- error = xfs_update_prealloc_flags(ip, +- XFS_PREALLOC_SET | XFS_PREALLOC_SYNC); ++ error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); ++ if (!error) ++ error = xfs_log_force_inode(ip); + if (error) + goto out_unlock; ++ + } else { + xfs_iunlock(ip, lock_flags); + } diff --git a/queue-5.10/xfs-remove-xfs_setattr_time-declaration.patch b/queue-5.10/xfs-remove-xfs_setattr_time-declaration.patch new file mode 100644 index 00000000000..fa4c3a8a296 --- /dev/null +++ b/queue-5.10/xfs-remove-xfs_setattr_time-declaration.patch @@ -0,0 +1,35 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:16:32 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:29 +0200 +Subject: xfs: remove xfs_setattr_time() declaration +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Gaosheng Cui , Carlos Maiolino , Dave Chinner +Message-ID: <20230318101529.1361673-16-amir73il@gmail.com> + +From: Gaosheng Cui + +commit b0463b9dd7030a766133ad2f1571f97f204d7bdf upstream. + +xfs_setattr_time() has been removed since +commit e014f37db1a2 ("xfs: use setattr_copy to set vfs inode +attributes"), so remove it. + +Signed-off-by: Gaosheng Cui +Reviewed-by: Carlos Maiolino +Signed-off-by: Dave Chinner +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_iops.h | 1 - + 1 file changed, 1 deletion(-) + +--- a/fs/xfs/xfs_iops.h ++++ b/fs/xfs/xfs_iops.h +@@ -18,7 +18,6 @@ extern ssize_t xfs_vn_listxattr(struct d + */ + #define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ + +-extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); + extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, + int flags); + extern int xfs_vn_setattr_nonsize(struct dentry *dentry, struct iattr *vap); diff --git a/queue-5.10/xfs-set-prealloc-flag-in-xfs_alloc_file_space.patch b/queue-5.10/xfs-set-prealloc-flag-in-xfs_alloc_file_space.patch new file mode 100644 index 00000000000..d407871e3ac --- /dev/null +++ b/queue-5.10/xfs-set-prealloc-flag-in-xfs_alloc_file_space.patch @@ -0,0 +1,91 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:16:01 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:20 +0200 +Subject: xfs: set prealloc flag in xfs_alloc_file_space() +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner +Message-ID: <20230318101529.1361673-7-amir73il@gmail.com> + +From: Dave Chinner + +commit 0b02c8c0d75a738c98c35f02efb36217c170d78c upstream. + +[backport for 5.10.y] + +Now that we only call xfs_update_prealloc_flags() from +xfs_file_fallocate() in the case where we need to set the +preallocation flag, do this in xfs_alloc_file_space() where we +already have the inode joined into a transaction and get +rid of the call to xfs_update_prealloc_flags() from the fallocate +code. + +This also means that we now correctly avoid setting the +XFS_DIFLAG_PREALLOC flag when xfs_is_always_cow_inode() is true, as +these inodes will never have preallocated extents. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_bmap_util.c | 9 +++------ + fs/xfs/xfs_file.c | 8 -------- + 2 files changed, 3 insertions(+), 14 deletions(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -800,9 +800,6 @@ xfs_alloc_file_space( + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + +- /* +- * Allocate and setup the transaction. +- */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + resrtextents, 0, &tp); + +@@ -830,9 +827,9 @@ xfs_alloc_file_space( + if (error) + goto error0; + +- /* +- * Complete the transaction +- */ ++ ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; ++ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); ++ + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -850,7 +850,6 @@ xfs_file_fallocate( + struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); + long error; +- enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + loff_t new_size = 0; + bool do_file_insert = false; +@@ -948,8 +947,6 @@ xfs_file_fallocate( + } + do_file_insert = true; + } else { +- flags |= XFS_PREALLOC_SET; +- + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; +@@ -1000,11 +997,6 @@ xfs_file_fallocate( + if (error) + goto out_unlock; + } +- +- error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); +- if (error) +- goto out_unlock; +- + } + + /* Change file size if needed */ diff --git a/queue-5.10/xfs-use-setattr_copy-to-set-vfs-inode-attributes.patch b/queue-5.10/xfs-use-setattr_copy-to-set-vfs-inode-attributes.patch new file mode 100644 index 00000000000..ec5dc8abf8a --- /dev/null +++ b/queue-5.10/xfs-use-setattr_copy-to-set-vfs-inode-attributes.patch @@ -0,0 +1,157 @@ +From stable-owner@vger.kernel.org Sat Mar 18 11:16:02 2023 +From: Amir Goldstein +Date: Sat, 18 Mar 2023 12:15:21 +0200 +Subject: xfs: use setattr_copy to set vfs inode attributes +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Christian Brauner , linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner , Christoph Hellwig +Message-ID: <20230318101529.1361673-8-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit e014f37db1a2d109afa750042ac4d69cf3e3d88e upstream. + +[remove userns argument of setattr_copy() for 5.10.y backport] + +Filipe Manana pointed out that XFS' behavior w.r.t. setuid/setgid +revocation isn't consistent with btrfs[1] or ext4. Those two +filesystems use the VFS function setattr_copy to convey certain +attributes from struct iattr into the VFS inode structure. + +Andrey Zhadchenko reported[2] that XFS uses the wrong user namespace to +decide if it should clear setgid and setuid on a file attribute update. +This is a second symptom of the problem that Filipe noticed. + +XFS, on the other hand, open-codes setattr_copy in xfs_setattr_mode, +xfs_setattr_nonsize, and xfs_setattr_time. Regrettably, setattr_copy is +/not/ a simple copy function; it contains additional logic to clear the +setgid bit when setting the mode, and XFS' version no longer matches. + +The VFS implements its own setuid/setgid stripping logic, which +establishes consistent behavior. It's a tad unfortunate that it's +scattered across notify_change, should_remove_suid, and setattr_copy but +XFS should really follow the Linux VFS. Adapt XFS to use the VFS +functions and get rid of the old functions. + +[1] https://lore.kernel.org/fstests/CAL3q7H47iNQ=Wmk83WcGB-KBJVOEtR9+qGczzCeXJ9Y2KCV25Q@mail.gmail.com/ +[2] https://lore.kernel.org/linux-xfs/20220221182218.748084-1-andrey.zhadchenko@virtuozzo.com/ + +Fixes: 7fa294c8991c ("userns: Allow chown and setgid preservation") +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Reviewed-by: Christian Brauner +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_iops.c | 56 ++---------------------------------------------------- + fs/xfs/xfs_pnfs.c | 3 +- + 2 files changed, 5 insertions(+), 54 deletions(-) + +--- a/fs/xfs/xfs_iops.c ++++ b/fs/xfs/xfs_iops.c +@@ -595,37 +595,6 @@ xfs_vn_getattr( + return 0; + } + +-static void +-xfs_setattr_mode( +- struct xfs_inode *ip, +- struct iattr *iattr) +-{ +- struct inode *inode = VFS_I(ip); +- umode_t mode = iattr->ia_mode; +- +- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); +- +- inode->i_mode &= S_IFMT; +- inode->i_mode |= mode & ~S_IFMT; +-} +- +-void +-xfs_setattr_time( +- struct xfs_inode *ip, +- struct iattr *iattr) +-{ +- struct inode *inode = VFS_I(ip); +- +- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); +- +- if (iattr->ia_valid & ATTR_ATIME) +- inode->i_atime = iattr->ia_atime; +- if (iattr->ia_valid & ATTR_CTIME) +- inode->i_ctime = iattr->ia_ctime; +- if (iattr->ia_valid & ATTR_MTIME) +- inode->i_mtime = iattr->ia_mtime; +-} +- + static int + xfs_vn_change_ok( + struct dentry *dentry, +@@ -741,16 +710,6 @@ xfs_setattr_nonsize( + } + + /* +- * CAP_FSETID overrides the following restrictions: +- * +- * The set-user-ID and set-group-ID bits of a file will be +- * cleared upon successful return from chown() +- */ +- if ((inode->i_mode & (S_ISUID|S_ISGID)) && +- !capable(CAP_FSETID)) +- inode->i_mode &= ~(S_ISUID|S_ISGID); +- +- /* + * Change the ownerships and register quota modifications + * in the transaction. + */ +@@ -761,7 +720,6 @@ xfs_setattr_nonsize( + olddquot1 = xfs_qm_vop_chown(tp, ip, + &ip->i_udquot, udqp); + } +- inode->i_uid = uid; + } + if (!gid_eq(igid, gid)) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { +@@ -772,15 +730,10 @@ xfs_setattr_nonsize( + olddquot2 = xfs_qm_vop_chown(tp, ip, + &ip->i_gdquot, gdqp); + } +- inode->i_gid = gid; + } + } + +- if (mask & ATTR_MODE) +- xfs_setattr_mode(ip, iattr); +- if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) +- xfs_setattr_time(ip, iattr); +- ++ setattr_copy(inode, iattr); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(mp, xs_ig_attrchg); +@@ -1025,11 +978,8 @@ xfs_setattr_size( + xfs_inode_clear_eofblocks_tag(ip); + } + +- if (iattr->ia_valid & ATTR_MODE) +- xfs_setattr_mode(ip, iattr); +- if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) +- xfs_setattr_time(ip, iattr); +- ++ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); ++ setattr_copy(inode, iattr); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(mp, xs_ig_attrchg); +--- a/fs/xfs/xfs_pnfs.c ++++ b/fs/xfs/xfs_pnfs.c +@@ -285,7 +285,8 @@ xfs_fs_commit_blocks( + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + +- xfs_setattr_time(ip, iattr); ++ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); ++ setattr_copy(inode, iattr); + if (update_isize) { + i_size_write(inode, iattr->ia_size); + ip->i_d.di_size = iattr->ia_size;