--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:16:15 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:24 +0200
+Subject: attr: add in_group_or_capable()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org
+Message-ID: <20230318101529.1361673-11-amir73il@gmail.com>
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+commit 11c2a8700cdcabf9b639b7204a1e38e2a0b6798e upstream.
+
+[backported to 5.10.y, prior to idmapped mounts]
+
+In setattr_{copy,prepare}() we need to perform the same permission
+checks to determine whether we need to drop the setgid bit or not.
+Instead of open-coding it twice add a simple helper the encapsulates the
+logic. We will reuse this helpers to make dropping the setgid bit during
+write operations more consistent in a follow up patch.
+
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/attr.c | 11 +++++------
+ fs/inode.c | 25 +++++++++++++++++++++----
+ fs/internal.h | 1 +
+ 3 files changed, 27 insertions(+), 10 deletions(-)
+
+--- a/fs/attr.c
++++ b/fs/attr.c
+@@ -18,6 +18,8 @@
+ #include <linux/evm.h>
+ #include <linux/ima.h>
+
++#include "internal.h"
++
+ static bool chown_ok(const struct inode *inode, kuid_t uid)
+ {
+ if (uid_eq(current_fsuid(), inode->i_uid) &&
+@@ -90,9 +92,8 @@ int setattr_prepare(struct dentry *dentr
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+ /* Also check the setgid bit! */
+- if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
+- inode->i_gid) &&
+- !capable_wrt_inode_uidgid(inode, CAP_FSETID))
++ if (!in_group_or_capable(inode, (ia_valid & ATTR_GID) ?
++ attr->ia_gid : inode->i_gid))
+ attr->ia_mode &= ~S_ISGID;
+ }
+
+@@ -193,9 +194,7 @@ void setattr_copy(struct inode *inode, c
+ inode->i_ctime = attr->ia_ctime;
+ if (ia_valid & ATTR_MODE) {
+ umode_t mode = attr->ia_mode;
+-
+- if (!in_group_p(inode->i_gid) &&
+- !capable_wrt_inode_uidgid(inode, CAP_FSETID))
++ if (!in_group_or_capable(inode, inode->i_gid))
+ mode &= ~S_ISGID;
+ inode->i_mode = mode;
+ }
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -2380,6 +2380,26 @@ int vfs_ioc_fssetxattr_check(struct inod
+ EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);
+
+ /**
++ * in_group_or_capable - check whether caller is CAP_FSETID privileged
++ * @inode: inode to check
++ * @gid: the new/current gid of @inode
++ *
++ * Check wether @gid is in the caller's group list or if the caller is
++ * privileged with CAP_FSETID over @inode. This can be used to determine
++ * whether the setgid bit can be kept or must be dropped.
++ *
++ * Return: true if the caller is sufficiently privileged, false if not.
++ */
++bool in_group_or_capable(const struct inode *inode, kgid_t gid)
++{
++ if (in_group_p(gid))
++ return true;
++ if (capable_wrt_inode_uidgid(inode, CAP_FSETID))
++ return true;
++ return false;
++}
++
++/**
+ * mode_strip_sgid - handle the sgid bit for non-directories
+ * @dir: parent directory inode
+ * @mode: mode of the file to be created in @dir
+@@ -2398,11 +2418,8 @@ umode_t mode_strip_sgid(const struct ino
+ return mode;
+ if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
+ return mode;
+- if (in_group_p(dir->i_gid))
++ if (in_group_or_capable(dir, dir->i_gid))
+ return mode;
+- if (capable_wrt_inode_uidgid(dir, CAP_FSETID))
+- return mode;
+-
+ return mode & ~S_ISGID;
+ }
+ EXPORT_SYMBOL(mode_strip_sgid);
+--- a/fs/internal.h
++++ b/fs/internal.h
+@@ -149,6 +149,7 @@ extern int vfs_open(const struct path *,
+ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
+ extern void inode_add_lru(struct inode *inode);
+ extern int dentry_needs_remove_privs(struct dentry *dentry);
++bool in_group_or_capable(const struct inode *inode, kgid_t gid);
+
+ /*
+ * fs-writeback.c
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:16:16 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:26 +0200
+Subject: attr: add setattr_should_drop_sgid()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org
+Message-ID: <20230318101529.1361673-13-amir73il@gmail.com>
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+commit 72ae017c5451860443a16fb2a8c243bff3e396b8 upstream.
+
+[backported to 5.10.y, prior to idmapped mounts]
+
+The current setgid stripping logic during write and ownership change
+operations is inconsistent and strewn over multiple places. In order to
+consolidate it and make more consistent we'll add a new helper
+setattr_should_drop_sgid(). The function retains the old behavior where
+we remove the S_ISGID bit unconditionally when S_IXGRP is set but also
+when it isn't set and the caller is neither in the group of the inode
+nor privileged over the inode.
+
+We will use this helper both in write operation permission removal such
+as file_remove_privs() as well as in ownership change operations.
+
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/attr.c | 25 +++++++++++++++++++++++++
+ fs/internal.h | 5 +++++
+ 2 files changed, 30 insertions(+)
+
+--- a/fs/attr.c
++++ b/fs/attr.c
+@@ -20,6 +20,31 @@
+
+ #include "internal.h"
+
++/**
++ * setattr_should_drop_sgid - determine whether the setgid bit needs to be
++ * removed
++ * @inode: inode to check
++ *
++ * This function determines whether the setgid bit needs to be removed.
++ * We retain backwards compatibility and require setgid bit to be removed
++ * unconditionally if S_IXGRP is set. Otherwise we have the exact same
++ * requirements as setattr_prepare() and setattr_copy().
++ *
++ * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise.
++ */
++int setattr_should_drop_sgid(const struct inode *inode)
++{
++ umode_t mode = inode->i_mode;
++
++ if (!(mode & S_ISGID))
++ return 0;
++ if (mode & S_IXGRP)
++ return ATTR_KILL_SGID;
++ if (!in_group_or_capable(inode, inode->i_gid))
++ return ATTR_KILL_SGID;
++ return 0;
++}
++
+ /*
+ * The logic we want is
+ *
+--- a/fs/internal.h
++++ b/fs/internal.h
+@@ -197,3 +197,8 @@ int sb_init_dio_done_wq(struct super_blo
+ */
+ int do_statx(int dfd, const char __user *filename, unsigned flags,
+ unsigned int mask, struct statx __user *buffer);
++
++/*
++ * fs/attr.c
++ */
++int setattr_should_drop_sgid(const struct inode *inode);
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:16:16 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:27 +0200
+Subject: attr: use consistent sgid stripping checks
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org
+Message-ID: <20230318101529.1361673-14-amir73il@gmail.com>
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+commit ed5a7047d2011cb6b2bf84ceb6680124cc6a7d95 upstream.
+
+[backported to 5.10.y, prior to idmapped mounts]
+
+Currently setgid stripping in file_remove_privs()'s should_remove_suid()
+helper is inconsistent with other parts of the vfs. Specifically, it only
+raises ATTR_KILL_SGID if the inode is S_ISGID and S_IXGRP but not if the
+inode isn't in the caller's groups and the caller isn't privileged over the
+inode although we require this already in setattr_prepare() and
+setattr_copy() and so all filesystem implement this requirement implicitly
+because they have to use setattr_{prepare,copy}() anyway.
+
+But the inconsistency shows up in setgid stripping bugs for overlayfs in
+xfstests (e.g., generic/673, generic/683, generic/685, generic/686,
+generic/687). For example, we test whether suid and setgid stripping works
+correctly when performing various write-like operations as an unprivileged
+user (fallocate, reflink, write, etc.):
+
+echo "Test 1 - qa_user, non-exec file $verb"
+setup_testfile
+chmod a+rws $junk_file
+commit_and_check "$qa_user" "$verb" 64k 64k
+
+The test basically creates a file with 6666 permissions. While the file has
+the S_ISUID and S_ISGID bits set it does not have the S_IXGRP set. On a
+regular filesystem like xfs what will happen is:
+
+sys_fallocate()
+-> vfs_fallocate()
+ -> xfs_file_fallocate()
+ -> file_modified()
+ -> __file_remove_privs()
+ -> dentry_needs_remove_privs()
+ -> should_remove_suid()
+ -> __remove_privs()
+ newattrs.ia_valid = ATTR_FORCE | kill;
+ -> notify_change()
+ -> setattr_copy()
+
+In should_remove_suid() we can see that ATTR_KILL_SUID is raised
+unconditionally because the file in the test has S_ISUID set.
+
+But we also see that ATTR_KILL_SGID won't be set because while the file
+is S_ISGID it is not S_IXGRP (see above) which is a condition for
+ATTR_KILL_SGID being raised.
+
+So by the time we call notify_change() we have attr->ia_valid set to
+ATTR_KILL_SUID | ATTR_FORCE. Now notify_change() sees that
+ATTR_KILL_SUID is set and does:
+
+ia_valid = attr->ia_valid |= ATTR_MODE
+attr->ia_mode = (inode->i_mode & ~S_ISUID);
+
+which means that when we call setattr_copy() later we will definitely
+update inode->i_mode. Note that attr->ia_mode still contains S_ISGID.
+
+Now we call into the filesystem's ->setattr() inode operation which will
+end up calling setattr_copy(). Since ATTR_MODE is set we will hit:
+
+if (ia_valid & ATTR_MODE) {
+ umode_t mode = attr->ia_mode;
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ if (!vfsgid_in_group_p(vfsgid) &&
+ !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ mode &= ~S_ISGID;
+ inode->i_mode = mode;
+}
+
+and since the caller in the test is neither capable nor in the group of the
+inode the S_ISGID bit is stripped.
+
+But assume the file isn't suid then ATTR_KILL_SUID won't be raised which
+has the consequence that neither the setgid nor the suid bits are stripped
+even though it should be stripped because the inode isn't in the caller's
+groups and the caller isn't privileged over the inode.
+
+If overlayfs is in the mix things become a bit more complicated and the bug
+shows up more clearly. When e.g., ovl_setattr() is hit from
+ovl_fallocate()'s call to file_remove_privs() then ATTR_KILL_SUID and
+ATTR_KILL_SGID might be raised but because the check in notify_change() is
+questioning the ATTR_KILL_SGID flag again by requiring S_IXGRP for it to be
+stripped the S_ISGID bit isn't removed even though it should be stripped:
+
+sys_fallocate()
+-> vfs_fallocate()
+ -> ovl_fallocate()
+ -> file_remove_privs()
+ -> dentry_needs_remove_privs()
+ -> should_remove_suid()
+ -> __remove_privs()
+ newattrs.ia_valid = ATTR_FORCE | kill;
+ -> notify_change()
+ -> ovl_setattr()
+ // TAKE ON MOUNTER'S CREDS
+ -> ovl_do_notify_change()
+ -> notify_change()
+ // GIVE UP MOUNTER'S CREDS
+ // TAKE ON MOUNTER'S CREDS
+ -> vfs_fallocate()
+ -> xfs_file_fallocate()
+ -> file_modified()
+ -> __file_remove_privs()
+ -> dentry_needs_remove_privs()
+ -> should_remove_suid()
+ -> __remove_privs()
+ newattrs.ia_valid = attr_force | kill;
+ -> notify_change()
+
+The fix for all of this is to make file_remove_privs()'s
+should_remove_suid() helper to perform the same checks as we already
+require in setattr_prepare() and setattr_copy() and have notify_change()
+not pointlessly requiring S_IXGRP again. It doesn't make any sense in the
+first place because the caller must calculate the flags via
+should_remove_suid() anyway which would raise ATTR_KILL_SGID.
+
+While we're at it we move should_remove_suid() from inode.c to attr.c
+where it belongs with the rest of the iattr helpers. Especially since it
+returns ATTR_KILL_S{G,U}ID flags. We also rename it to
+setattr_should_drop_suidgid() to better reflect that it indicates both
+setuid and setgid bit removal and also that it returns attr flags.
+
+Running xfstests with this doesn't report any regressions. We should really
+try and use consistent checks.
+
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/trace/ftrace.rst | 2 +-
+ fs/attr.c | 31 +++++++++++++++++--------------
+ fs/inode.c | 2 +-
+ fs/ocfs2/file.c | 4 ++--
+ fs/open.c | 6 +++---
+ include/linux/fs.h | 2 +-
+ 6 files changed, 25 insertions(+), 22 deletions(-)
+
+--- a/Documentation/trace/ftrace.rst
++++ b/Documentation/trace/ftrace.rst
+@@ -2923,7 +2923,7 @@ Produces::
+ bash-1994 [000] .... 4342.324898: ima_get_action <-process_measurement
+ bash-1994 [000] .... 4342.324898: ima_match_policy <-ima_get_action
+ bash-1994 [000] .... 4342.324899: do_truncate <-do_last
+- bash-1994 [000] .... 4342.324899: should_remove_suid <-do_truncate
++ bash-1994 [000] .... 4342.324899: setattr_should_drop_suidgid <-do_truncate
+ bash-1994 [000] .... 4342.324899: notify_change <-do_truncate
+ bash-1994 [000] .... 4342.324900: current_fs_time <-notify_change
+ bash-1994 [000] .... 4342.324900: current_kernel_time <-current_fs_time
+--- a/fs/attr.c
++++ b/fs/attr.c
+@@ -45,34 +45,37 @@ int setattr_should_drop_sgid(const struc
+ return 0;
+ }
+
+-/*
+- * The logic we want is
++/**
++ * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to
++ * be dropped
++ * @inode: inode to check
++ *
++ * This function determines whether the set{g,u}id bits need to be removed.
++ * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the
++ * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both
++ * set{g,u}id bits need to be removed the corresponding mask of both flags is
++ * returned.
+ *
+- * if suid or (sgid and xgrp)
+- * remove privs
++ * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits
++ * to remove, 0 otherwise.
+ */
+-int should_remove_suid(struct dentry *dentry)
++int setattr_should_drop_suidgid(struct inode *inode)
+ {
+- umode_t mode = d_inode(dentry)->i_mode;
++ umode_t mode = inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+- /*
+- * sgid without any exec bits is just a mandatory locking mark; leave
+- * it alone. If some exec bits are set, it's a real sgid; kill it.
+- */
+- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+- kill |= ATTR_KILL_SGID;
++ kill |= setattr_should_drop_sgid(inode);
+
+ if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+ }
+-EXPORT_SYMBOL(should_remove_suid);
++EXPORT_SYMBOL(setattr_should_drop_suidgid);
+
+ static bool chown_ok(const struct inode *inode, kuid_t uid)
+ {
+@@ -350,7 +353,7 @@ int notify_change(struct dentry * dentry
+ }
+ }
+ if (ia_valid & ATTR_KILL_SGID) {
+- if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
++ if (mode & S_ISGID) {
+ if (!(ia_valid & ATTR_MODE)) {
+ ia_valid = attr->ia_valid |= ATTR_MODE;
+ attr->ia_mode = inode->i_mode;
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -1868,7 +1868,7 @@ int dentry_needs_remove_privs(struct den
+ if (IS_NOSEC(inode))
+ return 0;
+
+- mask = should_remove_suid(dentry);
++ mask = setattr_should_drop_suidgid(inode);
+ ret = security_inode_need_killpriv(dentry);
+ if (ret < 0)
+ return ret;
+--- a/fs/ocfs2/file.c
++++ b/fs/ocfs2/file.c
+@@ -1994,7 +1994,7 @@ static int __ocfs2_change_file_space(str
+ }
+ }
+
+- if (file && should_remove_suid(file->f_path.dentry)) {
++ if (file && setattr_should_drop_suidgid(file_inode(file))) {
+ ret = __ocfs2_write_remove_suid(inode, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+@@ -2282,7 +2282,7 @@ static int ocfs2_prepare_inode_for_write
+ * inode. There's also the dinode i_size state which
+ * can be lost via setattr during extending writes (we
+ * set inode->i_size at the end of a write. */
+- if (should_remove_suid(dentry)) {
++ if (setattr_should_drop_suidgid(inode)) {
+ if (meta_level == 0) {
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+--- a/fs/open.c
++++ b/fs/open.c
+@@ -665,10 +665,10 @@ retry_deleg:
+ newattrs.ia_valid |= ATTR_GID;
+ newattrs.ia_gid = gid;
+ }
+- if (!S_ISDIR(inode->i_mode))
+- newattrs.ia_valid |=
+- ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+ inode_lock(inode);
++ if (!S_ISDIR(inode->i_mode))
++ newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
++ setattr_should_drop_sgid(inode);
+ error = security_path_chown(path, uid, gid);
+ if (!error)
+ error = notify_change(path->dentry, &newattrs, &delegated_inode);
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -2960,7 +2960,7 @@ extern void __destroy_inode(struct inode
+ extern struct inode *new_inode_pseudo(struct super_block *sb);
+ extern struct inode *new_inode(struct super_block *sb);
+ extern void free_inode_nonrcu(struct inode *inode);
+-extern int should_remove_suid(struct dentry *);
++extern int setattr_should_drop_suidgid(struct inode *);
+ extern int file_remove_privs(struct file *);
+
+ extern void __insert_inode_hash(struct inode *, unsigned long hashval);
--- /dev/null
+From e0e6b416b25ee14716f3549e0cbec1011b193809 Mon Sep 17 00:00:00 2001
+From: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
+Date: Thu, 2 Mar 2023 13:08:20 +0100
+Subject: drm/i915/active: Fix misuse of non-idle barriers as fence trackers
+
+From: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
+
+commit e0e6b416b25ee14716f3549e0cbec1011b193809 upstream.
+
+Users reported oopses on list corruptions when using i915 perf with a
+number of concurrently running graphics applications. Root cause analysis
+pointed at an issue in barrier processing code -- a race among perf open /
+close replacing active barriers with perf requests on kernel context and
+concurrent barrier preallocate / acquire operations performed during user
+context first pin / last unpin.
+
+When adding a request to a composite tracker, we try to reuse an existing
+fence tracker, already allocated and registered with that composite. The
+tracker we obtain may already track another fence, may be an idle barrier,
+or an active barrier.
+
+If the tracker we get occurs a non-idle barrier then we try to delete that
+barrier from a list of barrier tasks it belongs to. However, while doing
+that we don't respect return value from a function that performs the
+barrier deletion. Should the deletion ever fail, we would end up reusing
+the tracker still registered as a barrier task. Since the same structure
+field is reused with both fence callback lists and barrier tasks list,
+list corruptions would likely occur.
+
+Barriers are now deleted from a barrier tasks list by temporarily removing
+the list content, traversing that content with skip over the node to be
+deleted, then populating the list back with the modified content. Should
+that intentionally racy concurrent deletion attempts be not serialized,
+one or more of those may fail because of the list being temporary empty.
+
+Related code that ignores the results of barrier deletion was initially
+introduced in v5.4 by commit d8af05ff38ae ("drm/i915: Allow sharing the
+idle-barrier from other kernel requests"). However, all users of the
+barrier deletion routine were apparently serialized at that time, then the
+issue didn't exhibit itself. Results of git bisect with help of a newly
+developed igt@gem_barrier_race@remote-request IGT test indicate that list
+corruptions might start to appear after commit 311770173fac ("drm/i915/gt:
+Schedule request retirement when timeline idles"), introduced in v5.5.
+
+Respect results of barrier deletion attempts -- mark the barrier as idle
+only if successfully deleted from the list. Then, before proceeding with
+setting our fence as the one currently tracked, make sure that the tracker
+we've got is not a non-idle barrier. If that check fails then don't use
+that tracker but go back and try to acquire a new, usable one.
+
+v3: use unlikely() to document what outcome we expect (Andi),
+ - fix bad grammar in commit description.
+v2: no code changes,
+ - blame commit 311770173fac ("drm/i915/gt: Schedule request retirement
+ when timeline idles"), v5.5, not commit d8af05ff38ae ("drm/i915: Allow
+ sharing the idle-barrier from other kernel requests"), v5.4,
+ - reword commit description.
+
+Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/6333
+Fixes: 311770173fac ("drm/i915/gt: Schedule request retirement when timeline idles")
+Cc: Chris Wilson <chris@chris-wilson.co.uk>
+Cc: stable@vger.kernel.org # v5.5
+Cc: Andi Shyti <andi.shyti@linux.intel.com>
+Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
+Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
+Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20230302120820.48740-1-janusz.krzysztofik@linux.intel.com
+(cherry picked from commit 506006055769b10d1b2b4e22f636f3b45e0e9fc7)
+Signed-off-by: Jani Nikula <jani.nikula@intel.com>
+Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/i915_active.c | 26 ++++++++++++++------------
+ 1 file changed, 14 insertions(+), 12 deletions(-)
+
+--- a/drivers/gpu/drm/i915/i915_active.c
++++ b/drivers/gpu/drm/i915/i915_active.c
+@@ -432,8 +432,7 @@ replace_barrier(struct i915_active *ref,
+ * we can use it to substitute for the pending idle-barrer
+ * request that we want to emit on the kernel_context.
+ */
+- __active_del_barrier(ref, node_from_active(active));
+- return true;
++ return __active_del_barrier(ref, node_from_active(active));
+ }
+
+ int i915_active_ref(struct i915_active *ref, u64 idx, struct dma_fence *fence)
+@@ -446,16 +445,19 @@ int i915_active_ref(struct i915_active *
+ if (err)
+ return err;
+
+- active = active_instance(ref, idx);
+- if (!active) {
+- err = -ENOMEM;
+- goto out;
+- }
+-
+- if (replace_barrier(ref, active)) {
+- RCU_INIT_POINTER(active->fence, NULL);
+- atomic_dec(&ref->count);
+- }
++ do {
++ active = active_instance(ref, idx);
++ if (!active) {
++ err = -ENOMEM;
++ goto out;
++ }
++
++ if (replace_barrier(ref, active)) {
++ RCU_INIT_POINTER(active->fence, NULL);
++ atomic_dec(&ref->count);
++ }
++ } while (unlikely(is_barrier(active)));
++
+ if (!__i915_active_fence_set(active, fence))
+ __i915_active_acquire(ref);
+
--- /dev/null
+From 690e0ec8e63da9a29b39fedc6ed5da09c7c82651 Mon Sep 17 00:00:00 2001
+From: John Harrison <John.C.Harrison@Intel.com>
+Date: Wed, 15 Feb 2023 17:11:00 -0800
+Subject: drm/i915: Don't use stolen memory for ring buffers with LLC
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: John Harrison <John.C.Harrison@Intel.com>
+
+commit 690e0ec8e63da9a29b39fedc6ed5da09c7c82651 upstream.
+
+Direction from hardware is that stolen memory should never be used for
+ring buffer allocations on platforms with LLC. There are too many
+caching pitfalls due to the way stolen memory accesses are routed. So
+it is safest to just not use it.
+
+Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
+Fixes: c58b735fc762 ("drm/i915: Allocate rings from stolen")
+Cc: Chris Wilson <chris@chris-wilson.co.uk>
+Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
+Cc: Jani Nikula <jani.nikula@linux.intel.com>
+Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
+Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
+Cc: intel-gfx@lists.freedesktop.org
+Cc: <stable@vger.kernel.org> # v4.9+
+Tested-by: Jouni Högander <jouni.hogander@intel.com>
+Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20230216011101.1909009-2-John.C.Harrison@Intel.com
+(cherry picked from commit f54c1f6c697c4297f7ed94283c184acc338a5cf8)
+Signed-off-by: Jani Nikula <jani.nikula@intel.com>
+Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/gt/intel_ring.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/i915/gt/intel_ring.c
++++ b/drivers/gpu/drm/i915/gt/intel_ring.c
+@@ -108,7 +108,7 @@ static struct i915_vma *create_ring_vma(
+ struct i915_vma *vma;
+
+ obj = ERR_PTR(-ENODEV);
+- if (i915_ggtt_has_aperture(ggtt))
++ if (i915_ggtt_has_aperture(ggtt) && !HAS_LLC(i915))
+ obj = i915_gem_object_create_stolen(i915, size);
+ if (IS_ERR(obj))
+ obj = i915_gem_object_create_internal(i915, size);
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:16:11 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:22 +0200
+Subject: fs: add mode_strip_sgid() helper
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Yang Xu <xuyang2018.jy@fujitsu.com>, Jeff Layton <jlayton@kernel.org>
+Message-ID: <20230318101529.1361673-9-amir73il@gmail.com>
+
+From: Yang Xu <xuyang2018.jy@fujitsu.com>
+
+commit 2b3416ceff5e6bd4922f6d1c61fb68113dd82302 upstream.
+
+[remove userns argument of helper for 5.10.y backport]
+
+Add a dedicated helper to handle the setgid bit when creating a new file
+in a setgid directory. This is a preparatory patch for moving setgid
+stripping into the vfs. The patch contains no functional changes.
+
+Currently the setgid stripping logic is open-coded directly in
+inode_init_owner() and the individual filesystems are responsible for
+handling setgid inheritance. Since this has proven to be brittle as
+evidenced by old issues we uncovered over the last months (see [1] to
+[3] below) we will try to move this logic into the vfs.
+
+Link: e014f37db1a2 ("xfs: use setattr_copy to set vfs inode attributes") [1]
+Link: 01ea173e103e ("xfs: fix up non-directory creation in SGID directories") [2]
+Link: fd84bfdddd16 ("ceph: fix up non-directory creation in SGID directories") [3]
+Link: https://lore.kernel.org/r/1657779088-2242-1-git-send-email-xuyang2018.jy@fujitsu.com
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
+Reviewed-and-Tested-by: Jeff Layton <jlayton@kernel.org>
+Signed-off-by: Yang Xu <xuyang2018.jy@fujitsu.com>
+Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/inode.c | 34 ++++++++++++++++++++++++++++++----
+ include/linux/fs.h | 1 +
+ 2 files changed, 31 insertions(+), 4 deletions(-)
+
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -2147,10 +2147,8 @@ void inode_init_owner(struct inode *inod
+ /* Directories are special, and always inherit S_ISGID */
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+- else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
+- !in_group_p(inode->i_gid) &&
+- !capable_wrt_inode_uidgid(dir, CAP_FSETID))
+- mode &= ~S_ISGID;
++ else
++ mode = mode_strip_sgid(dir, mode);
+ } else
+ inode->i_gid = current_fsgid();
+ inode->i_mode = mode;
+@@ -2382,3 +2380,31 @@ int vfs_ioc_fssetxattr_check(struct inod
+ return 0;
+ }
+ EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);
++
++/**
++ * mode_strip_sgid - handle the sgid bit for non-directories
++ * @dir: parent directory inode
++ * @mode: mode of the file to be created in @dir
++ *
++ * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
++ * raised and @dir has the S_ISGID bit raised ensure that the caller is
++ * either in the group of the parent directory or they have CAP_FSETID
++ * in their user namespace and are privileged over the parent directory.
++ * In all other cases, strip the S_ISGID bit from @mode.
++ *
++ * Return: the new mode to use for the file
++ */
++umode_t mode_strip_sgid(const struct inode *dir, umode_t mode)
++{
++ if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
++ return mode;
++ if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
++ return mode;
++ if (in_group_p(dir->i_gid))
++ return mode;
++ if (capable_wrt_inode_uidgid(dir, CAP_FSETID))
++ return mode;
++
++ return mode & ~S_ISGID;
++}
++EXPORT_SYMBOL(mode_strip_sgid);
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -1768,6 +1768,7 @@ extern long compat_ptr_ioctl(struct file
+ extern void inode_init_owner(struct inode *inode, const struct inode *dir,
+ umode_t mode);
+ extern bool may_open_dev(const struct path *path);
++umode_t mode_strip_sgid(const struct inode *dir, umode_t mode);
+
+ /*
+ * This is the "filldir" function type, used by readdir() to let
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:16:10 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:23 +0200
+Subject: fs: move S_ISGID stripping into the vfs_*() helpers
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Yang Xu <xuyang2018.jy@fujitsu.com>, Dave Chinner <david@fromorbit.com>, Jeff Layton <jlayton@kernel.org>
+Message-ID: <20230318101529.1361673-10-amir73il@gmail.com>
+
+From: Yang Xu <xuyang2018.jy@fujitsu.com>
+
+commit 1639a49ccdce58ea248841ed9b23babcce6dbb0b upstream.
+
+[remove userns argument of helpers for 5.10.y backport]
+
+Move setgid handling out of individual filesystems and into the VFS
+itself to stop the proliferation of setgid inheritance bugs.
+
+Creating files that have both the S_IXGRP and S_ISGID bit raised in
+directories that themselves have the S_ISGID bit set requires additional
+privileges to avoid security issues.
+
+When a filesystem creates a new inode it needs to take care that the
+caller is either in the group of the newly created inode or they have
+CAP_FSETID in their current user namespace and are privileged over the
+parent directory of the new inode. If any of these two conditions is
+true then the S_ISGID bit can be raised for an S_IXGRP file and if not
+it needs to be stripped.
+
+However, there are several key issues with the current implementation:
+
+* S_ISGID stripping logic is entangled with umask stripping.
+
+ If a filesystem doesn't support or enable POSIX ACLs then umask
+ stripping is done directly in the vfs before calling into the
+ filesystem.
+ If the filesystem does support POSIX ACLs then unmask stripping may be
+ done in the filesystem itself when calling posix_acl_create().
+
+ Since umask stripping has an effect on S_ISGID inheritance, e.g., by
+ stripping the S_IXGRP bit from the file to be created and all relevant
+ filesystems have to call posix_acl_create() before inode_init_owner()
+ where we currently take care of S_ISGID handling S_ISGID handling is
+ order dependent. IOW, whether or not you get a setgid bit depends on
+ POSIX ACLs and umask and in what order they are called.
+
+ Note that technically filesystems are free to impose their own
+ ordering between posix_acl_create() and inode_init_owner() meaning
+ that there's additional ordering issues that influence S_SIGID
+ inheritance.
+
+* Filesystems that don't rely on inode_init_owner() don't get S_ISGID
+ stripping logic.
+
+ While that may be intentional (e.g. network filesystems might just
+ defer setgid stripping to a server) it is often just a security issue.
+
+This is not just ugly it's unsustainably messy especially since we do
+still have bugs in this area years after the initial round of setgid
+bugfixes.
+
+So the current state is quite messy and while we won't be able to make
+it completely clean as posix_acl_create() is still a filesystem specific
+call we can improve the S_SIGD stripping situation quite a bit by
+hoisting it out of inode_init_owner() and into the vfs creation
+operations. This means we alleviate the burden for filesystems to handle
+S_ISGID stripping correctly and can standardize the ordering between
+S_ISGID and umask stripping in the vfs.
+
+We add a new helper vfs_prepare_mode() so S_ISGID handling is now done
+in the VFS before umask handling. This has S_ISGID handling is
+unaffected unaffected by whether umask stripping is done by the VFS
+itself (if no POSIX ACLs are supported or enabled) or in the filesystem
+in posix_acl_create() (if POSIX ACLs are supported).
+
+The vfs_prepare_mode() helper is called directly in vfs_*() helpers that
+create new filesystem objects. We need to move them into there to make
+sure that filesystems like overlayfs hat have callchains like:
+
+sys_mknod()
+-> do_mknodat(mode)
+ -> .mknod = ovl_mknod(mode)
+ -> ovl_create(mode)
+ -> vfs_mknod(mode)
+
+get S_ISGID stripping done when calling into lower filesystems via
+vfs_*() creation helpers. Moving vfs_prepare_mode() into e.g.
+vfs_mknod() takes care of that. This is in any case semantically cleaner
+because S_ISGID stripping is VFS security requirement.
+
+Security hooks so far have seen the mode with the umask applied but
+without S_ISGID handling done. The relevant hooks are called outside of
+vfs_*() creation helpers so by calling vfs_prepare_mode() from vfs_*()
+helpers the security hooks would now see the mode without umask
+stripping applied. For now we fix this by passing the mode with umask
+settings applied to not risk any regressions for LSM hooks. IOW, nothing
+changes for LSM hooks. It is worth pointing out that security hooks
+never saw the mode that is seen by the filesystem when actually creating
+the file. They have always been completely misplaced for that to work.
+
+The following filesystems use inode_init_owner() and thus relied on
+S_ISGID stripping: spufs, 9p, bfs, btrfs, ext2, ext4, f2fs, hfsplus,
+hugetlbfs, jfs, minix, nilfs2, ntfs3, ocfs2, omfs, overlayfs, ramfs,
+reiserfs, sysv, ubifs, udf, ufs, xfs, zonefs, bpf, tmpfs.
+
+All of the above filesystems end up calling inode_init_owner() when new
+filesystem objects are created through the ->mkdir(), ->mknod(),
+->create(), ->tmpfile(), ->rename() inode operations.
+
+Since directories always inherit the S_ISGID bit with the exception of
+xfs when irix_sgid_inherit mode is turned on S_ISGID stripping doesn't
+apply. The ->symlink() and ->link() inode operations trivially inherit
+the mode from the target and the ->rename() inode operation inherits the
+mode from the source inode. All other creation inode operations will get
+S_ISGID handling via vfs_prepare_mode() when called from their relevant
+vfs_*() helpers.
+
+In addition to this there are filesystems which allow the creation of
+filesystem objects through ioctl()s or - in the case of spufs -
+circumventing the vfs in other ways. If filesystem objects are created
+through ioctl()s the vfs doesn't know about it and can't apply regular
+permission checking including S_ISGID logic. Therfore, a filesystem
+relying on S_ISGID stripping in inode_init_owner() in their ioctl()
+callpath will be affected by moving this logic into the vfs. We audited
+those filesystems:
+
+* btrfs allows the creation of filesystem objects through various
+ ioctls(). Snapshot creation literally takes a snapshot and so the mode
+ is fully preserved and S_ISGID stripping doesn't apply.
+
+ Creating a new subvolum relies on inode_init_owner() in
+ btrfs_new_subvol_inode() but only creates directories and doesn't
+ raise S_ISGID.
+
+* ocfs2 has a peculiar implementation of reflinks. In contrast to e.g.
+ xfs and btrfs FICLONE/FICLONERANGE ioctl() that is only concerned with
+ the actual extents ocfs2 uses a separate ioctl() that also creates the
+ target file.
+
+ Iow, ocfs2 circumvents the vfs entirely here and did indeed rely on
+ inode_init_owner() to strip the S_ISGID bit. This is the only place
+ where a filesystem needs to call mode_strip_sgid() directly but this
+ is self-inflicted pain.
+
+* spufs doesn't go through the vfs at all and doesn't use ioctl()s
+ either. Instead it has a dedicated system call spufs_create() which
+ allows the creation of filesystem objects. But spufs only creates
+ directories and doesn't allo S_SIGID bits, i.e. it specifically only
+ allows 0777 bits.
+
+* bpf uses vfs_mkobj() but also doesn't allow S_ISGID bits to be created.
+
+The patch will have an effect on ext2 when the EXT2_MOUNT_GRPID mount
+option is used, on ext4 when the EXT4_MOUNT_GRPID mount option is used,
+and on xfs when the XFS_FEAT_GRPID mount option is used. When any of
+these filesystems are mounted with their respective GRPID option then
+newly created files inherit the parent directories group
+unconditionally. In these cases non of the filesystems call
+inode_init_owner() and thus did never strip the S_ISGID bit for newly
+created files. Moving this logic into the VFS means that they now get
+the S_ISGID bit stripped. This is a user visible change. If this leads
+to regressions we will either need to figure out a better way or we need
+to revert. However, given the various setgid bugs that we found just in
+the last two years this is a regression risk we should take.
+
+Associated with this change is a new set of fstests to enforce the
+semantics for all new filesystems.
+
+Link: https://lore.kernel.org/ceph-devel/20220427092201.wvsdjbnc7b4dttaw@wittgenstein [1]
+Link: e014f37db1a2 ("xfs: use setattr_copy to set vfs inode attributes") [2]
+Link: 01ea173e103e ("xfs: fix up non-directory creation in SGID directories") [3]
+Link: fd84bfdddd16 ("ceph: fix up non-directory creation in SGID directories") [4]
+Link: https://lore.kernel.org/r/1657779088-2242-3-git-send-email-xuyang2018.jy@fujitsu.com
+Suggested-by: Dave Chinner <david@fromorbit.com>
+Suggested-by: Christian Brauner (Microsoft) <brauner@kernel.org>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-and-Tested-by: Jeff Layton <jlayton@kernel.org>
+Signed-off-by: Yang Xu <xuyang2018.jy@fujitsu.com>
+[<brauner@kernel.org>: rewrote commit message]
+Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/inode.c | 2 -
+ fs/namei.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++---------
+ fs/ocfs2/namei.c | 1
+ 3 files changed, 68 insertions(+), 15 deletions(-)
+
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -2147,8 +2147,6 @@ void inode_init_owner(struct inode *inod
+ /* Directories are special, and always inherit S_ISGID */
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+- else
+- mode = mode_strip_sgid(dir, mode);
+ } else
+ inode->i_gid = current_fsgid();
+ inode->i_mode = mode;
+--- a/fs/namei.c
++++ b/fs/namei.c
+@@ -2798,6 +2798,63 @@ void unlock_rename(struct dentry *p1, st
+ }
+ EXPORT_SYMBOL(unlock_rename);
+
++/**
++ * mode_strip_umask - handle vfs umask stripping
++ * @dir: parent directory of the new inode
++ * @mode: mode of the new inode to be created in @dir
++ *
++ * Umask stripping depends on whether or not the filesystem supports POSIX
++ * ACLs. If the filesystem doesn't support it umask stripping is done directly
++ * in here. If the filesystem does support POSIX ACLs umask stripping is
++ * deferred until the filesystem calls posix_acl_create().
++ *
++ * Returns: mode
++ */
++static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
++{
++ if (!IS_POSIXACL(dir))
++ mode &= ~current_umask();
++ return mode;
++}
++
++/**
++ * vfs_prepare_mode - prepare the mode to be used for a new inode
++ * @dir: parent directory of the new inode
++ * @mode: mode of the new inode
++ * @mask_perms: allowed permission by the vfs
++ * @type: type of file to be created
++ *
++ * This helper consolidates and enforces vfs restrictions on the @mode of a new
++ * object to be created.
++ *
++ * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
++ * the kernel documentation for mode_strip_umask()). Moving umask stripping
++ * after setgid stripping allows the same ordering for both non-POSIX ACL and
++ * POSIX ACL supporting filesystems.
++ *
++ * Note that it's currently valid for @type to be 0 if a directory is created.
++ * Filesystems raise that flag individually and we need to check whether each
++ * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
++ * non-zero type.
++ *
++ * Returns: mode to be passed to the filesystem
++ */
++static inline umode_t vfs_prepare_mode(const struct inode *dir, umode_t mode,
++ umode_t mask_perms, umode_t type)
++{
++ mode = mode_strip_sgid(dir, mode);
++ mode = mode_strip_umask(dir, mode);
++
++ /*
++ * Apply the vfs mandated allowed permission mask and set the type of
++ * file to be created before we call into the filesystem.
++ */
++ mode &= (mask_perms & ~S_IFMT);
++ mode |= (type & S_IFMT);
++
++ return mode;
++}
++
+ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool want_excl)
+ {
+@@ -2807,8 +2864,8 @@ int vfs_create(struct inode *dir, struct
+
+ if (!dir->i_op->create)
+ return -EACCES; /* shouldn't it be ENOSYS? */
+- mode &= S_IALLUGO;
+- mode |= S_IFREG;
++
++ mode = vfs_prepare_mode(dir, mode, S_IALLUGO, S_IFREG);
+ error = security_inode_create(dir, dentry, mode);
+ if (error)
+ return error;
+@@ -3072,8 +3129,7 @@ static struct dentry *lookup_open(struct
+ if (open_flag & O_CREAT) {
+ if (open_flag & O_EXCL)
+ open_flag &= ~O_TRUNC;
+- if (!IS_POSIXACL(dir->d_inode))
+- mode &= ~current_umask();
++ mode = vfs_prepare_mode(dir->d_inode, mode, mode, mode);
+ if (likely(got_write))
+ create_error = may_o_create(&nd->path, dentry, mode);
+ else
+@@ -3286,8 +3342,7 @@ struct dentry *vfs_tmpfile(struct dentry
+ child = d_alloc(dentry, &slash_name);
+ if (unlikely(!child))
+ goto out_err;
+- if (!IS_POSIXACL(dir))
+- mode &= ~current_umask();
++ mode = vfs_prepare_mode(dir, mode, mode, mode);
+ error = dir->i_op->tmpfile(dir, child, mode);
+ if (error)
+ goto out_err;
+@@ -3548,6 +3603,7 @@ int vfs_mknod(struct inode *dir, struct
+ if (!dir->i_op->mknod)
+ return -EPERM;
+
++ mode = vfs_prepare_mode(dir, mode, mode, mode);
+ error = devcgroup_inode_mknod(mode, dev);
+ if (error)
+ return error;
+@@ -3596,9 +3652,8 @@ retry:
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+- if (!IS_POSIXACL(path.dentry->d_inode))
+- mode &= ~current_umask();
+- error = security_path_mknod(&path, dentry, mode, dev);
++ error = security_path_mknod(&path, dentry,
++ mode_strip_umask(path.dentry->d_inode, mode), dev);
+ if (error)
+ goto out;
+ switch (mode & S_IFMT) {
+@@ -3646,7 +3701,7 @@ int vfs_mkdir(struct inode *dir, struct
+ if (!dir->i_op->mkdir)
+ return -EPERM;
+
+- mode &= (S_IRWXUGO|S_ISVTX);
++ mode = vfs_prepare_mode(dir, mode, S_IRWXUGO | S_ISVTX, 0);
+ error = security_inode_mkdir(dir, dentry, mode);
+ if (error)
+ return error;
+@@ -3673,9 +3728,8 @@ retry:
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+- if (!IS_POSIXACL(path.dentry->d_inode))
+- mode &= ~current_umask();
+- error = security_path_mkdir(&path, dentry, mode);
++ error = security_path_mkdir(&path, dentry,
++ mode_strip_umask(path.dentry->d_inode, mode));
+ if (!error)
+ error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+ done_path_create(&path, dentry);
+--- a/fs/ocfs2/namei.c
++++ b/fs/ocfs2/namei.c
+@@ -198,6 +198,7 @@ static struct inode *ocfs2_get_init_inod
+ * callers. */
+ if (S_ISDIR(mode))
+ set_nlink(inode, 2);
++ mode = mode_strip_sgid(dir, mode);
+ inode_init_owner(inode, dir, mode);
+ status = dquot_initialize(inode);
+ if (status)
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:16:15 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:25 +0200
+Subject: fs: move should_remove_suid()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org
+Message-ID: <20230318101529.1361673-12-amir73il@gmail.com>
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+commit e243e3f94c804ecca9a8241b5babe28f35258ef4 upstream.
+
+Move the helper from inode.c to attr.c. This keeps the the core of the
+set{g,u}id stripping logic in one place when we add follow-up changes.
+It is the better place anyway, since should_remove_suid() returns
+ATTR_KILL_S{G,U}ID flags.
+
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/attr.c | 29 +++++++++++++++++++++++++++++
+ fs/inode.c | 29 -----------------------------
+ 2 files changed, 29 insertions(+), 29 deletions(-)
+
+--- a/fs/attr.c
++++ b/fs/attr.c
+@@ -20,6 +20,35 @@
+
+ #include "internal.h"
+
++/*
++ * The logic we want is
++ *
++ * if suid or (sgid and xgrp)
++ * remove privs
++ */
++int should_remove_suid(struct dentry *dentry)
++{
++ umode_t mode = d_inode(dentry)->i_mode;
++ int kill = 0;
++
++ /* suid always must be killed */
++ if (unlikely(mode & S_ISUID))
++ kill = ATTR_KILL_SUID;
++
++ /*
++ * sgid without any exec bits is just a mandatory locking mark; leave
++ * it alone. If some exec bits are set, it's a real sgid; kill it.
++ */
++ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
++ kill |= ATTR_KILL_SGID;
++
++ if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
++ return kill;
++
++ return 0;
++}
++EXPORT_SYMBOL(should_remove_suid);
++
+ static bool chown_ok(const struct inode *inode, kuid_t uid)
+ {
+ if (uid_eq(current_fsuid(), inode->i_uid) &&
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -1855,35 +1855,6 @@ skip_update:
+ EXPORT_SYMBOL(touch_atime);
+
+ /*
+- * The logic we want is
+- *
+- * if suid or (sgid and xgrp)
+- * remove privs
+- */
+-int should_remove_suid(struct dentry *dentry)
+-{
+- umode_t mode = d_inode(dentry)->i_mode;
+- int kill = 0;
+-
+- /* suid always must be killed */
+- if (unlikely(mode & S_ISUID))
+- kill = ATTR_KILL_SUID;
+-
+- /*
+- * sgid without any exec bits is just a mandatory locking mark; leave
+- * it alone. If some exec bits are set, it's a real sgid; kill it.
+- */
+- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+- kill |= ATTR_KILL_SGID;
+-
+- if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
+- return kill;
+-
+- return 0;
+-}
+-EXPORT_SYMBOL(should_remove_suid);
+-
+-/*
+ * Return mask of changes for notify_change() that need to be done as a
+ * response to write or truncate. Return 0 if nothing has to be changed.
+ * Negative value on error (change should be denied).
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:16:17 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:28 +0200
+Subject: fs: use consistent setgid checks in is_sxid()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Miklos Szeredi <miklos@szeredi.hu>
+Message-ID: <20230318101529.1361673-15-amir73il@gmail.com>
+
+From: Christian Brauner <brauner@kernel.org>
+
+commit 8d84e39d76bd83474b26cb44f4b338635676e7e8 upstream.
+
+Now that we made the VFS setgid checking consistent an inode can't be
+marked security irrelevant even if the setgid bit is still set. Make
+this function consistent with all other helpers.
+
+Note that enforcing consistent setgid stripping checks for file
+modification and mode- and ownership changes will cause the setgid bit
+to be lost in more cases than useed to be the case. If an unprivileged
+user wrote to a non-executable setgid file that they don't have
+privilege over the setgid bit will be dropped. This will lead to
+temporary failures in some xfstests until they have been updated.
+
+Reported-by: Miklos Szeredi <miklos@szeredi.hu>
+Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/fs.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -3408,7 +3408,7 @@ int __init get_filesystem_list(char *buf
+
+ static inline bool is_sxid(umode_t mode)
+ {
+- return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP));
++ return mode & (S_ISUID | S_ISGID);
+ }
+
+ static inline int check_sticky(struct inode *dir, struct inode *inode)
--- /dev/null
+From pchelkin@ispras.ru Mon Mar 20 14:30:30 2023
+From: Fedor Pchelkin <pchelkin@ispras.ru>
+Date: Thu, 16 Mar 2023 21:56:16 +0300
+Subject: io_uring: avoid null-ptr-deref in io_arm_poll_handler
+To: Jens Axboe <axboe@kernel.dk>, Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable@vger.kernel.org
+Cc: Fedor Pchelkin <pchelkin@ispras.ru>, linux-kernel@vger.kernel.org, Alexey Khoroshilov <khoroshilov@ispras.ru>, lvc-project@linuxtesting.org
+Message-ID: <20230316185616.271024-1-pchelkin@ispras.ru>
+
+From: Fedor Pchelkin <pchelkin@ispras.ru>
+
+No upstream commit exists for this commit.
+
+The issue was introduced with backporting upstream commit c16bda37594f
+("io_uring/poll: allow some retries for poll triggering spuriously").
+
+Memory allocation can possibly fail causing invalid pointer be
+dereferenced just before comparing it to NULL value.
+
+Move the pointer check in proper place (upstream has the similar location
+of the check). In case the request has REQ_F_POLLED flag up, apoll can't
+be NULL so no need to check there.
+
+Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
+
+Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -5792,10 +5792,10 @@ static int io_arm_poll_handler(struct io
+ }
+ } else {
+ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
++ if (unlikely(!apoll))
++ return IO_APOLL_ABORTED;
+ apoll->poll.retries = APOLL_MAX_RETRY;
+ }
+- if (unlikely(!apoll))
+- return IO_APOLL_ABORTED;
+ apoll->double_poll = NULL;
+ req->apoll = apoll;
+ req->flags |= REQ_F_POLLED;
--- /dev/null
+From 53b54ad074de1896f8b021615f65b27f557ce874 Mon Sep 17 00:00:00 2001
+From: Lukas Wunner <lukas@wunner.de>
+Date: Sun, 15 Jan 2023 09:20:33 +0100
+Subject: PCI/DPC: Await readiness of secondary bus after reset
+
+From: Lukas Wunner <lukas@wunner.de>
+
+commit 53b54ad074de1896f8b021615f65b27f557ce874 upstream.
+
+pci_bridge_wait_for_secondary_bus() is called after a Secondary Bus
+Reset, but not after a DPC-induced Hot Reset.
+
+As a result, the delays prescribed by PCIe r6.0 sec 6.6.1 are not
+observed and devices on the secondary bus may be accessed before
+they're ready.
+
+One affected device is Intel's Ponte Vecchio HPC GPU. It comprises a
+PCIe switch whose upstream port is not immediately ready after reset.
+Because its config space is restored too early, it remains in
+D0uninitialized, its subordinate devices remain inaccessible and DPC
+recovery fails with messages such as:
+
+ i915 0000:8c:00.0: can't change power state from D3cold to D0 (config space inaccessible)
+ intel_vsec 0000:8e:00.1: can't change power state from D3cold to D0 (config space inaccessible)
+ pcieport 0000:89:02.0: AER: device recovery failed
+
+Fix it.
+
+Link: https://lore.kernel.org/r/9f5ff00e1593d8d9a4b452398b98aa14d23fca11.1673769517.git.lukas@wunner.de
+Tested-by: Ravi Kishore Koppuravuri <ravi.kishore.koppuravuri@intel.com>
+Signed-off-by: Lukas Wunner <lukas@wunner.de>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/pci.c | 3 ---
+ drivers/pci/pci.h | 6 ++++++
+ drivers/pci/pcie/dpc.c | 4 ++--
+ 3 files changed, 8 insertions(+), 5 deletions(-)
+
+--- a/drivers/pci/pci.c
++++ b/drivers/pci/pci.c
+@@ -157,9 +157,6 @@ static int __init pcie_port_pm_setup(cha
+ }
+ __setup("pcie_port_pm=", pcie_port_pm_setup);
+
+-/* Time to wait after a reset for device to become responsive */
+-#define PCIE_RESET_READY_POLL_MS 60000
+-
+ /**
+ * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children
+ * @bus: pointer to PCI bus structure to search
+--- a/drivers/pci/pci.h
++++ b/drivers/pci/pci.h
+@@ -53,6 +53,12 @@ int pci_bus_error_reset(struct pci_dev *
+ * Reset (PCIe r6.0 sec 5.8).
+ */
+ #define PCI_RESET_WAIT 1000 /* msec */
++/*
++ * Devices may extend the 1 sec period through Request Retry Status completions
++ * (PCIe r6.0 sec 2.3.1). The spec does not provide an upper limit, but 60 sec
++ * ought to be enough for any device to become responsive.
++ */
++#define PCIE_RESET_READY_POLL_MS 60000 /* msec */
+
+ /**
+ * struct pci_platform_pm_ops - Firmware PM callbacks
+--- a/drivers/pci/pcie/dpc.c
++++ b/drivers/pci/pcie/dpc.c
+@@ -170,8 +170,8 @@ pci_ers_result_t dpc_reset_link(struct p
+ pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS,
+ PCI_EXP_DPC_STATUS_TRIGGER);
+
+- if (!pcie_wait_for_link(pdev, true)) {
+- pci_info(pdev, "Data Link Layer Link Active not set in 1000 msec\n");
++ if (pci_bridge_wait_for_secondary_bus(pdev, "DPC",
++ PCIE_RESET_READY_POLL_MS)) {
+ clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags);
+ ret = PCI_ERS_RESULT_DISCONNECT;
+ } else {
--- /dev/null
+From ac91e6980563ed53afadd925fa6585ffd2bc4a2c Mon Sep 17 00:00:00 2001
+From: Lukas Wunner <lukas@wunner.de>
+Date: Sun, 15 Jan 2023 09:20:32 +0100
+Subject: PCI: Unify delay handling for reset and resume
+
+From: Lukas Wunner <lukas@wunner.de>
+
+commit ac91e6980563ed53afadd925fa6585ffd2bc4a2c upstream.
+
+Sheng Bi reports that pci_bridge_secondary_bus_reset() may fail to wait
+for devices on the secondary bus to become accessible after reset:
+
+Although it does call pci_dev_wait(), it erroneously passes the bridge's
+pci_dev rather than that of a child. The bridge of course is always
+accessible while its secondary bus is reset, so pci_dev_wait() returns
+immediately.
+
+Sheng Bi proposes introducing a new pci_bridge_secondary_bus_wait()
+function which is called from pci_bridge_secondary_bus_reset():
+
+https://lore.kernel.org/linux-pci/20220523171517.32407-1-windy.bi.enflame@gmail.com/
+
+However we already have pci_bridge_wait_for_secondary_bus() which does
+almost exactly what we need. So far it's only called on resume from
+D3cold (which implies a Fundamental Reset per PCIe r6.0 sec 5.8).
+Re-using it for Secondary Bus Resets is a leaner and more rational
+approach than introducing a new function.
+
+That only requires a few minor tweaks:
+
+- Amend pci_bridge_wait_for_secondary_bus() to await accessibility of
+ the first device on the secondary bus by calling pci_dev_wait() after
+ performing the prescribed delays. pci_dev_wait() needs two parameters,
+ a reset reason and a timeout, which callers must now pass to
+ pci_bridge_wait_for_secondary_bus(). The timeout is 1 sec for resume
+ (PCIe r6.0 sec 6.6.1) and 60 sec for reset (commit 821cdad5c46c ("PCI:
+ Wait up to 60 seconds for device to become ready after FLR")).
+ Introduce a PCI_RESET_WAIT macro for the 1 sec timeout.
+
+- Amend pci_bridge_wait_for_secondary_bus() to return 0 on success or
+ -ENOTTY on error for consumption by pci_bridge_secondary_bus_reset().
+
+- Drop an unnecessary 1 sec delay from pci_reset_secondary_bus() which
+ is now performed by pci_bridge_wait_for_secondary_bus(). A static
+ delay this long is only necessary for Conventional PCI, so modern
+ PCIe systems benefit from shorter reset times as a side effect.
+
+Fixes: 6b2f1351af56 ("PCI: Wait for device to become ready after secondary bus reset")
+Link: https://lore.kernel.org/r/da77c92796b99ec568bd070cbe4725074a117038.1673769517.git.lukas@wunner.de
+Reported-by: Sheng Bi <windy.bi.enflame@gmail.com>
+Tested-by: Ravi Kishore Koppuravuri <ravi.kishore.koppuravuri@intel.com>
+Signed-off-by: Lukas Wunner <lukas@wunner.de>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
+Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
+Cc: stable@vger.kernel.org # v4.17+
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/pci-driver.c | 4 +--
+ drivers/pci/pci.c | 54 ++++++++++++++++++++---------------------------
+ drivers/pci/pci.h | 10 +++++++-
+ 3 files changed, 35 insertions(+), 33 deletions(-)
+
+--- a/drivers/pci/pci-driver.c
++++ b/drivers/pci/pci-driver.c
+@@ -911,7 +911,7 @@ static int pci_pm_resume_noirq(struct de
+ pcie_pme_root_status_cleanup(pci_dev);
+
+ if (!skip_bus_pm && prev_state == PCI_D3cold)
+- pci_bridge_wait_for_secondary_bus(pci_dev);
++ pci_bridge_wait_for_secondary_bus(pci_dev, "resume", PCI_RESET_WAIT);
+
+ if (pci_has_legacy_pm_support(pci_dev))
+ return 0;
+@@ -1298,7 +1298,7 @@ static int pci_pm_runtime_resume(struct
+ pci_pm_default_resume(pci_dev);
+
+ if (prev_state == PCI_D3cold)
+- pci_bridge_wait_for_secondary_bus(pci_dev);
++ pci_bridge_wait_for_secondary_bus(pci_dev, "resume", PCI_RESET_WAIT);
+
+ if (pm && pm->runtime_resume)
+ error = pm->runtime_resume(dev);
+--- a/drivers/pci/pci.c
++++ b/drivers/pci/pci.c
+@@ -1221,7 +1221,7 @@ static int pci_dev_wait(struct pci_dev *
+ return -ENOTTY;
+ }
+
+- if (delay > 1000)
++ if (delay > PCI_RESET_WAIT)
+ pci_info(dev, "not ready %dms after %s; waiting\n",
+ delay - 1, reset_type);
+
+@@ -1230,7 +1230,7 @@ static int pci_dev_wait(struct pci_dev *
+ pci_read_config_dword(dev, PCI_COMMAND, &id);
+ }
+
+- if (delay > 1000)
++ if (delay > PCI_RESET_WAIT)
+ pci_info(dev, "ready %dms after %s\n", delay - 1,
+ reset_type);
+
+@@ -4792,24 +4792,31 @@ static int pci_bus_max_d3cold_delay(cons
+ /**
+ * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
+ * @dev: PCI bridge
++ * @reset_type: reset type in human-readable form
++ * @timeout: maximum time to wait for devices on secondary bus (milliseconds)
+ *
+ * Handle necessary delays before access to the devices on the secondary
+- * side of the bridge are permitted after D3cold to D0 transition.
++ * side of the bridge are permitted after D3cold to D0 transition
++ * or Conventional Reset.
+ *
+ * For PCIe this means the delays in PCIe 5.0 section 6.6.1. For
+ * conventional PCI it means Tpvrh + Trhfa specified in PCI 3.0 section
+ * 4.3.2.
++ *
++ * Return 0 on success or -ENOTTY if the first device on the secondary bus
++ * failed to become accessible.
+ */
+-void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev)
++int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type,
++ int timeout)
+ {
+ struct pci_dev *child;
+ int delay;
+
+ if (pci_dev_is_disconnected(dev))
+- return;
++ return 0;
+
+ if (!pci_is_bridge(dev))
+- return;
++ return 0;
+
+ down_read(&pci_bus_sem);
+
+@@ -4821,14 +4828,14 @@ void pci_bridge_wait_for_secondary_bus(s
+ */
+ if (!dev->subordinate || list_empty(&dev->subordinate->devices)) {
+ up_read(&pci_bus_sem);
+- return;
++ return 0;
+ }
+
+ /* Take d3cold_delay requirements into account */
+ delay = pci_bus_max_d3cold_delay(dev->subordinate);
+ if (!delay) {
+ up_read(&pci_bus_sem);
+- return;
++ return 0;
+ }
+
+ child = list_first_entry(&dev->subordinate->devices, struct pci_dev,
+@@ -4837,14 +4844,12 @@ void pci_bridge_wait_for_secondary_bus(s
+
+ /*
+ * Conventional PCI and PCI-X we need to wait Tpvrh + Trhfa before
+- * accessing the device after reset (that is 1000 ms + 100 ms). In
+- * practice this should not be needed because we don't do power
+- * management for them (see pci_bridge_d3_possible()).
++ * accessing the device after reset (that is 1000 ms + 100 ms).
+ */
+ if (!pci_is_pcie(dev)) {
+ pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
+ msleep(1000 + delay);
+- return;
++ return 0;
+ }
+
+ /*
+@@ -4861,11 +4866,11 @@ void pci_bridge_wait_for_secondary_bus(s
+ * configuration requests if we only wait for 100 ms (see
+ * https://bugzilla.kernel.org/show_bug.cgi?id=203885).
+ *
+- * Therefore we wait for 100 ms and check for the device presence.
+- * If it is still not present give it an additional 100 ms.
++ * Therefore we wait for 100 ms and check for the device presence
++ * until the timeout expires.
+ */
+ if (!pcie_downstream_port(dev))
+- return;
++ return 0;
+
+ if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
+ pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
+@@ -4876,14 +4881,11 @@ void pci_bridge_wait_for_secondary_bus(s
+ if (!pcie_wait_for_link_delay(dev, true, delay)) {
+ /* Did not train, no need to wait any further */
+ pci_info(dev, "Data Link Layer Link Active not set in 1000 msec\n");
+- return;
++ return -ENOTTY;
+ }
+ }
+
+- if (!pci_device_is_present(child)) {
+- pci_dbg(child, "waiting additional %d ms to become accessible\n", delay);
+- msleep(delay);
+- }
++ return pci_dev_wait(child, reset_type, timeout - delay);
+ }
+
+ void pci_reset_secondary_bus(struct pci_dev *dev)
+@@ -4902,15 +4904,6 @@ void pci_reset_secondary_bus(struct pci_
+
+ ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
+ pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
+-
+- /*
+- * Trhfa for conventional PCI is 2^25 clock cycles.
+- * Assuming a minimum 33MHz clock this results in a 1s
+- * delay before we can consider subordinate devices to
+- * be re-initialized. PCIe has some ways to shorten this,
+- * but we don't make use of them yet.
+- */
+- ssleep(1);
+ }
+
+ void __weak pcibios_reset_secondary_bus(struct pci_dev *dev)
+@@ -4929,7 +4922,8 @@ int pci_bridge_secondary_bus_reset(struc
+ {
+ pcibios_reset_secondary_bus(dev);
+
+- return pci_dev_wait(dev, "bus reset", PCIE_RESET_READY_POLL_MS);
++ return pci_bridge_wait_for_secondary_bus(dev, "bus reset",
++ PCIE_RESET_READY_POLL_MS);
+ }
+ EXPORT_SYMBOL_GPL(pci_bridge_secondary_bus_reset);
+
+--- a/drivers/pci/pci.h
++++ b/drivers/pci/pci.h
+@@ -47,6 +47,13 @@ int pci_bus_error_reset(struct pci_dev *
+ #define PCI_PM_D3HOT_WAIT 10 /* msec */
+ #define PCI_PM_D3COLD_WAIT 100 /* msec */
+
++/*
++ * Following exit from Conventional Reset, devices must be ready within 1 sec
++ * (PCIe r6.0 sec 6.6.1). A D3cold to D0 transition implies a Conventional
++ * Reset (PCIe r6.0 sec 5.8).
++ */
++#define PCI_RESET_WAIT 1000 /* msec */
++
+ /**
+ * struct pci_platform_pm_ops - Firmware PM callbacks
+ *
+@@ -108,7 +115,8 @@ void pci_allocate_cap_save_buffers(struc
+ void pci_free_cap_save_buffers(struct pci_dev *dev);
+ bool pci_bridge_d3_possible(struct pci_dev *dev);
+ void pci_bridge_d3_update(struct pci_dev *dev);
+-void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev);
++int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type,
++ int timeout);
+
+ static inline void pci_wakeup_event(struct pci_dev *dev)
+ {
--- /dev/null
+From a52e5cdbe8016d4e3e6322fd93d71afddb9a5af9 Mon Sep 17 00:00:00 2001
+From: Sven Schnelle <svens@linux.ibm.com>
+Date: Tue, 7 Mar 2023 14:35:23 +0100
+Subject: s390/ipl: add missing intersection check to ipl_report handling
+
+From: Sven Schnelle <svens@linux.ibm.com>
+
+commit a52e5cdbe8016d4e3e6322fd93d71afddb9a5af9 upstream.
+
+The code which handles the ipl report is searching for a free location
+in memory where it could copy the component and certificate entries to.
+It checks for intersection between the sections required for the kernel
+and the component/certificate data area, but fails to check whether
+the data structures linking these data areas together intersect.
+
+This might cause the iplreport copy code to overwrite the iplreport
+itself. Fix this by adding two addtional intersection checks.
+
+Cc: <stable@vger.kernel.org>
+Fixes: 9641b8cc733f ("s390/ipl: read IPL report at early boot")
+Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
+Reviewed-by: Vasily Gorbik <gor@linux.ibm.com>
+Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
+Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/s390/boot/ipl_report.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/arch/s390/boot/ipl_report.c
++++ b/arch/s390/boot/ipl_report.c
+@@ -57,11 +57,19 @@ repeat:
+ if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE &&
+ intersects(INITRD_START, INITRD_SIZE, safe_addr, size))
+ safe_addr = INITRD_START + INITRD_SIZE;
++ if (intersects(safe_addr, size, (unsigned long)comps, comps->len)) {
++ safe_addr = (unsigned long)comps + comps->len;
++ goto repeat;
++ }
+ for_each_rb_entry(comp, comps)
+ if (intersects(safe_addr, size, comp->addr, comp->len)) {
+ safe_addr = comp->addr + comp->len;
+ goto repeat;
+ }
++ if (intersects(safe_addr, size, (unsigned long)certs, certs->len)) {
++ safe_addr = (unsigned long)certs + certs->len;
++ goto repeat;
++ }
+ for_each_rb_entry(cert, certs)
+ if (intersects(safe_addr, size, cert->addr, cert->len)) {
+ safe_addr = cert->addr + cert->len;
cpuidle-psci-iterate-backwards-over-list-in-psci_pd_remove.patch
x86-mce-make-sure-logged-mces-are-processed-after-sysfs-update.patch
x86-mm-fix-use-of-uninitialized-buffer-in-sme_enable.patch
+drm-i915-don-t-use-stolen-memory-for-ring-buffers-with-llc.patch
+drm-i915-active-fix-misuse-of-non-idle-barriers-as-fence-trackers.patch
+io_uring-avoid-null-ptr-deref-in-io_arm_poll_handler.patch
+s390-ipl-add-missing-intersection-check-to-ipl_report-handling.patch
+pci-unify-delay-handling-for-reset-and-resume.patch
+pci-dpc-await-readiness-of-secondary-bus-after-reset.patch
+xfs-don-t-assert-fail-on-perag-references-on-teardown.patch
+xfs-purge-dquots-after-inode-walk-fails-during-quotacheck.patch
+xfs-don-t-leak-btree-cursor-when-insrec-fails-after-a-split.patch
+xfs-remove-xfs_prealloc_sync.patch
+xfs-fallocate-should-call-file_modified.patch
+xfs-set-prealloc-flag-in-xfs_alloc_file_space.patch
+xfs-use-setattr_copy-to-set-vfs-inode-attributes.patch
+fs-add-mode_strip_sgid-helper.patch
+fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch
+attr-add-in_group_or_capable.patch
+fs-move-should_remove_suid.patch
+attr-add-setattr_should_drop_sgid.patch
+attr-use-consistent-sgid-stripping-checks.patch
+fs-use-consistent-setgid-checks-in-is_sxid.patch
+xfs-remove-xfs_setattr_time-declaration.patch
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:15:45 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:15 +0200
+Subject: xfs: don't assert fail on perag references on teardown
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Christoph Hellwig <hch@lst.de>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230318101529.1361673-2-amir73il@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 5b55cbc2d72632e874e50d2e36bce608e55aaaea upstream.
+
+[backport for 5.10.y, prior to perag refactoring in v5.14]
+
+Not fatal, the assert is there to catch developer attention. I'm
+seeing this occasionally during recoveryloop testing after a
+shutdown, and I don't want this to stop an overnight recoveryloop
+run as it is currently doing.
+
+Convert the ASSERT to a XFS_IS_CORRUPT() check so it will dump a
+corruption report into the log and cause a test failure that way,
+but it won't stop the machine dead.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_mount.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -126,7 +126,6 @@ __xfs_free_perag(
+ {
+ struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+
+- ASSERT(atomic_read(&pag->pag_ref) == 0);
+ kmem_free(pag);
+ }
+
+@@ -145,7 +144,7 @@ xfs_free_perag(
+ pag = radix_tree_delete(&mp->m_perag_tree, agno);
+ spin_unlock(&mp->m_perag_lock);
+ ASSERT(pag);
+- ASSERT(atomic_read(&pag->pag_ref) == 0);
++ XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
+ xfs_iunlink_destroy(pag);
+ xfs_buf_hash_destroy(pag);
+ call_rcu(&pag->rcu_head, __xfs_free_perag);
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:15:49 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:17 +0200
+Subject: xfs: don't leak btree cursor when insrec fails after a split
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Christoph Hellwig <hch@lst.de>, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230318101529.1361673-4-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit a54f78def73d847cb060b18c4e4a3d1d26c9ca6d upstream.
+
+The recent patch to improve btree cycle checking caused a regression
+when I rebased the in-memory btree branch atop the 5.19 for-next branch,
+because in-memory short-pointer btrees do not have AG numbers. This
+produced the following complaint from kmemleak:
+
+unreferenced object 0xffff88803d47dde8 (size 264):
+ comm "xfs_io", pid 4889, jiffies 4294906764 (age 24.072s)
+ hex dump (first 32 bytes):
+ 90 4d 0b 0f 80 88 ff ff 00 a0 bd 05 80 88 ff ff .M..............
+ e0 44 3a a0 ff ff ff ff 00 df 08 06 80 88 ff ff .D:.............
+ backtrace:
+ [<ffffffffa0388059>] xfbtree_dup_cursor+0x49/0xc0 [xfs]
+ [<ffffffffa029887b>] xfs_btree_dup_cursor+0x3b/0x200 [xfs]
+ [<ffffffffa029af5d>] __xfs_btree_split+0x6ad/0x820 [xfs]
+ [<ffffffffa029b130>] xfs_btree_split+0x60/0x110 [xfs]
+ [<ffffffffa029f6da>] xfs_btree_make_block_unfull+0x19a/0x1f0 [xfs]
+ [<ffffffffa029fada>] xfs_btree_insrec+0x3aa/0x810 [xfs]
+ [<ffffffffa029fff3>] xfs_btree_insert+0xb3/0x240 [xfs]
+ [<ffffffffa02cb729>] xfs_rmap_insert+0x99/0x200 [xfs]
+ [<ffffffffa02cf142>] xfs_rmap_map_shared+0x192/0x5f0 [xfs]
+ [<ffffffffa02cf60b>] xfs_rmap_map_raw+0x6b/0x90 [xfs]
+ [<ffffffffa0384a85>] xrep_rmap_stash+0xd5/0x1d0 [xfs]
+ [<ffffffffa0384dc0>] xrep_rmap_visit_bmbt+0xa0/0xf0 [xfs]
+ [<ffffffffa0384fb6>] xrep_rmap_scan_iext+0x56/0xa0 [xfs]
+ [<ffffffffa03850d8>] xrep_rmap_scan_ifork+0xd8/0x160 [xfs]
+ [<ffffffffa0385195>] xrep_rmap_scan_inode+0x35/0x80 [xfs]
+ [<ffffffffa03852ee>] xrep_rmap_find_rmaps+0x10e/0x270 [xfs]
+
+I noticed that xfs_btree_insrec has a bunch of debug code that return
+out of the function immediately, without freeing the "new" btree cursor
+that can be returned when _make_block_unfull calls xfs_btree_split. Fix
+the error return in this function to free the btree cursor.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_btree.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -3190,7 +3190,7 @@ xfs_btree_insrec(
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_buf *bp; /* buffer for block */
+ union xfs_btree_ptr nptr; /* new block ptr */
+- struct xfs_btree_cur *ncur; /* new btree cursor */
++ struct xfs_btree_cur *ncur = NULL; /* new btree cursor */
+ union xfs_btree_key nkey; /* new block key */
+ union xfs_btree_key *lkey;
+ int optr; /* old key/record index */
+@@ -3270,7 +3270,7 @@ xfs_btree_insrec(
+ #ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+- return error;
++ goto error0;
+ #endif
+
+ /*
+@@ -3290,7 +3290,7 @@ xfs_btree_insrec(
+ for (i = numrecs - ptr; i >= 0; i--) {
+ error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+ if (error)
+- return error;
++ goto error0;
+ }
+
+ xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+@@ -3375,6 +3375,8 @@ xfs_btree_insrec(
+ return 0;
+
+ error0:
++ if (ncur)
++ xfs_btree_del_cursor(ncur, error);
+ return error;
+ }
+
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:15:51 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:19 +0200
+Subject: xfs: fallocate() should call file_modified()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>
+Message-ID: <20230318101529.1361673-6-amir73il@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit fbe7e520036583a783b13ff9744e35c2a329d9a4 upstream.
+
+In XFS, we always update the inode change and modification time when
+any fallocate() operation succeeds. Furthermore, as various
+fallocate modes can change the file contents (extending EOF,
+punching holes, zeroing things, shifting extents), we should drop
+file privileges like suid just like we do for a regular write().
+There's already a VFS helper that figures all this out for us, so
+use that.
+
+The net effect of this is that we no longer drop suid/sgid if the
+caller is root, but we also now drop file capabilities.
+
+We also move the xfs_update_prealloc_flags() function so that it now
+is only called by the scope that needs to set the the prealloc flag.
+
+Based on a patch from Darrick Wong.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_file.c | 13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -895,6 +895,10 @@ xfs_file_fallocate(
+ goto out_unlock;
+ }
+
++ error = file_modified(file);
++ if (error)
++ goto out_unlock;
++
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+@@ -996,11 +1000,12 @@ xfs_file_fallocate(
+ if (error)
+ goto out_unlock;
+ }
+- }
+
+- error = xfs_update_prealloc_flags(ip, flags);
+- if (error)
+- goto out_unlock;
++ error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET);
++ if (error)
++ goto out_unlock;
++
++ }
+
+ /* Change file size if needed */
+ if (new_size) {
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:15:45 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:16 +0200
+Subject: xfs: purge dquots after inode walk fails during quotacheck
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Christoph Hellwig <hch@lst.de>, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230318101529.1361673-3-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit 86d40f1e49e9a909d25c35ba01bea80dbcd758cb upstream.
+
+[add XFS_QMOPT_QUOTALL flag to xfs_qm_dqpurge_all() for 5.10.y backport]
+
+xfs/434 and xfs/436 have been reporting occasional memory leaks of
+xfs_dquot objects. These tests themselves were the messenger, not the
+culprit, since they unload the xfs module, which trips the slub
+debugging code while tearing down all the xfs slab caches:
+
+=============================================================================
+BUG xfs_dquot (Tainted: G W ): Objects remaining in xfs_dquot on __kmem_cache_shutdown()
+-----------------------------------------------------------------------------
+
+Slab 0xffffea000606de00 objects=30 used=5 fp=0xffff888181b78a78 flags=0x17ff80000010200(slab|head|node=0|zone=2|lastcpupid=0xfff)
+CPU: 0 PID: 3953166 Comm: modprobe Tainted: G W 5.18.0-rc6-djwx #rc6 d5824be9e46a2393677bda868f9b154d917ca6a7
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20171121_152543-x86-ol7-builder-01.us.oracle.com-4.el7.1 04/01/2014
+
+Since we don't generally rmmod the xfs module between fstests, this
+means that xfs/434 is really just the canary in the coal mine --
+something leaked a dquot, but we don't know who. After days of pounding
+on fstests with kmemleak enabled, I finally got it to spit this out:
+
+unreferenced object 0xffff8880465654c0 (size 536):
+ comm "u10:4", pid 88, jiffies 4294935810 (age 29.512s)
+ hex dump (first 32 bytes):
+ 60 4a 56 46 80 88 ff ff 58 ea e4 5c 80 88 ff ff `JVF....X..\....
+ 00 e0 52 49 80 88 ff ff 01 00 01 00 00 00 00 00 ..RI............
+ backtrace:
+ [<ffffffffa0740f6c>] xfs_dquot_alloc+0x2c/0x530 [xfs]
+ [<ffffffffa07443df>] xfs_qm_dqread+0x6f/0x330 [xfs]
+ [<ffffffffa07462a2>] xfs_qm_dqget+0x132/0x4e0 [xfs]
+ [<ffffffffa0756bb0>] xfs_qm_quotacheck_dqadjust+0xa0/0x3e0 [xfs]
+ [<ffffffffa075724d>] xfs_qm_dqusage_adjust+0x35d/0x4f0 [xfs]
+ [<ffffffffa06c9068>] xfs_iwalk_ag_recs+0x348/0x5d0 [xfs]
+ [<ffffffffa06c95d3>] xfs_iwalk_run_callbacks+0x273/0x540 [xfs]
+ [<ffffffffa06c9e8d>] xfs_iwalk_ag+0x5ed/0x890 [xfs]
+ [<ffffffffa06ca22f>] xfs_iwalk_ag_work+0xff/0x170 [xfs]
+ [<ffffffffa06d22c9>] xfs_pwork_work+0x79/0x130 [xfs]
+ [<ffffffff81170bb2>] process_one_work+0x672/0x1040
+ [<ffffffff81171b1b>] worker_thread+0x59b/0xec0
+ [<ffffffff8118711e>] kthread+0x29e/0x340
+ [<ffffffff810032bf>] ret_from_fork+0x1f/0x30
+
+Now we know that quotacheck is at fault, but even this report was
+canaryish -- it was triggered by xfs/494, which doesn't actually mount
+any filesystems. (kmemleak can be a little slow to notice leaks, even
+with fstests repeatedly whacking it to look for them.) Looking at the
+*previous* fstest, however, showed that the test run before xfs/494 was
+xfs/117. The tipoff to the problem is in this excerpt from dmesg:
+
+XFS (sda4): Quotacheck needed: Please wait.
+XFS (sda4): Metadata corruption detected at xfs_dinode_verify.part.0+0xdb/0x7b0 [xfs], inode 0x119 dinode
+XFS (sda4): Unmount and run xfs_repair
+XFS (sda4): First 128 bytes of corrupted metadata buffer:
+00000000: 49 4e 81 a4 03 02 00 00 00 00 00 00 00 00 00 00 IN..............
+00000010: 00 00 00 01 00 00 00 00 00 90 57 54 54 1a 4c 68 ..........WTT.Lh
+00000020: 81 f9 7d e1 6d ee 16 00 34 bd 7d e1 6d ee 16 00 ..}.m...4.}.m...
+00000030: 34 bd 7d e1 6d ee 16 00 00 00 00 00 00 00 00 00 4.}.m...........
+00000040: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
+00000050: 00 00 00 02 00 00 00 00 00 00 00 00 96 80 f3 ab ................
+00000060: ff ff ff ff da 57 7b 11 00 00 00 00 00 00 00 03 .....W{.........
+00000070: 00 00 00 01 00 00 00 10 00 00 00 00 00 00 00 08 ................
+XFS (sda4): Quotacheck: Unsuccessful (Error -117): Disabling quotas.
+
+The dinode verifier decided that the inode was corrupt, which causes
+iget to return with EFSCORRUPTED. Since this happened during
+quotacheck, it is obvious that the kernel aborted the inode walk on
+account of the corruption error and disabled quotas. Unfortunately, we
+neglect to purge the dquot cache before doing that, which is how the
+dquots leaked.
+
+The problems started 10 years ago in commit b84a3a, when the dquot lists
+were converted to a radix tree, but the error handling behavior was not
+correctly preserved -- in that commit, if the bulkstat failed and
+usrquota was enabled, the bulkstat failure code would be overwritten by
+the result of flushing all the dquots to disk. As long as that
+succeeds, we'd continue the quota mount as if everything were ok, but
+instead we're now operating with a corrupt inode and incorrect quota
+usage counts. I didn't notice this bug in 2019 when I wrote commit
+ebd126a, which changed quotacheck to skip the dqflush when the scan
+doesn't complete due to inode walk failures.
+
+Introduced-by: b84a3a96751f ("xfs: remove the per-filesystem list of dquots")
+Fixes: ebd126a651f8 ("xfs: convert quotacheck to use the new iwalk functions")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_qm.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -1318,8 +1318,15 @@ xfs_qm_quotacheck(
+
+ error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
+ NULL);
+- if (error)
++ if (error) {
++ /*
++ * The inode walk may have partially populated the dquot
++ * caches. We must purge them before disabling quota and
++ * tearing down the quotainfo, or else the dquots will leak.
++ */
++ xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+ goto error_return;
++ }
+
+ /*
+ * We've made all the changes that we need to make incore. Flush them
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:15:50 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:18 +0200
+Subject: xfs: remove XFS_PREALLOC_SYNC
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>
+Message-ID: <20230318101529.1361673-5-amir73il@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 472c6e46f589c26057596dcba160712a5b3e02c5 upstream.
+
+[partial backport for dependency -
+ xfs_ioc_space() still uses XFS_PREALLOC_SYNC]
+
+Callers can acheive the same thing by calling xfs_log_force_inode()
+after making their modifications. There is no need for
+xfs_update_prealloc_flags() to do this.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_file.c | 13 +++++++------
+ fs/xfs/xfs_pnfs.c | 6 ++++--
+ 2 files changed, 11 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -94,8 +94,6 @@ xfs_update_prealloc_flags(
+ ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+- if (flags & XFS_PREALLOC_SYNC)
+- xfs_trans_set_sync(tp);
+ return xfs_trans_commit(tp);
+ }
+
+@@ -1000,9 +998,6 @@ xfs_file_fallocate(
+ }
+ }
+
+- if (file->f_flags & O_DSYNC)
+- flags |= XFS_PREALLOC_SYNC;
+-
+ error = xfs_update_prealloc_flags(ip, flags);
+ if (error)
+ goto out_unlock;
+@@ -1024,8 +1019,14 @@ xfs_file_fallocate(
+ * leave shifted extents past EOF and hence losing access to
+ * the data that is contained within them.
+ */
+- if (do_file_insert)
++ if (do_file_insert) {
+ error = xfs_insert_file_space(ip, offset, len);
++ if (error)
++ goto out_unlock;
++ }
++
++ if (file->f_flags & O_DSYNC)
++ error = xfs_log_force_inode(ip);
+
+ out_unlock:
+ xfs_iunlock(ip, iolock);
+--- a/fs/xfs/xfs_pnfs.c
++++ b/fs/xfs/xfs_pnfs.c
+@@ -164,10 +164,12 @@ xfs_fs_map_blocks(
+ * that the blocks allocated and handed out to the client are
+ * guaranteed to be present even after a server crash.
+ */
+- error = xfs_update_prealloc_flags(ip,
+- XFS_PREALLOC_SET | XFS_PREALLOC_SYNC);
++ error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET);
++ if (!error)
++ error = xfs_log_force_inode(ip);
+ if (error)
+ goto out_unlock;
++
+ } else {
+ xfs_iunlock(ip, lock_flags);
+ }
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:16:32 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:29 +0200
+Subject: xfs: remove xfs_setattr_time() declaration
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Gaosheng Cui <cuigaosheng1@huawei.com>, Carlos Maiolino <cmaiolino@redhat.com>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230318101529.1361673-16-amir73il@gmail.com>
+
+From: Gaosheng Cui <cuigaosheng1@huawei.com>
+
+commit b0463b9dd7030a766133ad2f1571f97f204d7bdf upstream.
+
+xfs_setattr_time() has been removed since
+commit e014f37db1a2 ("xfs: use setattr_copy to set vfs inode
+attributes"), so remove it.
+
+Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iops.h | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/fs/xfs/xfs_iops.h
++++ b/fs/xfs/xfs_iops.h
+@@ -18,7 +18,6 @@ extern ssize_t xfs_vn_listxattr(struct d
+ */
+ #define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */
+
+-extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
+ extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
+ int flags);
+ extern int xfs_vn_setattr_nonsize(struct dentry *dentry, struct iattr *vap);
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:16:01 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:20 +0200
+Subject: xfs: set prealloc flag in xfs_alloc_file_space()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>
+Message-ID: <20230318101529.1361673-7-amir73il@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 0b02c8c0d75a738c98c35f02efb36217c170d78c upstream.
+
+[backport for 5.10.y]
+
+Now that we only call xfs_update_prealloc_flags() from
+xfs_file_fallocate() in the case where we need to set the
+preallocation flag, do this in xfs_alloc_file_space() where we
+already have the inode joined into a transaction and get
+rid of the call to xfs_update_prealloc_flags() from the fallocate
+code.
+
+This also means that we now correctly avoid setting the
+XFS_DIFLAG_PREALLOC flag when xfs_is_always_cow_inode() is true, as
+these inodes will never have preallocated extents.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bmap_util.c | 9 +++------
+ fs/xfs/xfs_file.c | 8 --------
+ 2 files changed, 3 insertions(+), 14 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -800,9 +800,6 @@ xfs_alloc_file_space(
+ quota_flag = XFS_QMOPT_RES_REGBLKS;
+ }
+
+- /*
+- * Allocate and setup the transaction.
+- */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+ resrtextents, 0, &tp);
+
+@@ -830,9 +827,9 @@ xfs_alloc_file_space(
+ if (error)
+ goto error0;
+
+- /*
+- * Complete the transaction
+- */
++ ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
++ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
++
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -850,7 +850,6 @@ xfs_file_fallocate(
+ struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
+ long error;
+- enum xfs_prealloc_flags flags = 0;
+ uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ loff_t new_size = 0;
+ bool do_file_insert = false;
+@@ -948,8 +947,6 @@ xfs_file_fallocate(
+ }
+ do_file_insert = true;
+ } else {
+- flags |= XFS_PREALLOC_SET;
+-
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+@@ -1000,11 +997,6 @@ xfs_file_fallocate(
+ if (error)
+ goto out_unlock;
+ }
+-
+- error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET);
+- if (error)
+- goto out_unlock;
+-
+ }
+
+ /* Change file size if needed */
--- /dev/null
+From stable-owner@vger.kernel.org Sat Mar 18 11:16:02 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 18 Mar 2023 12:15:21 +0200
+Subject: xfs: use setattr_copy to set vfs inode attributes
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Christian Brauner <brauner@kernel.org>, linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Christoph Hellwig <hch@lst.de>
+Message-ID: <20230318101529.1361673-8-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit e014f37db1a2d109afa750042ac4d69cf3e3d88e upstream.
+
+[remove userns argument of setattr_copy() for 5.10.y backport]
+
+Filipe Manana pointed out that XFS' behavior w.r.t. setuid/setgid
+revocation isn't consistent with btrfs[1] or ext4. Those two
+filesystems use the VFS function setattr_copy to convey certain
+attributes from struct iattr into the VFS inode structure.
+
+Andrey Zhadchenko reported[2] that XFS uses the wrong user namespace to
+decide if it should clear setgid and setuid on a file attribute update.
+This is a second symptom of the problem that Filipe noticed.
+
+XFS, on the other hand, open-codes setattr_copy in xfs_setattr_mode,
+xfs_setattr_nonsize, and xfs_setattr_time. Regrettably, setattr_copy is
+/not/ a simple copy function; it contains additional logic to clear the
+setgid bit when setting the mode, and XFS' version no longer matches.
+
+The VFS implements its own setuid/setgid stripping logic, which
+establishes consistent behavior. It's a tad unfortunate that it's
+scattered across notify_change, should_remove_suid, and setattr_copy but
+XFS should really follow the Linux VFS. Adapt XFS to use the VFS
+functions and get rid of the old functions.
+
+[1] https://lore.kernel.org/fstests/CAL3q7H47iNQ=Wmk83WcGB-KBJVOEtR9+qGczzCeXJ9Y2KCV25Q@mail.gmail.com/
+[2] https://lore.kernel.org/linux-xfs/20220221182218.748084-1-andrey.zhadchenko@virtuozzo.com/
+
+Fixes: 7fa294c8991c ("userns: Allow chown and setgid preservation")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iops.c | 56 ++----------------------------------------------------
+ fs/xfs/xfs_pnfs.c | 3 +-
+ 2 files changed, 5 insertions(+), 54 deletions(-)
+
+--- a/fs/xfs/xfs_iops.c
++++ b/fs/xfs/xfs_iops.c
+@@ -595,37 +595,6 @@ xfs_vn_getattr(
+ return 0;
+ }
+
+-static void
+-xfs_setattr_mode(
+- struct xfs_inode *ip,
+- struct iattr *iattr)
+-{
+- struct inode *inode = VFS_I(ip);
+- umode_t mode = iattr->ia_mode;
+-
+- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+-
+- inode->i_mode &= S_IFMT;
+- inode->i_mode |= mode & ~S_IFMT;
+-}
+-
+-void
+-xfs_setattr_time(
+- struct xfs_inode *ip,
+- struct iattr *iattr)
+-{
+- struct inode *inode = VFS_I(ip);
+-
+- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+-
+- if (iattr->ia_valid & ATTR_ATIME)
+- inode->i_atime = iattr->ia_atime;
+- if (iattr->ia_valid & ATTR_CTIME)
+- inode->i_ctime = iattr->ia_ctime;
+- if (iattr->ia_valid & ATTR_MTIME)
+- inode->i_mtime = iattr->ia_mtime;
+-}
+-
+ static int
+ xfs_vn_change_ok(
+ struct dentry *dentry,
+@@ -741,16 +710,6 @@ xfs_setattr_nonsize(
+ }
+
+ /*
+- * CAP_FSETID overrides the following restrictions:
+- *
+- * The set-user-ID and set-group-ID bits of a file will be
+- * cleared upon successful return from chown()
+- */
+- if ((inode->i_mode & (S_ISUID|S_ISGID)) &&
+- !capable(CAP_FSETID))
+- inode->i_mode &= ~(S_ISUID|S_ISGID);
+-
+- /*
+ * Change the ownerships and register quota modifications
+ * in the transaction.
+ */
+@@ -761,7 +720,6 @@ xfs_setattr_nonsize(
+ olddquot1 = xfs_qm_vop_chown(tp, ip,
+ &ip->i_udquot, udqp);
+ }
+- inode->i_uid = uid;
+ }
+ if (!gid_eq(igid, gid)) {
+ if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
+@@ -772,15 +730,10 @@ xfs_setattr_nonsize(
+ olddquot2 = xfs_qm_vop_chown(tp, ip,
+ &ip->i_gdquot, gdqp);
+ }
+- inode->i_gid = gid;
+ }
+ }
+
+- if (mask & ATTR_MODE)
+- xfs_setattr_mode(ip, iattr);
+- if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+- xfs_setattr_time(ip, iattr);
+-
++ setattr_copy(inode, iattr);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ XFS_STATS_INC(mp, xs_ig_attrchg);
+@@ -1025,11 +978,8 @@ xfs_setattr_size(
+ xfs_inode_clear_eofblocks_tag(ip);
+ }
+
+- if (iattr->ia_valid & ATTR_MODE)
+- xfs_setattr_mode(ip, iattr);
+- if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+- xfs_setattr_time(ip, iattr);
+-
++ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
++ setattr_copy(inode, iattr);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ XFS_STATS_INC(mp, xs_ig_attrchg);
+--- a/fs/xfs/xfs_pnfs.c
++++ b/fs/xfs/xfs_pnfs.c
+@@ -285,7 +285,8 @@ xfs_fs_commit_blocks(
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+- xfs_setattr_time(ip, iattr);
++ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
++ setattr_copy(inode, iattr);
+ if (update_isize) {
+ i_size_write(inode, iattr->ia_size);
+ ip->i_d.di_size = iattr->ia_size;