--- /dev/null
+From 15f519e9f883b316d86e2bb6b767a023aafd9d83 Mon Sep 17 00:00:00 2001
+From: Alex Markuze <amarkuze@redhat.com>
+Date: Tue, 12 Aug 2025 09:57:38 +0000
+Subject: ceph: fix race condition validating r_parent before applying state
+
+From: Alex Markuze <amarkuze@redhat.com>
+
+commit 15f519e9f883b316d86e2bb6b767a023aafd9d83 upstream.
+
+Add validation to ensure the cached parent directory inode matches the
+directory info in MDS replies. This prevents client-side race conditions
+where concurrent operations (e.g. rename) cause r_parent to become stale
+between request initiation and reply processing, which could lead to
+applying state changes to incorrect directory inodes.
+
+[ idryomov: folded a kerneldoc fixup and a follow-up fix from Alex to
+ move CEPH_CAP_PIN reference when r_parent is updated:
+
+ When the parent directory lock is not held, req->r_parent can become
+ stale and is updated to point to the correct inode. However, the
+ associated CEPH_CAP_PIN reference was not being adjusted. The
+ CEPH_CAP_PIN is a reference on an inode that is tracked for
+ accounting purposes. Moving this pin is important to keep the
+ accounting balanced. When the pin was not moved from the old parent
+ to the new one, it created two problems: The reference on the old,
+ stale parent was never released, causing a reference leak.
+ A reference for the new parent was never acquired, creating the risk
+ of a reference underflow later in ceph_mdsc_release_request(). This
+ patch corrects the logic by releasing the pin from the old parent and
+ acquiring it for the new parent when r_parent is switched. This
+ ensures reference accounting stays balanced. ]
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Alex Markuze <amarkuze@redhat.com>
+Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/debugfs.c | 14 +---
+ fs/ceph/dir.c | 17 ++---
+ fs/ceph/file.c | 24 ++-----
+ fs/ceph/inode.c | 7 --
+ fs/ceph/mds_client.c | 172 +++++++++++++++++++++++++++++++--------------------
+ fs/ceph/mds_client.h | 18 ++++-
+ 6 files changed, 145 insertions(+), 107 deletions(-)
+
+--- a/fs/ceph/debugfs.c
++++ b/fs/ceph/debugfs.c
+@@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s,
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ struct rb_node *rp;
+- int pathlen = 0;
+- u64 pathbase;
+ char *path;
+
+ mutex_lock(&mdsc->mutex);
+@@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s,
+ if (req->r_inode) {
+ seq_printf(s, " #%llx", ceph_ino(req->r_inode));
+ } else if (req->r_dentry) {
+- path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
+- &pathbase, 0);
++ struct ceph_path_info path_info;
++ path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
+ if (IS_ERR(path))
+ path = NULL;
+ spin_lock(&req->r_dentry->d_lock);
+@@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s,
+ req->r_dentry,
+ path ? path : "");
+ spin_unlock(&req->r_dentry->d_lock);
+- ceph_mdsc_free_path(path, pathlen);
++ ceph_mdsc_free_path_info(&path_info);
+ } else if (req->r_path1) {
+ seq_printf(s, " #%llx/%s", req->r_ino1.ino,
+ req->r_path1);
+@@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s,
+ }
+
+ if (req->r_old_dentry) {
+- path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
+- &pathbase, 0);
++ struct ceph_path_info path_info;
++ path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0);
+ if (IS_ERR(path))
+ path = NULL;
+ spin_lock(&req->r_old_dentry->d_lock);
+@@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s,
+ req->r_old_dentry,
+ path ? path : "");
+ spin_unlock(&req->r_old_dentry->d_lock);
+- ceph_mdsc_free_path(path, pathlen);
++ ceph_mdsc_free_path_info(&path_info);
+ } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
+ if (req->r_ino2.ino)
+ seq_printf(s, " #%llx/%s", req->r_ino2.ino,
+--- a/fs/ceph/dir.c
++++ b/fs/ceph/dir.c
+@@ -1263,10 +1263,8 @@ static void ceph_async_unlink_cb(struct
+
+ /* If op failed, mark everyone involved for errors */
+ if (result) {
+- int pathlen = 0;
+- u64 base = 0;
+- char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
+- &base, 0);
++ struct ceph_path_info path_info = {0};
++ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
+
+ /* mark error on parent + clear complete */
+ mapping_set_error(req->r_parent->i_mapping, result);
+@@ -1280,8 +1278,8 @@ static void ceph_async_unlink_cb(struct
+ mapping_set_error(req->r_old_inode->i_mapping, result);
+
+ pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
+- base, IS_ERR(path) ? "<<bad>>" : path, result);
+- ceph_mdsc_free_path(path, pathlen);
++ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
++ ceph_mdsc_free_path_info(&path_info);
+ }
+ out:
+ iput(req->r_old_inode);
+@@ -1339,8 +1337,6 @@ static int ceph_unlink(struct inode *dir
+ int err = -EROFS;
+ int op;
+ char *path;
+- int pathlen;
+- u64 pathbase;
+
+ if (ceph_snap(dir) == CEPH_SNAPDIR) {
+ /* rmdir .snap/foo is RMSNAP */
+@@ -1359,14 +1355,15 @@ static int ceph_unlink(struct inode *dir
+ if (!dn) {
+ try_async = false;
+ } else {
+- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
++ struct ceph_path_info path_info;
++ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
+ if (IS_ERR(path)) {
+ try_async = false;
+ err = 0;
+ } else {
+ err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
+ }
+- ceph_mdsc_free_path(path, pathlen);
++ ceph_mdsc_free_path_info(&path_info);
+ dput(dn);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+--- a/fs/ceph/file.c
++++ b/fs/ceph/file.c
+@@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struc
+ int flags, fmode, wanted;
+ struct dentry *dentry;
+ char *path;
+- int pathlen;
+- u64 pathbase;
+ bool do_sync = false;
+ int mask = MAY_READ;
+
+@@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struc
+ if (!dentry) {
+ do_sync = true;
+ } else {
+- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
++ struct ceph_path_info path_info;
++ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
+ if (IS_ERR(path)) {
+ do_sync = true;
+ err = 0;
+ } else {
+ err = ceph_mds_check_access(mdsc, path, mask);
+ }
+- ceph_mdsc_free_path(path, pathlen);
++ ceph_mdsc_free_path_info(&path_info);
+ dput(dentry);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+@@ -614,15 +613,13 @@ static void ceph_async_create_cb(struct
+ mapping_set_error(req->r_parent->i_mapping, result);
+
+ if (result) {
+- int pathlen = 0;
+- u64 base = 0;
+- char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
+- &base, 0);
++ struct ceph_path_info path_info = {0};
++ char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
+
+ pr_warn_client(cl,
+ "async create failure path=(%llx)%s result=%d!\n",
+- base, IS_ERR(path) ? "<<bad>>" : path, result);
+- ceph_mdsc_free_path(path, pathlen);
++ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
++ ceph_mdsc_free_path_info(&path_info);
+
+ ceph_dir_clear_complete(req->r_parent);
+ if (!d_unhashed(dentry))
+@@ -791,8 +788,6 @@ int ceph_atomic_open(struct inode *dir,
+ int mask;
+ int err;
+ char *path;
+- int pathlen;
+- u64 pathbase;
+
+ doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
+ dir, ceph_vinop(dir), dentry, dentry,
+@@ -814,7 +809,8 @@ int ceph_atomic_open(struct inode *dir,
+ if (!dn) {
+ try_async = false;
+ } else {
+- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
++ struct ceph_path_info path_info;
++ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
+ if (IS_ERR(path)) {
+ try_async = false;
+ err = 0;
+@@ -826,7 +822,7 @@ int ceph_atomic_open(struct inode *dir,
+ mask |= MAY_WRITE;
+ err = ceph_mds_check_access(mdsc, path, mask);
+ }
+- ceph_mdsc_free_path(path, pathlen);
++ ceph_mdsc_free_path_info(&path_info);
+ dput(dn);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+--- a/fs/ceph/inode.c
++++ b/fs/ceph/inode.c
+@@ -2483,22 +2483,21 @@ int __ceph_setattr(struct mnt_idmap *idm
+ int truncate_retry = 20; /* The RMW will take around 50ms */
+ struct dentry *dentry;
+ char *path;
+- int pathlen;
+- u64 pathbase;
+ bool do_sync = false;
+
+ dentry = d_find_alias(inode);
+ if (!dentry) {
+ do_sync = true;
+ } else {
+- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
++ struct ceph_path_info path_info;
++ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
+ if (IS_ERR(path)) {
+ do_sync = true;
+ err = 0;
+ } else {
+ err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
+ }
+- ceph_mdsc_free_path(path, pathlen);
++ ceph_mdsc_free_path_info(&path_info);
+ dput(dentry);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+--- a/fs/ceph/mds_client.c
++++ b/fs/ceph/mds_client.c
+@@ -2686,8 +2686,7 @@ static u8 *get_fscrypt_altname(const str
+ * ceph_mdsc_build_path - build a path string to a given dentry
+ * @mdsc: mds client
+ * @dentry: dentry to which path should be built
+- * @plen: returned length of string
+- * @pbase: returned base inode number
++ * @path_info: output path, length, base ino+snap, and freepath ownership flag
+ * @for_wire: is this path going to be sent to the MDS?
+ *
+ * Build a string that represents the path to the dentry. This is mostly called
+@@ -2705,7 +2704,7 @@ static u8 *get_fscrypt_altname(const str
+ * foo/.snap/bar -> foo//bar
+ */
+ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+- int *plen, u64 *pbase, int for_wire)
++ struct ceph_path_info *path_info, int for_wire)
+ {
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct dentry *cur;
+@@ -2815,16 +2814,28 @@ retry:
+ return ERR_PTR(-ENAMETOOLONG);
+ }
+
+- *pbase = base;
+- *plen = PATH_MAX - 1 - pos;
++ /* Initialize the output structure */
++ memset(path_info, 0, sizeof(*path_info));
++
++ path_info->vino.ino = base;
++ path_info->pathlen = PATH_MAX - 1 - pos;
++ path_info->path = path + pos;
++ path_info->freepath = true;
++
++ /* Set snap from dentry if available */
++ if (d_inode(dentry))
++ path_info->vino.snap = ceph_snap(d_inode(dentry));
++ else
++ path_info->vino.snap = CEPH_NOSNAP;
++
+ doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
+- base, *plen, path + pos);
++ base, PATH_MAX - 1 - pos, path + pos);
+ return path + pos;
+ }
+
+ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+- struct inode *dir, const char **ppath, int *ppathlen,
+- u64 *pino, bool *pfreepath, bool parent_locked)
++ struct inode *dir, struct ceph_path_info *path_info,
++ bool parent_locked)
+ {
+ char *path;
+
+@@ -2833,41 +2844,47 @@ static int build_dentry_path(struct ceph
+ dir = d_inode_rcu(dentry->d_parent);
+ if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
+ !IS_ENCRYPTED(dir)) {
+- *pino = ceph_ino(dir);
++ path_info->vino.ino = ceph_ino(dir);
++ path_info->vino.snap = ceph_snap(dir);
+ rcu_read_unlock();
+- *ppath = dentry->d_name.name;
+- *ppathlen = dentry->d_name.len;
++ path_info->path = dentry->d_name.name;
++ path_info->pathlen = dentry->d_name.len;
++ path_info->freepath = false;
+ return 0;
+ }
+ rcu_read_unlock();
+- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
++ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+- *ppath = path;
+- *pfreepath = true;
++ /*
++ * ceph_mdsc_build_path already fills path_info, including snap handling.
++ */
+ return 0;
+ }
+
+-static int build_inode_path(struct inode *inode,
+- const char **ppath, int *ppathlen, u64 *pino,
+- bool *pfreepath)
++static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)
+ {
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct dentry *dentry;
+ char *path;
+
+ if (ceph_snap(inode) == CEPH_NOSNAP) {
+- *pino = ceph_ino(inode);
+- *ppathlen = 0;
++ path_info->vino.ino = ceph_ino(inode);
++ path_info->vino.snap = ceph_snap(inode);
++ path_info->pathlen = 0;
++ path_info->freepath = false;
+ return 0;
+ }
+ dentry = d_find_alias(inode);
+- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
++ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
+ dput(dentry);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+- *ppath = path;
+- *pfreepath = true;
++ /*
++ * ceph_mdsc_build_path already fills path_info, including snap from dentry.
++ * Override with inode's snap since that's what this function is for.
++ */
++ path_info->vino.snap = ceph_snap(inode);
+ return 0;
+ }
+
+@@ -2877,26 +2894,32 @@ static int build_inode_path(struct inode
+ */
+ static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
+ struct dentry *rdentry, struct inode *rdiri,
+- const char *rpath, u64 rino, const char **ppath,
+- int *pathlen, u64 *ino, bool *freepath,
++ const char *rpath, u64 rino,
++ struct ceph_path_info *path_info,
+ bool parent_locked)
+ {
+ struct ceph_client *cl = mdsc->fsc->client;
+ int r = 0;
+
++ /* Initialize the output structure */
++ memset(path_info, 0, sizeof(*path_info));
++
+ if (rinode) {
+- r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
++ r = build_inode_path(rinode, path_info);
+ doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+ ceph_snap(rinode));
+ } else if (rdentry) {
+- r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
+- freepath, parent_locked);
+- doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
++ r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);
++ doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino,
++ path_info->pathlen, path_info->path);
+ } else if (rpath || rino) {
+- *ino = rino;
+- *ppath = rpath;
+- *pathlen = rpath ? strlen(rpath) : 0;
+- doutc(cl, " path %.*s\n", *pathlen, rpath);
++ path_info->vino.ino = rino;
++ path_info->vino.snap = CEPH_NOSNAP;
++ path_info->path = rpath;
++ path_info->pathlen = rpath ? strlen(rpath) : 0;
++ path_info->freepath = false;
++
++ doutc(cl, " path %.*s\n", path_info->pathlen, rpath);
+ }
+
+ return r;
+@@ -2973,11 +2996,8 @@ static struct ceph_msg *create_request_m
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_msg *msg;
+ struct ceph_mds_request_head_legacy *lhead;
+- const char *path1 = NULL;
+- const char *path2 = NULL;
+- u64 ino1 = 0, ino2 = 0;
+- int pathlen1 = 0, pathlen2 = 0;
+- bool freepath1 = false, freepath2 = false;
++ struct ceph_path_info path_info1 = {0};
++ struct ceph_path_info path_info2 = {0};
+ struct dentry *old_dentry = NULL;
+ int len;
+ u16 releases;
+@@ -2987,25 +3007,49 @@ static struct ceph_msg *create_request_m
+ u16 request_head_version = mds_supported_head_version(session);
+ kuid_t caller_fsuid = req->r_cred->fsuid;
+ kgid_t caller_fsgid = req->r_cred->fsgid;
++ bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+
+ ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
+- req->r_parent, req->r_path1, req->r_ino1.ino,
+- &path1, &pathlen1, &ino1, &freepath1,
+- test_bit(CEPH_MDS_R_PARENT_LOCKED,
+- &req->r_req_flags));
++ req->r_parent, req->r_path1, req->r_ino1.ino,
++ &path_info1, parent_locked);
+ if (ret < 0) {
+ msg = ERR_PTR(ret);
+ goto out;
+ }
+
++ /*
++ * When the parent directory's i_rwsem is *not* locked, req->r_parent may
++ * have become stale (e.g. after a concurrent rename) between the time the
++ * dentry was looked up and now. If we detect that the stored r_parent
++ * does not match the inode number we just encoded for the request, switch
++ * to the correct inode so that the MDS receives a valid parent reference.
++ */
++ if (!parent_locked && req->r_parent && path_info1.vino.ino &&
++ ceph_ino(req->r_parent) != path_info1.vino.ino) {
++ struct inode *old_parent = req->r_parent;
++ struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);
++ if (!IS_ERR(correct_dir)) {
++ WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
++ ceph_ino(old_parent), path_info1.vino.ino);
++ /*
++ * Transfer CEPH_CAP_PIN from the old parent to the new one.
++ * The pin was taken earlier in ceph_mdsc_submit_request().
++ */
++ ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN);
++ iput(old_parent);
++ req->r_parent = correct_dir;
++ ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
++ }
++ }
++
+ /* If r_old_dentry is set, then assume that its parent is locked */
+ if (req->r_old_dentry &&
+ !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
+ old_dentry = req->r_old_dentry;
+ ret = set_request_path_attr(mdsc, NULL, old_dentry,
+- req->r_old_dentry_dir,
+- req->r_path2, req->r_ino2.ino,
+- &path2, &pathlen2, &ino2, &freepath2, true);
++ req->r_old_dentry_dir,
++ req->r_path2, req->r_ino2.ino,
++ &path_info2, true);
+ if (ret < 0) {
+ msg = ERR_PTR(ret);
+ goto out_free1;
+@@ -3036,7 +3080,7 @@ static struct ceph_msg *create_request_m
+
+ /* filepaths */
+ len += 2 * (1 + sizeof(u32) + sizeof(u64));
+- len += pathlen1 + pathlen2;
++ len += path_info1.pathlen + path_info2.pathlen;
+
+ /* cap releases */
+ len += sizeof(struct ceph_mds_request_release) *
+@@ -3044,9 +3088,9 @@ static struct ceph_msg *create_request_m
+ !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+
+ if (req->r_dentry_drop)
+- len += pathlen1;
++ len += path_info1.pathlen;
+ if (req->r_old_dentry_drop)
+- len += pathlen2;
++ len += path_info2.pathlen;
+
+ /* MClientRequest tail */
+
+@@ -3159,8 +3203,8 @@ static struct ceph_msg *create_request_m
+ lhead->ino = cpu_to_le64(req->r_deleg_ino);
+ lhead->args = req->r_args;
+
+- ceph_encode_filepath(&p, end, ino1, path1);
+- ceph_encode_filepath(&p, end, ino2, path2);
++ ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);
++ ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);
+
+ /* make note of release offset, in case we need to replay */
+ req->r_request_release_offset = p - msg->front.iov_base;
+@@ -3223,11 +3267,9 @@ static struct ceph_msg *create_request_m
+ msg->hdr.data_off = cpu_to_le16(0);
+
+ out_free2:
+- if (freepath2)
+- ceph_mdsc_free_path((char *)path2, pathlen2);
++ ceph_mdsc_free_path_info(&path_info2);
+ out_free1:
+- if (freepath1)
+- ceph_mdsc_free_path((char *)path1, pathlen1);
++ ceph_mdsc_free_path_info(&path_info1);
+ out:
+ return msg;
+ out_err:
+@@ -4584,24 +4626,20 @@ static int reconnect_caps_cb(struct inod
+ struct ceph_pagelist *pagelist = recon_state->pagelist;
+ struct dentry *dentry;
+ struct ceph_cap *cap;
+- char *path;
+- int pathlen = 0, err;
+- u64 pathbase;
++ struct ceph_path_info path_info = {0};
++ int err;
+ u64 snap_follows;
+
+ dentry = d_find_primary(inode);
+ if (dentry) {
+ /* set pathbase to parent dir when msg_version >= 2 */
+- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
++ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info,
+ recon_state->msg_version >= 2);
+ dput(dentry);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out_err;
+ }
+- } else {
+- path = NULL;
+- pathbase = 0;
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+@@ -4634,7 +4672,7 @@ static int reconnect_caps_cb(struct inod
+ rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+ rec.v2.issued = cpu_to_le32(cap->issued);
+ rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+- rec.v2.pathbase = cpu_to_le64(pathbase);
++ rec.v2.pathbase = cpu_to_le64(path_info.vino.ino);
+ rec.v2.flock_len = (__force __le32)
+ ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
+ } else {
+@@ -4649,7 +4687,7 @@ static int reconnect_caps_cb(struct inod
+ ts = inode_get_atime(inode);
+ ceph_encode_timespec64(&rec.v1.atime, &ts);
+ rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+- rec.v1.pathbase = cpu_to_le64(pathbase);
++ rec.v1.pathbase = cpu_to_le64(path_info.vino.ino);
+ }
+
+ if (list_empty(&ci->i_cap_snaps)) {
+@@ -4711,7 +4749,7 @@ encode_again:
+ sizeof(struct ceph_filelock);
+ rec.v2.flock_len = cpu_to_le32(struct_len);
+
+- struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
++ struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2);
+
+ if (struct_v >= 2)
+ struct_len += sizeof(u64); /* snap_follows */
+@@ -4735,7 +4773,7 @@ encode_again:
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, struct_len);
+ }
+- ceph_pagelist_encode_string(pagelist, path, pathlen);
++ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
+ ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks, num_flock_locks);
+@@ -4746,17 +4784,17 @@ out_freeflocks:
+ } else {
+ err = ceph_pagelist_reserve(pagelist,
+ sizeof(u64) + sizeof(u32) +
+- pathlen + sizeof(rec.v1));
++ path_info.pathlen + sizeof(rec.v1));
+ if (err)
+ goto out_err;
+
+ ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+- ceph_pagelist_encode_string(pagelist, path, pathlen);
++ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+ }
+
+ out_err:
+- ceph_mdsc_free_path(path, pathlen);
++ ceph_mdsc_free_path_info(&path_info);
+ if (!err)
+ recon_state->nr_caps++;
+ return err;
+--- a/fs/ceph/mds_client.h
++++ b/fs/ceph/mds_client.h
+@@ -612,14 +612,24 @@ extern int ceph_mds_check_access(struct
+
+ extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
+
+-static inline void ceph_mdsc_free_path(char *path, int len)
++/*
++ * Structure to group path-related output parameters for build_*_path functions
++ */
++struct ceph_path_info {
++ const char *path;
++ int pathlen;
++ struct ceph_vino vino;
++ bool freepath;
++};
++
++static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info)
+ {
+- if (!IS_ERR_OR_NULL(path))
+- __putname(path - (PATH_MAX - 1 - len));
++ if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path))
++ __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen));
+ }
+
+ extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
+- struct dentry *dentry, int *plen, u64 *base,
++ struct dentry *dentry, struct ceph_path_info *path_info,
+ int for_wire);
+
+ extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
--- /dev/null
+From bec324f33d1ed346394b2eee25bf6dbf3511f727 Mon Sep 17 00:00:00 2001
+From: Alex Markuze <amarkuze@redhat.com>
+Date: Tue, 12 Aug 2025 09:57:39 +0000
+Subject: ceph: fix race condition where r_parent becomes stale before sending message
+
+From: Alex Markuze <amarkuze@redhat.com>
+
+commit bec324f33d1ed346394b2eee25bf6dbf3511f727 upstream.
+
+When the parent directory's i_rwsem is not locked, req->r_parent may become
+stale due to concurrent operations (e.g. rename) between dentry lookup and
+message creation. Validate that r_parent matches the encoded parent inode
+and update to the correct inode if a mismatch is detected.
+
+[ idryomov: folded a follow-up fix from Alex to drop extra reference
+ from ceph_get_reply_dir() in ceph_fill_trace():
+
+ ceph_get_reply_dir() may return a different, referenced inode when
+ r_parent is stale and the parent directory lock is not held.
+ ceph_fill_trace() used that inode but failed to drop the reference
+ when it differed from req->r_parent, leaking an inode reference.
+
+ Keep the directory inode in a local variable and iput() it at
+ function end if it does not match req->r_parent. ]
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Alex Markuze <amarkuze@redhat.com>
+Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/inode.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 69 insertions(+), 12 deletions(-)
+
+--- a/fs/ceph/inode.c
++++ b/fs/ceph/inode.c
+@@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode
+ return 0;
+ }
+
++/*
++ * Check if the parent inode matches the vino from directory reply info
++ */
++static inline bool ceph_vino_matches_parent(struct inode *parent,
++ struct ceph_vino vino)
++{
++ return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
++}
++
++/*
++ * Validate that the directory inode referenced by @req->r_parent matches the
++ * inode number and snapshot id contained in the reply's directory record. If
++ * they do not match – which can theoretically happen if the parent dentry was
++ * moved between the time the request was issued and the reply arrived – fall
++ * back to looking up the correct inode in the inode cache.
++ *
++ * A reference is *always* returned. Callers that receive a different inode
++ * than the original @parent are responsible for dropping the extra reference
++ * once the reply has been processed.
++ */
++static struct inode *ceph_get_reply_dir(struct super_block *sb,
++ struct inode *parent,
++ struct ceph_mds_reply_info_parsed *rinfo)
++{
++ struct ceph_vino vino;
++
++ if (unlikely(!rinfo->diri.in))
++ return parent; /* nothing to compare against */
++
++ /* If we didn't have a cached parent inode to begin with, just bail out. */
++ if (!parent)
++ return NULL;
++
++ vino.ino = le64_to_cpu(rinfo->diri.in->ino);
++ vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
++
++ if (likely(ceph_vino_matches_parent(parent, vino)))
++ return parent; /* matches – use the original reference */
++
++ /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
++ WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
++ ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
++
++ return ceph_get_inode(sb, vino, NULL);
++}
++
+ /**
+ * ceph_new_inode - allocate a new inode in advance of an expected create
+ * @dir: parent directory for new inode
+@@ -1523,6 +1569,7 @@ int ceph_fill_trace(struct super_block *
+ struct ceph_vino tvino, dvino;
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ struct ceph_client *cl = fsc->client;
++ struct inode *parent_dir = NULL;
+ int err = 0;
+
+ doutc(cl, "%p is_dentry %d is_target %d\n", req,
+@@ -1536,10 +1583,17 @@ int ceph_fill_trace(struct super_block *
+ }
+
+ if (rinfo->head->is_dentry) {
+- struct inode *dir = req->r_parent;
+-
+- if (dir) {
+- err = ceph_fill_inode(dir, NULL, &rinfo->diri,
++ /*
++ * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
++ * so we need to get the correct inode
++ */
++ parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
++ if (unlikely(IS_ERR(parent_dir))) {
++ err = PTR_ERR(parent_dir);
++ goto done;
++ }
++ if (parent_dir) {
++ err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
+ rinfo->dirfrag, session, -1,
+ &req->r_caps_reservation);
+ if (err < 0)
+@@ -1548,14 +1602,14 @@ int ceph_fill_trace(struct super_block *
+ WARN_ON_ONCE(1);
+ }
+
+- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
++ if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
+ bool is_nokey = false;
+ struct qstr dname;
+ struct dentry *dn, *parent;
+ struct fscrypt_str oname = FSTR_INIT(NULL, 0);
+- struct ceph_fname fname = { .dir = dir,
++ struct ceph_fname fname = { .dir = parent_dir,
+ .name = rinfo->dname,
+ .ctext = rinfo->altname,
+ .name_len = rinfo->dname_len,
+@@ -1564,10 +1618,10 @@ int ceph_fill_trace(struct super_block *
+ BUG_ON(!rinfo->head->is_target);
+ BUG_ON(req->r_dentry);
+
+- parent = d_find_any_alias(dir);
++ parent = d_find_any_alias(parent_dir);
+ BUG_ON(!parent);
+
+- err = ceph_fname_alloc_buffer(dir, &oname);
++ err = ceph_fname_alloc_buffer(parent_dir, &oname);
+ if (err < 0) {
+ dput(parent);
+ goto done;
+@@ -1576,7 +1630,7 @@ int ceph_fill_trace(struct super_block *
+ err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
+ if (err < 0) {
+ dput(parent);
+- ceph_fname_free_buffer(dir, &oname);
++ ceph_fname_free_buffer(parent_dir, &oname);
+ goto done;
+ }
+ dname.name = oname.name;
+@@ -1595,7 +1649,7 @@ retry_lookup:
+ dname.len, dname.name, dn);
+ if (!dn) {
+ dput(parent);
+- ceph_fname_free_buffer(dir, &oname);
++ ceph_fname_free_buffer(parent_dir, &oname);
+ err = -ENOMEM;
+ goto done;
+ }
+@@ -1610,12 +1664,12 @@ retry_lookup:
+ ceph_snap(d_inode(dn)) != tvino.snap)) {
+ doutc(cl, " dn %p points to wrong inode %p\n",
+ dn, d_inode(dn));
+- ceph_dir_clear_ordered(dir);
++ ceph_dir_clear_ordered(parent_dir);
+ d_delete(dn);
+ dput(dn);
+ goto retry_lookup;
+ }
+- ceph_fname_free_buffer(dir, &oname);
++ ceph_fname_free_buffer(parent_dir, &oname);
+
+ req->r_dentry = dn;
+ dput(parent);
+@@ -1794,6 +1848,9 @@ retry_lookup:
+ &dvino, ptvino);
+ }
+ done:
++ /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
++ if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
++ iput(parent_dir);
+ doutc(cl, "done err=%d\n", err);
+ return err;
+ }
--- /dev/null
+From 3c9ba2777d6c86025e1ba4186dc5cd930e40ec5f Mon Sep 17 00:00:00 2001
+From: Chen Ridong <chenridong@huawei.com>
+Date: Fri, 22 Aug 2025 07:07:14 +0000
+Subject: kernfs: Fix UAF in polling when open file is released
+
+From: Chen Ridong <chenridong@huawei.com>
+
+commit 3c9ba2777d6c86025e1ba4186dc5cd930e40ec5f upstream.
+
+A use-after-free (UAF) vulnerability was identified in the PSI (Pressure
+Stall Information) monitoring mechanism:
+
+BUG: KASAN: slab-use-after-free in psi_trigger_poll+0x3c/0x140
+Read of size 8 at addr ffff3de3d50bd308 by task systemd/1
+
+psi_trigger_poll+0x3c/0x140
+cgroup_pressure_poll+0x70/0xa0
+cgroup_file_poll+0x8c/0x100
+kernfs_fop_poll+0x11c/0x1c0
+ep_item_poll.isra.0+0x188/0x2c0
+
+Allocated by task 1:
+cgroup_file_open+0x88/0x388
+kernfs_fop_open+0x73c/0xaf0
+do_dentry_open+0x5fc/0x1200
+vfs_open+0xa0/0x3f0
+do_open+0x7e8/0xd08
+path_openat+0x2fc/0x6b0
+do_filp_open+0x174/0x368
+
+Freed by task 8462:
+cgroup_file_release+0x130/0x1f8
+kernfs_drain_open_files+0x17c/0x440
+kernfs_drain+0x2dc/0x360
+kernfs_show+0x1b8/0x288
+cgroup_file_show+0x150/0x268
+cgroup_pressure_write+0x1dc/0x340
+cgroup_file_write+0x274/0x548
+
+Reproduction Steps:
+1. Open test/cpu.pressure and establish epoll monitoring
+2. Disable monitoring: echo 0 > test/cgroup.pressure
+3. Re-enable monitoring: echo 1 > test/cgroup.pressure
+
+The race condition occurs because:
+1. When cgroup.pressure is disabled (echo 0 > cgroup.pressure), it:
+ - Releases PSI triggers via cgroup_file_release()
+ - Frees of->priv through kernfs_drain_open_files()
+2. While epoll still holds reference to the file and continues polling
+3. Re-enabling (echo 1 > cgroup.pressure) accesses freed of->priv
+
+epolling disable/enable cgroup.pressure
+fd=open(cpu.pressure)
+while(1)
+...
+epoll_wait
+kernfs_fop_poll
+kernfs_get_active = true echo 0 > cgroup.pressure
+... cgroup_file_show
+ kernfs_show
+ // inactive kn
+ kernfs_drain_open_files
+ cft->release(of);
+ kfree(ctx);
+ ...
+kernfs_get_active = false
+ echo 1 > cgroup.pressure
+ kernfs_show
+ kernfs_activate_one(kn);
+kernfs_fop_poll
+kernfs_get_active = true
+cgroup_file_poll
+psi_trigger_poll
+// UAF
+...
+end: close(fd)
+
+To address this issue, introduce kernfs_get_active_of() for kernfs open
+files to obtain active references. This function will fail if the open file
+has been released. Replace kernfs_get_active() with kernfs_get_active_of()
+to prevent further operations on released file descriptors.
+
+Fixes: 34f26a15611a ("sched/psi: Per-cgroup PSI accounting disable/re-enable interface")
+Cc: stable <stable@kernel.org>
+Reported-by: Zhang Zhaotian <zhangzhaotian@huawei.com>
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+Acked-by: Tejun Heo <tj@kernel.org>
+Link: https://lore.kernel.org/r/20250822070715.1565236-2-chenridong@huaweicloud.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/kernfs/file.c | 58 ++++++++++++++++++++++++++++++++++++-------------------
+ 1 file changed, 38 insertions(+), 20 deletions(-)
+
+--- a/fs/kernfs/file.c
++++ b/fs/kernfs/file.c
+@@ -70,6 +70,24 @@ static struct kernfs_open_node *of_on(st
+ !list_empty(&of->list));
+ }
+
++/* Get active reference to kernfs node for an open file */
++static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of)
++{
++ /* Skip if file was already released */
++ if (unlikely(of->released))
++ return NULL;
++
++ if (!kernfs_get_active(of->kn))
++ return NULL;
++
++ return of;
++}
++
++static void kernfs_put_active_of(struct kernfs_open_file *of)
++{
++ return kernfs_put_active(of->kn);
++}
++
+ /**
+ * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn
+ *
+@@ -139,7 +157,7 @@ static void kernfs_seq_stop_active(struc
+
+ if (ops->seq_stop)
+ ops->seq_stop(sf, v);
+- kernfs_put_active(of->kn);
++ kernfs_put_active_of(of);
+ }
+
+ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
+@@ -152,7 +170,7 @@ static void *kernfs_seq_start(struct seq
+ * the ops aren't called concurrently for the same open file.
+ */
+ mutex_lock(&of->mutex);
+- if (!kernfs_get_active(of->kn))
++ if (!kernfs_get_active_of(of))
+ return ERR_PTR(-ENODEV);
+
+ ops = kernfs_ops(of->kn);
+@@ -238,7 +256,7 @@ static ssize_t kernfs_file_read_iter(str
+ * the ops aren't called concurrently for the same open file.
+ */
+ mutex_lock(&of->mutex);
+- if (!kernfs_get_active(of->kn)) {
++ if (!kernfs_get_active_of(of)) {
+ len = -ENODEV;
+ mutex_unlock(&of->mutex);
+ goto out_free;
+@@ -252,7 +270,7 @@ static ssize_t kernfs_file_read_iter(str
+ else
+ len = -EINVAL;
+
+- kernfs_put_active(of->kn);
++ kernfs_put_active_of(of);
+ mutex_unlock(&of->mutex);
+
+ if (len < 0)
+@@ -323,7 +341,7 @@ static ssize_t kernfs_fop_write_iter(str
+ * the ops aren't called concurrently for the same open file.
+ */
+ mutex_lock(&of->mutex);
+- if (!kernfs_get_active(of->kn)) {
++ if (!kernfs_get_active_of(of)) {
+ mutex_unlock(&of->mutex);
+ len = -ENODEV;
+ goto out_free;
+@@ -335,7 +353,7 @@ static ssize_t kernfs_fop_write_iter(str
+ else
+ len = -EINVAL;
+
+- kernfs_put_active(of->kn);
++ kernfs_put_active_of(of);
+ mutex_unlock(&of->mutex);
+
+ if (len > 0)
+@@ -357,13 +375,13 @@ static void kernfs_vma_open(struct vm_ar
+ if (!of->vm_ops)
+ return;
+
+- if (!kernfs_get_active(of->kn))
++ if (!kernfs_get_active_of(of))
+ return;
+
+ if (of->vm_ops->open)
+ of->vm_ops->open(vma);
+
+- kernfs_put_active(of->kn);
++ kernfs_put_active_of(of);
+ }
+
+ static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
+@@ -375,14 +393,14 @@ static vm_fault_t kernfs_vma_fault(struc
+ if (!of->vm_ops)
+ return VM_FAULT_SIGBUS;
+
+- if (!kernfs_get_active(of->kn))
++ if (!kernfs_get_active_of(of))
+ return VM_FAULT_SIGBUS;
+
+ ret = VM_FAULT_SIGBUS;
+ if (of->vm_ops->fault)
+ ret = of->vm_ops->fault(vmf);
+
+- kernfs_put_active(of->kn);
++ kernfs_put_active_of(of);
+ return ret;
+ }
+
+@@ -395,7 +413,7 @@ static vm_fault_t kernfs_vma_page_mkwrit
+ if (!of->vm_ops)
+ return VM_FAULT_SIGBUS;
+
+- if (!kernfs_get_active(of->kn))
++ if (!kernfs_get_active_of(of))
+ return VM_FAULT_SIGBUS;
+
+ ret = 0;
+@@ -404,7 +422,7 @@ static vm_fault_t kernfs_vma_page_mkwrit
+ else
+ file_update_time(file);
+
+- kernfs_put_active(of->kn);
++ kernfs_put_active_of(of);
+ return ret;
+ }
+
+@@ -418,14 +436,14 @@ static int kernfs_vma_access(struct vm_a
+ if (!of->vm_ops)
+ return -EINVAL;
+
+- if (!kernfs_get_active(of->kn))
++ if (!kernfs_get_active_of(of))
+ return -EINVAL;
+
+ ret = -EINVAL;
+ if (of->vm_ops->access)
+ ret = of->vm_ops->access(vma, addr, buf, len, write);
+
+- kernfs_put_active(of->kn);
++ kernfs_put_active_of(of);
+ return ret;
+ }
+
+@@ -455,7 +473,7 @@ static int kernfs_fop_mmap(struct file *
+ mutex_lock(&of->mutex);
+
+ rc = -ENODEV;
+- if (!kernfs_get_active(of->kn))
++ if (!kernfs_get_active_of(of))
+ goto out_unlock;
+
+ ops = kernfs_ops(of->kn);
+@@ -490,7 +508,7 @@ static int kernfs_fop_mmap(struct file *
+ }
+ vma->vm_ops = &kernfs_vm_ops;
+ out_put:
+- kernfs_put_active(of->kn);
++ kernfs_put_active_of(of);
+ out_unlock:
+ mutex_unlock(&of->mutex);
+
+@@ -852,7 +870,7 @@ static __poll_t kernfs_fop_poll(struct f
+ struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
+ __poll_t ret;
+
+- if (!kernfs_get_active(kn))
++ if (!kernfs_get_active_of(of))
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+
+ if (kn->attr.ops->poll)
+@@ -860,7 +878,7 @@ static __poll_t kernfs_fop_poll(struct f
+ else
+ ret = kernfs_generic_poll(of, wait);
+
+- kernfs_put_active(kn);
++ kernfs_put_active_of(of);
+ return ret;
+ }
+
+@@ -875,7 +893,7 @@ static loff_t kernfs_fop_llseek(struct f
+ * the ops aren't called concurrently for the same open file.
+ */
+ mutex_lock(&of->mutex);
+- if (!kernfs_get_active(of->kn)) {
++ if (!kernfs_get_active_of(of)) {
+ mutex_unlock(&of->mutex);
+ return -ENODEV;
+ }
+@@ -886,7 +904,7 @@ static loff_t kernfs_fop_llseek(struct f
+ else
+ ret = generic_file_llseek(file, offset, whence);
+
+- kernfs_put_active(of->kn);
++ kernfs_put_active_of(of);
+ mutex_unlock(&of->mutex);
+ return ret;
+ }