--- /dev/null
+From stable-owner@vger.kernel.org Sun Nov 5 17:01:46 2023
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Sun, 05 Nov 2023 10:56:34 -0500
+Subject:[PATCH 4/5] eventfs: Delete eventfs_inode when the last dentry is freed
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org, <gregkh@linuxfoundation.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>, Mark Rutland <mark.rutland@arm.com>, Andrew Morton <akpm@linux-foundation.org>, Ajay Kaher <akaher@vmware.com>
+Message-ID: <20231105160139.821908367@goodmis.org>
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+
+commit 020010fbfa202aa528a52743eba4ab0da3400a4e upstream
+
+There exists a race between holding a reference of an eventfs_inode dentry
+and the freeing of the eventfs_inode. If user space has a dentry held long
+enough, it may still be able to access the dentry's eventfs_inode after it
+has been freed.
+
+To prevent this, have he eventfs_inode freed via the last dput() (or via
+RCU if the eventfs_inode does not have a dentry).
+
+This means reintroducing the eventfs_inode del_list field at a temporary
+place to put the eventfs_inode. It needs to mark it as freed (via the
+list) but also must invalidate the dentry immediately as the return from
+eventfs_remove_dir() expects that they are. But the dentry invalidation
+must not be called under the eventfs_mutex, so it must be done after the
+eventfs_inode is marked as free (put on a deletion list).
+
+Link: https://lkml.kernel.org/r/20231101172650.123479767@goodmis.org
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Ajay Kaher <akaher@vmware.com>
+Fixes: 5bdcd5f5331a2 ("eventfs: Implement removal of meta data from eventfs")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/tracefs/event_inode.c | 150 +++++++++++++++++++++++------------------------
+ 1 file changed, 74 insertions(+), 76 deletions(-)
+
+--- a/fs/tracefs/event_inode.c
++++ b/fs/tracefs/event_inode.c
+@@ -53,10 +53,12 @@ struct eventfs_file {
+ const struct inode_operations *iop;
+ /*
+ * Union - used for deletion
++ * @llist: for calling dput() if needed after RCU
+ * @del_list: list of eventfs_file to delete
+ * @rcu: eventfs_file to delete in RCU
+ */
+ union {
++ struct llist_node llist;
+ struct list_head del_list;
+ struct rcu_head rcu;
+ };
+@@ -113,8 +115,7 @@ static int eventfs_set_attr(struct mnt_i
+
+ mutex_lock(&eventfs_mutex);
+ ef = dentry->d_fsdata;
+- /* The LSB is set when the eventfs_inode is being freed */
+- if (((unsigned long)ef & 1UL) || ef->is_freed) {
++ if (ef->is_freed) {
+ /* Do not allow changes if the event is about to be removed. */
+ mutex_unlock(&eventfs_mutex);
+ return -ENODEV;
+@@ -258,6 +259,13 @@ static struct dentry *create_dir(struct
+ return eventfs_end_creating(dentry);
+ }
+
++static void free_ef(struct eventfs_file *ef)
++{
++ kfree(ef->name);
++ kfree(ef->ei);
++ kfree(ef);
++}
++
+ /**
+ * eventfs_set_ef_status_free - set the ef->status to free
+ * @ti: the tracefs_inode of the dentry
+@@ -270,34 +278,20 @@ void eventfs_set_ef_status_free(struct t
+ {
+ struct tracefs_inode *ti_parent;
+ struct eventfs_inode *ei;
+- struct eventfs_file *ef, *tmp;
++ struct eventfs_file *ef;
+
+ /* The top level events directory may be freed by this */
+ if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
+- LIST_HEAD(ef_del_list);
+-
+ mutex_lock(&eventfs_mutex);
+-
+ ei = ti->private;
+
+- /* Record all the top level files */
+- list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+- lockdep_is_held(&eventfs_mutex)) {
+- list_add_tail(&ef->del_list, &ef_del_list);
+- }
+-
+ /* Nothing should access this, but just in case! */
+ ti->private = NULL;
+-
+ mutex_unlock(&eventfs_mutex);
+
+- /* Now safely free the top level files and their children */
+- list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
+- list_del(&ef->del_list);
+- eventfs_remove(ef);
+- }
+-
+- kfree(ei);
++ ef = dentry->d_fsdata;
++ if (ef)
++ free_ef(ef);
+ return;
+ }
+
+@@ -311,16 +305,13 @@ void eventfs_set_ef_status_free(struct t
+ if (!ef)
+ goto out;
+
+- /*
+- * If ef was freed, then the LSB bit is set for d_fsdata.
+- * But this should not happen, as it should still have a
+- * ref count that prevents it. Warn in case it does.
+- */
+- if (WARN_ON_ONCE((unsigned long)ef & 1))
+- goto out;
++ if (ef->is_freed) {
++ free_ef(ef);
++ } else {
++ ef->dentry = NULL;
++ }
+
+ dentry->d_fsdata = NULL;
+- ef->dentry = NULL;
+ out:
+ mutex_unlock(&eventfs_mutex);
+ }
+@@ -847,13 +838,53 @@ int eventfs_add_file(const char *name, u
+ return 0;
+ }
+
+-static void free_ef(struct rcu_head *head)
++static LLIST_HEAD(free_list);
++
++static void eventfs_workfn(struct work_struct *work)
++{
++ struct eventfs_file *ef, *tmp;
++ struct llist_node *llnode;
++
++ llnode = llist_del_all(&free_list);
++ llist_for_each_entry_safe(ef, tmp, llnode, llist) {
++ /* This should only get here if it had a dentry */
++ if (!WARN_ON_ONCE(!ef->dentry))
++ dput(ef->dentry);
++ }
++}
++
++static DECLARE_WORK(eventfs_work, eventfs_workfn);
++
++static void free_rcu_ef(struct rcu_head *head)
+ {
+ struct eventfs_file *ef = container_of(head, struct eventfs_file, rcu);
+
+- kfree(ef->name);
+- kfree(ef->ei);
+- kfree(ef);
++ if (ef->dentry) {
++ /* Do not free the ef until all references of dentry are gone */
++ if (llist_add(&ef->llist, &free_list))
++ queue_work(system_unbound_wq, &eventfs_work);
++ return;
++ }
++
++ free_ef(ef);
++}
++
++static void unhook_dentry(struct dentry *dentry)
++{
++ if (!dentry)
++ return;
++
++ /* Keep the dentry from being freed yet (see eventfs_workfn()) */
++ dget(dentry);
++
++ dentry->d_fsdata = NULL;
++ d_invalidate(dentry);
++ mutex_lock(&eventfs_mutex);
++ /* dentry should now have at least a single reference */
++ WARN_ONCE((int)d_count(dentry) < 1,
++ "dentry %px (%s) less than one reference (%d) after invalidate\n",
++ dentry, dentry->d_name.name, d_count(dentry));
++ mutex_unlock(&eventfs_mutex);
+ }
+
+ /**
+@@ -905,58 +936,25 @@ void eventfs_remove(struct eventfs_file
+ {
+ struct eventfs_file *tmp;
+ LIST_HEAD(ef_del_list);
+- struct dentry *dentry_list = NULL;
+- struct dentry *dentry;
+
+ if (!ef)
+ return;
+
++ /*
++ * Move the deleted eventfs_inodes onto the ei_del_list
++ * which will also set the is_freed value. Note, this has to be
++ * done under the eventfs_mutex, but the deletions of
++ * the dentries must be done outside the eventfs_mutex.
++ * Hence moving them to this temporary list.
++ */
+ mutex_lock(&eventfs_mutex);
+ eventfs_remove_rec(ef, &ef_del_list, 0);
+- list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
+- if (ef->dentry) {
+- unsigned long ptr = (unsigned long)dentry_list;
+-
+- /* Keep the dentry from being freed yet */
+- dget(ef->dentry);
+-
+- /*
+- * Paranoid: The dget() above should prevent the dentry
+- * from being freed and calling eventfs_set_ef_status_free().
+- * But just in case, set the link list LSB pointer to 1
+- * and have eventfs_set_ef_status_free() check that to
+- * make sure that if it does happen, it will not think
+- * the d_fsdata is an event_file.
+- *
+- * For this to work, no event_file should be allocated
+- * on a odd space, as the ef should always be allocated
+- * to be at least word aligned. Check for that too.
+- */
+- WARN_ON_ONCE(ptr & 1);
+-
+- ef->dentry->d_fsdata = (void *)(ptr | 1);
+- dentry_list = ef->dentry;
+- ef->dentry = NULL;
+- }
+- call_srcu(&eventfs_srcu, &ef->rcu, free_ef);
+- }
+ mutex_unlock(&eventfs_mutex);
+
+- while (dentry_list) {
+- unsigned long ptr;
+-
+- dentry = dentry_list;
+- ptr = (unsigned long)dentry->d_fsdata & ~1UL;
+- dentry_list = (struct dentry *)ptr;
+- dentry->d_fsdata = NULL;
+- d_invalidate(dentry);
+- mutex_lock(&eventfs_mutex);
+- /* dentry should now have at least a single reference */
+- WARN_ONCE((int)d_count(dentry) < 1,
+- "dentry %p less than one reference (%d) after invalidate\n",
+- dentry, d_count(dentry));
+- mutex_unlock(&eventfs_mutex);
+- dput(dentry);
++ list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
++ unhook_dentry(ef->dentry);
++ list_del(&ef->del_list);
++ call_srcu(&eventfs_srcu, &ef->rcu, free_rcu_ef);
+ }
+ }
+
--- /dev/null
+From stable-owner@vger.kernel.org Sun Nov 5 17:01:44 2023
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Sun, 05 Nov 2023 10:56:32 -0500
+Subject:[PATCH 2/5] eventfs: Remove "is_freed" union with rcu head
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org, <gregkh@linuxfoundation.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>, Mark Rutland <mark.rutland@arm.com>, Andrew Morton <akpm@linux-foundation.org>, Ajay Kaher <akaher@vmware.com>
+Message-ID: <20231105160139.498444992@goodmis.org>
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+
+commit f2f496370afcbc5227d7002da28c74b91fed12ff upstream
+
+The eventfs_inode->is_freed was a union with the rcu_head with the
+assumption that when it was on the srcu list the head would contain a
+pointer which would make "is_freed" true. But that was a wrong assumption
+as the rcu head is a single link list where the last element is NULL.
+
+Instead, split the nr_entries integer so that "is_freed" is one bit and
+the nr_entries is the next 31 bits. As there shouldn't be more than 10
+(currently there's at most 5 to 7 depending on the config), this should
+not be a problem.
+
+Link: https://lkml.kernel.org/r/20231101172649.049758712@goodmis.org
+
+Cc: stable@vger.kernel.org
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Ajay Kaher <akaher@vmware.com>
+Fixes: 63940449555e7 ("eventfs: Implement eventfs lookup, read, open functions")
+Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/tracefs/event_inode.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/fs/tracefs/event_inode.c
++++ b/fs/tracefs/event_inode.c
+@@ -38,6 +38,7 @@ struct eventfs_inode {
+ * @fop: file_operations for file or directory
+ * @iop: inode_operations for file or directory
+ * @data: something that the caller will want to get to later on
++ * @is_freed: Flag set if the eventfs is on its way to be freed
+ * @mode: the permission that the file or directory should have
+ */
+ struct eventfs_file {
+@@ -52,15 +53,14 @@ struct eventfs_file {
+ * Union - used for deletion
+ * @del_list: list of eventfs_file to delete
+ * @rcu: eventfs_file to delete in RCU
+- * @is_freed: node is freed if one of the above is set
+ */
+ union {
+ struct list_head del_list;
+ struct rcu_head rcu;
+- unsigned long is_freed;
+ };
+ void *data;
+- umode_t mode;
++ unsigned int is_freed:1;
++ unsigned int mode:31;
+ };
+
+ static DEFINE_MUTEX(eventfs_mutex);
+@@ -814,6 +814,8 @@ static void eventfs_remove_rec(struct ev
+ }
+ }
+
++ ef->is_freed = 1;
++
+ list_del_rcu(&ef->list);
+ list_add_tail(&ef->del_list, head);
+ }
--- /dev/null
+From stable-owner@vger.kernel.org Sun Nov 5 17:01:46 2023
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Sun, 05 Nov 2023 10:56:33 -0500
+Subject:[PATCH 3/5] eventfs: Save ownership and mode
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org, <gregkh@linuxfoundation.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>, Mark Rutland <mark.rutland@arm.com>, Andrew Morton <akpm@linux-foundation.org>, Ajay Kaher <akaher@vmware.com>
+Message-ID: <20231105160139.660634360@goodmis.org>
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+
+commit 28e12c09f5aa081b2d13d1340e3610070b6c624d upstream
+
+Now that inodes and dentries are created on the fly, they are also
+reclaimed on memory pressure. Since the ownership and file mode are saved
+in the inode, if they are freed, any changes to the ownership and mode
+will be lost.
+
+To counter this, if the user changes the permissions or ownership, save
+them, and when creating the inodes again, restore those changes.
+
+Link: https://lkml.kernel.org/r/20231101172649.691841445@goodmis.org
+
+Cc: stable@vger.kernel.org
+Cc: Ajay Kaher <akaher@vmware.com>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Fixes: 63940449555e7 ("eventfs: Implement eventfs lookup, read, open functions")
+Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/tracefs/event_inode.c | 107 +++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 91 insertions(+), 16 deletions(-)
+
+--- a/fs/tracefs/event_inode.c
++++ b/fs/tracefs/event_inode.c
+@@ -40,6 +40,8 @@ struct eventfs_inode {
+ * @data: something that the caller will want to get to later on
+ * @is_freed: Flag set if the eventfs is on its way to be freed
+ * @mode: the permission that the file or directory should have
++ * @uid: saved uid if changed
++ * @gid: saved gid if changed
+ */
+ struct eventfs_file {
+ const char *name;
+@@ -61,11 +63,22 @@ struct eventfs_file {
+ void *data;
+ unsigned int is_freed:1;
+ unsigned int mode:31;
++ kuid_t uid;
++ kgid_t gid;
+ };
+
+ static DEFINE_MUTEX(eventfs_mutex);
+ DEFINE_STATIC_SRCU(eventfs_srcu);
+
++/* Mode is unsigned short, use the upper bits for flags */
++enum {
++ EVENTFS_SAVE_MODE = BIT(16),
++ EVENTFS_SAVE_UID = BIT(17),
++ EVENTFS_SAVE_GID = BIT(18),
++};
++
++#define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1)
++
+ static struct dentry *eventfs_root_lookup(struct inode *dir,
+ struct dentry *dentry,
+ unsigned int flags);
+@@ -73,8 +86,54 @@ static int dcache_dir_open_wrapper(struc
+ static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx);
+ static int eventfs_release(struct inode *inode, struct file *file);
+
++static void update_attr(struct eventfs_file *ef, struct iattr *iattr)
++{
++ unsigned int ia_valid = iattr->ia_valid;
++
++ if (ia_valid & ATTR_MODE) {
++ ef->mode = (ef->mode & ~EVENTFS_MODE_MASK) |
++ (iattr->ia_mode & EVENTFS_MODE_MASK) |
++ EVENTFS_SAVE_MODE;
++ }
++ if (ia_valid & ATTR_UID) {
++ ef->mode |= EVENTFS_SAVE_UID;
++ ef->uid = iattr->ia_uid;
++ }
++ if (ia_valid & ATTR_GID) {
++ ef->mode |= EVENTFS_SAVE_GID;
++ ef->gid = iattr->ia_gid;
++ }
++}
++
++static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
++ struct iattr *iattr)
++{
++ struct eventfs_file *ef;
++ int ret;
++
++ mutex_lock(&eventfs_mutex);
++ ef = dentry->d_fsdata;
++ /* The LSB is set when the eventfs_inode is being freed */
++ if (((unsigned long)ef & 1UL) || ef->is_freed) {
++ /* Do not allow changes if the event is about to be removed. */
++ mutex_unlock(&eventfs_mutex);
++ return -ENODEV;
++ }
++
++ ret = simple_setattr(idmap, dentry, iattr);
++ if (!ret)
++ update_attr(ef, iattr);
++ mutex_unlock(&eventfs_mutex);
++ return ret;
++}
++
+ static const struct inode_operations eventfs_root_dir_inode_operations = {
+ .lookup = eventfs_root_lookup,
++ .setattr = eventfs_set_attr,
++};
++
++static const struct inode_operations eventfs_file_inode_operations = {
++ .setattr = eventfs_set_attr,
+ };
+
+ static const struct file_operations eventfs_file_operations = {
+@@ -85,10 +144,20 @@ static const struct file_operations even
+ .release = eventfs_release,
+ };
+
++static void update_inode_attr(struct inode *inode, struct eventfs_file *ef)
++{
++ inode->i_mode = ef->mode & EVENTFS_MODE_MASK;
++
++ if (ef->mode & EVENTFS_SAVE_UID)
++ inode->i_uid = ef->uid;
++
++ if (ef->mode & EVENTFS_SAVE_GID)
++ inode->i_gid = ef->gid;
++}
++
+ /**
+ * create_file - create a file in the tracefs filesystem
+- * @name: the name of the file to create.
+- * @mode: the permission that the file should have.
++ * @ef: the eventfs_file
+ * @parent: parent dentry for this file.
+ * @data: something that the caller will want to get to later on.
+ * @fop: struct file_operations that should be used for this file.
+@@ -104,7 +173,7 @@ static const struct file_operations even
+ * If tracefs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+-static struct dentry *create_file(const char *name, umode_t mode,
++static struct dentry *create_file(struct eventfs_file *ef,
+ struct dentry *parent, void *data,
+ const struct file_operations *fop)
+ {
+@@ -112,13 +181,13 @@ static struct dentry *create_file(const
+ struct dentry *dentry;
+ struct inode *inode;
+
+- if (!(mode & S_IFMT))
+- mode |= S_IFREG;
++ if (!(ef->mode & S_IFMT))
++ ef->mode |= S_IFREG;
+
+- if (WARN_ON_ONCE(!S_ISREG(mode)))
++ if (WARN_ON_ONCE(!S_ISREG(ef->mode)))
+ return NULL;
+
+- dentry = eventfs_start_creating(name, parent);
++ dentry = eventfs_start_creating(ef->name, parent);
+
+ if (IS_ERR(dentry))
+ return dentry;
+@@ -127,7 +196,10 @@ static struct dentry *create_file(const
+ if (unlikely(!inode))
+ return eventfs_failed_creating(dentry);
+
+- inode->i_mode = mode;
++ /* If the user updated the directory's attributes, use them */
++ update_inode_attr(inode, ef);
++
++ inode->i_op = &eventfs_file_inode_operations;
+ inode->i_fop = fop;
+ inode->i_private = data;
+
+@@ -140,7 +212,7 @@ static struct dentry *create_file(const
+
+ /**
+ * create_dir - create a dir in the tracefs filesystem
+- * @name: the name of the file to create.
++ * @ei: the eventfs_inode that represents the directory to create
+ * @parent: parent dentry for this file.
+ * @data: something that the caller will want to get to later on.
+ *
+@@ -155,13 +227,14 @@ static struct dentry *create_file(const
+ * If tracefs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+-static struct dentry *create_dir(const char *name, struct dentry *parent, void *data)
++static struct dentry *create_dir(struct eventfs_file *ef,
++ struct dentry *parent, void *data)
+ {
+ struct tracefs_inode *ti;
+ struct dentry *dentry;
+ struct inode *inode;
+
+- dentry = eventfs_start_creating(name, parent);
++ dentry = eventfs_start_creating(ef->name, parent);
+ if (IS_ERR(dentry))
+ return dentry;
+
+@@ -169,7 +242,8 @@ static struct dentry *create_dir(const c
+ if (unlikely(!inode))
+ return eventfs_failed_creating(dentry);
+
+- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
++ update_inode_attr(inode, ef);
++
+ inode->i_op = &eventfs_root_dir_inode_operations;
+ inode->i_fop = &eventfs_file_operations;
+ inode->i_private = data;
+@@ -306,10 +380,9 @@ create_dentry(struct eventfs_file *ef, s
+ inode_lock(parent->d_inode);
+
+ if (ef->ei)
+- dentry = create_dir(ef->name, parent, ef->data);
++ dentry = create_dir(ef, parent, ef->data);
+ else
+- dentry = create_file(ef->name, ef->mode, parent,
+- ef->data, ef->fop);
++ dentry = create_file(ef, parent, ef->data, ef->fop);
+
+ if (!lookup)
+ inode_unlock(parent->d_inode);
+@@ -475,6 +548,7 @@ static int dcache_dir_open_wrapper(struc
+ if (d) {
+ struct dentry **tmp;
+
++
+ tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
+ if (!tmp)
+ break;
+@@ -549,13 +623,14 @@ static struct eventfs_file *eventfs_prep
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&ef->ei->e_top_files);
++ ef->mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ } else {
+ ef->ei = NULL;
++ ef->mode = mode;
+ }
+
+ ef->iop = iop;
+ ef->fop = fop;
+- ef->mode = mode;
+ ef->data = data;
+ return ef;
+ }
--- /dev/null
+From stable-owner@vger.kernel.org Sun Nov 5 17:01:45 2023
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Sun, 05 Nov 2023 10:56:35 -0500
+Subject:[PATCH 5/5] eventfs: Use simple_recursive_removal() to clean up dentries
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org, <gregkh@linuxfoundation.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>, Mark Rutland <mark.rutland@arm.com>, Andrew Morton <akpm@linux-foundation.org>, Al Viro <viro@zeniv.linux.org.uk>
+Message-ID: <20231105160139.983291500@goodmis.org>
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+
+commit 407c6726ca71b33330d2d6345d9ea7ebc02575e9 upstream
+
+Looking at how dentry is removed via the tracefs system, I found that
+eventfs does not do everything that it did under tracefs. The tracefs
+removal of a dentry calls simple_recursive_removal() that does a lot more
+than a simple d_invalidate().
+
+As it should be a requirement that any eventfs_inode that has a dentry, so
+does its parent. When removing a eventfs_inode, if it has a dentry, a call
+to simple_recursive_removal() on that dentry should clean up all the
+dentries underneath it.
+
+Add WARN_ON_ONCE() to check for the parent having a dentry if any children
+do.
+
+Link: https://lore.kernel.org/all/20231101022553.GE1957730@ZenIV/
+Link: https://lkml.kernel.org/r/20231101172650.552471568@goodmis.org
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Fixes: 5bdcd5f5331a2 ("eventfs: Implement removal of meta data from eventfs")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/tracefs/event_inode.c | 71 +++++++++++++++++++++--------------------------
+ 1 file changed, 33 insertions(+), 38 deletions(-)
+
+--- a/fs/tracefs/event_inode.c
++++ b/fs/tracefs/event_inode.c
+@@ -54,12 +54,10 @@ struct eventfs_file {
+ /*
+ * Union - used for deletion
+ * @llist: for calling dput() if needed after RCU
+- * @del_list: list of eventfs_file to delete
+ * @rcu: eventfs_file to delete in RCU
+ */
+ union {
+ struct llist_node llist;
+- struct list_head del_list;
+ struct rcu_head rcu;
+ };
+ void *data;
+@@ -276,7 +274,6 @@ static void free_ef(struct eventfs_file
+ */
+ void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
+ {
+- struct tracefs_inode *ti_parent;
+ struct eventfs_inode *ei;
+ struct eventfs_file *ef;
+
+@@ -297,10 +294,6 @@ void eventfs_set_ef_status_free(struct t
+
+ mutex_lock(&eventfs_mutex);
+
+- ti_parent = get_tracefs(dentry->d_parent->d_inode);
+- if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
+- goto out;
+-
+ ef = dentry->d_fsdata;
+ if (!ef)
+ goto out;
+@@ -873,30 +866,29 @@ static void unhook_dentry(struct dentry
+ {
+ if (!dentry)
+ return;
+-
+- /* Keep the dentry from being freed yet (see eventfs_workfn()) */
++ /*
++ * Need to add a reference to the dentry that is expected by
++ * simple_recursive_removal(), which will include a dput().
++ */
+ dget(dentry);
+
+- dentry->d_fsdata = NULL;
+- d_invalidate(dentry);
+- mutex_lock(&eventfs_mutex);
+- /* dentry should now have at least a single reference */
+- WARN_ONCE((int)d_count(dentry) < 1,
+- "dentry %px (%s) less than one reference (%d) after invalidate\n",
+- dentry, dentry->d_name.name, d_count(dentry));
+- mutex_unlock(&eventfs_mutex);
++ /*
++ * Also add a reference for the dput() in eventfs_workfn().
++ * That is required as that dput() will free the ei after
++ * the SRCU grace period is over.
++ */
++ dget(dentry);
+ }
+
+ /**
+ * eventfs_remove_rec - remove eventfs dir or file from list
+ * @ef: eventfs_file to be removed.
+- * @head: to create list of eventfs_file to be deleted
+ * @level: to check recursion depth
+ *
+ * The helper function eventfs_remove_rec() is used to clean up and free the
+ * associated data from eventfs for both of the added functions.
+ */
+-static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, int level)
++static void eventfs_remove_rec(struct eventfs_file *ef, int level)
+ {
+ struct eventfs_file *ef_child;
+
+@@ -916,14 +908,16 @@ static void eventfs_remove_rec(struct ev
+ /* search for nested folders or files */
+ list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
+ lockdep_is_held(&eventfs_mutex)) {
+- eventfs_remove_rec(ef_child, head, level + 1);
++ eventfs_remove_rec(ef_child, level + 1);
+ }
+ }
+
+ ef->is_freed = 1;
+
++ unhook_dentry(ef->dentry);
++
+ list_del_rcu(&ef->list);
+- list_add_tail(&ef->del_list, head);
++ call_srcu(&eventfs_srcu, &ef->rcu, free_rcu_ef);
+ }
+
+ /**
+@@ -934,28 +928,22 @@ static void eventfs_remove_rec(struct ev
+ */
+ void eventfs_remove(struct eventfs_file *ef)
+ {
+- struct eventfs_file *tmp;
+- LIST_HEAD(ef_del_list);
++ struct dentry *dentry;
+
+ if (!ef)
+ return;
+
+- /*
+- * Move the deleted eventfs_inodes onto the ei_del_list
+- * which will also set the is_freed value. Note, this has to be
+- * done under the eventfs_mutex, but the deletions of
+- * the dentries must be done outside the eventfs_mutex.
+- * Hence moving them to this temporary list.
+- */
+ mutex_lock(&eventfs_mutex);
+- eventfs_remove_rec(ef, &ef_del_list, 0);
++ dentry = ef->dentry;
++ eventfs_remove_rec(ef, 0);
+ mutex_unlock(&eventfs_mutex);
+
+- list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
+- unhook_dentry(ef->dentry);
+- list_del(&ef->del_list);
+- call_srcu(&eventfs_srcu, &ef->rcu, free_rcu_ef);
+- }
++ /*
++ * If any of the ei children has a dentry, then the ei itself
++ * must have a dentry.
++ */
++ if (dentry)
++ simple_recursive_removal(dentry, NULL);
+ }
+
+ /**
+@@ -966,6 +954,8 @@ void eventfs_remove(struct eventfs_file
+ */
+ void eventfs_remove_events_dir(struct dentry *dentry)
+ {
++ struct eventfs_file *ef_child;
++ struct eventfs_inode *ei;
+ struct tracefs_inode *ti;
+
+ if (!dentry || !dentry->d_inode)
+@@ -975,6 +965,11 @@ void eventfs_remove_events_dir(struct de
+ if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
+ return;
+
+- d_invalidate(dentry);
+- dput(dentry);
++ mutex_lock(&eventfs_mutex);
++ ei = ti->private;
++ list_for_each_entry_srcu(ef_child, &ei->e_top_files, list,
++ lockdep_is_held(&eventfs_mutex)) {
++ eventfs_remove_rec(ef_child, 0);
++ }
++ mutex_unlock(&eventfs_mutex);
+ }
drm-amd-display-don-t-use-fsleep-for-psr-exit-waits.patch
power-supply-core-use-blocking_notifier_call_chain-t.patch
perf-evlist-avoid-frequency-mode-for-the-dummy-event.patch
+tracing-have-trace_event_file-have-ref-counters.patch
+eventfs-remove-is_freed-union-with-rcu-head.patch
+eventfs-save-ownership-and-mode.patch
+eventfs-delete-eventfs_inode-when-the-last-dentry-is-freed.patch
+eventfs-use-simple_recursive_removal-to-clean-up-dentries.patch
--- /dev/null
+From stable-owner@vger.kernel.org Sun Nov 5 17:01:45 2023
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Sun, 05 Nov 2023 10:56:31 -0500
+Subject:[PATCH 1/5] tracing: Have trace_event_file have ref counters
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org, <gregkh@linuxfoundation.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>, Mark Rutland <mark.rutland@arm.com>, Andrew Morton <akpm@linux-foundation.org>, Beau Belgrave <beaub@linux.microsoft.com>
+Message-ID: <20231105160139.346174799@goodmis.org>
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+
+commit bb32500fb9b78215e4ef6ee8b4345c5f5d7eafb4 upstream
+
+The following can crash the kernel:
+
+ # cd /sys/kernel/tracing
+ # echo 'p:sched schedule' > kprobe_events
+ # exec 5>>events/kprobes/sched/enable
+ # > kprobe_events
+ # exec 5>&-
+
+The above commands:
+
+ 1. Change directory to the tracefs directory
+ 2. Create a kprobe event (doesn't matter what one)
+ 3. Open bash file descriptor 5 on the enable file of the kprobe event
+ 4. Delete the kprobe event (removes the files too)
+ 5. Close the bash file descriptor 5
+
+The above causes a crash!
+
+ BUG: kernel NULL pointer dereference, address: 0000000000000028
+ #PF: supervisor read access in kernel mode
+ #PF: error_code(0x0000) - not-present page
+ PGD 0 P4D 0
+ Oops: 0000 [#1] PREEMPT SMP PTI
+ CPU: 6 PID: 877 Comm: bash Not tainted 6.5.0-rc4-test-00008-g2c6b6b1029d4-dirty #186
+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
+ RIP: 0010:tracing_release_file_tr+0xc/0x50
+
+What happens here is that the kprobe event creates a trace_event_file
+"file" descriptor that represents the file in tracefs to the event. It
+maintains state of the event (is it enabled for the given instance?).
+Opening the "enable" file gets a reference to the event "file" descriptor
+via the open file descriptor. When the kprobe event is deleted, the file is
+also deleted from the tracefs system which also frees the event "file"
+descriptor.
+
+But as the tracefs file is still opened by user space, it will not be
+totally removed until the final dput() is called on it. But this is not
+true with the event "file" descriptor that is already freed. If the user
+does a write to or simply closes the file descriptor it will reference the
+event "file" descriptor that was just freed, causing a use-after-free bug.
+
+To solve this, add a ref count to the event "file" descriptor as well as a
+new flag called "FREED". The "file" will not be freed until the last
+reference is released. But the FREE flag will be set when the event is
+removed to prevent any more modifications to that event from happening,
+even if there's still a reference to the event "file" descriptor.
+
+Link: https://lore.kernel.org/linux-trace-kernel/20231031000031.1e705592@gandalf.local.home/
+Link: https://lore.kernel.org/linux-trace-kernel/20231031122453.7a48b923@gandalf.local.home
+
+Cc: stable@vger.kernel.org
+Cc: Mark Rutland <mark.rutland@arm.com>
+Fixes: f5ca233e2e66d ("tracing: Increase trace array ref count on enable and filter files")
+Reported-by: Beau Belgrave <beaub@linux.microsoft.com>
+Tested-by: Beau Belgrave <beaub@linux.microsoft.com>
+Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/trace_events.h | 4 ++++
+ kernel/trace/trace.c | 15 +++++++++++++++
+ kernel/trace/trace.h | 3 +++
+ kernel/trace/trace_events.c | 31 +++++++++++++++++++++++++++----
+ kernel/trace/trace_events_filter.c | 3 +++
+ 5 files changed, 52 insertions(+), 4 deletions(-)
+
+--- a/include/linux/trace_events.h
++++ b/include/linux/trace_events.h
+@@ -492,6 +492,7 @@ enum {
+ EVENT_FILE_FL_TRIGGER_COND_BIT,
+ EVENT_FILE_FL_PID_FILTER_BIT,
+ EVENT_FILE_FL_WAS_ENABLED_BIT,
++ EVENT_FILE_FL_FREED_BIT,
+ };
+
+ extern struct trace_event_file *trace_get_event_file(const char *instance,
+@@ -630,6 +631,7 @@ extern int __kprobe_event_add_fields(str
+ * TRIGGER_COND - When set, one or more triggers has an associated filter
+ * PID_FILTER - When set, the event is filtered based on pid
+ * WAS_ENABLED - Set when enabled to know to clear trace on module removal
++ * FREED - File descriptor is freed, all fields should be considered invalid
+ */
+ enum {
+ EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT),
+@@ -643,6 +645,7 @@ enum {
+ EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
+ EVENT_FILE_FL_PID_FILTER = (1 << EVENT_FILE_FL_PID_FILTER_BIT),
+ EVENT_FILE_FL_WAS_ENABLED = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT),
++ EVENT_FILE_FL_FREED = (1 << EVENT_FILE_FL_FREED_BIT),
+ };
+
+ struct trace_event_file {
+@@ -671,6 +674,7 @@ struct trace_event_file {
+ * caching and such. Which is mostly OK ;-)
+ */
+ unsigned long flags;
++ atomic_t ref; /* ref count for opened files */
+ atomic_t sm_ref; /* soft-mode reference counter */
+ atomic_t tm_ref; /* trigger-mode reference counter */
+ };
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -4986,6 +4986,20 @@ int tracing_open_file_tr(struct inode *i
+ if (ret)
+ return ret;
+
++ mutex_lock(&event_mutex);
++
++ /* Fail if the file is marked for removal */
++ if (file->flags & EVENT_FILE_FL_FREED) {
++ trace_array_put(file->tr);
++ ret = -ENODEV;
++ } else {
++ event_file_get(file);
++ }
++
++ mutex_unlock(&event_mutex);
++ if (ret)
++ return ret;
++
+ filp->private_data = inode->i_private;
+
+ return 0;
+@@ -4996,6 +5010,7 @@ int tracing_release_file_tr(struct inode
+ struct trace_event_file *file = inode->i_private;
+
+ trace_array_put(file->tr);
++ event_file_put(file);
+
+ return 0;
+ }
+--- a/kernel/trace/trace.h
++++ b/kernel/trace/trace.h
+@@ -1664,6 +1664,9 @@ extern void event_trigger_unregister(str
+ char *glob,
+ struct event_trigger_data *trigger_data);
+
++extern void event_file_get(struct trace_event_file *file);
++extern void event_file_put(struct trace_event_file *file);
++
+ /**
+ * struct event_trigger_ops - callbacks for trace event triggers
+ *
+--- a/kernel/trace/trace_events.c
++++ b/kernel/trace/trace_events.c
+@@ -990,13 +990,35 @@ static void remove_subsystem(struct trac
+ }
+ }
+
++void event_file_get(struct trace_event_file *file)
++{
++ atomic_inc(&file->ref);
++}
++
++void event_file_put(struct trace_event_file *file)
++{
++ if (WARN_ON_ONCE(!atomic_read(&file->ref))) {
++ if (file->flags & EVENT_FILE_FL_FREED)
++ kmem_cache_free(file_cachep, file);
++ return;
++ }
++
++ if (atomic_dec_and_test(&file->ref)) {
++ /* Count should only go to zero when it is freed */
++ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED)))
++ return;
++ kmem_cache_free(file_cachep, file);
++ }
++}
++
+ static void remove_event_file_dir(struct trace_event_file *file)
+ {
+ eventfs_remove(file->ef);
+ list_del(&file->list);
+ remove_subsystem(file->system);
+ free_event_filter(file->filter);
+- kmem_cache_free(file_cachep, file);
++ file->flags |= EVENT_FILE_FL_FREED;
++ event_file_put(file);
+ }
+
+ /*
+@@ -1369,7 +1391,7 @@ event_enable_read(struct file *filp, cha
+ flags = file->flags;
+ mutex_unlock(&event_mutex);
+
+- if (!file)
++ if (!file || flags & EVENT_FILE_FL_FREED)
+ return -ENODEV;
+
+ if (flags & EVENT_FILE_FL_ENABLED &&
+@@ -1407,7 +1429,7 @@ event_enable_write(struct file *filp, co
+ ret = -ENODEV;
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+- if (likely(file))
++ if (likely(file && !(file->flags & EVENT_FILE_FL_FREED)))
+ ret = ftrace_event_enable_disable(file, val);
+ mutex_unlock(&event_mutex);
+ break;
+@@ -1681,7 +1703,7 @@ event_filter_read(struct file *filp, cha
+
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+- if (file)
++ if (file && !(file->flags & EVENT_FILE_FL_FREED))
+ print_event_filter(file, s);
+ mutex_unlock(&event_mutex);
+
+@@ -2803,6 +2825,7 @@ trace_create_new_event(struct trace_even
+ atomic_set(&file->tm_ref, 0);
+ INIT_LIST_HEAD(&file->triggers);
+ list_add(&file->list, &tr->events);
++ event_file_get(file);
+
+ return file;
+ }
+--- a/kernel/trace/trace_events_filter.c
++++ b/kernel/trace/trace_events_filter.c
+@@ -2349,6 +2349,9 @@ int apply_event_filter(struct trace_even
+ struct event_filter *filter = NULL;
+ int err;
+
++ if (file->flags & EVENT_FILE_FL_FREED)
++ return -ENODEV;
++
+ if (!strcmp(strstrip(filter_string), "0")) {
+ filter_disable(file);
+ filter = event_filter(file);