]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
Merge branch 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Apr 2020 18:30:20 +0000 (11:30 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Apr 2020 18:30:20 +0000 (11:30 -0700)
Pull cgroup updates from Tejun Heo:

 - Christian extended clone3 so that processes can be spawned into
   cgroups directly.

   This is not only neat in terms of semantics but also avoids grabbing
   the global cgroup_threadgroup_rwsem for migration.

 - Daniel added !root xattr support to cgroupfs.

   Userland already uses xattrs on cgroupfs for bookkeeping. This will
   allow delegated cgroups to support such usages.

 - Prateek tried to make cpuset hotplug handling synchronous but that
   led to possible deadlock scenarios. Reverted.

 - Other minor changes including release_agent_path handling cleanup.

* 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  docs: cgroup-v1: Document the cpuset_v2_mode mount option
  Revert "cpuset: Make cpuset hotplug synchronous"
  cgroupfs: Support user xattrs
  kernfs: Add option to enable user xattrs
  kernfs: Add removed_size out param for simple_xattr_set
  kernfs: kvmalloc xattr value instead of kmalloc
  cgroup: Restructure release_agent_path handling
  selftests/cgroup: add tests for cloning into cgroups
  clone3: allow spawning processes into cgroups
  cgroup: add cgroup_may_write() helper
  cgroup: refactor fork helpers
  cgroup: add cgroup_get_from_file() helper
  cgroup: unify attach permission checking
  cpuset: Make cpuset hotplug synchronous
  cgroup.c: Use built-in RCU list checking
  kselftest/cgroup: add cgroup destruction test
  cgroup: Clean up css_set task traversal

1  2 
include/linux/cgroup-defs.h
kernel/cgroup/cgroup.c
kernel/fork.c
mm/shmem.c

index e1fafed22db12f23b0e12e9b368d181b85c74627,68c391f451d156c1add8ef5d9e0abe0b6d37c101..52661155f85fd935f930c9fe1c3990257c8302a1
@@@ -94,11 -94,6 +94,11 @@@ enum 
         * Enable legacy local memory.events.
         */
        CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 5),
 +
 +      /*
 +       * Enable recursive subtree protection
 +       */
 +      CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 6),
  };
  
  /* cftype->flags */
@@@ -633,8 -628,9 +633,9 @@@ struct cgroup_subsys 
        void (*cancel_attach)(struct cgroup_taskset *tset);
        void (*attach)(struct cgroup_taskset *tset);
        void (*post_attach)(void);
-       int (*can_fork)(struct task_struct *task);
-       void (*cancel_fork)(struct task_struct *task);
+       int (*can_fork)(struct task_struct *task,
+                       struct css_set *cset);
+       void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
        void (*fork)(struct task_struct *task);
        void (*exit)(struct task_struct *task);
        void (*release)(struct task_struct *task);
diff --combined kernel/cgroup/cgroup.c
index 755c07d845ce184676ea35e2f133e5ba0eeb6599,33ff9ec4a5232f6976e6ed18b5bbd6bf7e20763c..06b5ea9d899d81d020fbd51ce8d6fdf1a75e169f
@@@ -1813,14 -1813,12 +1813,14 @@@ int cgroup_show_path(struct seq_file *s
  enum cgroup2_param {
        Opt_nsdelegate,
        Opt_memory_localevents,
 +      Opt_memory_recursiveprot,
        nr__cgroup2_params
  };
  
  static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
        fsparam_flag("nsdelegate",              Opt_nsdelegate),
        fsparam_flag("memory_localevents",      Opt_memory_localevents),
 +      fsparam_flag("memory_recursiveprot",    Opt_memory_recursiveprot),
        {}
  };
  
@@@ -1841,9 -1839,6 +1841,9 @@@ static int cgroup2_parse_param(struct f
        case Opt_memory_localevents:
                ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                return 0;
 +      case Opt_memory_recursiveprot:
 +              ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
 +              return 0;
        }
        return -EINVAL;
  }
@@@ -1860,11 -1855,6 +1860,11 @@@ static void apply_cgroup_root_flags(uns
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
 +
 +              if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
 +                      cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
 +              else
 +                      cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
        }
  }
  
@@@ -1874,8 -1864,6 +1874,8 @@@ static int cgroup_show_options(struct s
                seq_puts(seq, ",nsdelegate");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                seq_puts(seq, ",memory_localevents");
 +      if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
 +              seq_puts(seq, ",memory_recursiveprot");
        return 0;
  }
  
@@@ -1966,7 -1954,8 +1966,8 @@@ int cgroup_setup_root(struct cgroup_roo
  
        root->kf_root = kernfs_create_root(kf_sops,
                                           KERNFS_ROOT_CREATE_DEACTIVATED |
-                                          KERNFS_ROOT_SUPPORT_EXPORTOP,
+                                          KERNFS_ROOT_SUPPORT_EXPORTOP |
+                                          KERNFS_ROOT_SUPPORT_USER_XATTR,
                                           root_cgrp);
        if (IS_ERR(root->kf_root)) {
                ret = PTR_ERR(root->kf_root);
@@@ -2726,11 -2715,7 +2727,7 @@@ int cgroup_attach_task(struct cgroup *d
  {
        DEFINE_CGROUP_MGCTX(mgctx);
        struct task_struct *task;
-       int ret;
-       ret = cgroup_migrate_vet_dst(dst_cgrp);
-       if (ret)
-               return ret;
+       int ret = 0;
  
        /* look up all src csets */
        spin_lock_irq(&css_set_lock);
@@@ -4160,7 -4145,8 +4157,8 @@@ struct cgroup_subsys_state *css_next_ch
        } else if (likely(!(pos->flags & CSS_RELEASED))) {
                next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
        } else {
-               list_for_each_entry_rcu(next, &parent->children, sibling)
+               list_for_each_entry_rcu(next, &parent->children, sibling,
+                                       lockdep_is_held(&cgroup_mutex))
                        if (next->serial_nr > pos->serial_nr)
                                break;
        }
@@@ -4403,29 -4389,24 +4401,24 @@@ static void css_task_iter_advance_css_s
  
        lockdep_assert_held(&css_set_lock);
  
-       /* Advance to the next non-empty css_set */
-       do {
-               cset = css_task_iter_next_css_set(it);
-               if (!cset) {
-                       it->task_pos = NULL;
-                       return;
+       /* Advance to the next non-empty css_set and find first non-empty tasks list*/
+       while ((cset = css_task_iter_next_css_set(it))) {
+               if (!list_empty(&cset->tasks)) {
+                       it->cur_tasks_head = &cset->tasks;
+                       break;
+               } else if (!list_empty(&cset->mg_tasks)) {
+                       it->cur_tasks_head = &cset->mg_tasks;
+                       break;
+               } else if (!list_empty(&cset->dying_tasks)) {
+                       it->cur_tasks_head = &cset->dying_tasks;
+                       break;
                }
-       } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
-       if (!list_empty(&cset->tasks)) {
-               it->task_pos = cset->tasks.next;
-               it->cur_tasks_head = &cset->tasks;
-       } else if (!list_empty(&cset->mg_tasks)) {
-               it->task_pos = cset->mg_tasks.next;
-               it->cur_tasks_head = &cset->mg_tasks;
-       } else {
-               it->task_pos = cset->dying_tasks.next;
-               it->cur_tasks_head = &cset->dying_tasks;
        }
-       it->tasks_head = &cset->tasks;
-       it->mg_tasks_head = &cset->mg_tasks;
-       it->dying_tasks_head = &cset->dying_tasks;
+       if (!cset) {
+               it->task_pos = NULL;
+               return;
+       }
+       it->task_pos = it->cur_tasks_head->next;
  
        /*
         * We don't keep css_sets locked across iteration steps and thus
@@@ -4470,24 -4451,24 +4463,24 @@@ static void css_task_iter_advance(struc
  repeat:
        if (it->task_pos) {
                /*
-                * Advance iterator to find next entry.  cset->tasks is
-                * consumed first and then ->mg_tasks.  After ->mg_tasks,
-                * we move onto the next cset.
+                * Advance iterator to find next entry. We go through cset
+                * tasks, mg_tasks and dying_tasks, when consumed we move onto
+                * the next cset.
                 */
                if (it->flags & CSS_TASK_ITER_SKIPPED)
                        it->flags &= ~CSS_TASK_ITER_SKIPPED;
                else
                        it->task_pos = it->task_pos->next;
  
-               if (it->task_pos == it->tasks_head) {
-                       it->task_pos = it->mg_tasks_head->next;
-                       it->cur_tasks_head = it->mg_tasks_head;
+               if (it->task_pos == &it->cur_cset->tasks) {
+                       it->cur_tasks_head = &it->cur_cset->mg_tasks;
+                       it->task_pos = it->cur_tasks_head->next;
                }
-               if (it->task_pos == it->mg_tasks_head) {
-                       it->task_pos = it->dying_tasks_head->next;
-                       it->cur_tasks_head = it->dying_tasks_head;
+               if (it->task_pos == &it->cur_cset->mg_tasks) {
+                       it->cur_tasks_head = &it->cur_cset->dying_tasks;
+                       it->task_pos = it->cur_tasks_head->next;
                }
-               if (it->task_pos == it->dying_tasks_head)
+               if (it->task_pos == &it->cur_cset->dying_tasks)
                        css_task_iter_advance_css_set(it);
        } else {
                /* called from start, proceed to the first cset */
                        goto repeat;
  
                /* and dying leaders w/o live member threads */
-               if (it->cur_tasks_head == it->dying_tasks_head &&
+               if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
                    !atomic_read(&task->signal->live))
                        goto repeat;
        } else {
                /* skip all dying ones */
-               if (it->cur_tasks_head == it->dying_tasks_head)
+               if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
                        goto repeat;
        }
  }
@@@ -4674,13 -4655,28 +4667,28 @@@ static int cgroup_procs_show(struct seq
        return 0;
  }
  
+ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
+ {
+       int ret;
+       struct inode *inode;
+       lockdep_assert_held(&cgroup_mutex);
+       inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
+       if (!inode)
+               return -ENOMEM;
+       ret = inode_permission(inode, MAY_WRITE);
+       iput(inode);
+       return ret;
+ }
  static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
                                         struct cgroup *dst_cgrp,
                                         struct super_block *sb)
  {
        struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
        struct cgroup *com_cgrp = src_cgrp;
-       struct inode *inode;
        int ret;
  
        lockdep_assert_held(&cgroup_mutex);
                com_cgrp = cgroup_parent(com_cgrp);
  
        /* %current should be authorized to migrate to the common ancestor */
-       inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
-       if (!inode)
-               return -ENOMEM;
-       ret = inode_permission(inode, MAY_WRITE);
-       iput(inode);
+       ret = cgroup_may_write(com_cgrp, sb);
        if (ret)
                return ret;
  
        return 0;
  }
  
+ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
+                                    struct cgroup *dst_cgrp,
+                                    struct super_block *sb, bool threadgroup)
+ {
+       int ret = 0;
+       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
+       if (ret)
+               return ret;
+       ret = cgroup_migrate_vet_dst(dst_cgrp);
+       if (ret)
+               return ret;
+       if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
+               ret = -EOPNOTSUPP;
+       return ret;
+ }
  static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
                                  char *buf, size_t nbytes, loff_t off)
  {
        src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
        spin_unlock_irq(&css_set_lock);
  
-       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
-                                           of->file->f_path.dentry->d_sb);
+       ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+                                       of->file->f_path.dentry->d_sb, true);
        if (ret)
                goto out_finish;
  
@@@ -4778,16 -4789,11 +4801,11 @@@ static ssize_t cgroup_threads_write(str
        spin_unlock_irq(&css_set_lock);
  
        /* thread migrations follow the cgroup.procs delegation rule */
-       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
-                                           of->file->f_path.dentry->d_sb);
+       ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+                                       of->file->f_path.dentry->d_sb, false);
        if (ret)
                goto out_finish;
  
-       /* and must be contained in the same domain */
-       ret = -EOPNOTSUPP;
-       if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
-               goto out_finish;
        ret = cgroup_attach_task(dst_cgrp, task, false);
  
  out_finish:
@@@ -5876,8 -5882,7 +5894,7 @@@ out
   * @child: pointer to task_struct of forking parent process.
   *
   * A task is associated with the init_css_set until cgroup_post_fork()
-  * attaches it to the parent's css_set.  Empty cg_list indicates that
-  * @child isn't holding reference to its css_set.
+  * attaches it to the target css_set.
   */
  void cgroup_fork(struct task_struct *child)
  {
        INIT_LIST_HEAD(&child->cg_list);
  }
  
+ static struct cgroup *cgroup_get_from_file(struct file *f)
+ {
+       struct cgroup_subsys_state *css;
+       struct cgroup *cgrp;
+       css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+       if (IS_ERR(css))
+               return ERR_CAST(css);
+       cgrp = css->cgroup;
+       if (!cgroup_on_dfl(cgrp)) {
+               cgroup_put(cgrp);
+               return ERR_PTR(-EBADF);
+       }
+       return cgrp;
+ }
+ /**
+  * cgroup_css_set_fork - find or create a css_set for a child process
+  * @kargs: the arguments passed to create the child process
+  *
+  * This functions finds or creates a new css_set which the child
+  * process will be attached to in cgroup_post_fork(). By default,
+  * the child process will be given the same css_set as its parent.
+  *
+  * If CLONE_INTO_CGROUP is specified this function will try to find an
+  * existing css_set which includes the requested cgroup and if not create
+  * a new css_set that the child will be attached to later. If this function
+  * succeeds it will hold cgroup_threadgroup_rwsem on return. If
+  * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
+  * before grabbing cgroup_threadgroup_rwsem and will hold a reference
+  * to the target cgroup.
+  */
+ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
+       __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
+ {
+       int ret;
+       struct cgroup *dst_cgrp = NULL;
+       struct css_set *cset;
+       struct super_block *sb;
+       struct file *f;
+       if (kargs->flags & CLONE_INTO_CGROUP)
+               mutex_lock(&cgroup_mutex);
+       cgroup_threadgroup_change_begin(current);
+       spin_lock_irq(&css_set_lock);
+       cset = task_css_set(current);
+       get_css_set(cset);
+       spin_unlock_irq(&css_set_lock);
+       if (!(kargs->flags & CLONE_INTO_CGROUP)) {
+               kargs->cset = cset;
+               return 0;
+       }
+       f = fget_raw(kargs->cgroup);
+       if (!f) {
+               ret = -EBADF;
+               goto err;
+       }
+       sb = f->f_path.dentry->d_sb;
+       dst_cgrp = cgroup_get_from_file(f);
+       if (IS_ERR(dst_cgrp)) {
+               ret = PTR_ERR(dst_cgrp);
+               dst_cgrp = NULL;
+               goto err;
+       }
+       if (cgroup_is_dead(dst_cgrp)) {
+               ret = -ENODEV;
+               goto err;
+       }
+       /*
+        * Verify that we the target cgroup is writable for us. This is
+        * usually done by the vfs layer but since we're not going through
+        * the vfs layer here we need to do it "manually".
+        */
+       ret = cgroup_may_write(dst_cgrp, sb);
+       if (ret)
+               goto err;
+       ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
+                                       !(kargs->flags & CLONE_THREAD));
+       if (ret)
+               goto err;
+       kargs->cset = find_css_set(cset, dst_cgrp);
+       if (!kargs->cset) {
+               ret = -ENOMEM;
+               goto err;
+       }
+       put_css_set(cset);
+       fput(f);
+       kargs->cgrp = dst_cgrp;
+       return ret;
+ err:
+       cgroup_threadgroup_change_end(current);
+       mutex_unlock(&cgroup_mutex);
+       if (f)
+               fput(f);
+       if (dst_cgrp)
+               cgroup_put(dst_cgrp);
+       put_css_set(cset);
+       if (kargs->cset)
+               put_css_set(kargs->cset);
+       return ret;
+ }
+ /**
+  * cgroup_css_set_put_fork - drop references we took during fork
+  * @kargs: the arguments passed to create the child process
+  *
+  * Drop references to the prepared css_set and target cgroup if
+  * CLONE_INTO_CGROUP was requested.
+  */
+ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
+       __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+ {
+       cgroup_threadgroup_change_end(current);
+       if (kargs->flags & CLONE_INTO_CGROUP) {
+               struct cgroup *cgrp = kargs->cgrp;
+               struct css_set *cset = kargs->cset;
+               mutex_unlock(&cgroup_mutex);
+               if (cset) {
+                       put_css_set(cset);
+                       kargs->cset = NULL;
+               }
+               if (cgrp) {
+                       cgroup_put(cgrp);
+                       kargs->cgrp = NULL;
+               }
+       }
+ }
  /**
   * cgroup_can_fork - called on a new task before the process is exposed
-  * @child: the task in question.
+  * @child: the child process
   *
-  * This calls the subsystem can_fork() callbacks. If the can_fork() callback
-  * returns an error, the fork aborts with that error code. This allows for
-  * a cgroup subsystem to conditionally allow or deny new forks.
+  * This prepares a new css_set for the child process which the child will
+  * be attached to in cgroup_post_fork().
+  * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
+  * callback returns an error, the fork aborts with that error code. This
+  * allows for a cgroup subsystem to conditionally allow or deny new forks.
   */
- int cgroup_can_fork(struct task_struct *child)
+ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
  {
        struct cgroup_subsys *ss;
        int i, j, ret;
  
+       ret = cgroup_css_set_fork(kargs);
+       if (ret)
+               return ret;
        do_each_subsys_mask(ss, i, have_canfork_callback) {
-               ret = ss->can_fork(child);
+               ret = ss->can_fork(child, kargs->cset);
                if (ret)
                        goto out_revert;
        } while_each_subsys_mask();
@@@ -5911,54 -6067,64 +6079,64 @@@ out_revert
                if (j >= i)
                        break;
                if (ss->cancel_fork)
-                       ss->cancel_fork(child);
+                       ss->cancel_fork(child, kargs->cset);
        }
  
+       cgroup_css_set_put_fork(kargs);
        return ret;
  }
  
  /**
   * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
-  * @child: the task in question
+  * @child: the child process
+  * @kargs: the arguments passed to create the child process
   *
   * This calls the cancel_fork() callbacks if a fork failed *after*
-  * cgroup_can_fork() succeded.
+  * cgroup_can_fork() succeded and cleans up references we took to
+  * prepare a new css_set for the child process in cgroup_can_fork().
   */
- void cgroup_cancel_fork(struct task_struct *child)
+ void cgroup_cancel_fork(struct task_struct *child,
+                       struct kernel_clone_args *kargs)
  {
        struct cgroup_subsys *ss;
        int i;
  
        for_each_subsys(ss, i)
                if (ss->cancel_fork)
-                       ss->cancel_fork(child);
+                       ss->cancel_fork(child, kargs->cset);
+       cgroup_css_set_put_fork(kargs);
  }
  
  /**
-  * cgroup_post_fork - called on a new task after adding it to the task list
-  * @child: the task in question
-  *
-  * Adds the task to the list running through its css_set if necessary and
-  * call the subsystem fork() callbacks.  Has to be after the task is
-  * visible on the task list in case we race with the first call to
-  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
-  * list.
+  * cgroup_post_fork - finalize cgroup setup for the child process
+  * @child: the child process
+  *
+  * Attach the child process to its css_set calling the subsystem fork()
+  * callbacks.
   */
- void cgroup_post_fork(struct task_struct *child)
+ void cgroup_post_fork(struct task_struct *child,
+                     struct kernel_clone_args *kargs)
+       __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
  {
        struct cgroup_subsys *ss;
        struct css_set *cset;
        int i;
  
+       cset = kargs->cset;
+       kargs->cset = NULL;
        spin_lock_irq(&css_set_lock);
  
        /* init tasks are special, only link regular threads */
        if (likely(child->pid)) {
                WARN_ON_ONCE(!list_empty(&child->cg_list));
-               cset = task_css_set(current); /* current is @child's parent */
-               get_css_set(cset);
                cset->nr_tasks++;
                css_set_move_task(child, NULL, cset, false);
+       } else {
+               put_css_set(cset);
+               cset = NULL;
        }
  
        /*
        do_each_subsys_mask(ss, i, have_fork_callback) {
                ss->fork(child);
        } while_each_subsys_mask();
+       /* Make the new cset the root_cset of the new cgroup namespace. */
+       if (kargs->flags & CLONE_NEWCGROUP) {
+               struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
+               get_css_set(cset);
+               child->nsproxy->cgroup_ns->root_cset = cset;
+               put_css_set(rcset);
+       }
+       cgroup_css_set_put_fork(kargs);
  }
  
  /**
@@@ -6176,7 -6353,6 +6365,6 @@@ EXPORT_SYMBOL_GPL(cgroup_get_from_path)
   */
  struct cgroup *cgroup_get_from_fd(int fd)
  {
-       struct cgroup_subsys_state *css;
        struct cgroup *cgrp;
        struct file *f;
  
        if (!f)
                return ERR_PTR(-EBADF);
  
-       css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+       cgrp = cgroup_get_from_file(f);
        fput(f);
-       if (IS_ERR(css))
-               return ERR_CAST(css);
-       cgrp = css->cgroup;
-       if (!cgroup_on_dfl(cgrp)) {
-               cgroup_put(cgrp);
-               return ERR_PTR(-EBADF);
-       }
        return cgrp;
  }
  EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
@@@ -6283,10 -6450,6 +6462,10 @@@ void cgroup_sk_alloc(struct sock_cgroup
                return;
        }
  
 +      /* Don't associate the sock with unrelated interrupted task's cgroup. */
 +      if (in_interrupt())
 +              return;
 +
        rcu_read_lock();
  
        while (true) {
@@@ -6315,58 -6478,27 +6494,58 @@@ void cgroup_sk_free(struct sock_cgroup_
  #endif        /* CONFIG_SOCK_CGROUP_DATA */
  
  #ifdef CONFIG_CGROUP_BPF
 -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 -                    struct bpf_prog *replace_prog, enum bpf_attach_type type,
 +int cgroup_bpf_attach(struct cgroup *cgrp,
 +                    struct bpf_prog *prog, struct bpf_prog *replace_prog,
 +                    struct bpf_cgroup_link *link,
 +                    enum bpf_attach_type type,
                      u32 flags)
  {
        int ret;
  
        mutex_lock(&cgroup_mutex);
 -      ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
 +      ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
 +      mutex_unlock(&cgroup_mutex);
 +      return ret;
 +}
 +
 +int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog,
 +                     struct bpf_prog *new_prog)
 +{
 +      struct bpf_cgroup_link *cg_link;
 +      int ret;
 +
 +      if (link->ops != &bpf_cgroup_link_lops)
 +              return -EINVAL;
 +
 +      cg_link = container_of(link, struct bpf_cgroup_link, link);
 +
 +      mutex_lock(&cgroup_mutex);
 +      /* link might have been auto-released by dying cgroup, so fail */
 +      if (!cg_link->cgroup) {
 +              ret = -EINVAL;
 +              goto out_unlock;
 +      }
 +      if (old_prog && link->prog != old_prog) {
 +              ret = -EPERM;
 +              goto out_unlock;
 +      }
 +      ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
 +out_unlock:
        mutex_unlock(&cgroup_mutex);
        return ret;
  }
 +
  int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 -                    enum bpf_attach_type type, u32 flags)
 +                    enum bpf_attach_type type)
  {
        int ret;
  
        mutex_lock(&cgroup_mutex);
 -      ret = __cgroup_bpf_detach(cgrp, prog, type);
 +      ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
        mutex_unlock(&cgroup_mutex);
        return ret;
  }
 +
  int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
                     union bpf_attr __user *uattr)
  {
@@@ -6424,10 -6556,7 +6603,10 @@@ static struct kobj_attribute cgroup_del
  static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
                             char *buf)
  {
 -      return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
 +      return snprintf(buf, PAGE_SIZE,
 +                      "nsdelegate\n"
 +                      "memory_localevents\n"
 +                      "memory_recursiveprot\n");
  }
  static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
  
diff --combined kernel/fork.c
index ba122d6f5127386a2c532b5679dde19a30669af1,635d6369dfb904dd7e0498c2ff8e29ea3a539548..d2a967bf85d5eed4ccad59e51dbc89da332bffeb
@@@ -281,7 -281,7 +281,7 @@@ static inline void free_thread_stack(st
                                             MEMCG_KERNEL_STACK_KB,
                                             -(int)(PAGE_SIZE / 1024));
  
 -                      memcg_kmem_uncharge(vm->pages[i], 0);
 +                      memcg_kmem_uncharge_page(vm->pages[i], 0);
                }
  
                for (i = 0; i < NR_CACHED_STACKS; i++) {
@@@ -397,8 -397,8 +397,8 @@@ static void account_kernel_stack(struc
                mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
                                    THREAD_SIZE / 1024 * account);
  
 -              mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
 -                                   account * (THREAD_SIZE / 1024));
 +              mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
 +                                  account * (THREAD_SIZE / 1024));
        }
  }
  
@@@ -413,13 -413,12 +413,13 @@@ static int memcg_charge_kernel_stack(st
  
                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                        /*
 -                       * If memcg_kmem_charge() fails, page->mem_cgroup
 -                       * pointer is NULL, and both memcg_kmem_uncharge()
 +                       * If memcg_kmem_charge_page() fails, page->mem_cgroup
 +                       * pointer is NULL, and both memcg_kmem_uncharge_page()
                         * and mod_memcg_page_state() in free_thread_stack()
                         * will ignore this page. So it's safe.
                         */
 -                      ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0);
 +                      ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
 +                                                   0);
                        if (ret)
                                return ret;
  
@@@ -1225,7 -1224,7 +1225,7 @@@ struct mm_struct *mm_access(struct task
        struct mm_struct *mm;
        int err;
  
 -      err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
 +      err =  mutex_lock_killable(&task->signal->exec_update_mutex);
        if (err)
                return ERR_PTR(err);
  
                mmput(mm);
                mm = ERR_PTR(-EACCES);
        }
 -      mutex_unlock(&task->signal->cred_guard_mutex);
 +      mutex_unlock(&task->signal->exec_update_mutex);
  
        return mm;
  }
@@@ -1509,7 -1508,7 +1509,7 @@@ static int copy_sighand(unsigned long c
                return 0;
        }
        sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
 -      rcu_assign_pointer(tsk->sighand, sig);
 +      RCU_INIT_POINTER(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;
  
@@@ -1595,7 -1594,6 +1595,7 @@@ static int copy_signal(unsigned long cl
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
  
        mutex_init(&sig->cred_guard_mutex);
 +      mutex_init(&sig->exec_update_mutex);
  
        return 0;
  }
@@@ -2176,16 -2174,15 +2176,15 @@@ static __latent_entropy struct task_str
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
  
-       cgroup_threadgroup_change_begin(current);
        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted the the new process's css_set can be changed
         * between here and cgroup_post_fork() if an organisation operation is in
         * progress.
         */
-       retval = cgroup_can_fork(p);
+       retval = cgroup_can_fork(p, args);
        if (retval)
-               goto bad_fork_cgroup_threadgroup_change_end;
+               goto bad_fork_put_pidfd;
  
        /*
         * From this point on we must avoid any synchronous user-space
        write_unlock_irq(&tasklist_lock);
  
        proc_fork_connector(p);
-       cgroup_post_fork(p);
-       cgroup_threadgroup_change_end(current);
+       cgroup_post_fork(p, args);
        perf_event_fork(p);
  
        trace_task_newtask(p, clone_flags);
  bad_fork_cancel_cgroup:
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
-       cgroup_cancel_fork(p);
- bad_fork_cgroup_threadgroup_change_end:
-       cgroup_threadgroup_change_end(current);
+       cgroup_cancel_fork(p, args);
  bad_fork_put_pidfd:
        if (clone_flags & CLONE_PIDFD) {
                fput(pidfile);
@@@ -2633,6 -2627,9 +2629,9 @@@ noinline static int copy_clone_args_fro
                     !valid_signal(args.exit_signal)))
                return -EINVAL;
  
+       if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
+               return -EINVAL;
        *kargs = (struct kernel_clone_args){
                .flags          = args.flags,
                .pidfd          = u64_to_user_ptr(args.pidfd),
                .stack_size     = args.stack_size,
                .tls            = args.tls,
                .set_tid_size   = args.set_tid_size,
+               .cgroup         = args.cgroup,
        };
  
        if (args.set_tid &&
@@@ -2686,7 -2684,8 +2686,8 @@@ static inline bool clone3_stack_valid(s
  static bool clone3_args_valid(struct kernel_clone_args *kargs)
  {
        /* Verify that no unknown flags are passed along. */
-       if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
+       if (kargs->flags &
+           ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
                return false;
  
        /*
diff --combined mm/shmem.c
index aad3ba74b0e9d121bd774e795460d5f2f73ac119,e6a7549faf20594c1a1c31b0c0fa84c20a675979..f47347cb30f6b0b41684a549508b9d4972aec378
@@@ -3243,7 -3243,7 +3243,7 @@@ static int shmem_xattr_handler_set(cons
        struct shmem_inode_info *info = SHMEM_I(inode);
  
        name = xattr_full_name(handler, name);
-       return simple_xattr_set(&info->xattrs, name, value, size, flags);
+       return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
  }
  
  static const struct xattr_handler shmem_security_xattr_handler = {
@@@ -3386,6 -3386,8 +3386,6 @@@ static const struct constant_table shme
        {"always",      SHMEM_HUGE_ALWAYS },
        {"within_size", SHMEM_HUGE_WITHIN_SIZE },
        {"advise",      SHMEM_HUGE_ADVISE },
 -      {"deny",        SHMEM_HUGE_DENY },
 -      {"force",       SHMEM_HUGE_FORCE },
        {}
  };