Merge branch 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 3 Apr 2020 18:30:20 +0000 (11:30 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 3 Apr 2020 18:30:20 +0000 (11:30 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Apr 2020 18:30:20 +0000 (11:30 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Apr 2020 18:30:20 +0000 (11:30 -0700)
diff --combined include/linux/cgroup-defs.h

index e1fafed22db12f23b0e12e9b368d181b85c74627,68c391f451d156c1add8ef5d9e0abe0b6d37c101..52661155f85fd935f930c9fe1c3990257c8302a1
--- 1/include/linux/cgroup-defs.h
--- 2/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@@ -94,11 -94,6 +94,11 @@@ enum 
          * Enable legacy local memory.events.
          */
         CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 5),
+ +
+ +      /*
+ +       * Enable recursive subtree protection
+ +       */
+ +      CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 6),
   };
   
   /* cftype->flags */
@@@ -633,8 -628,9 +633,9 @@@ struct cgroup_subsys 
         void (*cancel_attach)(struct cgroup_taskset *tset);
         void (*attach)(struct cgroup_taskset *tset);
         void (*post_attach)(void);
-       int (*can_fork)(struct task_struct *task);
-       void (*cancel_fork)(struct task_struct *task);
+       int (*can_fork)(struct task_struct *task,
+                       struct css_set *cset);
+       void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
         void (*fork)(struct task_struct *task);
         void (*exit)(struct task_struct *task);
         void (*release)(struct task_struct *task);
diff --combined kernel/cgroup/cgroup.c

index 755c07d845ce184676ea35e2f133e5ba0eeb6599,33ff9ec4a5232f6976e6ed18b5bbd6bf7e20763c..06b5ea9d899d81d020fbd51ce8d6fdf1a75e169f
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -1813,14 -1813,12 +1813,14 @@@ int cgroup_show_path(struct seq_file *s
   enum cgroup2_param {
         Opt_nsdelegate,
         Opt_memory_localevents,
+ +      Opt_memory_recursiveprot,
         nr__cgroup2_params
   };
   
   static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
         fsparam_flag("nsdelegate",              Opt_nsdelegate),
         fsparam_flag("memory_localevents",      Opt_memory_localevents),
+ +      fsparam_flag("memory_recursiveprot",    Opt_memory_recursiveprot),
         {}
   };
   
@@@ -1841,9 -1839,6 +1841,9 @@@ static int cgroup2_parse_param(struct f
         case Opt_memory_localevents:
                 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                 return 0;
+ +      case Opt_memory_recursiveprot:
+ +              ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+ +              return 0;
         }
         return -EINVAL;
   }
@@@ -1860,11 -1855,6 +1860,11 @@@ static void apply_cgroup_root_flags(uns
                         cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                 else
                         cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ +
+ +              if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
+ +                      cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+ +              else
+ +                      cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
         }
   }
   
@@@ -1874,8 -1864,6 +1874,8 @@@ static int cgroup_show_options(struct s
                 seq_puts(seq, ",nsdelegate");
         if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                 seq_puts(seq, ",memory_localevents");
+ +      if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
+ +              seq_puts(seq, ",memory_recursiveprot");
         return 0;
   }
   
@@@ -1966,7 -1954,8 +1966,8 @@@ int cgroup_setup_root(struct cgroup_roo
   
         root->kf_root = kernfs_create_root(kf_sops,
                                            KERNFS_ROOT_CREATE_DEACTIVATED |
-                                          KERNFS_ROOT_SUPPORT_EXPORTOP,
+                                          KERNFS_ROOT_SUPPORT_EXPORTOP |
+                                          KERNFS_ROOT_SUPPORT_USER_XATTR,
                                            root_cgrp);
         if (IS_ERR(root->kf_root)) {
                 ret = PTR_ERR(root->kf_root);
@@@ -2726,11 -2715,7 +2727,7 @@@ int cgroup_attach_task(struct cgroup *d
   {
         DEFINE_CGROUP_MGCTX(mgctx);
         struct task_struct *task;
-       int ret;
- 
-       ret = cgroup_migrate_vet_dst(dst_cgrp);
-       if (ret)
-               return ret;
+       int ret = 0;
   
         /* look up all src csets */
         spin_lock_irq(&css_set_lock);
@@@ -4160,7 -4145,8 +4157,8 @@@ struct cgroup_subsys_state *css_next_ch
         } else if (likely(!(pos->flags & CSS_RELEASED))) {
                 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
         } else {
-               list_for_each_entry_rcu(next, &parent->children, sibling)
+               list_for_each_entry_rcu(next, &parent->children, sibling,
+                                       lockdep_is_held(&cgroup_mutex))
                         if (next->serial_nr > pos->serial_nr)
                                 break;
         }
@@@ -4403,29 -4389,24 +4401,24 @@@ static void css_task_iter_advance_css_s
   
         lockdep_assert_held(&css_set_lock);
   
-       /* Advance to the next non-empty css_set */
-       do {
-               cset = css_task_iter_next_css_set(it);
-               if (!cset) {
-                       it->task_pos = NULL;
-                       return;
+       /* Advance to the next non-empty css_set and find first non-empty tasks list*/
+       while ((cset = css_task_iter_next_css_set(it))) {
+               if (!list_empty(&cset->tasks)) {
+                       it->cur_tasks_head = &cset->tasks;
+                       break;
+               } else if (!list_empty(&cset->mg_tasks)) {
+                       it->cur_tasks_head = &cset->mg_tasks;
+                       break;
+               } else if (!list_empty(&cset->dying_tasks)) {
+                       it->cur_tasks_head = &cset->dying_tasks;
+                       break;
                 }
-       } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
- 
-       if (!list_empty(&cset->tasks)) {
-               it->task_pos = cset->tasks.next;
-               it->cur_tasks_head = &cset->tasks;
-       } else if (!list_empty(&cset->mg_tasks)) {
-               it->task_pos = cset->mg_tasks.next;
-               it->cur_tasks_head = &cset->mg_tasks;
-       } else {
-               it->task_pos = cset->dying_tasks.next;
-               it->cur_tasks_head = &cset->dying_tasks;
         }
- 
-       it->tasks_head = &cset->tasks;
-       it->mg_tasks_head = &cset->mg_tasks;
-       it->dying_tasks_head = &cset->dying_tasks;
+       if (!cset) {
+               it->task_pos = NULL;
+               return;
+       }
+       it->task_pos = it->cur_tasks_head->next;
   
         /*
          * We don't keep css_sets locked across iteration steps and thus
@@@ -4470,24 -4451,24 +4463,24 @@@ static void css_task_iter_advance(struc
   repeat:
         if (it->task_pos) {
                 /*
-                * Advance iterator to find next entry.  cset->tasks is
-                * consumed first and then ->mg_tasks.  After ->mg_tasks,
-                * we move onto the next cset.
+                * Advance iterator to find next entry. We go through cset
+                * tasks, mg_tasks and dying_tasks, when consumed we move onto
+                * the next cset.
                  */
                 if (it->flags & CSS_TASK_ITER_SKIPPED)
                         it->flags &= ~CSS_TASK_ITER_SKIPPED;
                 else
                         it->task_pos = it->task_pos->next;
   
-               if (it->task_pos == it->tasks_head) {
-                       it->task_pos = it->mg_tasks_head->next;
-                       it->cur_tasks_head = it->mg_tasks_head;
+               if (it->task_pos == &it->cur_cset->tasks) {
+                       it->cur_tasks_head = &it->cur_cset->mg_tasks;
+                       it->task_pos = it->cur_tasks_head->next;
                 }
-               if (it->task_pos == it->mg_tasks_head) {
-                       it->task_pos = it->dying_tasks_head->next;
-                       it->cur_tasks_head = it->dying_tasks_head;
+               if (it->task_pos == &it->cur_cset->mg_tasks) {
+                       it->cur_tasks_head = &it->cur_cset->dying_tasks;
+                       it->task_pos = it->cur_tasks_head->next;
                 }
-               if (it->task_pos == it->dying_tasks_head)
+               if (it->task_pos == &it->cur_cset->dying_tasks)
                         css_task_iter_advance_css_set(it);
         } else {
                 /* called from start, proceed to the first cset */
@@@ -4505,12 -4486,12 +4498,12 @@@
                         goto repeat;
   
                 /* and dying leaders w/o live member threads */
-               if (it->cur_tasks_head == it->dying_tasks_head &&
+               if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
                     !atomic_read(&task->signal->live))
                         goto repeat;
         } else {
                 /* skip all dying ones */
-               if (it->cur_tasks_head == it->dying_tasks_head)
+               if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
                         goto repeat;
         }
   }
@@@ -4674,13 -4655,28 +4667,28 @@@ static int cgroup_procs_show(struct seq
         return 0;
   }
   
+ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
+ {
+       int ret;
+       struct inode *inode;
+ 
+       lockdep_assert_held(&cgroup_mutex);
+ 
+       inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
+       if (!inode)
+               return -ENOMEM;
+ 
+       ret = inode_permission(inode, MAY_WRITE);
+       iput(inode);
+       return ret;
+ }
+ 
   static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
                                          struct cgroup *dst_cgrp,
                                          struct super_block *sb)
   {
         struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
         struct cgroup *com_cgrp = src_cgrp;
-       struct inode *inode;
         int ret;
   
         lockdep_assert_held(&cgroup_mutex);
@@@ -4690,12 -4686,7 +4698,7 @@@
                 com_cgrp = cgroup_parent(com_cgrp);
   
         /* %current should be authorized to migrate to the common ancestor */
-       inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
-       if (!inode)
-               return -ENOMEM;
- 
-       ret = inode_permission(inode, MAY_WRITE);
-       iput(inode);
+       ret = cgroup_may_write(com_cgrp, sb);
         if (ret)
                 return ret;
   
@@@ -4711,6 -4702,26 +4714,26 @@@
         return 0;
   }
   
+ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
+                                    struct cgroup *dst_cgrp,
+                                    struct super_block *sb, bool threadgroup)
+ {
+       int ret = 0;
+ 
+       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
+       if (ret)
+               return ret;
+ 
+       ret = cgroup_migrate_vet_dst(dst_cgrp);
+       if (ret)
+               return ret;
+ 
+       if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
+               ret = -EOPNOTSUPP;
+ 
+       return ret;
+ }
+ 
   static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
                                   char *buf, size_t nbytes, loff_t off)
   {
@@@ -4733,8 -4744,8 +4756,8 @@@
         src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
         spin_unlock_irq(&css_set_lock);
   
-       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
-                                           of->file->f_path.dentry->d_sb);
+       ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+                                       of->file->f_path.dentry->d_sb, true);
         if (ret)
                 goto out_finish;
   
@@@ -4778,16 -4789,11 +4801,11 @@@ static ssize_t cgroup_threads_write(str
         spin_unlock_irq(&css_set_lock);
   
         /* thread migrations follow the cgroup.procs delegation rule */
-       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
-                                           of->file->f_path.dentry->d_sb);
+       ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+                                       of->file->f_path.dentry->d_sb, false);
         if (ret)
                 goto out_finish;
   
-       /* and must be contained in the same domain */
-       ret = -EOPNOTSUPP;
-       if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
-               goto out_finish;
- 
         ret = cgroup_attach_task(dst_cgrp, task, false);
   
   out_finish:
@@@ -5876,8 -5882,7 +5894,7 @@@ out
    * @child: pointer to task_struct of forking parent process.
    *
    * A task is associated with the init_css_set until cgroup_post_fork()
-  * attaches it to the parent's css_set.  Empty cg_list indicates that
-  * @child isn't holding reference to its css_set.
+  * attaches it to the target css_set.
    */
   void cgroup_fork(struct task_struct *child)
   {
@@@ -5885,21 -5890,172 +5902,172 @@@
         INIT_LIST_HEAD(&child->cg_list);
   }
   
+ static struct cgroup *cgroup_get_from_file(struct file *f)
+ {
+       struct cgroup_subsys_state *css;
+       struct cgroup *cgrp;
+ 
+       css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+       if (IS_ERR(css))
+               return ERR_CAST(css);
+ 
+       cgrp = css->cgroup;
+       if (!cgroup_on_dfl(cgrp)) {
+               cgroup_put(cgrp);
+               return ERR_PTR(-EBADF);
+       }
+ 
+       return cgrp;
+ }
+ 
+ /**
+  * cgroup_css_set_fork - find or create a css_set for a child process
+  * @kargs: the arguments passed to create the child process
+  *
+  * This functions finds or creates a new css_set which the child
+  * process will be attached to in cgroup_post_fork(). By default,
+  * the child process will be given the same css_set as its parent.
+  *
+  * If CLONE_INTO_CGROUP is specified this function will try to find an
+  * existing css_set which includes the requested cgroup and if not create
+  * a new css_set that the child will be attached to later. If this function
+  * succeeds it will hold cgroup_threadgroup_rwsem on return. If
+  * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
+  * before grabbing cgroup_threadgroup_rwsem and will hold a reference
+  * to the target cgroup.
+  */
+ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
+       __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
+ {
+       int ret;
+       struct cgroup *dst_cgrp = NULL;
+       struct css_set *cset;
+       struct super_block *sb;
+       struct file *f;
+ 
+       if (kargs->flags & CLONE_INTO_CGROUP)
+               mutex_lock(&cgroup_mutex);
+ 
+       cgroup_threadgroup_change_begin(current);
+ 
+       spin_lock_irq(&css_set_lock);
+       cset = task_css_set(current);
+       get_css_set(cset);
+       spin_unlock_irq(&css_set_lock);
+ 
+       if (!(kargs->flags & CLONE_INTO_CGROUP)) {
+               kargs->cset = cset;
+               return 0;
+       }
+ 
+       f = fget_raw(kargs->cgroup);
+       if (!f) {
+               ret = -EBADF;
+               goto err;
+       }
+       sb = f->f_path.dentry->d_sb;
+ 
+       dst_cgrp = cgroup_get_from_file(f);
+       if (IS_ERR(dst_cgrp)) {
+               ret = PTR_ERR(dst_cgrp);
+               dst_cgrp = NULL;
+               goto err;
+       }
+ 
+       if (cgroup_is_dead(dst_cgrp)) {
+               ret = -ENODEV;
+               goto err;
+       }
+ 
+       /*
+        * Verify that we the target cgroup is writable for us. This is
+        * usually done by the vfs layer but since we're not going through
+        * the vfs layer here we need to do it "manually".
+        */
+       ret = cgroup_may_write(dst_cgrp, sb);
+       if (ret)
+               goto err;
+ 
+       ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
+                                       !(kargs->flags & CLONE_THREAD));
+       if (ret)
+               goto err;
+ 
+       kargs->cset = find_css_set(cset, dst_cgrp);
+       if (!kargs->cset) {
+               ret = -ENOMEM;
+               goto err;
+       }
+ 
+       put_css_set(cset);
+       fput(f);
+       kargs->cgrp = dst_cgrp;
+       return ret;
+ 
+ err:
+       cgroup_threadgroup_change_end(current);
+       mutex_unlock(&cgroup_mutex);
+       if (f)
+               fput(f);
+       if (dst_cgrp)
+               cgroup_put(dst_cgrp);
+       put_css_set(cset);
+       if (kargs->cset)
+               put_css_set(kargs->cset);
+       return ret;
+ }
+ 
+ /**
+  * cgroup_css_set_put_fork - drop references we took during fork
+  * @kargs: the arguments passed to create the child process
+  *
+  * Drop references to the prepared css_set and target cgroup if
+  * CLONE_INTO_CGROUP was requested.
+  */
+ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
+       __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+ {
+       cgroup_threadgroup_change_end(current);
+ 
+       if (kargs->flags & CLONE_INTO_CGROUP) {
+               struct cgroup *cgrp = kargs->cgrp;
+               struct css_set *cset = kargs->cset;
+ 
+               mutex_unlock(&cgroup_mutex);
+ 
+               if (cset) {
+                       put_css_set(cset);
+                       kargs->cset = NULL;
+               }
+ 
+               if (cgrp) {
+                       cgroup_put(cgrp);
+                       kargs->cgrp = NULL;
+               }
+       }
+ }
+ 
   /**
    * cgroup_can_fork - called on a new task before the process is exposed
-  * @child: the task in question.
+  * @child: the child process
    *
-  * This calls the subsystem can_fork() callbacks. If the can_fork() callback
-  * returns an error, the fork aborts with that error code. This allows for
-  * a cgroup subsystem to conditionally allow or deny new forks.
+  * This prepares a new css_set for the child process which the child will
+  * be attached to in cgroup_post_fork().
+  * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
+  * callback returns an error, the fork aborts with that error code. This
+  * allows for a cgroup subsystem to conditionally allow or deny new forks.
    */
- int cgroup_can_fork(struct task_struct *child)
+ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
   {
         struct cgroup_subsys *ss;
         int i, j, ret;
   
+       ret = cgroup_css_set_fork(kargs);
+       if (ret)
+               return ret;
+ 
         do_each_subsys_mask(ss, i, have_canfork_callback) {
-               ret = ss->can_fork(child);
+               ret = ss->can_fork(child, kargs->cset);
                 if (ret)
                         goto out_revert;
         } while_each_subsys_mask();
@@@ -5911,54 -6067,64 +6079,64 @@@ out_revert
                 if (j >= i)
                         break;
                 if (ss->cancel_fork)
-                       ss->cancel_fork(child);
+                       ss->cancel_fork(child, kargs->cset);
         }
   
+       cgroup_css_set_put_fork(kargs);
+ 
         return ret;
   }
   
   /**
    * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
-  * @child: the task in question
+  * @child: the child process
+  * @kargs: the arguments passed to create the child process
    *
    * This calls the cancel_fork() callbacks if a fork failed *after*
-  * cgroup_can_fork() succeded.
+  * cgroup_can_fork() succeded and cleans up references we took to
+  * prepare a new css_set for the child process in cgroup_can_fork().
    */
- void cgroup_cancel_fork(struct task_struct *child)
+ void cgroup_cancel_fork(struct task_struct *child,
+                       struct kernel_clone_args *kargs)
   {
         struct cgroup_subsys *ss;
         int i;
   
         for_each_subsys(ss, i)
                 if (ss->cancel_fork)
-                       ss->cancel_fork(child);
+                       ss->cancel_fork(child, kargs->cset);
+ 
+       cgroup_css_set_put_fork(kargs);
   }
   
   /**
-  * cgroup_post_fork - called on a new task after adding it to the task list
-  * @child: the task in question
-  *
-  * Adds the task to the list running through its css_set if necessary and
-  * call the subsystem fork() callbacks.  Has to be after the task is
-  * visible on the task list in case we race with the first call to
-  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
-  * list.
+  * cgroup_post_fork - finalize cgroup setup for the child process
+  * @child: the child process
+  *
+  * Attach the child process to its css_set calling the subsystem fork()
+  * callbacks.
    */
- void cgroup_post_fork(struct task_struct *child)
+ void cgroup_post_fork(struct task_struct *child,
+                     struct kernel_clone_args *kargs)
+       __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
   {
         struct cgroup_subsys *ss;
         struct css_set *cset;
         int i;
   
+       cset = kargs->cset;
+       kargs->cset = NULL;
+ 
         spin_lock_irq(&css_set_lock);
   
         /* init tasks are special, only link regular threads */
         if (likely(child->pid)) {
                 WARN_ON_ONCE(!list_empty(&child->cg_list));
-               cset = task_css_set(current); /* current is @child's parent */
-               get_css_set(cset);
                 cset->nr_tasks++;
                 css_set_move_task(child, NULL, cset, false);
+       } else {
+               put_css_set(cset);
+               cset = NULL;
         }
   
         /*
@@@ -5990,6 -6156,17 +6168,17 @@@
         do_each_subsys_mask(ss, i, have_fork_callback) {
                 ss->fork(child);
         } while_each_subsys_mask();
+ 
+       /* Make the new cset the root_cset of the new cgroup namespace. */
+       if (kargs->flags & CLONE_NEWCGROUP) {
+               struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
+ 
+               get_css_set(cset);
+               child->nsproxy->cgroup_ns->root_cset = cset;
+               put_css_set(rcset);
+       }
+ 
+       cgroup_css_set_put_fork(kargs);
   }
   
   /**
@@@ -6176,7 -6353,6 +6365,6 @@@ EXPORT_SYMBOL_GPL(cgroup_get_from_path)
    */
   struct cgroup *cgroup_get_from_fd(int fd)
   {
-       struct cgroup_subsys_state *css;
         struct cgroup *cgrp;
         struct file *f;
   
@@@ -6184,17 -6360,8 +6372,8 @@@
         if (!f)
                 return ERR_PTR(-EBADF);
   
-       css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+       cgrp = cgroup_get_from_file(f);
         fput(f);
-       if (IS_ERR(css))
-               return ERR_CAST(css);
- 
-       cgrp = css->cgroup;
-       if (!cgroup_on_dfl(cgrp)) {
-               cgroup_put(cgrp);
-               return ERR_PTR(-EBADF);
-       }
- 
         return cgrp;
   }
   EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
@@@ -6283,10 -6450,6 +6462,10 @@@ void cgroup_sk_alloc(struct sock_cgroup
                 return;
         }
   
+ +      /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ +      if (in_interrupt())
+ +              return;
+ +
         rcu_read_lock();
   
         while (true) {
@@@ -6315,58 -6478,27 +6494,58 @@@ void cgroup_sk_free(struct sock_cgroup_
   #endif        /* CONFIG_SOCK_CGROUP_DATA */
   
   #ifdef CONFIG_CGROUP_BPF
- -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
- -                    struct bpf_prog *replace_prog, enum bpf_attach_type type,
+ +int cgroup_bpf_attach(struct cgroup *cgrp,
+ +                    struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ +                    struct bpf_cgroup_link *link,
+ +                    enum bpf_attach_type type,
                       u32 flags)
   {
         int ret;
   
         mutex_lock(&cgroup_mutex);
- -      ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
+ +      ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
+ +      mutex_unlock(&cgroup_mutex);
+ +      return ret;
+ +}
+ +
+ +int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog,
+ +                     struct bpf_prog *new_prog)
+ +{
+ +      struct bpf_cgroup_link *cg_link;
+ +      int ret;
+ +
+ +      if (link->ops != &bpf_cgroup_link_lops)
+ +              return -EINVAL;
+ +
+ +      cg_link = container_of(link, struct bpf_cgroup_link, link);
+ +
+ +      mutex_lock(&cgroup_mutex);
+ +      /* link might have been auto-released by dying cgroup, so fail */
+ +      if (!cg_link->cgroup) {
+ +              ret = -EINVAL;
+ +              goto out_unlock;
+ +      }
+ +      if (old_prog && link->prog != old_prog) {
+ +              ret = -EPERM;
+ +              goto out_unlock;
+ +      }
+ +      ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
+ +out_unlock:
         mutex_unlock(&cgroup_mutex);
         return ret;
   }
+ +
   int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- -                    enum bpf_attach_type type, u32 flags)
+ +                    enum bpf_attach_type type)
   {
         int ret;
   
         mutex_lock(&cgroup_mutex);
- -      ret = __cgroup_bpf_detach(cgrp, prog, type);
+ +      ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
         mutex_unlock(&cgroup_mutex);
         return ret;
   }
+ +
   int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
                      union bpf_attr __user *uattr)
   {
@@@ -6424,10 -6556,7 +6603,10 @@@ static struct kobj_attribute cgroup_del
   static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
                              char *buf)
   {
- -      return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
+ +      return snprintf(buf, PAGE_SIZE,
+ +                      "nsdelegate\n"
+ +                      "memory_localevents\n"
+ +                      "memory_recursiveprot\n");
   }
   static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
   
diff --combined kernel/fork.c

index ba122d6f5127386a2c532b5679dde19a30669af1,635d6369dfb904dd7e0498c2ff8e29ea3a539548..d2a967bf85d5eed4ccad59e51dbc89da332bffeb
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -281,7 -281,7 +281,7 @@@ static inline void free_thread_stack(st
                                              MEMCG_KERNEL_STACK_KB,
                                              -(int)(PAGE_SIZE / 1024));
   
- -                      memcg_kmem_uncharge(vm->pages[i], 0);
+ +                      memcg_kmem_uncharge_page(vm->pages[i], 0);
                 }
   
                 for (i = 0; i < NR_CACHED_STACKS; i++) {
@@@ -397,8 -397,8 +397,8 @@@ static void account_kernel_stack(struc
                 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
                                     THREAD_SIZE / 1024 * account);
   
- -              mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
- -                                   account * (THREAD_SIZE / 1024));
+ +              mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
+ +                                  account * (THREAD_SIZE / 1024));
         }
   }
   
@@@ -413,13 -413,12 +413,13 @@@ static int memcg_charge_kernel_stack(st
   
                 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                         /*
- -                       * If memcg_kmem_charge() fails, page->mem_cgroup
- -                       * pointer is NULL, and both memcg_kmem_uncharge()
+ +                       * If memcg_kmem_charge_page() fails, page->mem_cgroup
+ +                       * pointer is NULL, and both memcg_kmem_uncharge_page()
                          * and mod_memcg_page_state() in free_thread_stack()
                          * will ignore this page. So it's safe.
                          */
- -                      ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0);
+ +                      ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
+ +                                                   0);
                         if (ret)
                                 return ret;
   
@@@ -1225,7 -1224,7 +1225,7 @@@ struct mm_struct *mm_access(struct task
         struct mm_struct *mm;
         int err;
   
- -      err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
+ +      err =  mutex_lock_killable(&task->signal->exec_update_mutex);
         if (err)
                 return ERR_PTR(err);
   
@@@ -1235,7 -1234,7 +1235,7 @@@
                 mmput(mm);
                 mm = ERR_PTR(-EACCES);
         }
- -      mutex_unlock(&task->signal->cred_guard_mutex);
+ +      mutex_unlock(&task->signal->exec_update_mutex);
   
         return mm;
   }
@@@ -1509,7 -1508,7 +1509,7 @@@ static int copy_sighand(unsigned long c
                 return 0;
         }
         sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
- -      rcu_assign_pointer(tsk->sighand, sig);
+ +      RCU_INIT_POINTER(tsk->sighand, sig);
         if (!sig)
                 return -ENOMEM;
   
@@@ -1595,7 -1594,6 +1595,7 @@@ static int copy_signal(unsigned long cl
         sig->oom_score_adj_min = current->signal->oom_score_adj_min;
   
         mutex_init(&sig->cred_guard_mutex);
+ +      mutex_init(&sig->exec_update_mutex);
   
         return 0;
   }
@@@ -2176,16 -2174,15 +2176,15 @@@ static __latent_entropy struct task_str
         INIT_LIST_HEAD(&p->thread_group);
         p->task_works = NULL;
   
-       cgroup_threadgroup_change_begin(current);
         /*
          * Ensure that the cgroup subsystem policies allow the new process to be
          * forked. It should be noted the the new process's css_set can be changed
          * between here and cgroup_post_fork() if an organisation operation is in
          * progress.
          */
-       retval = cgroup_can_fork(p);
+       retval = cgroup_can_fork(p, args);
         if (retval)
-               goto bad_fork_cgroup_threadgroup_change_end;
+               goto bad_fork_put_pidfd;
   
         /*
          * From this point on we must avoid any synchronous user-space
@@@ -2290,8 -2287,7 +2289,7 @@@
         write_unlock_irq(&tasklist_lock);
   
         proc_fork_connector(p);
-       cgroup_post_fork(p);
-       cgroup_threadgroup_change_end(current);
+       cgroup_post_fork(p, args);
         perf_event_fork(p);
   
         trace_task_newtask(p, clone_flags);
@@@ -2302,9 -2298,7 +2300,7 @@@
   bad_fork_cancel_cgroup:
         spin_unlock(&current->sighand->siglock);
         write_unlock_irq(&tasklist_lock);
-       cgroup_cancel_fork(p);
- bad_fork_cgroup_threadgroup_change_end:
-       cgroup_threadgroup_change_end(current);
+       cgroup_cancel_fork(p, args);
   bad_fork_put_pidfd:
         if (clone_flags & CLONE_PIDFD) {
                 fput(pidfile);
@@@ -2633,6 -2627,9 +2629,9 @@@ noinline static int copy_clone_args_fro
                      !valid_signal(args.exit_signal)))
                 return -EINVAL;
   
+       if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
+               return -EINVAL;
+ 
         *kargs = (struct kernel_clone_args){
                 .flags          = args.flags,
                 .pidfd          = u64_to_user_ptr(args.pidfd),
@@@ -2643,6 -2640,7 +2642,7 @@@
                 .stack_size     = args.stack_size,
                 .tls            = args.tls,
                 .set_tid_size   = args.set_tid_size,
+               .cgroup         = args.cgroup,
         };
   
         if (args.set_tid &&
@@@ -2686,7 -2684,8 +2686,8 @@@ static inline bool clone3_stack_valid(s
   static bool clone3_args_valid(struct kernel_clone_args *kargs)
   {
         /* Verify that no unknown flags are passed along. */
-       if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
+       if (kargs->flags &
+           ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
                 return false;
   
         /*
diff --combined mm/shmem.c

index aad3ba74b0e9d121bd774e795460d5f2f73ac119,e6a7549faf20594c1a1c31b0c0fa84c20a675979..f47347cb30f6b0b41684a549508b9d4972aec378
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -3243,7 -3243,7 +3243,7 @@@ static int shmem_xattr_handler_set(cons
         struct shmem_inode_info *info = SHMEM_I(inode);
   
         name = xattr_full_name(handler, name);
-       return simple_xattr_set(&info->xattrs, name, value, size, flags);
+       return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
   }
   
   static const struct xattr_handler shmem_security_xattr_handler = {
@@@ -3386,6 -3386,8 +3386,6 @@@ static const struct constant_table shme
         {"always",      SHMEM_HUGE_ALWAYS },
         {"within_size", SHMEM_HUGE_WITHIN_SIZE },
         {"advise",      SHMEM_HUGE_ADVISE },
- -      {"deny",        SHMEM_HUGE_DENY },
- -      {"force",       SHMEM_HUGE_FORCE },
         {}
   };
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 3 Apr 2020 18:30:20 +0000 (11:30 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 3 Apr 2020 18:30:20 +0000 (11:30 -0700)
		1	2
include/linux/cgroup-defs.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history