cgroup/pids: Separate semantics of pids.events related to pids.max

author Michal Koutný <mkoutny@suse.com>

Tue, 21 May 2024 09:21:26 +0000 (11:21 +0200)

committer Tejun Heo <tj@kernel.org>

Sun, 26 May 2024 18:45:09 +0000 (08:45 -1000)
author Michal Koutný <mkoutny@suse.com>
Tue, 21 May 2024 09:21:26 +0000 (11:21 +0200)
committer Tejun Heo <tj@kernel.org>
Sun, 26 May 2024 18:45:09 +0000 (08:45 -1000)
diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst

index 6acebd9e72c819eca5fe9f5a1fa2ef3f54929173..0f9f9a7b1f6c32de4a696018514aba96f34c9bd3 100644 (file)
--- a/Documentation/admin-guide/cgroup-v1/pids.rst
+++ b/Documentation/admin-guide/cgroup-v1/pids.rst
@@ -36,7 +36,8 @@ superset of parent/child/pids.current.
  
  The pids.events file contains event counters:
  
-  - max: Number of times fork failed because limit was hit.
+  - max: Number of times fork failed in the cgroup because limit was hit in
+    self or ancestors.
  
  Example
  -------
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst

index 8fbb0519d5569086e63cb7decdcf56972baf6591..dfeb51c994e642584af0aad7ae8a2a0405764839 100644 (file)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -239,6 +239,10 @@ cgroup v2 currently supports the following mount options.
            will not be tracked by the memory controller (even if cgroup
            v2 is remounted later on).
  
+  pids_localevents
+        Represent fork failures inside cgroup's pids.events:max (v1 behavior),
+        not its limit being hit (v2 behavior).
+
  
  Organizing Processes and Threads
  --------------------------------
@@ -2205,12 +2209,13 @@ PID Interface Files
         descendants has ever reached.
  
    pids.events
-       A read-only flat-keyed file which exists on non-root cgroups. The
-       following entries are defined. Unless specified otherwise, a value
-       change in this file generates a file modified event.
+       A read-only flat-keyed file which exists on non-root cgroups. Unless
+       specified otherwise, a value change in this file generates a file
+       modified event. The following entries are defined.
  
           max
-               Number of times fork failed because limit was hit.
+               The number of times the cgroup's number of processes hit the
+               limit (see also pids_localevents).
  
  Organisational operations are not blocked by cgroup policies, so it is
  possible to have pids.current > pids.max.  This can be done by either
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index ea48c861cd3695296b5de1548bbb03cf61c62e4a..b36690ca0d3f11bcc87ac9f4cfcb7923ea89ccb1 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -119,7 +119,12 @@ enum {
         /*
          * Enable hugetlb accounting for the memory controller.
          */
-        CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
+       CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
+
+       /*
+        * Enable legacy local pids.events.
+        */
+       CGRP_ROOT_PIDS_LOCAL_EVENTS = (1 << 20),
  };
  
  /* cftype->flags */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index e32b6972c47840a8f6880964100b3356ab849f2a..9c9943ea5f89d2b0c6384ee7d878b34a2b5ea493 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1922,6 +1922,7 @@ enum cgroup2_param {
         Opt_memory_localevents,
         Opt_memory_recursiveprot,
         Opt_memory_hugetlb_accounting,
+       Opt_pids_localevents,
         nr__cgroup2_params
  };
  
@@ -1931,6 +1932,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
         fsparam_flag("memory_localevents",      Opt_memory_localevents),
         fsparam_flag("memory_recursiveprot",    Opt_memory_recursiveprot),
         fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
+       fsparam_flag("pids_localevents",        Opt_pids_localevents),
         {}
  };
  
@@ -1960,6 +1962,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
         case Opt_memory_hugetlb_accounting:
                 ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
                 return 0;
+       case Opt_pids_localevents:
+               ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+               return 0;
         }
         return -EINVAL;
  }
@@ -1989,6 +1994,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
                         cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
                 else
                         cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+
+               if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+                       cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+               else
+                       cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS;
         }
  }
  
@@ -2004,6 +2014,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
                 seq_puts(seq, ",memory_recursiveprot");
         if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
                 seq_puts(seq, ",memory_hugetlb_accounting");
+       if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+               seq_puts(seq, ",pids_localevents");
         return 0;
  }
  
@@ -7062,7 +7074,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
                         "favordynmods\n"
                         "memory_localevents\n"
                         "memory_recursiveprot\n"
-                       "memory_hugetlb_accounting\n");
+                       "memory_hugetlb_accounting\n"
+                       "pids_localevents\n");
  }
  static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
  
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c

index 0e5ec7d59b4d21ab79342a02d9ba1a0ec864d71f..a557f5c8300b83eb205a6cb2484624a6ea6c987c 100644 (file)
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -38,6 +38,14 @@
  #define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
  #define PIDS_MAX_STR "max"
  
+enum pidcg_event {
+       /* Fork failed in subtree because this pids_cgroup limit was hit. */
+       PIDCG_MAX,
+       /* Fork failed in this pids_cgroup because ancestor limit was hit. */
+       PIDCG_FORKFAIL,
+       NR_PIDCG_EVENTS,
+};
+
  struct pids_cgroup {
         struct cgroup_subsys_state      css;
  
@@ -52,8 +60,7 @@ struct pids_cgroup {
         /* Handle for "pids.events" */
         struct cgroup_file              events_file;
  
-       /* Number of times fork failed because limit was hit. */
-       atomic64_t                      events_limit;
+       atomic64_t                      events[NR_PIDCG_EVENTS];
  };
  
  static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -148,12 +155,13 @@ static void pids_charge(struct pids_cgroup *pids, int num)
   * pids_try_charge - hierarchically try to charge the pid count
   * @pids: the pid cgroup state
   * @num: the number of pids to charge
+ * @fail: storage of pid cgroup causing the fail
   *
   * This function follows the set limit. It will fail if the charge would cause
   * the new value to exceed the hierarchical limit. Returns 0 if the charge
   * succeeded, otherwise -EAGAIN.
   */
-static int pids_try_charge(struct pids_cgroup *pids, int num)
+static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail)
  {
         struct pids_cgroup *p, *q;
  
@@ -166,9 +174,10 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
                  * p->limit is %PIDS_MAX then we know that this test will never
                  * fail.
                  */
-               if (new > limit)
+               if (new > limit) {
+                       *fail = p;
                         goto revert;
-
+               }
                 /*
                  * Not technically accurate if we go over limit somewhere up
                  * the hierarchy, but that's tolerable for the watermark.
@@ -236,7 +245,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
  static int pids_can_fork(struct task_struct *task, struct css_set *cset)
  {
         struct cgroup_subsys_state *css;
-       struct pids_cgroup *pids;
+       struct pids_cgroup *pids, *pids_over_limit;
         int err;
  
         if (cset)
@@ -244,15 +253,23 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset)
         else
                 css = task_css_check(current, pids_cgrp_id, true);
         pids = css_pids(css);
-       err = pids_try_charge(pids, 1);
+       err = pids_try_charge(pids, 1, &pids_over_limit);
         if (err) {
-               /* Only log the first time events_limit is incremented. */
-               if (atomic64_inc_return(&pids->events_limit) == 1) {
+               /* compatibility on v1 where events were notified in leaves. */
+               if (!cgroup_subsys_on_dfl(pids_cgrp_subsys))
+                       pids_over_limit = pids;
+
+               /* Only log the first time limit is hit. */
+               if (atomic64_inc_return(&pids->events[PIDCG_FORKFAIL]) == 1) {
                         pr_info("cgroup: fork rejected by pids controller in ");
-                       pr_cont_cgroup_path(css->cgroup);
+                       pr_cont_cgroup_path(pids->css.cgroup);
                         pr_cont("\n");
                 }
+               atomic64_inc(&pids_over_limit->events[PIDCG_MAX]);
+
                 cgroup_file_notify(&pids->events_file);
+               if (pids_over_limit != pids)
+                       cgroup_file_notify(&pids_over_limit->events_file);
         }
         return err;
  }
@@ -340,8 +357,13 @@ static s64 pids_peak_read(struct cgroup_subsys_state *css,
  static int pids_events_show(struct seq_file *sf, void *v)
  {
         struct pids_cgroup *pids = css_pids(seq_css(sf));
+       enum pidcg_event pe = PIDCG_MAX;
+
+       if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
+           cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+               pe = PIDCG_FORKFAIL;
  
-       seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
+       seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events[pe]));
         return 0;
  }
author	Michal Koutný <mkoutny@suse.com>
	Tue, 21 May 2024 09:21:26 +0000 (11:21 +0200)
committer	Tejun Heo <tj@kernel.org>
	Sun, 26 May 2024 18:45:09 +0000 (08:45 -1000)
Documentation/admin-guide/cgroup-v1/pids.rst		patch \| blob \| blame \| history
Documentation/admin-guide/cgroup-v2.rst		patch \| blob \| blame \| history
include/linux/cgroup-defs.h		patch \| blob \| blame \| history
kernel/cgroup/cgroup.c		patch \| blob \| blame \| history
kernel/cgroup/pids.c		patch \| blob \| blame \| history