Updated kernel (2.6.27.41).

[ipfire-2.x.git] / src / patches / suse-2.6.27.39 / patches.suse / cgroup-freezer.patch
diff --git a/src/patches/suse-2.6.27.39/patches.suse/cgroup-freezer.patch b/src/patches/suse-2.6.27.39/patches.suse/cgroup-freezer.patch

deleted file mode 100644 (file)

index b9b4f73..0000000
--- a/src/patches/suse-2.6.27.39/patches.suse/cgroup-freezer.patch
+++ /dev/null
@@ -1,2505 +0,0 @@
-From: Serge E. Hallyn <serue@us.ibm.com>
-Subject: cgroup freezer
-References: bnc#417294, fate#304191, fate#201036
-Patch-upstream: yes
-Git: 68d1a06b440a5df55fb253e1d1113d2e4a7209fc Mon Sep 17 00:00:00 2001
-
-Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
-Acked-by: Nick Piggin <npiggin@suse.de>
----
- Documentation/cgroups.txt                   |  548 ----------------------------
- Documentation/cgroups/cgroups.txt           |  548 ++++++++++++++++++++++++++++
- Documentation/cgroups/freezer-subsystem.txt |  102 +++++
- Documentation/cpusets.txt                   |    2 
- arch/alpha/Kconfig                          |    1 
- arch/alpha/include/asm/thread_info.h        |    2 
- arch/arm/Kconfig                            |    2 
- arch/avr32/Kconfig                          |    2 
- arch/avr32/include/asm/thread_info.h        |    1 
- arch/blackfin/Kconfig                       |    3 
- arch/cris/Kconfig                           |    2 
- arch/frv/Kconfig                            |    2 
- arch/h8300/Kconfig                          |    2 
- arch/h8300/include/asm/thread_info.h        |    2 
- arch/ia64/Kconfig                           |    2 
- arch/m32r/Kconfig                           |    2 
- arch/m68k/Kconfig                           |    2 
- arch/m68knommu/Kconfig                      |    2 
- arch/m68knommu/include/asm/thread_info.h    |    2 
- arch/mips/Kconfig                           |    2 
- arch/mn10300/Kconfig                        |    2 
- arch/parisc/Kconfig                         |    2 
- arch/powerpc/Kconfig                        |    2 
- arch/s390/Kconfig                           |    2 
- arch/s390/include/asm/thread_info.h         |    2 
- arch/sh/Kconfig                             |    2 
- arch/sparc/Kconfig                          |    2 
- arch/sparc/include/asm/thread_info_32.h     |    2 
- arch/sparc64/Kconfig                        |    1 
- arch/um/Kconfig                             |    2 
- arch/x86/Kconfig                            |    1 
- arch/xtensa/Kconfig                         |    1 
- include/asm-cris/thread_info.h              |    2 
- include/asm-m68k/thread_info.h              |    1 
- include/asm-parisc/thread_info.h            |    2 
- include/asm-um/thread_info.h                |    2 
- include/asm-xtensa/thread_info.h            |    2 
- include/linux/cgroup_subsys.h               |    6 
- include/linux/freezer.h                     |   42 --
- init/Kconfig                                |    7 
- kernel/Kconfig.freezer                      |    2 
- kernel/Makefile                             |    2 
- kernel/cgroup_freezer.c                     |  379 +++++++++++++++++++
- kernel/freezer.c                            |  154 +++++++
- kernel/power/process.c                      |  119 ------
- 45 files changed, 1283 insertions(+), 689 deletions(-)
- create mode 100644 include/linux/cgroup_freezer.h
- create mode 100644 kernel/cgroup_freezer.c
- create mode 100644 kernel/freezer.c
-
---- a/Documentation/cgroups.txt
-+++ /dev/null
-@@ -1,548 +0,0 @@
--                              CGROUPS
--                              -------
--
--Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
--
--Original copyright statements from cpusets.txt:
--Portions Copyright (C) 2004 BULL SA.
--Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
--Modified by Paul Jackson <pj@sgi.com>
--Modified by Christoph Lameter <clameter@sgi.com>
--
--CONTENTS:
--=========
--
--1. Control Groups
--  1.1 What are cgroups ?
--  1.2 Why are cgroups needed ?
--  1.3 How are cgroups implemented ?
--  1.4 What does notify_on_release do ?
--  1.5 How do I use cgroups ?
--2. Usage Examples and Syntax
--  2.1 Basic Usage
--  2.2 Attaching processes
--3. Kernel API
--  3.1 Overview
--  3.2 Synchronization
--  3.3 Subsystem API
--4. Questions
--
--1. Control Groups
--=================
--
--1.1 What are cgroups ?
------------------------
--
--Control Groups provide a mechanism for aggregating/partitioning sets of
--tasks, and all their future children, into hierarchical groups with
--specialized behaviour.
--
--Definitions:
--
--A *cgroup* associates a set of tasks with a set of parameters for one
--or more subsystems.
--
--A *subsystem* is a module that makes use of the task grouping
--facilities provided by cgroups to treat groups of tasks in
--particular ways. A subsystem is typically a "resource controller" that
--schedules a resource or applies per-cgroup limits, but it may be
--anything that wants to act on a group of processes, e.g. a
--virtualization subsystem.
--
--A *hierarchy* is a set of cgroups arranged in a tree, such that
--every task in the system is in exactly one of the cgroups in the
--hierarchy, and a set of subsystems; each subsystem has system-specific
--state attached to each cgroup in the hierarchy.  Each hierarchy has
--an instance of the cgroup virtual filesystem associated with it.
--
--At any one time there may be multiple active hierachies of task
--cgroups. Each hierarchy is a partition of all tasks in the system.
--
--User level code may create and destroy cgroups by name in an
--instance of the cgroup virtual file system, specify and query to
--which cgroup a task is assigned, and list the task pids assigned to
--a cgroup. Those creations and assignments only affect the hierarchy
--associated with that instance of the cgroup file system.
--
--On their own, the only use for cgroups is for simple job
--tracking. The intention is that other subsystems hook into the generic
--cgroup support to provide new attributes for cgroups, such as
--accounting/limiting the resources which processes in a cgroup can
--access. For example, cpusets (see Documentation/cpusets.txt) allows
--you to associate a set of CPUs and a set of memory nodes with the
--tasks in each cgroup.
--
--1.2 Why are cgroups needed ?
------------------------------
--
--There are multiple efforts to provide process aggregations in the
--Linux kernel, mainly for resource tracking purposes. Such efforts
--include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
--namespaces. These all require the basic notion of a
--grouping/partitioning of processes, with newly forked processes ending
--in the same group (cgroup) as their parent process.
--
--The kernel cgroup patch provides the minimum essential kernel
--mechanisms required to efficiently implement such groups. It has
--minimal impact on the system fast paths, and provides hooks for
--specific subsystems such as cpusets to provide additional behaviour as
--desired.
--
--Multiple hierarchy support is provided to allow for situations where
--the division of tasks into cgroups is distinctly different for
--different subsystems - having parallel hierarchies allows each
--hierarchy to be a natural division of tasks, without having to handle
--complex combinations of tasks that would be present if several
--unrelated subsystems needed to be forced into the same tree of
--cgroups.
--
--At one extreme, each resource controller or subsystem could be in a
--separate hierarchy; at the other extreme, all subsystems
--would be attached to the same hierarchy.
--
--As an example of a scenario (originally proposed by vatsa@in.ibm.com)
--that can benefit from multiple hierarchies, consider a large
--university server with various users - students, professors, system
--tasks etc. The resource planning for this server could be along the
--following lines:
--
--       CPU :           Top cpuset
--                       /       \
--               CPUSet1         CPUSet2
--                  |              |
--               (Profs)         (Students)
--
--               In addition (system tasks) are attached to topcpuset (so
--               that they can run anywhere) with a limit of 20%
--
--       Memory : Professors (50%), students (30%), system (20%)
--
--       Disk : Prof (50%), students (30%), system (20%)
--
--       Network : WWW browsing (20%), Network File System (60%), others (20%)
--                               / \
--                       Prof (15%) students (5%)
--
--Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
--into NFS network class.
--
--At the same time firefox/lynx will share an appropriate CPU/Memory class
--depending on who launched it (prof/student).
--
--With the ability to classify tasks differently for different resources
--(by putting those resource subsystems in different hierarchies) then
--the admin can easily set up a script which receives exec notifications
--and depending on who is launching the browser he can
--
--       # echo browser_pid > /mnt/<restype>/<userclass>/tasks
--
--With only a single hierarchy, he now would potentially have to create
--a separate cgroup for every browser launched and associate it with
--approp network and other resource class.  This may lead to
--proliferation of such cgroups.
--
--Also lets say that the administrator would like to give enhanced network
--access temporarily to a student's browser (since it is night and the user
--wants to do online gaming :))  OR give one of the students simulation
--apps enhanced CPU power,
--
--With ability to write pids directly to resource classes, it's just a
--matter of :
--
--       # echo pid > /mnt/network/<new_class>/tasks
--       (after some time)
--       # echo pid > /mnt/network/<orig_class>/tasks
--
--Without this ability, he would have to split the cgroup into
--multiple separate ones and then associate the new cgroups with the
--new resource classes.
--
--
--
--1.3 How are cgroups implemented ?
-----------------------------------
--
--Control Groups extends the kernel as follows:
--
-- - Each task in the system has a reference-counted pointer to a
--   css_set.
--
-- - A css_set contains a set of reference-counted pointers to
--   cgroup_subsys_state objects, one for each cgroup subsystem
--   registered in the system. There is no direct link from a task to
--   the cgroup of which it's a member in each hierarchy, but this
--   can be determined by following pointers through the
--   cgroup_subsys_state objects. This is because accessing the
--   subsystem state is something that's expected to happen frequently
--   and in performance-critical code, whereas operations that require a
--   task's actual cgroup assignments (in particular, moving between
--   cgroups) are less common. A linked list runs through the cg_list
--   field of each task_struct using the css_set, anchored at
--   css_set->tasks.
--
-- - A cgroup hierarchy filesystem can be mounted  for browsing and
--   manipulation from user space.
--
-- - You can list all the tasks (by pid) attached to any cgroup.
--
--The implementation of cgroups requires a few, simple hooks
--into the rest of the kernel, none in performance critical paths:
--
-- - in init/main.c, to initialize the root cgroups and initial
--   css_set at system boot.
--
-- - in fork and exit, to attach and detach a task from its css_set.
--
--In addition a new file system, of type "cgroup" may be mounted, to
--enable browsing and modifying the cgroups presently known to the
--kernel.  When mounting a cgroup hierarchy, you may specify a
--comma-separated list of subsystems to mount as the filesystem mount
--options.  By default, mounting the cgroup filesystem attempts to
--mount a hierarchy containing all registered subsystems.
--
--If an active hierarchy with exactly the same set of subsystems already
--exists, it will be reused for the new mount. If no existing hierarchy
--matches, and any of the requested subsystems are in use in an existing
--hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
--is activated, associated with the requested subsystems.
--
--It's not currently possible to bind a new subsystem to an active
--cgroup hierarchy, or to unbind a subsystem from an active cgroup
--hierarchy. This may be possible in future, but is fraught with nasty
--error-recovery issues.
--
--When a cgroup filesystem is unmounted, if there are any
--child cgroups created below the top-level cgroup, that hierarchy
--will remain active even though unmounted; if there are no
--child cgroups then the hierarchy will be deactivated.
--
--No new system calls are added for cgroups - all support for
--querying and modifying cgroups is via this cgroup file system.
--
--Each task under /proc has an added file named 'cgroup' displaying,
--for each active hierarchy, the subsystem names and the cgroup name
--as the path relative to the root of the cgroup file system.
--
--Each cgroup is represented by a directory in the cgroup file system
--containing the following files describing that cgroup:
--
-- - tasks: list of tasks (by pid) attached to that cgroup
-- - releasable flag: cgroup currently removeable?
-- - notify_on_release flag: run the release agent on exit?
-- - release_agent: the path to use for release notifications (this file
--   exists in the top cgroup only)
--
--Other subsystems such as cpusets may add additional files in each
--cgroup dir.
--
--New cgroups are created using the mkdir system call or shell
--command.  The properties of a cgroup, such as its flags, are
--modified by writing to the appropriate file in that cgroups
--directory, as listed above.
--
--The named hierarchical structure of nested cgroups allows partitioning
--a large system into nested, dynamically changeable, "soft-partitions".
--
--The attachment of each task, automatically inherited at fork by any
--children of that task, to a cgroup allows organizing the work load
--on a system into related sets of tasks.  A task may be re-attached to
--any other cgroup, if allowed by the permissions on the necessary
--cgroup file system directories.
--
--When a task is moved from one cgroup to another, it gets a new
--css_set pointer - if there's an already existing css_set with the
--desired collection of cgroups then that group is reused, else a new
--css_set is allocated. Note that the current implementation uses a
--linear search to locate an appropriate existing css_set, so isn't
--very efficient. A future version will use a hash table for better
--performance.
--
--To allow access from a cgroup to the css_sets (and hence tasks)
--that comprise it, a set of cg_cgroup_link objects form a lattice;
--each cg_cgroup_link is linked into a list of cg_cgroup_links for
--a single cgroup on its cgrp_link_list field, and a list of
--cg_cgroup_links for a single css_set on its cg_link_list.
--
--Thus the set of tasks in a cgroup can be listed by iterating over
--each css_set that references the cgroup, and sub-iterating over
--each css_set's task set.
--
--The use of a Linux virtual file system (vfs) to represent the
--cgroup hierarchy provides for a familiar permission and name space
--for cgroups, with a minimum of additional kernel code.
--
--1.4 What does notify_on_release do ?
--------------------------------------
--
--If the notify_on_release flag is enabled (1) in a cgroup, then
--whenever the last task in the cgroup leaves (exits or attaches to
--some other cgroup) and the last child cgroup of that cgroup
--is removed, then the kernel runs the command specified by the contents
--of the "release_agent" file in that hierarchy's root directory,
--supplying the pathname (relative to the mount point of the cgroup
--file system) of the abandoned cgroup.  This enables automatic
--removal of abandoned cgroups.  The default value of
--notify_on_release in the root cgroup at system boot is disabled
--(0).  The default value of other cgroups at creation is the current
--value of their parents notify_on_release setting. The default value of
--a cgroup hierarchy's release_agent path is empty.
--
--1.5 How do I use cgroups ?
----------------------------
--
--To start a new job that is to be contained within a cgroup, using
--the "cpuset" cgroup subsystem, the steps are something like:
--
-- 1) mkdir /dev/cgroup
-- 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
-- 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
--    the /dev/cgroup virtual file system.
-- 4) Start a task that will be the "founding father" of the new job.
-- 5) Attach that task to the new cgroup by writing its pid to the
--    /dev/cgroup tasks file for that cgroup.
-- 6) fork, exec or clone the job tasks from this founding father task.
--
--For example, the following sequence of commands will setup a cgroup
--named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
--and then start a subshell 'sh' in that cgroup:
--
--  mount -t cgroup cpuset -ocpuset /dev/cgroup
--  cd /dev/cgroup
--  mkdir Charlie
--  cd Charlie
--  /bin/echo 2-3 > cpuset.cpus
--  /bin/echo 1 > cpuset.mems
--  /bin/echo $$ > tasks
--  sh
--  # The subshell 'sh' is now running in cgroup Charlie
--  # The next line should display '/Charlie'
--  cat /proc/self/cgroup
--
--2. Usage Examples and Syntax
--============================
--
--2.1 Basic Usage
-----------------
--
--Creating, modifying, using the cgroups can be done through the cgroup
--virtual filesystem.
--
--To mount a cgroup hierarchy will all available subsystems, type:
--# mount -t cgroup xxx /dev/cgroup
--
--The "xxx" is not interpreted by the cgroup code, but will appear in
--/proc/mounts so may be any useful identifying string that you like.
--
--To mount a cgroup hierarchy with just the cpuset and numtasks
--subsystems, type:
--# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
--
--To change the set of subsystems bound to a mounted hierarchy, just
--remount with different options:
--
--# mount -o remount,cpuset,ns  /dev/cgroup
--
--Note that changing the set of subsystems is currently only supported
--when the hierarchy consists of a single (root) cgroup. Supporting
--the ability to arbitrarily bind/unbind subsystems from an existing
--cgroup hierarchy is intended to be implemented in the future.
--
--Then under /dev/cgroup you can find a tree that corresponds to the
--tree of the cgroups in the system. For instance, /dev/cgroup
--is the cgroup that holds the whole system.
--
--If you want to create a new cgroup under /dev/cgroup:
--# cd /dev/cgroup
--# mkdir my_cgroup
--
--Now you want to do something with this cgroup.
--# cd my_cgroup
--
--In this directory you can find several files:
--# ls
--notify_on_release releasable tasks
--(plus whatever files added by the attached subsystems)
--
--Now attach your shell to this cgroup:
--# /bin/echo $$ > tasks
--
--You can also create cgroups inside your cgroup by using mkdir in this
--directory.
--# mkdir my_sub_cs
--
--To remove a cgroup, just use rmdir:
--# rmdir my_sub_cs
--
--This will fail if the cgroup is in use (has cgroups inside, or
--has processes attached, or is held alive by other subsystem-specific
--reference).
--
--2.2 Attaching processes
-------------------------
--
--# /bin/echo PID > tasks
--
--Note that it is PID, not PIDs. You can only attach ONE task at a time.
--If you have several tasks to attach, you have to do it one after another:
--
--# /bin/echo PID1 > tasks
--# /bin/echo PID2 > tasks
--      ...
--# /bin/echo PIDn > tasks
--
--You can attach the current shell task by echoing 0:
--
--# echo 0 > tasks
--
--3. Kernel API
--=============
--
--3.1 Overview
--------------
--
--Each kernel subsystem that wants to hook into the generic cgroup
--system needs to create a cgroup_subsys object. This contains
--various methods, which are callbacks from the cgroup system, along
--with a subsystem id which will be assigned by the cgroup system.
--
--Other fields in the cgroup_subsys object include:
--
--- subsys_id: a unique array index for the subsystem, indicating which
--  entry in cgroup->subsys[] this subsystem should be managing.
--
--- name: should be initialized to a unique subsystem name. Should be
--  no longer than MAX_CGROUP_TYPE_NAMELEN.
--
--- early_init: indicate if the subsystem needs early initialization
--  at system boot.
--
--Each cgroup object created by the system has an array of pointers,
--indexed by subsystem id; this pointer is entirely managed by the
--subsystem; the generic cgroup code will never touch this pointer.
--
--3.2 Synchronization
---------------------
--
--There is a global mutex, cgroup_mutex, used by the cgroup
--system. This should be taken by anything that wants to modify a
--cgroup. It may also be taken to prevent cgroups from being
--modified, but more specific locks may be more appropriate in that
--situation.
--
--See kernel/cgroup.c for more details.
--
--Subsystems can take/release the cgroup_mutex via the functions
--cgroup_lock()/cgroup_unlock().
--
--Accessing a task's cgroup pointer may be done in the following ways:
--- while holding cgroup_mutex
--- while holding the task's alloc_lock (via task_lock())
--- inside an rcu_read_lock() section via rcu_dereference()
--
--3.3 Subsystem API
-------------------
--
--Each subsystem should:
--
--- add an entry in linux/cgroup_subsys.h
--- define a cgroup_subsys object called <name>_subsys
--
--Each subsystem may export the following methods. The only mandatory
--methods are create/destroy. Any others that are null are presumed to
--be successful no-ops.
--
--struct cgroup_subsys_state *create(struct cgroup_subsys *ss,
--                                 struct cgroup *cgrp)
--(cgroup_mutex held by caller)
--
--Called to create a subsystem state object for a cgroup. The
--subsystem should allocate its subsystem state object for the passed
--cgroup, returning a pointer to the new object on success or a
--negative error code. On success, the subsystem pointer should point to
--a structure of type cgroup_subsys_state (typically embedded in a
--larger subsystem-specific object), which will be initialized by the
--cgroup system. Note that this will be called at initialization to
--create the root subsystem state for this subsystem; this case can be
--identified by the passed cgroup object having a NULL parent (since
--it's the root of the hierarchy) and may be an appropriate place for
--initialization code.
--
--void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
--(cgroup_mutex held by caller)
--
--The cgroup system is about to destroy the passed cgroup; the subsystem
--should do any necessary cleanup and free its subsystem state
--object. By the time this method is called, the cgroup has already been
--unlinked from the file system and from the child list of its parent;
--cgroup->parent is still valid. (Note - can also be called for a
--newly-created cgroup if an error occurs after this subsystem's
--create() method has been called for the new cgroup).
--
--void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
--(cgroup_mutex held by caller)
--
--Called before checking the reference count on each subsystem. This may
--be useful for subsystems which have some extra references even if
--there are not tasks in the cgroup.
--
--int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
--             struct task_struct *task)
--(cgroup_mutex held by caller)
--
--Called prior to moving a task into a cgroup; if the subsystem
--returns an error, this will abort the attach operation.  If a NULL
--task is passed, then a successful result indicates that *any*
--unspecified task can be moved into the cgroup. Note that this isn't
--called on a fork. If this method returns 0 (success) then this should
--remain valid while the caller holds cgroup_mutex.
--
--void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
--          struct cgroup *old_cgrp, struct task_struct *task)
--
--Called after the task has been attached to the cgroup, to allow any
--post-attachment activity that requires memory allocations or blocking.
--
--void fork(struct cgroup_subsy *ss, struct task_struct *task)
--
--Called when a task is forked into a cgroup.
--
--void exit(struct cgroup_subsys *ss, struct task_struct *task)
--
--Called during task exit.
--
--int populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
--
--Called after creation of a cgroup to allow a subsystem to populate
--the cgroup directory with file entries.  The subsystem should make
--calls to cgroup_add_file() with objects of type cftype (see
--include/linux/cgroup.h for details).  Note that although this
--method can return an error code, the error code is currently not
--always handled well.
--
--void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
--
--Called at the end of cgroup_clone() to do any paramater
--initialization which might be required before a task could attach.  For
--example in cpusets, no task may attach before 'cpus' and 'mems' are set
--up.
--
--void bind(struct cgroup_subsys *ss, struct cgroup *root)
--(cgroup_mutex held by caller)
--
--Called when a cgroup subsystem is rebound to a different hierarchy
--and root cgroup. Currently this will only involve movement between
--the default hierarchy (which never has sub-cgroups) and a hierarchy
--that is being created/destroyed (and hence has no sub-cgroups).
--
--4. Questions
--============
--
--Q: what's up with this '/bin/echo' ?
--A: bash's builtin 'echo' command does not check calls to write() against
--   errors. If you use it in the cgroup file system, you won't be
--   able to tell whether a command succeeded or failed.
--
--Q: When I attach processes, only the first of the line gets really attached !
--A: We can only return one error code per call to write(). So you should also
--   put only ONE pid.
--
---- /dev/null
-+++ b/Documentation/cgroups/cgroups.txt
-@@ -0,0 +1,548 @@
-+                              CGROUPS
-+                              -------
-+
-+Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
-+
-+Original copyright statements from cpusets.txt:
-+Portions Copyright (C) 2004 BULL SA.
-+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-+Modified by Paul Jackson <pj@sgi.com>
-+Modified by Christoph Lameter <clameter@sgi.com>
-+
-+CONTENTS:
-+=========
-+
-+1. Control Groups
-+  1.1 What are cgroups ?
-+  1.2 Why are cgroups needed ?
-+  1.3 How are cgroups implemented ?
-+  1.4 What does notify_on_release do ?
-+  1.5 How do I use cgroups ?
-+2. Usage Examples and Syntax
-+  2.1 Basic Usage
-+  2.2 Attaching processes
-+3. Kernel API
-+  3.1 Overview
-+  3.2 Synchronization
-+  3.3 Subsystem API
-+4. Questions
-+
-+1. Control Groups
-+=================
-+
-+1.1 What are cgroups ?
-+----------------------
-+
-+Control Groups provide a mechanism for aggregating/partitioning sets of
-+tasks, and all their future children, into hierarchical groups with
-+specialized behaviour.
-+
-+Definitions:
-+
-+A *cgroup* associates a set of tasks with a set of parameters for one
-+or more subsystems.
-+
-+A *subsystem* is a module that makes use of the task grouping
-+facilities provided by cgroups to treat groups of tasks in
-+particular ways. A subsystem is typically a "resource controller" that
-+schedules a resource or applies per-cgroup limits, but it may be
-+anything that wants to act on a group of processes, e.g. a
-+virtualization subsystem.
-+
-+A *hierarchy* is a set of cgroups arranged in a tree, such that
-+every task in the system is in exactly one of the cgroups in the
-+hierarchy, and a set of subsystems; each subsystem has system-specific
-+state attached to each cgroup in the hierarchy.  Each hierarchy has
-+an instance of the cgroup virtual filesystem associated with it.
-+
-+At any one time there may be multiple active hierachies of task
-+cgroups. Each hierarchy is a partition of all tasks in the system.
-+
-+User level code may create and destroy cgroups by name in an
-+instance of the cgroup virtual file system, specify and query to
-+which cgroup a task is assigned, and list the task pids assigned to
-+a cgroup. Those creations and assignments only affect the hierarchy
-+associated with that instance of the cgroup file system.
-+
-+On their own, the only use for cgroups is for simple job
-+tracking. The intention is that other subsystems hook into the generic
-+cgroup support to provide new attributes for cgroups, such as
-+accounting/limiting the resources which processes in a cgroup can
-+access. For example, cpusets (see Documentation/cpusets.txt) allows
-+you to associate a set of CPUs and a set of memory nodes with the
-+tasks in each cgroup.
-+
-+1.2 Why are cgroups needed ?
-+----------------------------
-+
-+There are multiple efforts to provide process aggregations in the
-+Linux kernel, mainly for resource tracking purposes. Such efforts
-+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
-+namespaces. These all require the basic notion of a
-+grouping/partitioning of processes, with newly forked processes ending
-+in the same group (cgroup) as their parent process.
-+
-+The kernel cgroup patch provides the minimum essential kernel
-+mechanisms required to efficiently implement such groups. It has
-+minimal impact on the system fast paths, and provides hooks for
-+specific subsystems such as cpusets to provide additional behaviour as
-+desired.
-+
-+Multiple hierarchy support is provided to allow for situations where
-+the division of tasks into cgroups is distinctly different for
-+different subsystems - having parallel hierarchies allows each
-+hierarchy to be a natural division of tasks, without having to handle
-+complex combinations of tasks that would be present if several
-+unrelated subsystems needed to be forced into the same tree of
-+cgroups.
-+
-+At one extreme, each resource controller or subsystem could be in a
-+separate hierarchy; at the other extreme, all subsystems
-+would be attached to the same hierarchy.
-+
-+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
-+that can benefit from multiple hierarchies, consider a large
-+university server with various users - students, professors, system
-+tasks etc. The resource planning for this server could be along the
-+following lines:
-+
-+       CPU :           Top cpuset
-+                       /       \
-+               CPUSet1         CPUSet2
-+                  |              |
-+               (Profs)         (Students)
-+
-+               In addition (system tasks) are attached to topcpuset (so
-+               that they can run anywhere) with a limit of 20%
-+
-+       Memory : Professors (50%), students (30%), system (20%)
-+
-+       Disk : Prof (50%), students (30%), system (20%)
-+
-+       Network : WWW browsing (20%), Network File System (60%), others (20%)
-+                               / \
-+                       Prof (15%) students (5%)
-+
-+Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
-+into NFS network class.
-+
-+At the same time firefox/lynx will share an appropriate CPU/Memory class
-+depending on who launched it (prof/student).
-+
-+With the ability to classify tasks differently for different resources
-+(by putting those resource subsystems in different hierarchies) then
-+the admin can easily set up a script which receives exec notifications
-+and depending on who is launching the browser he can
-+
-+       # echo browser_pid > /mnt/<restype>/<userclass>/tasks
-+
-+With only a single hierarchy, he now would potentially have to create
-+a separate cgroup for every browser launched and associate it with
-+approp network and other resource class.  This may lead to
-+proliferation of such cgroups.
-+
-+Also lets say that the administrator would like to give enhanced network
-+access temporarily to a student's browser (since it is night and the user
-+wants to do online gaming :))  OR give one of the students simulation
-+apps enhanced CPU power,
-+
-+With ability to write pids directly to resource classes, it's just a
-+matter of :
-+
-+       # echo pid > /mnt/network/<new_class>/tasks
-+       (after some time)
-+       # echo pid > /mnt/network/<orig_class>/tasks
-+
-+Without this ability, he would have to split the cgroup into
-+multiple separate ones and then associate the new cgroups with the
-+new resource classes.
-+
-+
-+
-+1.3 How are cgroups implemented ?
-+---------------------------------
-+
-+Control Groups extends the kernel as follows:
-+
-+ - Each task in the system has a reference-counted pointer to a
-+   css_set.
-+
-+ - A css_set contains a set of reference-counted pointers to
-+   cgroup_subsys_state objects, one for each cgroup subsystem
-+   registered in the system. There is no direct link from a task to
-+   the cgroup of which it's a member in each hierarchy, but this
-+   can be determined by following pointers through the
-+   cgroup_subsys_state objects. This is because accessing the
-+   subsystem state is something that's expected to happen frequently
-+   and in performance-critical code, whereas operations that require a
-+   task's actual cgroup assignments (in particular, moving between
-+   cgroups) are less common. A linked list runs through the cg_list
-+   field of each task_struct using the css_set, anchored at
-+   css_set->tasks.
-+
-+ - A cgroup hierarchy filesystem can be mounted  for browsing and
-+   manipulation from user space.
-+
-+ - You can list all the tasks (by pid) attached to any cgroup.
-+
-+The implementation of cgroups requires a few, simple hooks
-+into the rest of the kernel, none in performance critical paths:
-+
-+ - in init/main.c, to initialize the root cgroups and initial
-+   css_set at system boot.
-+
-+ - in fork and exit, to attach and detach a task from its css_set.
-+
-+In addition a new file system, of type "cgroup" may be mounted, to
-+enable browsing and modifying the cgroups presently known to the
-+kernel.  When mounting a cgroup hierarchy, you may specify a
-+comma-separated list of subsystems to mount as the filesystem mount
-+options.  By default, mounting the cgroup filesystem attempts to
-+mount a hierarchy containing all registered subsystems.
-+
-+If an active hierarchy with exactly the same set of subsystems already
-+exists, it will be reused for the new mount. If no existing hierarchy
-+matches, and any of the requested subsystems are in use in an existing
-+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
-+is activated, associated with the requested subsystems.
-+
-+It's not currently possible to bind a new subsystem to an active
-+cgroup hierarchy, or to unbind a subsystem from an active cgroup
-+hierarchy. This may be possible in future, but is fraught with nasty
-+error-recovery issues.
-+
-+When a cgroup filesystem is unmounted, if there are any
-+child cgroups created below the top-level cgroup, that hierarchy
-+will remain active even though unmounted; if there are no
-+child cgroups then the hierarchy will be deactivated.
-+
-+No new system calls are added for cgroups - all support for
-+querying and modifying cgroups is via this cgroup file system.
-+
-+Each task under /proc has an added file named 'cgroup' displaying,
-+for each active hierarchy, the subsystem names and the cgroup name
-+as the path relative to the root of the cgroup file system.
-+
-+Each cgroup is represented by a directory in the cgroup file system
-+containing the following files describing that cgroup:
-+
-+ - tasks: list of tasks (by pid) attached to that cgroup
-+ - releasable flag: cgroup currently removeable?
-+ - notify_on_release flag: run the release agent on exit?
-+ - release_agent: the path to use for release notifications (this file
-+   exists in the top cgroup only)
-+
-+Other subsystems such as cpusets may add additional files in each
-+cgroup dir.
-+
-+New cgroups are created using the mkdir system call or shell
-+command.  The properties of a cgroup, such as its flags, are
-+modified by writing to the appropriate file in that cgroups
-+directory, as listed above.
-+
-+The named hierarchical structure of nested cgroups allows partitioning
-+a large system into nested, dynamically changeable, "soft-partitions".
-+
-+The attachment of each task, automatically inherited at fork by any
-+children of that task, to a cgroup allows organizing the work load
-+on a system into related sets of tasks.  A task may be re-attached to
-+any other cgroup, if allowed by the permissions on the necessary
-+cgroup file system directories.
-+
-+When a task is moved from one cgroup to another, it gets a new
-+css_set pointer - if there's an already existing css_set with the
-+desired collection of cgroups then that group is reused, else a new
-+css_set is allocated. Note that the current implementation uses a
-+linear search to locate an appropriate existing css_set, so isn't
-+very efficient. A future version will use a hash table for better
-+performance.
-+
-+To allow access from a cgroup to the css_sets (and hence tasks)
-+that comprise it, a set of cg_cgroup_link objects form a lattice;
-+each cg_cgroup_link is linked into a list of cg_cgroup_links for
-+a single cgroup on its cgrp_link_list field, and a list of
-+cg_cgroup_links for a single css_set on its cg_link_list.
-+
-+Thus the set of tasks in a cgroup can be listed by iterating over
-+each css_set that references the cgroup, and sub-iterating over
-+each css_set's task set.
-+
-+The use of a Linux virtual file system (vfs) to represent the
-+cgroup hierarchy provides for a familiar permission and name space
-+for cgroups, with a minimum of additional kernel code.
-+
-+1.4 What does notify_on_release do ?
-+------------------------------------
-+
-+If the notify_on_release flag is enabled (1) in a cgroup, then
-+whenever the last task in the cgroup leaves (exits or attaches to
-+some other cgroup) and the last child cgroup of that cgroup
-+is removed, then the kernel runs the command specified by the contents
-+of the "release_agent" file in that hierarchy's root directory,
-+supplying the pathname (relative to the mount point of the cgroup
-+file system) of the abandoned cgroup.  This enables automatic
-+removal of abandoned cgroups.  The default value of
-+notify_on_release in the root cgroup at system boot is disabled
-+(0).  The default value of other cgroups at creation is the current
-+value of their parents notify_on_release setting. The default value of
-+a cgroup hierarchy's release_agent path is empty.
-+
-+1.5 How do I use cgroups ?
-+--------------------------
-+
-+To start a new job that is to be contained within a cgroup, using
-+the "cpuset" cgroup subsystem, the steps are something like:
-+
-+ 1) mkdir /dev/cgroup
-+ 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
-+ 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
-+    the /dev/cgroup virtual file system.
-+ 4) Start a task that will be the "founding father" of the new job.
-+ 5) Attach that task to the new cgroup by writing its pid to the
-+    /dev/cgroup tasks file for that cgroup.
-+ 6) fork, exec or clone the job tasks from this founding father task.
-+
-+For example, the following sequence of commands will setup a cgroup
-+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-+and then start a subshell 'sh' in that cgroup:
-+
-+  mount -t cgroup cpuset -ocpuset /dev/cgroup
-+  cd /dev/cgroup
-+  mkdir Charlie
-+  cd Charlie
-+  /bin/echo 2-3 > cpuset.cpus
-+  /bin/echo 1 > cpuset.mems
-+  /bin/echo $$ > tasks
-+  sh
-+  # The subshell 'sh' is now running in cgroup Charlie
-+  # The next line should display '/Charlie'
-+  cat /proc/self/cgroup
-+
-+2. Usage Examples and Syntax
-+============================
-+
-+2.1 Basic Usage
-+---------------
-+
-+Creating, modifying, using the cgroups can be done through the cgroup
-+virtual filesystem.
-+
-+To mount a cgroup hierarchy will all available subsystems, type:
-+# mount -t cgroup xxx /dev/cgroup
-+
-+The "xxx" is not interpreted by the cgroup code, but will appear in
-+/proc/mounts so may be any useful identifying string that you like.
-+
-+To mount a cgroup hierarchy with just the cpuset and numtasks
-+subsystems, type:
-+# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
-+
-+To change the set of subsystems bound to a mounted hierarchy, just
-+remount with different options:
-+
-+# mount -o remount,cpuset,ns  /dev/cgroup
-+
-+Note that changing the set of subsystems is currently only supported
-+when the hierarchy consists of a single (root) cgroup. Supporting
-+the ability to arbitrarily bind/unbind subsystems from an existing
-+cgroup hierarchy is intended to be implemented in the future.
-+
-+Then under /dev/cgroup you can find a tree that corresponds to the
-+tree of the cgroups in the system. For instance, /dev/cgroup
-+is the cgroup that holds the whole system.
-+
-+If you want to create a new cgroup under /dev/cgroup:
-+# cd /dev/cgroup
-+# mkdir my_cgroup
-+
-+Now you want to do something with this cgroup.
-+# cd my_cgroup
-+
-+In this directory you can find several files:
-+# ls
-+notify_on_release releasable tasks
-+(plus whatever files added by the attached subsystems)
-+
-+Now attach your shell to this cgroup:
-+# /bin/echo $$ > tasks
-+
-+You can also create cgroups inside your cgroup by using mkdir in this
-+directory.
-+# mkdir my_sub_cs
-+
-+To remove a cgroup, just use rmdir:
-+# rmdir my_sub_cs
-+
-+This will fail if the cgroup is in use (has cgroups inside, or
-+has processes attached, or is held alive by other subsystem-specific
-+reference).
-+
-+2.2 Attaching processes
-+-----------------------
-+
-+# /bin/echo PID > tasks
-+
-+Note that it is PID, not PIDs. You can only attach ONE task at a time.
-+If you have several tasks to attach, you have to do it one after another:
-+
-+# /bin/echo PID1 > tasks
-+# /bin/echo PID2 > tasks
-+      ...
-+# /bin/echo PIDn > tasks
-+
-+You can attach the current shell task by echoing 0:
-+
-+# echo 0 > tasks
-+
-+3. Kernel API
-+=============
-+
-+3.1 Overview
-+------------
-+
-+Each kernel subsystem that wants to hook into the generic cgroup
-+system needs to create a cgroup_subsys object. This contains
-+various methods, which are callbacks from the cgroup system, along
-+with a subsystem id which will be assigned by the cgroup system.
-+
-+Other fields in the cgroup_subsys object include:
-+
-+- subsys_id: a unique array index for the subsystem, indicating which
-+  entry in cgroup->subsys[] this subsystem should be managing.
-+
-+- name: should be initialized to a unique subsystem name. Should be
-+  no longer than MAX_CGROUP_TYPE_NAMELEN.
-+
-+- early_init: indicate if the subsystem needs early initialization
-+  at system boot.
-+
-+Each cgroup object created by the system has an array of pointers,
-+indexed by subsystem id; this pointer is entirely managed by the
-+subsystem; the generic cgroup code will never touch this pointer.
-+
-+3.2 Synchronization
-+-------------------
-+
-+There is a global mutex, cgroup_mutex, used by the cgroup
-+system. This should be taken by anything that wants to modify a
-+cgroup. It may also be taken to prevent cgroups from being
-+modified, but more specific locks may be more appropriate in that
-+situation.
-+
-+See kernel/cgroup.c for more details.
-+
-+Subsystems can take/release the cgroup_mutex via the functions
-+cgroup_lock()/cgroup_unlock().
-+
-+Accessing a task's cgroup pointer may be done in the following ways:
-+- while holding cgroup_mutex
-+- while holding the task's alloc_lock (via task_lock())
-+- inside an rcu_read_lock() section via rcu_dereference()
-+
-+3.3 Subsystem API
-+-----------------
-+
-+Each subsystem should:
-+
-+- add an entry in linux/cgroup_subsys.h
-+- define a cgroup_subsys object called <name>_subsys
-+
-+Each subsystem may export the following methods. The only mandatory
-+methods are create/destroy. Any others that are null are presumed to
-+be successful no-ops.
-+
-+struct cgroup_subsys_state *create(struct cgroup_subsys *ss,
-+                                 struct cgroup *cgrp)
-+(cgroup_mutex held by caller)
-+
-+Called to create a subsystem state object for a cgroup. The
-+subsystem should allocate its subsystem state object for the passed
-+cgroup, returning a pointer to the new object on success or a
-+negative error code. On success, the subsystem pointer should point to
-+a structure of type cgroup_subsys_state (typically embedded in a
-+larger subsystem-specific object), which will be initialized by the
-+cgroup system. Note that this will be called at initialization to
-+create the root subsystem state for this subsystem; this case can be
-+identified by the passed cgroup object having a NULL parent (since
-+it's the root of the hierarchy) and may be an appropriate place for
-+initialization code.
-+
-+void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
-+(cgroup_mutex held by caller)
-+
-+The cgroup system is about to destroy the passed cgroup; the subsystem
-+should do any necessary cleanup and free its subsystem state
-+object. By the time this method is called, the cgroup has already been
-+unlinked from the file system and from the child list of its parent;
-+cgroup->parent is still valid. (Note - can also be called for a
-+newly-created cgroup if an error occurs after this subsystem's
-+create() method has been called for the new cgroup).
-+
-+void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
-+(cgroup_mutex held by caller)
-+
-+Called before checking the reference count on each subsystem. This may
-+be useful for subsystems which have some extra references even if
-+there are not tasks in the cgroup.
-+
-+int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-+             struct task_struct *task)
-+(cgroup_mutex held by caller)
-+
-+Called prior to moving a task into a cgroup; if the subsystem
-+returns an error, this will abort the attach operation.  If a NULL
-+task is passed, then a successful result indicates that *any*
-+unspecified task can be moved into the cgroup. Note that this isn't
-+called on a fork. If this method returns 0 (success) then this should
-+remain valid while the caller holds cgroup_mutex.
-+
-+void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-+          struct cgroup *old_cgrp, struct task_struct *task)
-+
-+Called after the task has been attached to the cgroup, to allow any
-+post-attachment activity that requires memory allocations or blocking.
-+
-+void fork(struct cgroup_subsy *ss, struct task_struct *task)
-+
-+Called when a task is forked into a cgroup.
-+
-+void exit(struct cgroup_subsys *ss, struct task_struct *task)
-+
-+Called during task exit.
-+
-+int populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-+
-+Called after creation of a cgroup to allow a subsystem to populate
-+the cgroup directory with file entries.  The subsystem should make
-+calls to cgroup_add_file() with objects of type cftype (see
-+include/linux/cgroup.h for details).  Note that although this
-+method can return an error code, the error code is currently not
-+always handled well.
-+
-+void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
-+
-+Called at the end of cgroup_clone() to do any paramater
-+initialization which might be required before a task could attach.  For
-+example in cpusets, no task may attach before 'cpus' and 'mems' are set
-+up.
-+
-+void bind(struct cgroup_subsys *ss, struct cgroup *root)
-+(cgroup_mutex held by caller)
-+
-+Called when a cgroup subsystem is rebound to a different hierarchy
-+and root cgroup. Currently this will only involve movement between
-+the default hierarchy (which never has sub-cgroups) and a hierarchy
-+that is being created/destroyed (and hence has no sub-cgroups).
-+
-+4. Questions
-+============
-+
-+Q: what's up with this '/bin/echo' ?
-+A: bash's builtin 'echo' command does not check calls to write() against
-+   errors. If you use it in the cgroup file system, you won't be
-+   able to tell whether a command succeeded or failed.
-+
-+Q: When I attach processes, only the first of the line gets really attached !
-+A: We can only return one error code per call to write(). So you should also
-+   put only ONE pid.
-+
---- /dev/null
-+++ b/Documentation/cgroups/freezer-subsystem.txt
-@@ -0,0 +1,102 @@
-+The cgroup freezer is useful to batch job management system which start
-+and stop sets of tasks in order to schedule the resources of a machine
-+according to the desires of a system administrator. This sort of program
-+is often used on HPC clusters to schedule access to the cluster as a
-+whole. The cgroup freezer uses cgroups to describe the set of tasks to
-+be started/stopped by the batch job management system. It also provides
-+a means to start and stop the tasks composing the job.
-+
-+The cgroup freezer will also be useful for checkpointing running groups
-+of tasks. The freezer allows the checkpoint code to obtain a consistent
-+image of the tasks by attempting to force the tasks in a cgroup into a
-+quiescent state. Once the tasks are quiescent another task can
-+walk /proc or invoke a kernel interface to gather information about the
-+quiesced tasks. Checkpointed tasks can be restarted later should a
-+recoverable error occur. This also allows the checkpointed tasks to be
-+migrated between nodes in a cluster by copying the gathered information
-+to another node and restarting the tasks there.
-+
-+Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
-+and resuming tasks in userspace. Both of these signals are observable
-+from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
-+blocked, or ignored it can be seen by waiting or ptracing parent tasks.
-+SIGCONT is especially unsuitable since it can be caught by the task. Any
-+programs designed to watch for SIGSTOP and SIGCONT could be broken by
-+attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
-+demonstrate this problem using nested bash shells:
-+
-+      $ echo $$
-+      16644
-+      $ bash
-+      $ echo $$
-+      16690
-+
-+      From a second, unrelated bash shell:
-+      $ kill -SIGSTOP 16690
-+      $ kill -SIGCONT 16990
-+
-+      <at this point 16990 exits and causes 16644 to exit too>
-+
-+This happens because bash can observe both signals and choose how it
-+responds to them.
-+
-+Another example of a program which catches and responds to these
-+signals is gdb. In fact any program designed to use ptrace is likely to
-+have a problem with this method of stopping and resuming tasks.
-+
-+In contrast, the cgroup freezer uses the kernel freezer code to
-+prevent the freeze/unfreeze cycle from becoming visible to the tasks
-+being frozen. This allows the bash example above and gdb to run as
-+expected.
-+
-+The freezer subsystem in the container filesystem defines a file named
-+freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the
-+cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup.
-+Reading will return the current state.
-+
-+Note freezer.state doesn't exist in root cgroup, which means root cgroup
-+is non-freezable.
-+
-+* Examples of usage :
-+
-+   # mkdir /containers
-+   # mount -t cgroup -ofreezer freezer  /containers
-+   # mkdir /containers/0
-+   # echo $some_pid > /containers/0/tasks
-+
-+to get status of the freezer subsystem :
-+
-+   # cat /containers/0/freezer.state
-+   THAWED
-+
-+to freeze all tasks in the container :
-+
-+   # echo FROZEN > /containers/0/freezer.state
-+   # cat /containers/0/freezer.state
-+   FREEZING
-+   # cat /containers/0/freezer.state
-+   FROZEN
-+
-+to unfreeze all tasks in the container :
-+
-+   # echo THAWED > /containers/0/freezer.state
-+   # cat /containers/0/freezer.state
-+   THAWED
-+
-+This is the basic mechanism which should do the right thing for user space task
-+in a simple scenario.
-+
-+It's important to note that freezing can be incomplete. In that case we return
-+EBUSY. This means that some tasks in the cgroup are busy doing something that
-+prevents us from completely freezing the cgroup at this time. After EBUSY,
-+the cgroup will remain partially frozen -- reflected by freezer.state reporting
-+"FREEZING" when read. The state will remain "FREEZING" until one of these
-+things happens:
-+
-+      1) Userspace cancels the freezing operation by writing "THAWED" to
-+              the freezer.state file
-+      2) Userspace retries the freezing operation by writing "FROZEN" to
-+              the freezer.state file (writing "FREEZING" is not legal
-+              and returns EINVAL)
-+      3) The tasks that blocked the cgroup from entering the "FROZEN"
-+              state disappear from the cgroup's set of tasks.
---- a/Documentation/cpusets.txt
-+++ b/Documentation/cpusets.txt
-@@ -48,7 +48,7 @@ hooks, beyond what is already present, r
- job placement on large systems.
- 
- Cpusets use the generic cgroup subsystem described in
--Documentation/cgroup.txt.
-+Documentation/cgroups/cgroups.txt.
- 
- Requests by a task, using the sched_setaffinity(2) system call to
- include CPUs in its CPU affinity mask, and using the mbind(2) and
---- a/arch/alpha/Kconfig
-+++ b/arch/alpha/Kconfig
-@@ -72,6 +72,7 @@ config ARCH_SUPPORTS_AOUT
-       def_bool y
- 
- source "init/Kconfig"
-+source "kernel/Kconfig.freezer"
- 
- 
- menu "System setup"
---- a/arch/alpha/include/asm/thread_info.h
-+++ b/arch/alpha/include/asm/thread_info.h
-@@ -74,12 +74,14 @@ register struct thread_info *__current_t
- #define TIF_UAC_SIGBUS                7
- #define TIF_MEMDIE            8
- #define TIF_RESTORE_SIGMASK   9       /* restore signal mask in do_signal */
-+#define TIF_FREEZE            16      /* is freezing for suspend */
- 
- #define _TIF_SYSCALL_TRACE    (1<<TIF_SYSCALL_TRACE)
- #define _TIF_SIGPENDING               (1<<TIF_SIGPENDING)
- #define _TIF_NEED_RESCHED     (1<<TIF_NEED_RESCHED)
- #define _TIF_POLLING_NRFLAG   (1<<TIF_POLLING_NRFLAG)
- #define _TIF_RESTORE_SIGMASK  (1<<TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE           (1<<TIF_FREEZE)
- 
- /* Work to do on interrupt/exception return.  */
- #define _TIF_WORK_MASK                (_TIF_SIGPENDING | _TIF_NEED_RESCHED)
---- a/arch/arm/Kconfig
-+++ b/arch/arm/Kconfig
-@@ -190,6 +190,8 @@ config VECTORS_BASE
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- menu "System Type"
- 
- choice
---- a/arch/avr32/Kconfig
-+++ b/arch/avr32/Kconfig
-@@ -72,6 +72,8 @@ config GENERIC_BUG
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- menu "System Type and features"
- 
- source "kernel/time/Kconfig"
---- a/arch/avr32/include/asm/thread_info.h
-+++ b/arch/avr32/include/asm/thread_info.h
-@@ -96,6 +96,7 @@ static inline struct thread_info *curren
- #define _TIF_MEMDIE           (1 << TIF_MEMDIE)
- #define _TIF_RESTORE_SIGMASK  (1 << TIF_RESTORE_SIGMASK)
- #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP)
-+#define _TIF_FREEZE           (1 << TIF_FREEZE)
- 
- /* Note: The masks below must never span more than 16 bits! */
- 
---- a/arch/blackfin/Kconfig
-+++ b/arch/blackfin/Kconfig
-@@ -64,8 +64,11 @@ config HARDWARE_PM
-       depends on OPROFILE
- 
- source "init/Kconfig"
-+
- source "kernel/Kconfig.preempt"
- 
-+source "kernel/Kconfig.freezer"
-+
- menu "Blackfin Processor Options"
- 
- comment "Processor and Board Settings"
---- a/arch/cris/Kconfig
-+++ b/arch/cris/Kconfig
-@@ -62,6 +62,8 @@ config HZ
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- menu "General setup"
- 
- source "fs/Kconfig.binfmt"
---- a/arch/frv/Kconfig
-+++ b/arch/frv/Kconfig
-@@ -66,6 +66,8 @@ mainmenu "Fujitsu FR-V Kernel Configurat
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- 
- menu "Fujitsu FR-V system setup"
- 
---- a/arch/h8300/Kconfig
-+++ b/arch/h8300/Kconfig
-@@ -89,6 +89,8 @@ config HZ
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- source "arch/h8300/Kconfig.cpu"
- 
- menu "Executable file formats"
---- a/arch/h8300/include/asm/thread_info.h
-+++ b/arch/h8300/include/asm/thread_info.h
-@@ -89,6 +89,7 @@ static inline struct thread_info *curren
-                                          TIF_NEED_RESCHED */
- #define TIF_MEMDIE            4
- #define TIF_RESTORE_SIGMASK   5       /* restore signal mask in do_signal() */
-+#define TIF_FREEZE            16      /* is freezing for suspend */
- 
- /* as above, but as bit values */
- #define _TIF_SYSCALL_TRACE    (1<<TIF_SYSCALL_TRACE)
-@@ -96,6 +97,7 @@ static inline struct thread_info *curren
- #define _TIF_NEED_RESCHED     (1<<TIF_NEED_RESCHED)
- #define _TIF_POLLING_NRFLAG   (1<<TIF_POLLING_NRFLAG)
- #define _TIF_RESTORE_SIGMASK  (1<<TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE           (1<<TIF_FREEZE)
- 
- #define _TIF_WORK_MASK                0x0000FFFE      /* work to do on interrupt/exception return */
- 
---- a/arch/ia64/Kconfig
-+++ b/arch/ia64/Kconfig
-@@ -7,6 +7,8 @@ mainmenu "IA-64 Linux Kernel Configurati
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- menu "Processor type and features"
- 
- config IA64
---- a/arch/m32r/Kconfig
-+++ b/arch/m32r/Kconfig
-@@ -45,6 +45,8 @@ config HZ
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- 
- menu "Processor type and features"
- 
---- a/arch/m68k/Kconfig
-+++ b/arch/m68k/Kconfig
-@@ -64,6 +64,8 @@ mainmenu "Linux/68k Kernel Configuration
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- menu "Platform dependent setup"
- 
- config EISA
---- a/arch/m68knommu/Kconfig
-+++ b/arch/m68knommu/Kconfig
-@@ -82,6 +82,8 @@ config ARCH_SUPPORTS_AOUT
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- menu "Processor type and features"
- 
- choice
---- a/arch/m68knommu/include/asm/thread_info.h
-+++ b/arch/m68knommu/include/asm/thread_info.h
-@@ -84,12 +84,14 @@ static inline struct thread_info *curren
- #define TIF_POLLING_NRFLAG    3       /* true if poll_idle() is polling
-                                          TIF_NEED_RESCHED */
- #define TIF_MEMDIE            4
-+#define TIF_FREEZE            16      /* is freezing for suspend */
- 
- /* as above, but as bit values */
- #define _TIF_SYSCALL_TRACE    (1<<TIF_SYSCALL_TRACE)
- #define _TIF_SIGPENDING               (1<<TIF_SIGPENDING)
- #define _TIF_NEED_RESCHED     (1<<TIF_NEED_RESCHED)
- #define _TIF_POLLING_NRFLAG   (1<<TIF_POLLING_NRFLAG)
-+#define _TIF_FREEZE           (1<<TIF_FREEZE)
- 
- #define _TIF_WORK_MASK                0x0000FFFE      /* work to do on interrupt/exception return */
- 
---- a/arch/mips/Kconfig
-+++ b/arch/mips/Kconfig
-@@ -1885,6 +1885,8 @@ config PROBE_INITRD_HEADER
-         add initrd or initramfs image to the kernel image.
-         Otherwise, say N.
- 
-+source "kernel/Kconfig.freezer"
-+
- menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)"
- 
- config HW_HAS_EISA
---- a/arch/mn10300/Kconfig
-+++ b/arch/mn10300/Kconfig
-@@ -71,6 +71,8 @@ mainmenu "Matsushita MN10300/AM33 Kernel
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- 
- menu "Matsushita MN10300 system setup"
- 
---- a/arch/parisc/Kconfig
-+++ b/arch/parisc/Kconfig
-@@ -93,6 +93,8 @@ config ARCH_MAY_HAVE_PC_FDC
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- 
- menu "Processor type and features"
- 
---- a/arch/powerpc/Kconfig
-+++ b/arch/powerpc/Kconfig
-@@ -228,6 +228,8 @@ config PPC_OF_PLATFORM_PCI
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- source "arch/powerpc/sysdev/Kconfig"
- source "arch/powerpc/platforms/Kconfig"
- 
---- a/arch/s390/Kconfig
-+++ b/arch/s390/Kconfig
-@@ -79,6 +79,8 @@ config S390
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- menu "Base setup"
- 
- comment "Processor type and features"
---- a/arch/s390/include/asm/thread_info.h
-+++ b/arch/s390/include/asm/thread_info.h
-@@ -98,6 +98,7 @@ static inline struct thread_info *curren
- #define TIF_31BIT             18      /* 32bit process */ 
- #define TIF_MEMDIE            19
- #define TIF_RESTORE_SIGMASK   20      /* restore signal mask in do_signal() */
-+#define TIF_FREEZE            21
- 
- #define _TIF_SYSCALL_TRACE    (1<<TIF_SYSCALL_TRACE)
- #define _TIF_RESTORE_SIGMASK  (1<<TIF_RESTORE_SIGMASK)
-@@ -110,6 +111,7 @@ static inline struct thread_info *curren
- #define _TIF_USEDFPU          (1<<TIF_USEDFPU)
- #define _TIF_POLLING_NRFLAG   (1<<TIF_POLLING_NRFLAG)
- #define _TIF_31BIT            (1<<TIF_31BIT)
-+#define _TIF_FREEZE           (1<<TIF_FREEZE)
- 
- #endif /* __KERNEL__ */
- 
---- a/arch/sh/Kconfig
-+++ b/arch/sh/Kconfig
-@@ -106,6 +106,8 @@ config IO_TRAPPED
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- menu "System type"
- 
- #
---- a/arch/sparc/Kconfig
-+++ b/arch/sparc/Kconfig
-@@ -32,6 +32,8 @@ config HZ
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- menu "General machine setup"
- 
- config SMP
---- a/arch/sparc/include/asm/thread_info_32.h
-+++ b/arch/sparc/include/asm/thread_info_32.h
-@@ -139,6 +139,7 @@ BTFIXUPDEF_CALL(void, free_thread_info,
- #define TIF_POLLING_NRFLAG    9       /* true if poll_idle() is polling
-                                        * TIF_NEED_RESCHED */
- #define TIF_MEMDIE            10
-+#define TIF_FREEZE            11      /* is freezing for suspend */
- 
- /* as above, but as bit values */
- #define _TIF_SYSCALL_TRACE    (1<<TIF_SYSCALL_TRACE)
-@@ -152,6 +153,7 @@ BTFIXUPDEF_CALL(void, free_thread_info,
- #define _TIF_DO_NOTIFY_RESUME_MASK    (_TIF_NOTIFY_RESUME | \
-                                        _TIF_SIGPENDING | \
-                                        _TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE           (1<<TIF_FREEZE)
- 
- #endif /* __KERNEL__ */
- 
---- a/arch/sparc64/Kconfig
-+++ b/arch/sparc64/Kconfig
-@@ -85,6 +85,7 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
-       def_bool y
- 
- source "init/Kconfig"
-+source "kernel/Kconfig.freezer"
- 
- menu "Processor type and features"
- 
---- a/arch/um/Kconfig
-+++ b/arch/um/Kconfig
-@@ -229,6 +229,8 @@ endmenu
- 
- source "init/Kconfig"
- 
-+source "kernel/Kconfig.freezer"
-+
- source "drivers/block/Kconfig"
- 
- source "arch/um/Kconfig.char"
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -208,6 +208,7 @@ config X86_TRAMPOLINE
- config KTIME_SCALAR
-       def_bool X86_32
- source "init/Kconfig"
-+source "kernel/Kconfig.freezer"
- 
- menu "Processor type and features"
- 
---- a/arch/xtensa/Kconfig
-+++ b/arch/xtensa/Kconfig
-@@ -55,6 +55,7 @@ config HZ
-       default 100
- 
- source "init/Kconfig"
-+source "kernel/Kconfig.freezer"
- 
- menu "Processor type and features"
- 
---- a/include/asm-cris/thread_info.h
-+++ b/include/asm-cris/thread_info.h
-@@ -88,6 +88,7 @@ struct thread_info {
- #define TIF_RESTORE_SIGMASK   9       /* restore signal mask in do_signal() */
- #define TIF_POLLING_NRFLAG    16      /* true if poll_idle() is polling TIF_NEED_RESCHED */
- #define TIF_MEMDIE            17
-+#define TIF_FREEZE            18      /* is freezing for suspend */
- 
- #define _TIF_SYSCALL_TRACE    (1<<TIF_SYSCALL_TRACE)
- #define _TIF_NOTIFY_RESUME    (1<<TIF_NOTIFY_RESUME)
-@@ -95,6 +96,7 @@ struct thread_info {
- #define _TIF_NEED_RESCHED     (1<<TIF_NEED_RESCHED)
- #define _TIF_RESTORE_SIGMASK  (1<<TIF_RESTORE_SIGMASK)
- #define _TIF_POLLING_NRFLAG   (1<<TIF_POLLING_NRFLAG)
-+#define _TIF_FREEZE           (1<<TIF_FREEZE)
- 
- #define _TIF_WORK_MASK                0x0000FFFE      /* work to do on interrupt/exception return */
- #define _TIF_ALLWORK_MASK     0x0000FFFF      /* work to do on any return to u-space */
---- a/include/asm-m68k/thread_info.h
-+++ b/include/asm-m68k/thread_info.h
-@@ -52,5 +52,6 @@ struct thread_info {
- #define TIF_DELAYED_TRACE     14      /* single step a syscall */
- #define TIF_SYSCALL_TRACE     15      /* syscall trace active */
- #define TIF_MEMDIE            16
-+#define TIF_FREEZE            17      /* thread is freezing for suspend */
- 
- #endif        /* _ASM_M68K_THREAD_INFO_H */
---- a/include/asm-parisc/thread_info.h
-+++ b/include/asm-parisc/thread_info.h
-@@ -58,6 +58,7 @@ struct thread_info {
- #define TIF_32BIT               4       /* 32 bit binary */
- #define TIF_MEMDIE            5
- #define TIF_RESTORE_SIGMASK   6       /* restore saved signal mask */
-+#define TIF_FREEZE            7       /* is freezing for suspend */
- 
- #define _TIF_SYSCALL_TRACE    (1 << TIF_SYSCALL_TRACE)
- #define _TIF_SIGPENDING               (1 << TIF_SIGPENDING)
-@@ -65,6 +66,7 @@ struct thread_info {
- #define _TIF_POLLING_NRFLAG   (1 << TIF_POLLING_NRFLAG)
- #define _TIF_32BIT            (1 << TIF_32BIT)
- #define _TIF_RESTORE_SIGMASK  (1 << TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE           (1 << TIF_FREEZE)
- 
- #define _TIF_USER_WORK_MASK     (_TIF_SIGPENDING | \
-                                  _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK)
---- a/include/asm-um/thread_info.h
-+++ b/include/asm-um/thread_info.h
-@@ -69,6 +69,7 @@ static inline struct thread_info *curren
- #define TIF_MEMDIE            5
- #define TIF_SYSCALL_AUDIT     6
- #define TIF_RESTORE_SIGMASK   7
-+#define TIF_FREEZE            16      /* is freezing for suspend */
- 
- #define _TIF_SYSCALL_TRACE    (1 << TIF_SYSCALL_TRACE)
- #define _TIF_SIGPENDING               (1 << TIF_SIGPENDING)
-@@ -77,5 +78,6 @@ static inline struct thread_info *curren
- #define _TIF_MEMDIE           (1 << TIF_MEMDIE)
- #define _TIF_SYSCALL_AUDIT    (1 << TIF_SYSCALL_AUDIT)
- #define _TIF_RESTORE_SIGMASK  (1 << TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE           (1 << TIF_FREEZE)
- 
- #endif
---- a/include/asm-xtensa/thread_info.h
-+++ b/include/asm-xtensa/thread_info.h
-@@ -134,6 +134,7 @@ static inline struct thread_info *curren
- #define TIF_MEMDIE            5
- #define TIF_RESTORE_SIGMASK   6       /* restore signal mask in do_signal() */
- #define TIF_POLLING_NRFLAG    16      /* true if poll_idle() is polling TIF_NEED_RESCHED */
-+#define TIF_FREEZE            17      /* is freezing for suspend */
- 
- #define _TIF_SYSCALL_TRACE    (1<<TIF_SYSCALL_TRACE)
- #define _TIF_SIGPENDING               (1<<TIF_SIGPENDING)
-@@ -142,6 +143,7 @@ static inline struct thread_info *curren
- #define _TIF_IRET             (1<<TIF_IRET)
- #define _TIF_POLLING_NRFLAG   (1<<TIF_POLLING_NRFLAG)
- #define _TIF_RESTORE_SIGMASK  (1<<TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE           (1<<TIF_FREEZE)
- 
- #define _TIF_WORK_MASK                0x0000FFFE      /* work to do on interrupt/exception return */
- #define _TIF_ALLWORK_MASK     0x0000FFFF      /* work to do on any return to u-space */
---- a/include/linux/cgroup_subsys.h
-+++ b/include/linux/cgroup_subsys.h
-@@ -48,3 +48,9 @@ SUBSYS(devices)
- #endif
- 
- /* */
-+
-+#ifdef CONFIG_CGROUP_FREEZER
-+SUBSYS(freezer)
-+#endif
-+
-+/* */
---- a/include/linux/freezer.h
-+++ b/include/linux/freezer.h
-@@ -6,7 +6,7 @@
- #include <linux/sched.h>
- #include <linux/wait.h>
- 
--#ifdef CONFIG_PM_SLEEP
-+#ifdef CONFIG_FREEZER
- /*
-  * Check if a process has been frozen
-  */
-@@ -39,29 +39,14 @@ static inline void clear_freeze_flag(str
-       clear_tsk_thread_flag(p, TIF_FREEZE);
- }
- 
--/*
-- * Wake up a frozen process
-- *
-- * task_lock() is taken to prevent the race with refrigerator() which may
-- * occur if the freezing of tasks fails.  Namely, without the lock, if the
-- * freezing of tasks failed, thaw_tasks() might have run before a task in
-- * refrigerator() could call frozen_process(), in which case the task would be
-- * frozen and no one would thaw it.
-- */
--static inline int thaw_process(struct task_struct *p)
--{
--      task_lock(p);
--      if (frozen(p)) {
--              p->flags &= ~PF_FROZEN;
--              task_unlock(p);
--              wake_up_process(p);
--              return 1;
--      }
--      clear_freeze_flag(p);
--      task_unlock(p);
--      return 0;
-+static inline bool should_send_signal(struct task_struct *p)
-+{
-+      return !(p->flags & PF_FREEZER_NOSIG);
- }
- 
-+/* Takes and releases task alloc lock using task_lock() */
-+extern int thaw_process(struct task_struct *p);
-+
- extern void refrigerator(void);
- extern int freeze_processes(void);
- extern void thaw_processes(void);
-@@ -75,6 +60,15 @@ static inline int try_to_freeze(void)
-               return 0;
- }
- 
-+extern bool freeze_task(struct task_struct *p, bool sig_only);
-+extern void cancel_freezing(struct task_struct *p);
-+
-+#ifdef CONFIG_CGROUP_FREEZER
-+extern int cgroup_frozen(struct task_struct *task);
-+#else /* !CONFIG_CGROUP_FREEZER */
-+static inline int cgroup_frozen(struct task_struct *task) { return 0; }
-+#endif /* !CONFIG_CGROUP_FREEZER */
-+
- /*
-  * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
-  * calls wait_for_completion(&vfork) and reset right after it returns from this
-@@ -166,7 +160,7 @@ static inline void set_freezable_with_si
-       } while (try_to_freeze());                                      \
-       __retval;                                                       \
- })
--#else /* !CONFIG_PM_SLEEP */
-+#else /* !CONFIG_FREEZER */
- static inline int frozen(struct task_struct *p) { return 0; }
- static inline int freezing(struct task_struct *p) { return 0; }
- static inline void set_freeze_flag(struct task_struct *p) {}
-@@ -191,6 +185,6 @@ static inline void set_freezable_with_si
- #define wait_event_freezable_timeout(wq, condition, timeout)          \
-               wait_event_interruptible_timeout(wq, condition, timeout)
- 
--#endif /* !CONFIG_PM_SLEEP */
-+#endif /* !CONFIG_FREEZER */
- 
- #endif        /* FREEZER_H_INCLUDED */
---- a/init/Kconfig
-+++ b/init/Kconfig
-@@ -303,6 +303,13 @@ config CGROUP_NS
-           for instance virtual servers and checkpoint/restart
-           jobs.
- 
-+config CGROUP_FREEZER
-+        bool "control group freezer subsystem"
-+        depends on CGROUPS
-+        help
-+          Provides a way to freeze and unfreeze all tasks in a
-+        cgroup.
-+
- config CGROUP_DEVICE
-       bool "Device controller for cgroups"
-       depends on CGROUPS && EXPERIMENTAL
---- /dev/null
-+++ b/kernel/Kconfig.freezer
-@@ -0,0 +1,2 @@
-+config FREEZER
-+      def_bool PM_SLEEP || CGROUP_FREEZER
---- a/kernel/Makefile
-+++ b/kernel/Makefile
-@@ -22,6 +22,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
- CFLAGS_REMOVE_sched.o = -pg
- endif
- 
-+obj-$(CONFIG_FREEZER) += freezer.o
- obj-$(CONFIG_PROFILING) += profile.o
- obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
- obj-$(CONFIG_STACKTRACE) += stacktrace.o
-@@ -54,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += bac
- obj-$(CONFIG_COMPAT) += compat.o
- obj-$(CONFIG_CGROUPS) += cgroup.o
- obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
-+obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
- obj-$(CONFIG_CPUSETS) += cpuset.o
- obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
- obj-$(CONFIG_UTS_NS) += utsname.o
---- /dev/null
-+++ b/kernel/cgroup_freezer.c
-@@ -0,0 +1,379 @@
-+/*
-+ * cgroup_freezer.c -  control group freezer subsystem
-+ *
-+ * Copyright IBM Corporation, 2007
-+ *
-+ * Author : Cedric Le Goater <clg@fr.ibm.com>
-+ *
-+ * This program is free software; you can redistribute it and/or modify it
-+ * under the terms of version 2.1 of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation.
-+ *
-+ * This program is distributed in the hope that it would be useful, but
-+ * WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-+ */
-+
-+#include <linux/module.h>
-+#include <linux/cgroup.h>
-+#include <linux/fs.h>
-+#include <linux/uaccess.h>
-+#include <linux/freezer.h>
-+#include <linux/seq_file.h>
-+
-+enum freezer_state {
-+      CGROUP_THAWED = 0,
-+      CGROUP_FREEZING,
-+      CGROUP_FROZEN,
-+};
-+
-+struct freezer {
-+      struct cgroup_subsys_state css;
-+      enum freezer_state state;
-+      spinlock_t lock; /* protects _writes_ to state */
-+};
-+
-+static inline struct freezer *cgroup_freezer(
-+              struct cgroup *cgroup)
-+{
-+      return container_of(
-+              cgroup_subsys_state(cgroup, freezer_subsys_id),
-+              struct freezer, css);
-+}
-+
-+static inline struct freezer *task_freezer(struct task_struct *task)
-+{
-+      return container_of(task_subsys_state(task, freezer_subsys_id),
-+                          struct freezer, css);
-+}
-+
-+int cgroup_frozen(struct task_struct *task)
-+{
-+      struct freezer *freezer;
-+      enum freezer_state state;
-+
-+      task_lock(task);
-+      freezer = task_freezer(task);
-+      state = freezer->state;
-+      task_unlock(task);
-+
-+      return state == CGROUP_FROZEN;
-+}
-+
-+/*
-+ * cgroups_write_string() limits the size of freezer state strings to
-+ * CGROUP_LOCAL_BUFFER_SIZE
-+ */
-+static const char *freezer_state_strs[] = {
-+      "THAWED",
-+      "FREEZING",
-+      "FROZEN",
-+};
-+
-+/*
-+ * State diagram
-+ * Transitions are caused by userspace writes to the freezer.state file.
-+ * The values in parenthesis are state labels. The rest are edge labels.
-+ *
-+ * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
-+ *    ^ ^                    |                     |
-+ *    | \_______THAWED_______/                     |
-+ *    \__________________________THAWED____________/
-+ */
-+
-+struct cgroup_subsys freezer_subsys;
-+
-+/* Locks taken and their ordering
-+ * ------------------------------
-+ * css_set_lock
-+ * cgroup_mutex (AKA cgroup_lock)
-+ * task->alloc_lock (AKA task_lock)
-+ * freezer->lock
-+ * task->sighand->siglock
-+ *
-+ * cgroup code forces css_set_lock to be taken before task->alloc_lock
-+ *
-+ * freezer_create(), freezer_destroy():
-+ * cgroup_mutex [ by cgroup core ]
-+ *
-+ * can_attach():
-+ * cgroup_mutex
-+ *
-+ * cgroup_frozen():
-+ * task->alloc_lock (to get task's cgroup)
-+ *
-+ * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
-+ * task->alloc_lock (to get task's cgroup)
-+ * freezer->lock
-+ *  sighand->siglock (if the cgroup is freezing)
-+ *
-+ * freezer_read():
-+ * cgroup_mutex
-+ *  freezer->lock
-+ *   read_lock css_set_lock (cgroup iterator start)
-+ *
-+ * freezer_write() (freeze):
-+ * cgroup_mutex
-+ *  freezer->lock
-+ *   read_lock css_set_lock (cgroup iterator start)
-+ *    sighand->siglock
-+ *
-+ * freezer_write() (unfreeze):
-+ * cgroup_mutex
-+ *  freezer->lock
-+ *   read_lock css_set_lock (cgroup iterator start)
-+ *    task->alloc_lock (to prevent races with freeze_task())
-+ *     sighand->siglock
-+ */
-+static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
-+                                                struct cgroup *cgroup)
-+{
-+      struct freezer *freezer;
-+
-+      freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
-+      if (!freezer)
-+              return ERR_PTR(-ENOMEM);
-+
-+      spin_lock_init(&freezer->lock);
-+      freezer->state = CGROUP_THAWED;
-+      return &freezer->css;
-+}
-+
-+static void freezer_destroy(struct cgroup_subsys *ss,
-+                          struct cgroup *cgroup)
-+{
-+      kfree(cgroup_freezer(cgroup));
-+}
-+
-+/* Task is frozen or will freeze immediately when next it gets woken */
-+static bool is_task_frozen_enough(struct task_struct *task)
-+{
-+      return frozen(task) ||
-+              (task_is_stopped_or_traced(task) && freezing(task));
-+}
-+
-+/*
-+ * The call to cgroup_lock() in the freezer.state write method prevents
-+ * a write to that file racing against an attach, and hence the
-+ * can_attach() result will remain valid until the attach completes.
-+ */
-+static int freezer_can_attach(struct cgroup_subsys *ss,
-+                            struct cgroup *new_cgroup,
-+                            struct task_struct *task)
-+{
-+      struct freezer *freezer;
-+
-+      /*
-+       * Anything frozen can't move or be moved to/from.
-+       *
-+       * Since orig_freezer->state == FROZEN means that @task has been
-+       * frozen, so it's sufficient to check the latter condition.
-+       */
-+
-+      if (is_task_frozen_enough(task))
-+              return -EBUSY;
-+
-+      freezer = cgroup_freezer(new_cgroup);
-+      if (freezer->state == CGROUP_FROZEN)
-+              return -EBUSY;
-+
-+      return 0;
-+}
-+
-+static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
-+{
-+      struct freezer *freezer;
-+
-+      /*
-+       * No lock is needed, since the task isn't on tasklist yet,
-+       * so it can't be moved to another cgroup, which means the
-+       * freezer won't be removed and will be valid during this
-+       * function call.
-+       */
-+      freezer = task_freezer(task);
-+
-+      /*
-+       * The root cgroup is non-freezable, so we can skip the
-+       * following check.
-+       */
-+      if (!freezer->css.cgroup->parent)
-+              return;
-+
-+      spin_lock_irq(&freezer->lock);
-+      BUG_ON(freezer->state == CGROUP_FROZEN);
-+
-+      /* Locking avoids race with FREEZING -> THAWED transitions. */
-+      if (freezer->state == CGROUP_FREEZING)
-+              freeze_task(task, true);
-+      spin_unlock_irq(&freezer->lock);
-+}
-+
-+/*
-+ * caller must hold freezer->lock
-+ */
-+static void update_freezer_state(struct cgroup *cgroup,
-+                               struct freezer *freezer)
-+{
-+      struct cgroup_iter it;
-+      struct task_struct *task;
-+      unsigned int nfrozen = 0, ntotal = 0;
-+
-+      cgroup_iter_start(cgroup, &it);
-+      while ((task = cgroup_iter_next(cgroup, &it))) {
-+              ntotal++;
-+              if (is_task_frozen_enough(task))
-+                      nfrozen++;
-+      }
-+
-+      /*
-+       * Transition to FROZEN when no new tasks can be added ensures
-+       * that we never exist in the FROZEN state while there are unfrozen
-+       * tasks.
-+       */
-+      if (nfrozen == ntotal)
-+              freezer->state = CGROUP_FROZEN;
-+      else if (nfrozen > 0)
-+              freezer->state = CGROUP_FREEZING;
-+      else
-+              freezer->state = CGROUP_THAWED;
-+      cgroup_iter_end(cgroup, &it);
-+}
-+
-+static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
-+                      struct seq_file *m)
-+{
-+      struct freezer *freezer;
-+      enum freezer_state state;
-+
-+      if (!cgroup_lock_live_group(cgroup))
-+              return -ENODEV;
-+
-+      freezer = cgroup_freezer(cgroup);
-+      spin_lock_irq(&freezer->lock);
-+      state = freezer->state;
-+      if (state == CGROUP_FREEZING) {
-+              /* We change from FREEZING to FROZEN lazily if the cgroup was
-+               * only partially frozen when we exitted write. */
-+              update_freezer_state(cgroup, freezer);
-+              state = freezer->state;
-+      }
-+      spin_unlock_irq(&freezer->lock);
-+      cgroup_unlock();
-+
-+      seq_puts(m, freezer_state_strs[state]);
-+      seq_putc(m, '\n');
-+      return 0;
-+}
-+
-+static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
-+{
-+      struct cgroup_iter it;
-+      struct task_struct *task;
-+      unsigned int num_cant_freeze_now = 0;
-+
-+      freezer->state = CGROUP_FREEZING;
-+      cgroup_iter_start(cgroup, &it);
-+      while ((task = cgroup_iter_next(cgroup, &it))) {
-+              if (!freeze_task(task, true))
-+                      continue;
-+              if (is_task_frozen_enough(task))
-+                      continue;
-+              if (!freezing(task) && !freezer_should_skip(task))
-+                      num_cant_freeze_now++;
-+      }
-+      cgroup_iter_end(cgroup, &it);
-+
-+      return num_cant_freeze_now ? -EBUSY : 0;
-+}
-+
-+static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
-+{
-+      struct cgroup_iter it;
-+      struct task_struct *task;
-+
-+      cgroup_iter_start(cgroup, &it);
-+      while ((task = cgroup_iter_next(cgroup, &it))) {
-+              thaw_process(task);
-+      }
-+      cgroup_iter_end(cgroup, &it);
-+
-+      freezer->state = CGROUP_THAWED;
-+}
-+
-+static int freezer_change_state(struct cgroup *cgroup,
-+                              enum freezer_state goal_state)
-+{
-+      struct freezer *freezer;
-+      int retval = 0;
-+
-+      freezer = cgroup_freezer(cgroup);
-+
-+      spin_lock_irq(&freezer->lock);
-+
-+      update_freezer_state(cgroup, freezer);
-+      if (goal_state == freezer->state)
-+              goto out;
-+
-+      switch (goal_state) {
-+      case CGROUP_THAWED:
-+              unfreeze_cgroup(cgroup, freezer);
-+              break;
-+      case CGROUP_FROZEN:
-+              retval = try_to_freeze_cgroup(cgroup, freezer);
-+              break;
-+      default:
-+              BUG();
-+      }
-+out:
-+      spin_unlock_irq(&freezer->lock);
-+
-+      return retval;
-+}
-+
-+static int freezer_write(struct cgroup *cgroup,
-+                       struct cftype *cft,
-+                       const char *buffer)
-+{
-+      int retval;
-+      enum freezer_state goal_state;
-+
-+      if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
-+              goal_state = CGROUP_THAWED;
-+      else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
-+              goal_state = CGROUP_FROZEN;
-+      else
-+              return -EINVAL;
-+
-+      if (!cgroup_lock_live_group(cgroup))
-+              return -ENODEV;
-+      retval = freezer_change_state(cgroup, goal_state);
-+      cgroup_unlock();
-+      return retval;
-+}
-+
-+static struct cftype files[] = {
-+      {
-+              .name = "state",
-+              .read_seq_string = freezer_read,
-+              .write_string = freezer_write,
-+      },
-+};
-+
-+static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
-+{
-+      if (!cgroup->parent)
-+              return 0;
-+      return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
-+}
-+
-+struct cgroup_subsys freezer_subsys = {
-+      .name           = "freezer",
-+      .create         = freezer_create,
-+      .destroy        = freezer_destroy,
-+      .populate       = freezer_populate,
-+      .subsys_id      = freezer_subsys_id,
-+      .can_attach     = freezer_can_attach,
-+      .attach         = NULL,
-+      .fork           = freezer_fork,
-+      .exit           = NULL,
-+};
---- /dev/null
-+++ b/kernel/freezer.c
-@@ -0,0 +1,154 @@
-+/*
-+ * kernel/freezer.c - Function to freeze a process
-+ *
-+ * Originally from kernel/power/process.c
-+ */
-+
-+#include <linux/interrupt.h>
-+#include <linux/suspend.h>
-+#include <linux/module.h>
-+#include <linux/syscalls.h>
-+#include <linux/freezer.h>
-+
-+/*
-+ * freezing is complete, mark current process as frozen
-+ */
-+static inline void frozen_process(void)
-+{
-+      if (!unlikely(current->flags & PF_NOFREEZE)) {
-+              current->flags |= PF_FROZEN;
-+              wmb();
-+      }
-+      clear_freeze_flag(current);
-+}
-+
-+/* Refrigerator is place where frozen processes are stored :-). */
-+void refrigerator(void)
-+{
-+      /* Hmm, should we be allowed to suspend when there are realtime
-+         processes around? */
-+      long save;
-+
-+      task_lock(current);
-+      if (freezing(current)) {
-+              frozen_process();
-+              task_unlock(current);
-+      } else {
-+              task_unlock(current);
-+              return;
-+      }
-+      save = current->state;
-+      pr_debug("%s entered refrigerator\n", current->comm);
-+
-+      spin_lock_irq(&current->sighand->siglock);
-+      recalc_sigpending(); /* We sent fake signal, clean it up */
-+      spin_unlock_irq(&current->sighand->siglock);
-+
-+      for (;;) {
-+              set_current_state(TASK_UNINTERRUPTIBLE);
-+              if (!frozen(current))
-+                      break;
-+              schedule();
-+      }
-+      pr_debug("%s left refrigerator\n", current->comm);
-+      __set_current_state(save);
-+}
-+EXPORT_SYMBOL(refrigerator);
-+
-+static void fake_signal_wake_up(struct task_struct *p)
-+{
-+      unsigned long flags;
-+
-+      spin_lock_irqsave(&p->sighand->siglock, flags);
-+      signal_wake_up(p, 0);
-+      spin_unlock_irqrestore(&p->sighand->siglock, flags);
-+}
-+
-+/**
-+ *    freeze_task - send a freeze request to given task
-+ *    @p: task to send the request to
-+ *    @sig_only: if set, the request will only be sent if the task has the
-+ *            PF_FREEZER_NOSIG flag unset
-+ *    Return value: 'false', if @sig_only is set and the task has
-+ *            PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
-+ *
-+ *    The freeze request is sent by setting the tasks's TIF_FREEZE flag and
-+ *    either sending a fake signal to it or waking it up, depending on whether
-+ *    or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
-+ *    has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
-+ *    TIF_FREEZE flag will not be set.
-+ */
-+bool freeze_task(struct task_struct *p, bool sig_only)
-+{
-+      /*
-+       * We first check if the task is freezing and next if it has already
-+       * been frozen to avoid the race with frozen_process() which first marks
-+       * the task as frozen and next clears its TIF_FREEZE.
-+       */
-+      if (!freezing(p)) {
-+              rmb();
-+              if (frozen(p))
-+                      return false;
-+
-+              if (!sig_only || should_send_signal(p))
-+                      set_freeze_flag(p);
-+              else
-+                      return false;
-+      }
-+
-+      if (should_send_signal(p)) {
-+              if (!signal_pending(p))
-+                      fake_signal_wake_up(p);
-+      } else if (sig_only) {
-+              return false;
-+      } else {
-+              wake_up_state(p, TASK_INTERRUPTIBLE);
-+      }
-+
-+      return true;
-+}
-+
-+void cancel_freezing(struct task_struct *p)
-+{
-+      unsigned long flags;
-+
-+      if (freezing(p)) {
-+              pr_debug("  clean up: %s\n", p->comm);
-+              clear_freeze_flag(p);
-+              spin_lock_irqsave(&p->sighand->siglock, flags);
-+              recalc_sigpending_and_wake(p);
-+              spin_unlock_irqrestore(&p->sighand->siglock, flags);
-+      }
-+}
-+
-+static int __thaw_process(struct task_struct *p)
-+{
-+      if (frozen(p)) {
-+              p->flags &= ~PF_FROZEN;
-+              return 1;
-+      }
-+      clear_freeze_flag(p);
-+      return 0;
-+}
-+
-+/*
-+ * Wake up a frozen process
-+ *
-+ * task_lock() is needed to prevent the race with refrigerator() which may
-+ * occur if the freezing of tasks fails.  Namely, without the lock, if the
-+ * freezing of tasks failed, thaw_tasks() might have run before a task in
-+ * refrigerator() could call frozen_process(), in which case the task would be
-+ * frozen and no one would thaw it.
-+ */
-+int thaw_process(struct task_struct *p)
-+{
-+      task_lock(p);
-+      if (__thaw_process(p) == 1) {
-+              task_unlock(p);
-+              wake_up_process(p);
-+              return 1;
-+      }
-+      task_unlock(p);
-+      return 0;
-+}
-+EXPORT_SYMBOL(thaw_process);
---- a/kernel/power/process.c
-+++ b/kernel/power/process.c
-@@ -28,121 +28,6 @@ static inline int freezeable(struct task
-       return 1;
- }
- 
--/*
-- * freezing is complete, mark current process as frozen
-- */
--static inline void frozen_process(void)
--{
--      if (!unlikely(current->flags & PF_NOFREEZE)) {
--              current->flags |= PF_FROZEN;
--              wmb();
--      }
--      clear_freeze_flag(current);
--}
--
--/* Refrigerator is place where frozen processes are stored :-). */
--void refrigerator(void)
--{
--      /* Hmm, should we be allowed to suspend when there are realtime
--         processes around? */
--      long save;
--
--      task_lock(current);
--      if (freezing(current)) {
--              frozen_process();
--              task_unlock(current);
--      } else {
--              task_unlock(current);
--              return;
--      }
--      save = current->state;
--      pr_debug("%s entered refrigerator\n", current->comm);
--
--      spin_lock_irq(&current->sighand->siglock);
--      recalc_sigpending(); /* We sent fake signal, clean it up */
--      spin_unlock_irq(&current->sighand->siglock);
--
--      for (;;) {
--              set_current_state(TASK_UNINTERRUPTIBLE);
--              if (!frozen(current))
--                      break;
--              schedule();
--      }
--      pr_debug("%s left refrigerator\n", current->comm);
--      __set_current_state(save);
--}
--
--static void fake_signal_wake_up(struct task_struct *p)
--{
--      unsigned long flags;
--
--      spin_lock_irqsave(&p->sighand->siglock, flags);
--      signal_wake_up(p, 0);
--      spin_unlock_irqrestore(&p->sighand->siglock, flags);
--}
--
--static inline bool should_send_signal(struct task_struct *p)
--{
--      return !(p->flags & PF_FREEZER_NOSIG);
--}
--
--/**
-- *    freeze_task - send a freeze request to given task
-- *    @p: task to send the request to
-- *    @sig_only: if set, the request will only be sent if the task has the
-- *            PF_FREEZER_NOSIG flag unset
-- *    Return value: 'false', if @sig_only is set and the task has
-- *            PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
-- *
-- *    The freeze request is sent by setting the tasks's TIF_FREEZE flag and
-- *    either sending a fake signal to it or waking it up, depending on whether
-- *    or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
-- *    has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
-- *    TIF_FREEZE flag will not be set.
-- */
--static bool freeze_task(struct task_struct *p, bool sig_only)
--{
--      /*
--       * We first check if the task is freezing and next if it has already
--       * been frozen to avoid the race with frozen_process() which first marks
--       * the task as frozen and next clears its TIF_FREEZE.
--       */
--      if (!freezing(p)) {
--              rmb();
--              if (frozen(p))
--                      return false;
--
--              if (!sig_only || should_send_signal(p))
--                      set_freeze_flag(p);
--              else
--                      return false;
--      }
--
--      if (should_send_signal(p)) {
--              if (!signal_pending(p))
--                      fake_signal_wake_up(p);
--      } else if (sig_only) {
--              return false;
--      } else {
--              wake_up_state(p, TASK_INTERRUPTIBLE);
--      }
--
--      return true;
--}
--
--static void cancel_freezing(struct task_struct *p)
--{
--      unsigned long flags;
--
--      if (freezing(p)) {
--              pr_debug("  clean up: %s\n", p->comm);
--              clear_freeze_flag(p);
--              spin_lock_irqsave(&p->sighand->siglock, flags);
--              recalc_sigpending_and_wake(p);
--              spin_unlock_irqrestore(&p->sighand->siglock, flags);
--      }
--}
--
- static int try_to_freeze_tasks(bool sig_only)
- {
-       struct task_struct *g, *p;
-@@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only)
-               if (nosig_only && should_send_signal(p))
-                       continue;
- 
-+              if (cgroup_frozen(p))
-+                      continue;
-+
-               thaw_process(p);
-       } while_each_thread(g, p);
-       read_unlock(&tasklist_lock);
-@@ -264,4 +152,3 @@ void thaw_processes(void)
-       printk("done.\n");
- }
- 
--EXPORT_SYMBOL(refrigerator);