+++ /dev/null
-From: Serge E. Hallyn <serue@us.ibm.com>
-Subject: cgroup freezer
-References: bnc#417294, fate#304191, fate#201036
-Patch-upstream: yes
-Git: 68d1a06b440a5df55fb253e1d1113d2e4a7209fc Mon Sep 17 00:00:00 2001
-
-Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
-Acked-by: Nick Piggin <npiggin@suse.de>
----
- Documentation/cgroups.txt | 548 ----------------------------
- Documentation/cgroups/cgroups.txt | 548 ++++++++++++++++++++++++++++
- Documentation/cgroups/freezer-subsystem.txt | 102 +++++
- Documentation/cpusets.txt | 2
- arch/alpha/Kconfig | 1
- arch/alpha/include/asm/thread_info.h | 2
- arch/arm/Kconfig | 2
- arch/avr32/Kconfig | 2
- arch/avr32/include/asm/thread_info.h | 1
- arch/blackfin/Kconfig | 3
- arch/cris/Kconfig | 2
- arch/frv/Kconfig | 2
- arch/h8300/Kconfig | 2
- arch/h8300/include/asm/thread_info.h | 2
- arch/ia64/Kconfig | 2
- arch/m32r/Kconfig | 2
- arch/m68k/Kconfig | 2
- arch/m68knommu/Kconfig | 2
- arch/m68knommu/include/asm/thread_info.h | 2
- arch/mips/Kconfig | 2
- arch/mn10300/Kconfig | 2
- arch/parisc/Kconfig | 2
- arch/powerpc/Kconfig | 2
- arch/s390/Kconfig | 2
- arch/s390/include/asm/thread_info.h | 2
- arch/sh/Kconfig | 2
- arch/sparc/Kconfig | 2
- arch/sparc/include/asm/thread_info_32.h | 2
- arch/sparc64/Kconfig | 1
- arch/um/Kconfig | 2
- arch/x86/Kconfig | 1
- arch/xtensa/Kconfig | 1
- include/asm-cris/thread_info.h | 2
- include/asm-m68k/thread_info.h | 1
- include/asm-parisc/thread_info.h | 2
- include/asm-um/thread_info.h | 2
- include/asm-xtensa/thread_info.h | 2
- include/linux/cgroup_subsys.h | 6
- include/linux/freezer.h | 42 --
- init/Kconfig | 7
- kernel/Kconfig.freezer | 2
- kernel/Makefile | 2
- kernel/cgroup_freezer.c | 379 +++++++++++++++++++
- kernel/freezer.c | 154 +++++++
- kernel/power/process.c | 119 ------
- 45 files changed, 1283 insertions(+), 689 deletions(-)
- create mode 100644 include/linux/cgroup_freezer.h
- create mode 100644 kernel/cgroup_freezer.c
- create mode 100644 kernel/freezer.c
-
---- a/Documentation/cgroups.txt
-+++ /dev/null
-@@ -1,548 +0,0 @@
-- CGROUPS
-- -------
--
--Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
--
--Original copyright statements from cpusets.txt:
--Portions Copyright (C) 2004 BULL SA.
--Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
--Modified by Paul Jackson <pj@sgi.com>
--Modified by Christoph Lameter <clameter@sgi.com>
--
--CONTENTS:
--=========
--
--1. Control Groups
-- 1.1 What are cgroups ?
-- 1.2 Why are cgroups needed ?
-- 1.3 How are cgroups implemented ?
-- 1.4 What does notify_on_release do ?
-- 1.5 How do I use cgroups ?
--2. Usage Examples and Syntax
-- 2.1 Basic Usage
-- 2.2 Attaching processes
--3. Kernel API
-- 3.1 Overview
-- 3.2 Synchronization
-- 3.3 Subsystem API
--4. Questions
--
--1. Control Groups
--=================
--
--1.1 What are cgroups ?
------------------------
--
--Control Groups provide a mechanism for aggregating/partitioning sets of
--tasks, and all their future children, into hierarchical groups with
--specialized behaviour.
--
--Definitions:
--
--A *cgroup* associates a set of tasks with a set of parameters for one
--or more subsystems.
--
--A *subsystem* is a module that makes use of the task grouping
--facilities provided by cgroups to treat groups of tasks in
--particular ways. A subsystem is typically a "resource controller" that
--schedules a resource or applies per-cgroup limits, but it may be
--anything that wants to act on a group of processes, e.g. a
--virtualization subsystem.
--
--A *hierarchy* is a set of cgroups arranged in a tree, such that
--every task in the system is in exactly one of the cgroups in the
--hierarchy, and a set of subsystems; each subsystem has system-specific
--state attached to each cgroup in the hierarchy. Each hierarchy has
--an instance of the cgroup virtual filesystem associated with it.
--
--At any one time there may be multiple active hierachies of task
--cgroups. Each hierarchy is a partition of all tasks in the system.
--
--User level code may create and destroy cgroups by name in an
--instance of the cgroup virtual file system, specify and query to
--which cgroup a task is assigned, and list the task pids assigned to
--a cgroup. Those creations and assignments only affect the hierarchy
--associated with that instance of the cgroup file system.
--
--On their own, the only use for cgroups is for simple job
--tracking. The intention is that other subsystems hook into the generic
--cgroup support to provide new attributes for cgroups, such as
--accounting/limiting the resources which processes in a cgroup can
--access. For example, cpusets (see Documentation/cpusets.txt) allows
--you to associate a set of CPUs and a set of memory nodes with the
--tasks in each cgroup.
--
--1.2 Why are cgroups needed ?
------------------------------
--
--There are multiple efforts to provide process aggregations in the
--Linux kernel, mainly for resource tracking purposes. Such efforts
--include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
--namespaces. These all require the basic notion of a
--grouping/partitioning of processes, with newly forked processes ending
--in the same group (cgroup) as their parent process.
--
--The kernel cgroup patch provides the minimum essential kernel
--mechanisms required to efficiently implement such groups. It has
--minimal impact on the system fast paths, and provides hooks for
--specific subsystems such as cpusets to provide additional behaviour as
--desired.
--
--Multiple hierarchy support is provided to allow for situations where
--the division of tasks into cgroups is distinctly different for
--different subsystems - having parallel hierarchies allows each
--hierarchy to be a natural division of tasks, without having to handle
--complex combinations of tasks that would be present if several
--unrelated subsystems needed to be forced into the same tree of
--cgroups.
--
--At one extreme, each resource controller or subsystem could be in a
--separate hierarchy; at the other extreme, all subsystems
--would be attached to the same hierarchy.
--
--As an example of a scenario (originally proposed by vatsa@in.ibm.com)
--that can benefit from multiple hierarchies, consider a large
--university server with various users - students, professors, system
--tasks etc. The resource planning for this server could be along the
--following lines:
--
-- CPU : Top cpuset
-- / \
-- CPUSet1 CPUSet2
-- | |
-- (Profs) (Students)
--
-- In addition (system tasks) are attached to topcpuset (so
-- that they can run anywhere) with a limit of 20%
--
-- Memory : Professors (50%), students (30%), system (20%)
--
-- Disk : Prof (50%), students (30%), system (20%)
--
-- Network : WWW browsing (20%), Network File System (60%), others (20%)
-- / \
-- Prof (15%) students (5%)
--
--Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
--into NFS network class.
--
--At the same time firefox/lynx will share an appropriate CPU/Memory class
--depending on who launched it (prof/student).
--
--With the ability to classify tasks differently for different resources
--(by putting those resource subsystems in different hierarchies) then
--the admin can easily set up a script which receives exec notifications
--and depending on who is launching the browser he can
--
-- # echo browser_pid > /mnt/<restype>/<userclass>/tasks
--
--With only a single hierarchy, he now would potentially have to create
--a separate cgroup for every browser launched and associate it with
--approp network and other resource class. This may lead to
--proliferation of such cgroups.
--
--Also lets say that the administrator would like to give enhanced network
--access temporarily to a student's browser (since it is night and the user
--wants to do online gaming :)) OR give one of the students simulation
--apps enhanced CPU power,
--
--With ability to write pids directly to resource classes, it's just a
--matter of :
--
-- # echo pid > /mnt/network/<new_class>/tasks
-- (after some time)
-- # echo pid > /mnt/network/<orig_class>/tasks
--
--Without this ability, he would have to split the cgroup into
--multiple separate ones and then associate the new cgroups with the
--new resource classes.
--
--
--
--1.3 How are cgroups implemented ?
-----------------------------------
--
--Control Groups extends the kernel as follows:
--
-- - Each task in the system has a reference-counted pointer to a
-- css_set.
--
-- - A css_set contains a set of reference-counted pointers to
-- cgroup_subsys_state objects, one for each cgroup subsystem
-- registered in the system. There is no direct link from a task to
-- the cgroup of which it's a member in each hierarchy, but this
-- can be determined by following pointers through the
-- cgroup_subsys_state objects. This is because accessing the
-- subsystem state is something that's expected to happen frequently
-- and in performance-critical code, whereas operations that require a
-- task's actual cgroup assignments (in particular, moving between
-- cgroups) are less common. A linked list runs through the cg_list
-- field of each task_struct using the css_set, anchored at
-- css_set->tasks.
--
-- - A cgroup hierarchy filesystem can be mounted for browsing and
-- manipulation from user space.
--
-- - You can list all the tasks (by pid) attached to any cgroup.
--
--The implementation of cgroups requires a few, simple hooks
--into the rest of the kernel, none in performance critical paths:
--
-- - in init/main.c, to initialize the root cgroups and initial
-- css_set at system boot.
--
-- - in fork and exit, to attach and detach a task from its css_set.
--
--In addition a new file system, of type "cgroup" may be mounted, to
--enable browsing and modifying the cgroups presently known to the
--kernel. When mounting a cgroup hierarchy, you may specify a
--comma-separated list of subsystems to mount as the filesystem mount
--options. By default, mounting the cgroup filesystem attempts to
--mount a hierarchy containing all registered subsystems.
--
--If an active hierarchy with exactly the same set of subsystems already
--exists, it will be reused for the new mount. If no existing hierarchy
--matches, and any of the requested subsystems are in use in an existing
--hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
--is activated, associated with the requested subsystems.
--
--It's not currently possible to bind a new subsystem to an active
--cgroup hierarchy, or to unbind a subsystem from an active cgroup
--hierarchy. This may be possible in future, but is fraught with nasty
--error-recovery issues.
--
--When a cgroup filesystem is unmounted, if there are any
--child cgroups created below the top-level cgroup, that hierarchy
--will remain active even though unmounted; if there are no
--child cgroups then the hierarchy will be deactivated.
--
--No new system calls are added for cgroups - all support for
--querying and modifying cgroups is via this cgroup file system.
--
--Each task under /proc has an added file named 'cgroup' displaying,
--for each active hierarchy, the subsystem names and the cgroup name
--as the path relative to the root of the cgroup file system.
--
--Each cgroup is represented by a directory in the cgroup file system
--containing the following files describing that cgroup:
--
-- - tasks: list of tasks (by pid) attached to that cgroup
-- - releasable flag: cgroup currently removeable?
-- - notify_on_release flag: run the release agent on exit?
-- - release_agent: the path to use for release notifications (this file
-- exists in the top cgroup only)
--
--Other subsystems such as cpusets may add additional files in each
--cgroup dir.
--
--New cgroups are created using the mkdir system call or shell
--command. The properties of a cgroup, such as its flags, are
--modified by writing to the appropriate file in that cgroups
--directory, as listed above.
--
--The named hierarchical structure of nested cgroups allows partitioning
--a large system into nested, dynamically changeable, "soft-partitions".
--
--The attachment of each task, automatically inherited at fork by any
--children of that task, to a cgroup allows organizing the work load
--on a system into related sets of tasks. A task may be re-attached to
--any other cgroup, if allowed by the permissions on the necessary
--cgroup file system directories.
--
--When a task is moved from one cgroup to another, it gets a new
--css_set pointer - if there's an already existing css_set with the
--desired collection of cgroups then that group is reused, else a new
--css_set is allocated. Note that the current implementation uses a
--linear search to locate an appropriate existing css_set, so isn't
--very efficient. A future version will use a hash table for better
--performance.
--
--To allow access from a cgroup to the css_sets (and hence tasks)
--that comprise it, a set of cg_cgroup_link objects form a lattice;
--each cg_cgroup_link is linked into a list of cg_cgroup_links for
--a single cgroup on its cgrp_link_list field, and a list of
--cg_cgroup_links for a single css_set on its cg_link_list.
--
--Thus the set of tasks in a cgroup can be listed by iterating over
--each css_set that references the cgroup, and sub-iterating over
--each css_set's task set.
--
--The use of a Linux virtual file system (vfs) to represent the
--cgroup hierarchy provides for a familiar permission and name space
--for cgroups, with a minimum of additional kernel code.
--
--1.4 What does notify_on_release do ?
--------------------------------------
--
--If the notify_on_release flag is enabled (1) in a cgroup, then
--whenever the last task in the cgroup leaves (exits or attaches to
--some other cgroup) and the last child cgroup of that cgroup
--is removed, then the kernel runs the command specified by the contents
--of the "release_agent" file in that hierarchy's root directory,
--supplying the pathname (relative to the mount point of the cgroup
--file system) of the abandoned cgroup. This enables automatic
--removal of abandoned cgroups. The default value of
--notify_on_release in the root cgroup at system boot is disabled
--(0). The default value of other cgroups at creation is the current
--value of their parents notify_on_release setting. The default value of
--a cgroup hierarchy's release_agent path is empty.
--
--1.5 How do I use cgroups ?
----------------------------
--
--To start a new job that is to be contained within a cgroup, using
--the "cpuset" cgroup subsystem, the steps are something like:
--
-- 1) mkdir /dev/cgroup
-- 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
-- 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
-- the /dev/cgroup virtual file system.
-- 4) Start a task that will be the "founding father" of the new job.
-- 5) Attach that task to the new cgroup by writing its pid to the
-- /dev/cgroup tasks file for that cgroup.
-- 6) fork, exec or clone the job tasks from this founding father task.
--
--For example, the following sequence of commands will setup a cgroup
--named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
--and then start a subshell 'sh' in that cgroup:
--
-- mount -t cgroup cpuset -ocpuset /dev/cgroup
-- cd /dev/cgroup
-- mkdir Charlie
-- cd Charlie
-- /bin/echo 2-3 > cpuset.cpus
-- /bin/echo 1 > cpuset.mems
-- /bin/echo $$ > tasks
-- sh
-- # The subshell 'sh' is now running in cgroup Charlie
-- # The next line should display '/Charlie'
-- cat /proc/self/cgroup
--
--2. Usage Examples and Syntax
--============================
--
--2.1 Basic Usage
-----------------
--
--Creating, modifying, using the cgroups can be done through the cgroup
--virtual filesystem.
--
--To mount a cgroup hierarchy will all available subsystems, type:
--# mount -t cgroup xxx /dev/cgroup
--
--The "xxx" is not interpreted by the cgroup code, but will appear in
--/proc/mounts so may be any useful identifying string that you like.
--
--To mount a cgroup hierarchy with just the cpuset and numtasks
--subsystems, type:
--# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
--
--To change the set of subsystems bound to a mounted hierarchy, just
--remount with different options:
--
--# mount -o remount,cpuset,ns /dev/cgroup
--
--Note that changing the set of subsystems is currently only supported
--when the hierarchy consists of a single (root) cgroup. Supporting
--the ability to arbitrarily bind/unbind subsystems from an existing
--cgroup hierarchy is intended to be implemented in the future.
--
--Then under /dev/cgroup you can find a tree that corresponds to the
--tree of the cgroups in the system. For instance, /dev/cgroup
--is the cgroup that holds the whole system.
--
--If you want to create a new cgroup under /dev/cgroup:
--# cd /dev/cgroup
--# mkdir my_cgroup
--
--Now you want to do something with this cgroup.
--# cd my_cgroup
--
--In this directory you can find several files:
--# ls
--notify_on_release releasable tasks
--(plus whatever files added by the attached subsystems)
--
--Now attach your shell to this cgroup:
--# /bin/echo $$ > tasks
--
--You can also create cgroups inside your cgroup by using mkdir in this
--directory.
--# mkdir my_sub_cs
--
--To remove a cgroup, just use rmdir:
--# rmdir my_sub_cs
--
--This will fail if the cgroup is in use (has cgroups inside, or
--has processes attached, or is held alive by other subsystem-specific
--reference).
--
--2.2 Attaching processes
-------------------------
--
--# /bin/echo PID > tasks
--
--Note that it is PID, not PIDs. You can only attach ONE task at a time.
--If you have several tasks to attach, you have to do it one after another:
--
--# /bin/echo PID1 > tasks
--# /bin/echo PID2 > tasks
-- ...
--# /bin/echo PIDn > tasks
--
--You can attach the current shell task by echoing 0:
--
--# echo 0 > tasks
--
--3. Kernel API
--=============
--
--3.1 Overview
--------------
--
--Each kernel subsystem that wants to hook into the generic cgroup
--system needs to create a cgroup_subsys object. This contains
--various methods, which are callbacks from the cgroup system, along
--with a subsystem id which will be assigned by the cgroup system.
--
--Other fields in the cgroup_subsys object include:
--
--- subsys_id: a unique array index for the subsystem, indicating which
-- entry in cgroup->subsys[] this subsystem should be managing.
--
--- name: should be initialized to a unique subsystem name. Should be
-- no longer than MAX_CGROUP_TYPE_NAMELEN.
--
--- early_init: indicate if the subsystem needs early initialization
-- at system boot.
--
--Each cgroup object created by the system has an array of pointers,
--indexed by subsystem id; this pointer is entirely managed by the
--subsystem; the generic cgroup code will never touch this pointer.
--
--3.2 Synchronization
---------------------
--
--There is a global mutex, cgroup_mutex, used by the cgroup
--system. This should be taken by anything that wants to modify a
--cgroup. It may also be taken to prevent cgroups from being
--modified, but more specific locks may be more appropriate in that
--situation.
--
--See kernel/cgroup.c for more details.
--
--Subsystems can take/release the cgroup_mutex via the functions
--cgroup_lock()/cgroup_unlock().
--
--Accessing a task's cgroup pointer may be done in the following ways:
--- while holding cgroup_mutex
--- while holding the task's alloc_lock (via task_lock())
--- inside an rcu_read_lock() section via rcu_dereference()
--
--3.3 Subsystem API
-------------------
--
--Each subsystem should:
--
--- add an entry in linux/cgroup_subsys.h
--- define a cgroup_subsys object called <name>_subsys
--
--Each subsystem may export the following methods. The only mandatory
--methods are create/destroy. Any others that are null are presumed to
--be successful no-ops.
--
--struct cgroup_subsys_state *create(struct cgroup_subsys *ss,
-- struct cgroup *cgrp)
--(cgroup_mutex held by caller)
--
--Called to create a subsystem state object for a cgroup. The
--subsystem should allocate its subsystem state object for the passed
--cgroup, returning a pointer to the new object on success or a
--negative error code. On success, the subsystem pointer should point to
--a structure of type cgroup_subsys_state (typically embedded in a
--larger subsystem-specific object), which will be initialized by the
--cgroup system. Note that this will be called at initialization to
--create the root subsystem state for this subsystem; this case can be
--identified by the passed cgroup object having a NULL parent (since
--it's the root of the hierarchy) and may be an appropriate place for
--initialization code.
--
--void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
--(cgroup_mutex held by caller)
--
--The cgroup system is about to destroy the passed cgroup; the subsystem
--should do any necessary cleanup and free its subsystem state
--object. By the time this method is called, the cgroup has already been
--unlinked from the file system and from the child list of its parent;
--cgroup->parent is still valid. (Note - can also be called for a
--newly-created cgroup if an error occurs after this subsystem's
--create() method has been called for the new cgroup).
--
--void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
--(cgroup_mutex held by caller)
--
--Called before checking the reference count on each subsystem. This may
--be useful for subsystems which have some extra references even if
--there are not tasks in the cgroup.
--
--int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-- struct task_struct *task)
--(cgroup_mutex held by caller)
--
--Called prior to moving a task into a cgroup; if the subsystem
--returns an error, this will abort the attach operation. If a NULL
--task is passed, then a successful result indicates that *any*
--unspecified task can be moved into the cgroup. Note that this isn't
--called on a fork. If this method returns 0 (success) then this should
--remain valid while the caller holds cgroup_mutex.
--
--void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-- struct cgroup *old_cgrp, struct task_struct *task)
--
--Called after the task has been attached to the cgroup, to allow any
--post-attachment activity that requires memory allocations or blocking.
--
--void fork(struct cgroup_subsy *ss, struct task_struct *task)
--
--Called when a task is forked into a cgroup.
--
--void exit(struct cgroup_subsys *ss, struct task_struct *task)
--
--Called during task exit.
--
--int populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
--
--Called after creation of a cgroup to allow a subsystem to populate
--the cgroup directory with file entries. The subsystem should make
--calls to cgroup_add_file() with objects of type cftype (see
--include/linux/cgroup.h for details). Note that although this
--method can return an error code, the error code is currently not
--always handled well.
--
--void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
--
--Called at the end of cgroup_clone() to do any paramater
--initialization which might be required before a task could attach. For
--example in cpusets, no task may attach before 'cpus' and 'mems' are set
--up.
--
--void bind(struct cgroup_subsys *ss, struct cgroup *root)
--(cgroup_mutex held by caller)
--
--Called when a cgroup subsystem is rebound to a different hierarchy
--and root cgroup. Currently this will only involve movement between
--the default hierarchy (which never has sub-cgroups) and a hierarchy
--that is being created/destroyed (and hence has no sub-cgroups).
--
--4. Questions
--============
--
--Q: what's up with this '/bin/echo' ?
--A: bash's builtin 'echo' command does not check calls to write() against
-- errors. If you use it in the cgroup file system, you won't be
-- able to tell whether a command succeeded or failed.
--
--Q: When I attach processes, only the first of the line gets really attached !
--A: We can only return one error code per call to write(). So you should also
-- put only ONE pid.
--
---- /dev/null
-+++ b/Documentation/cgroups/cgroups.txt
-@@ -0,0 +1,548 @@
-+ CGROUPS
-+ -------
-+
-+Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
-+
-+Original copyright statements from cpusets.txt:
-+Portions Copyright (C) 2004 BULL SA.
-+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-+Modified by Paul Jackson <pj@sgi.com>
-+Modified by Christoph Lameter <clameter@sgi.com>
-+
-+CONTENTS:
-+=========
-+
-+1. Control Groups
-+ 1.1 What are cgroups ?
-+ 1.2 Why are cgroups needed ?
-+ 1.3 How are cgroups implemented ?
-+ 1.4 What does notify_on_release do ?
-+ 1.5 How do I use cgroups ?
-+2. Usage Examples and Syntax
-+ 2.1 Basic Usage
-+ 2.2 Attaching processes
-+3. Kernel API
-+ 3.1 Overview
-+ 3.2 Synchronization
-+ 3.3 Subsystem API
-+4. Questions
-+
-+1. Control Groups
-+=================
-+
-+1.1 What are cgroups ?
-+----------------------
-+
-+Control Groups provide a mechanism for aggregating/partitioning sets of
-+tasks, and all their future children, into hierarchical groups with
-+specialized behaviour.
-+
-+Definitions:
-+
-+A *cgroup* associates a set of tasks with a set of parameters for one
-+or more subsystems.
-+
-+A *subsystem* is a module that makes use of the task grouping
-+facilities provided by cgroups to treat groups of tasks in
-+particular ways. A subsystem is typically a "resource controller" that
-+schedules a resource or applies per-cgroup limits, but it may be
-+anything that wants to act on a group of processes, e.g. a
-+virtualization subsystem.
-+
-+A *hierarchy* is a set of cgroups arranged in a tree, such that
-+every task in the system is in exactly one of the cgroups in the
-+hierarchy, and a set of subsystems; each subsystem has system-specific
-+state attached to each cgroup in the hierarchy. Each hierarchy has
-+an instance of the cgroup virtual filesystem associated with it.
-+
-+At any one time there may be multiple active hierachies of task
-+cgroups. Each hierarchy is a partition of all tasks in the system.
-+
-+User level code may create and destroy cgroups by name in an
-+instance of the cgroup virtual file system, specify and query to
-+which cgroup a task is assigned, and list the task pids assigned to
-+a cgroup. Those creations and assignments only affect the hierarchy
-+associated with that instance of the cgroup file system.
-+
-+On their own, the only use for cgroups is for simple job
-+tracking. The intention is that other subsystems hook into the generic
-+cgroup support to provide new attributes for cgroups, such as
-+accounting/limiting the resources which processes in a cgroup can
-+access. For example, cpusets (see Documentation/cpusets.txt) allows
-+you to associate a set of CPUs and a set of memory nodes with the
-+tasks in each cgroup.
-+
-+1.2 Why are cgroups needed ?
-+----------------------------
-+
-+There are multiple efforts to provide process aggregations in the
-+Linux kernel, mainly for resource tracking purposes. Such efforts
-+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
-+namespaces. These all require the basic notion of a
-+grouping/partitioning of processes, with newly forked processes ending
-+in the same group (cgroup) as their parent process.
-+
-+The kernel cgroup patch provides the minimum essential kernel
-+mechanisms required to efficiently implement such groups. It has
-+minimal impact on the system fast paths, and provides hooks for
-+specific subsystems such as cpusets to provide additional behaviour as
-+desired.
-+
-+Multiple hierarchy support is provided to allow for situations where
-+the division of tasks into cgroups is distinctly different for
-+different subsystems - having parallel hierarchies allows each
-+hierarchy to be a natural division of tasks, without having to handle
-+complex combinations of tasks that would be present if several
-+unrelated subsystems needed to be forced into the same tree of
-+cgroups.
-+
-+At one extreme, each resource controller or subsystem could be in a
-+separate hierarchy; at the other extreme, all subsystems
-+would be attached to the same hierarchy.
-+
-+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
-+that can benefit from multiple hierarchies, consider a large
-+university server with various users - students, professors, system
-+tasks etc. The resource planning for this server could be along the
-+following lines:
-+
-+ CPU : Top cpuset
-+ / \
-+ CPUSet1 CPUSet2
-+ | |
-+ (Profs) (Students)
-+
-+ In addition (system tasks) are attached to topcpuset (so
-+ that they can run anywhere) with a limit of 20%
-+
-+ Memory : Professors (50%), students (30%), system (20%)
-+
-+ Disk : Prof (50%), students (30%), system (20%)
-+
-+ Network : WWW browsing (20%), Network File System (60%), others (20%)
-+ / \
-+ Prof (15%) students (5%)
-+
-+Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
-+into NFS network class.
-+
-+At the same time firefox/lynx will share an appropriate CPU/Memory class
-+depending on who launched it (prof/student).
-+
-+With the ability to classify tasks differently for different resources
-+(by putting those resource subsystems in different hierarchies) then
-+the admin can easily set up a script which receives exec notifications
-+and depending on who is launching the browser he can
-+
-+ # echo browser_pid > /mnt/<restype>/<userclass>/tasks
-+
-+With only a single hierarchy, he now would potentially have to create
-+a separate cgroup for every browser launched and associate it with
-+approp network and other resource class. This may lead to
-+proliferation of such cgroups.
-+
-+Also lets say that the administrator would like to give enhanced network
-+access temporarily to a student's browser (since it is night and the user
-+wants to do online gaming :)) OR give one of the students simulation
-+apps enhanced CPU power,
-+
-+With ability to write pids directly to resource classes, it's just a
-+matter of :
-+
-+ # echo pid > /mnt/network/<new_class>/tasks
-+ (after some time)
-+ # echo pid > /mnt/network/<orig_class>/tasks
-+
-+Without this ability, he would have to split the cgroup into
-+multiple separate ones and then associate the new cgroups with the
-+new resource classes.
-+
-+
-+
-+1.3 How are cgroups implemented ?
-+---------------------------------
-+
-+Control Groups extends the kernel as follows:
-+
-+ - Each task in the system has a reference-counted pointer to a
-+ css_set.
-+
-+ - A css_set contains a set of reference-counted pointers to
-+ cgroup_subsys_state objects, one for each cgroup subsystem
-+ registered in the system. There is no direct link from a task to
-+ the cgroup of which it's a member in each hierarchy, but this
-+ can be determined by following pointers through the
-+ cgroup_subsys_state objects. This is because accessing the
-+ subsystem state is something that's expected to happen frequently
-+ and in performance-critical code, whereas operations that require a
-+ task's actual cgroup assignments (in particular, moving between
-+ cgroups) are less common. A linked list runs through the cg_list
-+ field of each task_struct using the css_set, anchored at
-+ css_set->tasks.
-+
-+ - A cgroup hierarchy filesystem can be mounted for browsing and
-+ manipulation from user space.
-+
-+ - You can list all the tasks (by pid) attached to any cgroup.
-+
-+The implementation of cgroups requires a few, simple hooks
-+into the rest of the kernel, none in performance critical paths:
-+
-+ - in init/main.c, to initialize the root cgroups and initial
-+ css_set at system boot.
-+
-+ - in fork and exit, to attach and detach a task from its css_set.
-+
-+In addition a new file system, of type "cgroup" may be mounted, to
-+enable browsing and modifying the cgroups presently known to the
-+kernel. When mounting a cgroup hierarchy, you may specify a
-+comma-separated list of subsystems to mount as the filesystem mount
-+options. By default, mounting the cgroup filesystem attempts to
-+mount a hierarchy containing all registered subsystems.
-+
-+If an active hierarchy with exactly the same set of subsystems already
-+exists, it will be reused for the new mount. If no existing hierarchy
-+matches, and any of the requested subsystems are in use in an existing
-+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
-+is activated, associated with the requested subsystems.
-+
-+It's not currently possible to bind a new subsystem to an active
-+cgroup hierarchy, or to unbind a subsystem from an active cgroup
-+hierarchy. This may be possible in future, but is fraught with nasty
-+error-recovery issues.
-+
-+When a cgroup filesystem is unmounted, if there are any
-+child cgroups created below the top-level cgroup, that hierarchy
-+will remain active even though unmounted; if there are no
-+child cgroups then the hierarchy will be deactivated.
-+
-+No new system calls are added for cgroups - all support for
-+querying and modifying cgroups is via this cgroup file system.
-+
-+Each task under /proc has an added file named 'cgroup' displaying,
-+for each active hierarchy, the subsystem names and the cgroup name
-+as the path relative to the root of the cgroup file system.
-+
-+Each cgroup is represented by a directory in the cgroup file system
-+containing the following files describing that cgroup:
-+
-+ - tasks: list of tasks (by pid) attached to that cgroup
-+ - releasable flag: cgroup currently removeable?
-+ - notify_on_release flag: run the release agent on exit?
-+ - release_agent: the path to use for release notifications (this file
-+ exists in the top cgroup only)
-+
-+Other subsystems such as cpusets may add additional files in each
-+cgroup dir.
-+
-+New cgroups are created using the mkdir system call or shell
-+command. The properties of a cgroup, such as its flags, are
-+modified by writing to the appropriate file in that cgroups
-+directory, as listed above.
-+
-+The named hierarchical structure of nested cgroups allows partitioning
-+a large system into nested, dynamically changeable, "soft-partitions".
-+
-+The attachment of each task, automatically inherited at fork by any
-+children of that task, to a cgroup allows organizing the work load
-+on a system into related sets of tasks. A task may be re-attached to
-+any other cgroup, if allowed by the permissions on the necessary
-+cgroup file system directories.
-+
-+When a task is moved from one cgroup to another, it gets a new
-+css_set pointer - if there's an already existing css_set with the
-+desired collection of cgroups then that group is reused, else a new
-+css_set is allocated. Note that the current implementation uses a
-+linear search to locate an appropriate existing css_set, so isn't
-+very efficient. A future version will use a hash table for better
-+performance.
-+
-+To allow access from a cgroup to the css_sets (and hence tasks)
-+that comprise it, a set of cg_cgroup_link objects form a lattice;
-+each cg_cgroup_link is linked into a list of cg_cgroup_links for
-+a single cgroup on its cgrp_link_list field, and a list of
-+cg_cgroup_links for a single css_set on its cg_link_list.
-+
-+Thus the set of tasks in a cgroup can be listed by iterating over
-+each css_set that references the cgroup, and sub-iterating over
-+each css_set's task set.
-+
-+The use of a Linux virtual file system (vfs) to represent the
-+cgroup hierarchy provides for a familiar permission and name space
-+for cgroups, with a minimum of additional kernel code.
-+
-+1.4 What does notify_on_release do ?
-+------------------------------------
-+
-+If the notify_on_release flag is enabled (1) in a cgroup, then
-+whenever the last task in the cgroup leaves (exits or attaches to
-+some other cgroup) and the last child cgroup of that cgroup
-+is removed, then the kernel runs the command specified by the contents
-+of the "release_agent" file in that hierarchy's root directory,
-+supplying the pathname (relative to the mount point of the cgroup
-+file system) of the abandoned cgroup. This enables automatic
-+removal of abandoned cgroups. The default value of
-+notify_on_release in the root cgroup at system boot is disabled
-+(0). The default value of other cgroups at creation is the current
-+value of their parents notify_on_release setting. The default value of
-+a cgroup hierarchy's release_agent path is empty.
-+
-+1.5 How do I use cgroups ?
-+--------------------------
-+
-+To start a new job that is to be contained within a cgroup, using
-+the "cpuset" cgroup subsystem, the steps are something like:
-+
-+ 1) mkdir /dev/cgroup
-+ 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
-+ 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
-+ the /dev/cgroup virtual file system.
-+ 4) Start a task that will be the "founding father" of the new job.
-+ 5) Attach that task to the new cgroup by writing its pid to the
-+ /dev/cgroup tasks file for that cgroup.
-+ 6) fork, exec or clone the job tasks from this founding father task.
-+
-+For example, the following sequence of commands will setup a cgroup
-+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-+and then start a subshell 'sh' in that cgroup:
-+
-+ mount -t cgroup cpuset -ocpuset /dev/cgroup
-+ cd /dev/cgroup
-+ mkdir Charlie
-+ cd Charlie
-+ /bin/echo 2-3 > cpuset.cpus
-+ /bin/echo 1 > cpuset.mems
-+ /bin/echo $$ > tasks
-+ sh
-+ # The subshell 'sh' is now running in cgroup Charlie
-+ # The next line should display '/Charlie'
-+ cat /proc/self/cgroup
-+
-+2. Usage Examples and Syntax
-+============================
-+
-+2.1 Basic Usage
-+---------------
-+
-+Creating, modifying, using the cgroups can be done through the cgroup
-+virtual filesystem.
-+
-+To mount a cgroup hierarchy will all available subsystems, type:
-+# mount -t cgroup xxx /dev/cgroup
-+
-+The "xxx" is not interpreted by the cgroup code, but will appear in
-+/proc/mounts so may be any useful identifying string that you like.
-+
-+To mount a cgroup hierarchy with just the cpuset and numtasks
-+subsystems, type:
-+# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
-+
-+To change the set of subsystems bound to a mounted hierarchy, just
-+remount with different options:
-+
-+# mount -o remount,cpuset,ns /dev/cgroup
-+
-+Note that changing the set of subsystems is currently only supported
-+when the hierarchy consists of a single (root) cgroup. Supporting
-+the ability to arbitrarily bind/unbind subsystems from an existing
-+cgroup hierarchy is intended to be implemented in the future.
-+
-+Then under /dev/cgroup you can find a tree that corresponds to the
-+tree of the cgroups in the system. For instance, /dev/cgroup
-+is the cgroup that holds the whole system.
-+
-+If you want to create a new cgroup under /dev/cgroup:
-+# cd /dev/cgroup
-+# mkdir my_cgroup
-+
-+Now you want to do something with this cgroup.
-+# cd my_cgroup
-+
-+In this directory you can find several files:
-+# ls
-+notify_on_release releasable tasks
-+(plus whatever files added by the attached subsystems)
-+
-+Now attach your shell to this cgroup:
-+# /bin/echo $$ > tasks
-+
-+You can also create cgroups inside your cgroup by using mkdir in this
-+directory.
-+# mkdir my_sub_cs
-+
-+To remove a cgroup, just use rmdir:
-+# rmdir my_sub_cs
-+
-+This will fail if the cgroup is in use (has cgroups inside, or
-+has processes attached, or is held alive by other subsystem-specific
-+reference).
-+
-+2.2 Attaching processes
-+-----------------------
-+
-+# /bin/echo PID > tasks
-+
-+Note that it is PID, not PIDs. You can only attach ONE task at a time.
-+If you have several tasks to attach, you have to do it one after another:
-+
-+# /bin/echo PID1 > tasks
-+# /bin/echo PID2 > tasks
-+ ...
-+# /bin/echo PIDn > tasks
-+
-+You can attach the current shell task by echoing 0:
-+
-+# echo 0 > tasks
-+
-+3. Kernel API
-+=============
-+
-+3.1 Overview
-+------------
-+
-+Each kernel subsystem that wants to hook into the generic cgroup
-+system needs to create a cgroup_subsys object. This contains
-+various methods, which are callbacks from the cgroup system, along
-+with a subsystem id which will be assigned by the cgroup system.
-+
-+Other fields in the cgroup_subsys object include:
-+
-+- subsys_id: a unique array index for the subsystem, indicating which
-+ entry in cgroup->subsys[] this subsystem should be managing.
-+
-+- name: should be initialized to a unique subsystem name. Should be
-+ no longer than MAX_CGROUP_TYPE_NAMELEN.
-+
-+- early_init: indicate if the subsystem needs early initialization
-+ at system boot.
-+
-+Each cgroup object created by the system has an array of pointers,
-+indexed by subsystem id; this pointer is entirely managed by the
-+subsystem; the generic cgroup code will never touch this pointer.
-+
-+3.2 Synchronization
-+-------------------
-+
-+There is a global mutex, cgroup_mutex, used by the cgroup
-+system. This should be taken by anything that wants to modify a
-+cgroup. It may also be taken to prevent cgroups from being
-+modified, but more specific locks may be more appropriate in that
-+situation.
-+
-+See kernel/cgroup.c for more details.
-+
-+Subsystems can take/release the cgroup_mutex via the functions
-+cgroup_lock()/cgroup_unlock().
-+
-+Accessing a task's cgroup pointer may be done in the following ways:
-+- while holding cgroup_mutex
-+- while holding the task's alloc_lock (via task_lock())
-+- inside an rcu_read_lock() section via rcu_dereference()
-+
-+3.3 Subsystem API
-+-----------------
-+
-+Each subsystem should:
-+
-+- add an entry in linux/cgroup_subsys.h
-+- define a cgroup_subsys object called <name>_subsys
-+
-+Each subsystem may export the following methods. The only mandatory
-+methods are create/destroy. Any others that are null are presumed to
-+be successful no-ops.
-+
-+struct cgroup_subsys_state *create(struct cgroup_subsys *ss,
-+ struct cgroup *cgrp)
-+(cgroup_mutex held by caller)
-+
-+Called to create a subsystem state object for a cgroup. The
-+subsystem should allocate its subsystem state object for the passed
-+cgroup, returning a pointer to the new object on success or a
-+negative error code. On success, the subsystem pointer should point to
-+a structure of type cgroup_subsys_state (typically embedded in a
-+larger subsystem-specific object), which will be initialized by the
-+cgroup system. Note that this will be called at initialization to
-+create the root subsystem state for this subsystem; this case can be
-+identified by the passed cgroup object having a NULL parent (since
-+it's the root of the hierarchy) and may be an appropriate place for
-+initialization code.
-+
-+void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
-+(cgroup_mutex held by caller)
-+
-+The cgroup system is about to destroy the passed cgroup; the subsystem
-+should do any necessary cleanup and free its subsystem state
-+object. By the time this method is called, the cgroup has already been
-+unlinked from the file system and from the child list of its parent;
-+cgroup->parent is still valid. (Note - can also be called for a
-+newly-created cgroup if an error occurs after this subsystem's
-+create() method has been called for the new cgroup).
-+
-+void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
-+(cgroup_mutex held by caller)
-+
-+Called before checking the reference count on each subsystem. This may
-+be useful for subsystems which have some extra references even if
-+there are not tasks in the cgroup.
-+
-+int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-+ struct task_struct *task)
-+(cgroup_mutex held by caller)
-+
-+Called prior to moving a task into a cgroup; if the subsystem
-+returns an error, this will abort the attach operation. If a NULL
-+task is passed, then a successful result indicates that *any*
-+unspecified task can be moved into the cgroup. Note that this isn't
-+called on a fork. If this method returns 0 (success) then this should
-+remain valid while the caller holds cgroup_mutex.
-+
-+void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-+ struct cgroup *old_cgrp, struct task_struct *task)
-+
-+Called after the task has been attached to the cgroup, to allow any
-+post-attachment activity that requires memory allocations or blocking.
-+
-+void fork(struct cgroup_subsy *ss, struct task_struct *task)
-+
-+Called when a task is forked into a cgroup.
-+
-+void exit(struct cgroup_subsys *ss, struct task_struct *task)
-+
-+Called during task exit.
-+
-+int populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-+
-+Called after creation of a cgroup to allow a subsystem to populate
-+the cgroup directory with file entries. The subsystem should make
-+calls to cgroup_add_file() with objects of type cftype (see
-+include/linux/cgroup.h for details). Note that although this
-+method can return an error code, the error code is currently not
-+always handled well.
-+
-+void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
-+
-+Called at the end of cgroup_clone() to do any paramater
-+initialization which might be required before a task could attach. For
-+example in cpusets, no task may attach before 'cpus' and 'mems' are set
-+up.
-+
-+void bind(struct cgroup_subsys *ss, struct cgroup *root)
-+(cgroup_mutex held by caller)
-+
-+Called when a cgroup subsystem is rebound to a different hierarchy
-+and root cgroup. Currently this will only involve movement between
-+the default hierarchy (which never has sub-cgroups) and a hierarchy
-+that is being created/destroyed (and hence has no sub-cgroups).
-+
-+4. Questions
-+============
-+
-+Q: what's up with this '/bin/echo' ?
-+A: bash's builtin 'echo' command does not check calls to write() against
-+ errors. If you use it in the cgroup file system, you won't be
-+ able to tell whether a command succeeded or failed.
-+
-+Q: When I attach processes, only the first of the line gets really attached !
-+A: We can only return one error code per call to write(). So you should also
-+ put only ONE pid.
-+
---- /dev/null
-+++ b/Documentation/cgroups/freezer-subsystem.txt
-@@ -0,0 +1,102 @@
-+The cgroup freezer is useful to batch job management system which start
-+and stop sets of tasks in order to schedule the resources of a machine
-+according to the desires of a system administrator. This sort of program
-+is often used on HPC clusters to schedule access to the cluster as a
-+whole. The cgroup freezer uses cgroups to describe the set of tasks to
-+be started/stopped by the batch job management system. It also provides
-+a means to start and stop the tasks composing the job.
-+
-+The cgroup freezer will also be useful for checkpointing running groups
-+of tasks. The freezer allows the checkpoint code to obtain a consistent
-+image of the tasks by attempting to force the tasks in a cgroup into a
-+quiescent state. Once the tasks are quiescent another task can
-+walk /proc or invoke a kernel interface to gather information about the
-+quiesced tasks. Checkpointed tasks can be restarted later should a
-+recoverable error occur. This also allows the checkpointed tasks to be
-+migrated between nodes in a cluster by copying the gathered information
-+to another node and restarting the tasks there.
-+
-+Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
-+and resuming tasks in userspace. Both of these signals are observable
-+from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
-+blocked, or ignored it can be seen by waiting or ptracing parent tasks.
-+SIGCONT is especially unsuitable since it can be caught by the task. Any
-+programs designed to watch for SIGSTOP and SIGCONT could be broken by
-+attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
-+demonstrate this problem using nested bash shells:
-+
-+ $ echo $$
-+ 16644
-+ $ bash
-+ $ echo $$
-+ 16690
-+
-+ From a second, unrelated bash shell:
-+ $ kill -SIGSTOP 16690
-+ $ kill -SIGCONT 16990
-+
-+ <at this point 16990 exits and causes 16644 to exit too>
-+
-+This happens because bash can observe both signals and choose how it
-+responds to them.
-+
-+Another example of a program which catches and responds to these
-+signals is gdb. In fact any program designed to use ptrace is likely to
-+have a problem with this method of stopping and resuming tasks.
-+
-+In contrast, the cgroup freezer uses the kernel freezer code to
-+prevent the freeze/unfreeze cycle from becoming visible to the tasks
-+being frozen. This allows the bash example above and gdb to run as
-+expected.
-+
-+The freezer subsystem in the container filesystem defines a file named
-+freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the
-+cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup.
-+Reading will return the current state.
-+
-+Note freezer.state doesn't exist in root cgroup, which means root cgroup
-+is non-freezable.
-+
-+* Examples of usage :
-+
-+ # mkdir /containers
-+ # mount -t cgroup -ofreezer freezer /containers
-+ # mkdir /containers/0
-+ # echo $some_pid > /containers/0/tasks
-+
-+to get status of the freezer subsystem :
-+
-+ # cat /containers/0/freezer.state
-+ THAWED
-+
-+to freeze all tasks in the container :
-+
-+ # echo FROZEN > /containers/0/freezer.state
-+ # cat /containers/0/freezer.state
-+ FREEZING
-+ # cat /containers/0/freezer.state
-+ FROZEN
-+
-+to unfreeze all tasks in the container :
-+
-+ # echo THAWED > /containers/0/freezer.state
-+ # cat /containers/0/freezer.state
-+ THAWED
-+
-+This is the basic mechanism which should do the right thing for user space task
-+in a simple scenario.
-+
-+It's important to note that freezing can be incomplete. In that case we return
-+EBUSY. This means that some tasks in the cgroup are busy doing something that
-+prevents us from completely freezing the cgroup at this time. After EBUSY,
-+the cgroup will remain partially frozen -- reflected by freezer.state reporting
-+"FREEZING" when read. The state will remain "FREEZING" until one of these
-+things happens:
-+
-+ 1) Userspace cancels the freezing operation by writing "THAWED" to
-+ the freezer.state file
-+ 2) Userspace retries the freezing operation by writing "FROZEN" to
-+ the freezer.state file (writing "FREEZING" is not legal
-+ and returns EINVAL)
-+ 3) The tasks that blocked the cgroup from entering the "FROZEN"
-+ state disappear from the cgroup's set of tasks.
---- a/Documentation/cpusets.txt
-+++ b/Documentation/cpusets.txt
-@@ -48,7 +48,7 @@ hooks, beyond what is already present, r
- job placement on large systems.
-
- Cpusets use the generic cgroup subsystem described in
--Documentation/cgroup.txt.
-+Documentation/cgroups/cgroups.txt.
-
- Requests by a task, using the sched_setaffinity(2) system call to
- include CPUs in its CPU affinity mask, and using the mbind(2) and
---- a/arch/alpha/Kconfig
-+++ b/arch/alpha/Kconfig
-@@ -72,6 +72,7 @@ config ARCH_SUPPORTS_AOUT
- def_bool y
-
- source "init/Kconfig"
-+source "kernel/Kconfig.freezer"
-
-
- menu "System setup"
---- a/arch/alpha/include/asm/thread_info.h
-+++ b/arch/alpha/include/asm/thread_info.h
-@@ -74,12 +74,14 @@ register struct thread_info *__current_t
- #define TIF_UAC_SIGBUS 7
- #define TIF_MEMDIE 8
- #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */
-+#define TIF_FREEZE 16 /* is freezing for suspend */
-
- #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
- #define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
- #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
- #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
- #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE (1<<TIF_FREEZE)
-
- /* Work to do on interrupt/exception return. */
- #define _TIF_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED)
---- a/arch/arm/Kconfig
-+++ b/arch/arm/Kconfig
-@@ -190,6 +190,8 @@ config VECTORS_BASE
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- menu "System Type"
-
- choice
---- a/arch/avr32/Kconfig
-+++ b/arch/avr32/Kconfig
-@@ -72,6 +72,8 @@ config GENERIC_BUG
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- menu "System Type and features"
-
- source "kernel/time/Kconfig"
---- a/arch/avr32/include/asm/thread_info.h
-+++ b/arch/avr32/include/asm/thread_info.h
-@@ -96,6 +96,7 @@ static inline struct thread_info *curren
- #define _TIF_MEMDIE (1 << TIF_MEMDIE)
- #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK)
- #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP)
-+#define _TIF_FREEZE (1 << TIF_FREEZE)
-
- /* Note: The masks below must never span more than 16 bits! */
-
---- a/arch/blackfin/Kconfig
-+++ b/arch/blackfin/Kconfig
-@@ -64,8 +64,11 @@ config HARDWARE_PM
- depends on OPROFILE
-
- source "init/Kconfig"
-+
- source "kernel/Kconfig.preempt"
-
-+source "kernel/Kconfig.freezer"
-+
- menu "Blackfin Processor Options"
-
- comment "Processor and Board Settings"
---- a/arch/cris/Kconfig
-+++ b/arch/cris/Kconfig
-@@ -62,6 +62,8 @@ config HZ
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- menu "General setup"
-
- source "fs/Kconfig.binfmt"
---- a/arch/frv/Kconfig
-+++ b/arch/frv/Kconfig
-@@ -66,6 +66,8 @@ mainmenu "Fujitsu FR-V Kernel Configurat
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
-
- menu "Fujitsu FR-V system setup"
-
---- a/arch/h8300/Kconfig
-+++ b/arch/h8300/Kconfig
-@@ -89,6 +89,8 @@ config HZ
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- source "arch/h8300/Kconfig.cpu"
-
- menu "Executable file formats"
---- a/arch/h8300/include/asm/thread_info.h
-+++ b/arch/h8300/include/asm/thread_info.h
-@@ -89,6 +89,7 @@ static inline struct thread_info *curren
- TIF_NEED_RESCHED */
- #define TIF_MEMDIE 4
- #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */
-+#define TIF_FREEZE 16 /* is freezing for suspend */
-
- /* as above, but as bit values */
- #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
-@@ -96,6 +97,7 @@ static inline struct thread_info *curren
- #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
- #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
- #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE (1<<TIF_FREEZE)
-
- #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */
-
---- a/arch/ia64/Kconfig
-+++ b/arch/ia64/Kconfig
-@@ -7,6 +7,8 @@ mainmenu "IA-64 Linux Kernel Configurati
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- menu "Processor type and features"
-
- config IA64
---- a/arch/m32r/Kconfig
-+++ b/arch/m32r/Kconfig
-@@ -45,6 +45,8 @@ config HZ
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
-
- menu "Processor type and features"
-
---- a/arch/m68k/Kconfig
-+++ b/arch/m68k/Kconfig
-@@ -64,6 +64,8 @@ mainmenu "Linux/68k Kernel Configuration
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- menu "Platform dependent setup"
-
- config EISA
---- a/arch/m68knommu/Kconfig
-+++ b/arch/m68knommu/Kconfig
-@@ -82,6 +82,8 @@ config ARCH_SUPPORTS_AOUT
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- menu "Processor type and features"
-
- choice
---- a/arch/m68knommu/include/asm/thread_info.h
-+++ b/arch/m68knommu/include/asm/thread_info.h
-@@ -84,12 +84,14 @@ static inline struct thread_info *curren
- #define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling
- TIF_NEED_RESCHED */
- #define TIF_MEMDIE 4
-+#define TIF_FREEZE 16 /* is freezing for suspend */
-
- /* as above, but as bit values */
- #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
- #define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
- #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
- #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
-+#define _TIF_FREEZE (1<<TIF_FREEZE)
-
- #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */
-
---- a/arch/mips/Kconfig
-+++ b/arch/mips/Kconfig
-@@ -1885,6 +1885,8 @@ config PROBE_INITRD_HEADER
- add initrd or initramfs image to the kernel image.
- Otherwise, say N.
-
-+source "kernel/Kconfig.freezer"
-+
- menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)"
-
- config HW_HAS_EISA
---- a/arch/mn10300/Kconfig
-+++ b/arch/mn10300/Kconfig
-@@ -71,6 +71,8 @@ mainmenu "Matsushita MN10300/AM33 Kernel
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
-
- menu "Matsushita MN10300 system setup"
-
---- a/arch/parisc/Kconfig
-+++ b/arch/parisc/Kconfig
-@@ -93,6 +93,8 @@ config ARCH_MAY_HAVE_PC_FDC
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
-
- menu "Processor type and features"
-
---- a/arch/powerpc/Kconfig
-+++ b/arch/powerpc/Kconfig
-@@ -228,6 +228,8 @@ config PPC_OF_PLATFORM_PCI
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- source "arch/powerpc/sysdev/Kconfig"
- source "arch/powerpc/platforms/Kconfig"
-
---- a/arch/s390/Kconfig
-+++ b/arch/s390/Kconfig
-@@ -79,6 +79,8 @@ config S390
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- menu "Base setup"
-
- comment "Processor type and features"
---- a/arch/s390/include/asm/thread_info.h
-+++ b/arch/s390/include/asm/thread_info.h
-@@ -98,6 +98,7 @@ static inline struct thread_info *curren
- #define TIF_31BIT 18 /* 32bit process */
- #define TIF_MEMDIE 19
- #define TIF_RESTORE_SIGMASK 20 /* restore signal mask in do_signal() */
-+#define TIF_FREEZE 21
-
- #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
- #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
-@@ -110,6 +111,7 @@ static inline struct thread_info *curren
- #define _TIF_USEDFPU (1<<TIF_USEDFPU)
- #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
- #define _TIF_31BIT (1<<TIF_31BIT)
-+#define _TIF_FREEZE (1<<TIF_FREEZE)
-
- #endif /* __KERNEL__ */
-
---- a/arch/sh/Kconfig
-+++ b/arch/sh/Kconfig
-@@ -106,6 +106,8 @@ config IO_TRAPPED
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- menu "System type"
-
- #
---- a/arch/sparc/Kconfig
-+++ b/arch/sparc/Kconfig
-@@ -32,6 +32,8 @@ config HZ
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- menu "General machine setup"
-
- config SMP
---- a/arch/sparc/include/asm/thread_info_32.h
-+++ b/arch/sparc/include/asm/thread_info_32.h
-@@ -139,6 +139,7 @@ BTFIXUPDEF_CALL(void, free_thread_info,
- #define TIF_POLLING_NRFLAG 9 /* true if poll_idle() is polling
- * TIF_NEED_RESCHED */
- #define TIF_MEMDIE 10
-+#define TIF_FREEZE 11 /* is freezing for suspend */
-
- /* as above, but as bit values */
- #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
-@@ -152,6 +153,7 @@ BTFIXUPDEF_CALL(void, free_thread_info,
- #define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | \
- _TIF_SIGPENDING | \
- _TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE (1<<TIF_FREEZE)
-
- #endif /* __KERNEL__ */
-
---- a/arch/sparc64/Kconfig
-+++ b/arch/sparc64/Kconfig
-@@ -85,6 +85,7 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
- def_bool y
-
- source "init/Kconfig"
-+source "kernel/Kconfig.freezer"
-
- menu "Processor type and features"
-
---- a/arch/um/Kconfig
-+++ b/arch/um/Kconfig
-@@ -229,6 +229,8 @@ endmenu
-
- source "init/Kconfig"
-
-+source "kernel/Kconfig.freezer"
-+
- source "drivers/block/Kconfig"
-
- source "arch/um/Kconfig.char"
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -208,6 +208,7 @@ config X86_TRAMPOLINE
- config KTIME_SCALAR
- def_bool X86_32
- source "init/Kconfig"
-+source "kernel/Kconfig.freezer"
-
- menu "Processor type and features"
-
---- a/arch/xtensa/Kconfig
-+++ b/arch/xtensa/Kconfig
-@@ -55,6 +55,7 @@ config HZ
- default 100
-
- source "init/Kconfig"
-+source "kernel/Kconfig.freezer"
-
- menu "Processor type and features"
-
---- a/include/asm-cris/thread_info.h
-+++ b/include/asm-cris/thread_info.h
-@@ -88,6 +88,7 @@ struct thread_info {
- #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */
- #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */
- #define TIF_MEMDIE 17
-+#define TIF_FREEZE 18 /* is freezing for suspend */
-
- #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
- #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
-@@ -95,6 +96,7 @@ struct thread_info {
- #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
- #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
- #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
-+#define _TIF_FREEZE (1<<TIF_FREEZE)
-
- #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */
- #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */
---- a/include/asm-m68k/thread_info.h
-+++ b/include/asm-m68k/thread_info.h
-@@ -52,5 +52,6 @@ struct thread_info {
- #define TIF_DELAYED_TRACE 14 /* single step a syscall */
- #define TIF_SYSCALL_TRACE 15 /* syscall trace active */
- #define TIF_MEMDIE 16
-+#define TIF_FREEZE 17 /* thread is freezing for suspend */
-
- #endif /* _ASM_M68K_THREAD_INFO_H */
---- a/include/asm-parisc/thread_info.h
-+++ b/include/asm-parisc/thread_info.h
-@@ -58,6 +58,7 @@ struct thread_info {
- #define TIF_32BIT 4 /* 32 bit binary */
- #define TIF_MEMDIE 5
- #define TIF_RESTORE_SIGMASK 6 /* restore saved signal mask */
-+#define TIF_FREEZE 7 /* is freezing for suspend */
-
- #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
- #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
-@@ -65,6 +66,7 @@ struct thread_info {
- #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
- #define _TIF_32BIT (1 << TIF_32BIT)
- #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE (1 << TIF_FREEZE)
-
- #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | \
- _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK)
---- a/include/asm-um/thread_info.h
-+++ b/include/asm-um/thread_info.h
-@@ -69,6 +69,7 @@ static inline struct thread_info *curren
- #define TIF_MEMDIE 5
- #define TIF_SYSCALL_AUDIT 6
- #define TIF_RESTORE_SIGMASK 7
-+#define TIF_FREEZE 16 /* is freezing for suspend */
-
- #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
- #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
-@@ -77,5 +78,6 @@ static inline struct thread_info *curren
- #define _TIF_MEMDIE (1 << TIF_MEMDIE)
- #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
- #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE (1 << TIF_FREEZE)
-
- #endif
---- a/include/asm-xtensa/thread_info.h
-+++ b/include/asm-xtensa/thread_info.h
-@@ -134,6 +134,7 @@ static inline struct thread_info *curren
- #define TIF_MEMDIE 5
- #define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */
- #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */
-+#define TIF_FREEZE 17 /* is freezing for suspend */
-
- #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
- #define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
-@@ -142,6 +143,7 @@ static inline struct thread_info *curren
- #define _TIF_IRET (1<<TIF_IRET)
- #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
- #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
-+#define _TIF_FREEZE (1<<TIF_FREEZE)
-
- #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */
- #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */
---- a/include/linux/cgroup_subsys.h
-+++ b/include/linux/cgroup_subsys.h
-@@ -48,3 +48,9 @@ SUBSYS(devices)
- #endif
-
- /* */
-+
-+#ifdef CONFIG_CGROUP_FREEZER
-+SUBSYS(freezer)
-+#endif
-+
-+/* */
---- a/include/linux/freezer.h
-+++ b/include/linux/freezer.h
-@@ -6,7 +6,7 @@
- #include <linux/sched.h>
- #include <linux/wait.h>
-
--#ifdef CONFIG_PM_SLEEP
-+#ifdef CONFIG_FREEZER
- /*
- * Check if a process has been frozen
- */
-@@ -39,29 +39,14 @@ static inline void clear_freeze_flag(str
- clear_tsk_thread_flag(p, TIF_FREEZE);
- }
-
--/*
-- * Wake up a frozen process
-- *
-- * task_lock() is taken to prevent the race with refrigerator() which may
-- * occur if the freezing of tasks fails. Namely, without the lock, if the
-- * freezing of tasks failed, thaw_tasks() might have run before a task in
-- * refrigerator() could call frozen_process(), in which case the task would be
-- * frozen and no one would thaw it.
-- */
--static inline int thaw_process(struct task_struct *p)
--{
-- task_lock(p);
-- if (frozen(p)) {
-- p->flags &= ~PF_FROZEN;
-- task_unlock(p);
-- wake_up_process(p);
-- return 1;
-- }
-- clear_freeze_flag(p);
-- task_unlock(p);
-- return 0;
-+static inline bool should_send_signal(struct task_struct *p)
-+{
-+ return !(p->flags & PF_FREEZER_NOSIG);
- }
-
-+/* Takes and releases task alloc lock using task_lock() */
-+extern int thaw_process(struct task_struct *p);
-+
- extern void refrigerator(void);
- extern int freeze_processes(void);
- extern void thaw_processes(void);
-@@ -75,6 +60,15 @@ static inline int try_to_freeze(void)
- return 0;
- }
-
-+extern bool freeze_task(struct task_struct *p, bool sig_only);
-+extern void cancel_freezing(struct task_struct *p);
-+
-+#ifdef CONFIG_CGROUP_FREEZER
-+extern int cgroup_frozen(struct task_struct *task);
-+#else /* !CONFIG_CGROUP_FREEZER */
-+static inline int cgroup_frozen(struct task_struct *task) { return 0; }
-+#endif /* !CONFIG_CGROUP_FREEZER */
-+
- /*
- * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
- * calls wait_for_completion(&vfork) and reset right after it returns from this
-@@ -166,7 +160,7 @@ static inline void set_freezable_with_si
- } while (try_to_freeze()); \
- __retval; \
- })
--#else /* !CONFIG_PM_SLEEP */
-+#else /* !CONFIG_FREEZER */
- static inline int frozen(struct task_struct *p) { return 0; }
- static inline int freezing(struct task_struct *p) { return 0; }
- static inline void set_freeze_flag(struct task_struct *p) {}
-@@ -191,6 +185,6 @@ static inline void set_freezable_with_si
- #define wait_event_freezable_timeout(wq, condition, timeout) \
- wait_event_interruptible_timeout(wq, condition, timeout)
-
--#endif /* !CONFIG_PM_SLEEP */
-+#endif /* !CONFIG_FREEZER */
-
- #endif /* FREEZER_H_INCLUDED */
---- a/init/Kconfig
-+++ b/init/Kconfig
-@@ -303,6 +303,13 @@ config CGROUP_NS
- for instance virtual servers and checkpoint/restart
- jobs.
-
-+config CGROUP_FREEZER
-+ bool "control group freezer subsystem"
-+ depends on CGROUPS
-+ help
-+ Provides a way to freeze and unfreeze all tasks in a
-+ cgroup.
-+
- config CGROUP_DEVICE
- bool "Device controller for cgroups"
- depends on CGROUPS && EXPERIMENTAL
---- /dev/null
-+++ b/kernel/Kconfig.freezer
-@@ -0,0 +1,2 @@
-+config FREEZER
-+ def_bool PM_SLEEP || CGROUP_FREEZER
---- a/kernel/Makefile
-+++ b/kernel/Makefile
-@@ -22,6 +22,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
- CFLAGS_REMOVE_sched.o = -pg
- endif
-
-+obj-$(CONFIG_FREEZER) += freezer.o
- obj-$(CONFIG_PROFILING) += profile.o
- obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
- obj-$(CONFIG_STACKTRACE) += stacktrace.o
-@@ -54,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += bac
- obj-$(CONFIG_COMPAT) += compat.o
- obj-$(CONFIG_CGROUPS) += cgroup.o
- obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
-+obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
- obj-$(CONFIG_CPUSETS) += cpuset.o
- obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
- obj-$(CONFIG_UTS_NS) += utsname.o
---- /dev/null
-+++ b/kernel/cgroup_freezer.c
-@@ -0,0 +1,379 @@
-+/*
-+ * cgroup_freezer.c - control group freezer subsystem
-+ *
-+ * Copyright IBM Corporation, 2007
-+ *
-+ * Author : Cedric Le Goater <clg@fr.ibm.com>
-+ *
-+ * This program is free software; you can redistribute it and/or modify it
-+ * under the terms of version 2.1 of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation.
-+ *
-+ * This program is distributed in the hope that it would be useful, but
-+ * WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-+ */
-+
-+#include <linux/module.h>
-+#include <linux/cgroup.h>
-+#include <linux/fs.h>
-+#include <linux/uaccess.h>
-+#include <linux/freezer.h>
-+#include <linux/seq_file.h>
-+
-+enum freezer_state {
-+ CGROUP_THAWED = 0,
-+ CGROUP_FREEZING,
-+ CGROUP_FROZEN,
-+};
-+
-+struct freezer {
-+ struct cgroup_subsys_state css;
-+ enum freezer_state state;
-+ spinlock_t lock; /* protects _writes_ to state */
-+};
-+
-+static inline struct freezer *cgroup_freezer(
-+ struct cgroup *cgroup)
-+{
-+ return container_of(
-+ cgroup_subsys_state(cgroup, freezer_subsys_id),
-+ struct freezer, css);
-+}
-+
-+static inline struct freezer *task_freezer(struct task_struct *task)
-+{
-+ return container_of(task_subsys_state(task, freezer_subsys_id),
-+ struct freezer, css);
-+}
-+
-+int cgroup_frozen(struct task_struct *task)
-+{
-+ struct freezer *freezer;
-+ enum freezer_state state;
-+
-+ task_lock(task);
-+ freezer = task_freezer(task);
-+ state = freezer->state;
-+ task_unlock(task);
-+
-+ return state == CGROUP_FROZEN;
-+}
-+
-+/*
-+ * cgroups_write_string() limits the size of freezer state strings to
-+ * CGROUP_LOCAL_BUFFER_SIZE
-+ */
-+static const char *freezer_state_strs[] = {
-+ "THAWED",
-+ "FREEZING",
-+ "FROZEN",
-+};
-+
-+/*
-+ * State diagram
-+ * Transitions are caused by userspace writes to the freezer.state file.
-+ * The values in parenthesis are state labels. The rest are edge labels.
-+ *
-+ * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
-+ * ^ ^ | |
-+ * | \_______THAWED_______/ |
-+ * \__________________________THAWED____________/
-+ */
-+
-+struct cgroup_subsys freezer_subsys;
-+
-+/* Locks taken and their ordering
-+ * ------------------------------
-+ * css_set_lock
-+ * cgroup_mutex (AKA cgroup_lock)
-+ * task->alloc_lock (AKA task_lock)
-+ * freezer->lock
-+ * task->sighand->siglock
-+ *
-+ * cgroup code forces css_set_lock to be taken before task->alloc_lock
-+ *
-+ * freezer_create(), freezer_destroy():
-+ * cgroup_mutex [ by cgroup core ]
-+ *
-+ * can_attach():
-+ * cgroup_mutex
-+ *
-+ * cgroup_frozen():
-+ * task->alloc_lock (to get task's cgroup)
-+ *
-+ * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
-+ * task->alloc_lock (to get task's cgroup)
-+ * freezer->lock
-+ * sighand->siglock (if the cgroup is freezing)
-+ *
-+ * freezer_read():
-+ * cgroup_mutex
-+ * freezer->lock
-+ * read_lock css_set_lock (cgroup iterator start)
-+ *
-+ * freezer_write() (freeze):
-+ * cgroup_mutex
-+ * freezer->lock
-+ * read_lock css_set_lock (cgroup iterator start)
-+ * sighand->siglock
-+ *
-+ * freezer_write() (unfreeze):
-+ * cgroup_mutex
-+ * freezer->lock
-+ * read_lock css_set_lock (cgroup iterator start)
-+ * task->alloc_lock (to prevent races with freeze_task())
-+ * sighand->siglock
-+ */
-+static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
-+ struct cgroup *cgroup)
-+{
-+ struct freezer *freezer;
-+
-+ freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
-+ if (!freezer)
-+ return ERR_PTR(-ENOMEM);
-+
-+ spin_lock_init(&freezer->lock);
-+ freezer->state = CGROUP_THAWED;
-+ return &freezer->css;
-+}
-+
-+static void freezer_destroy(struct cgroup_subsys *ss,
-+ struct cgroup *cgroup)
-+{
-+ kfree(cgroup_freezer(cgroup));
-+}
-+
-+/* Task is frozen or will freeze immediately when next it gets woken */
-+static bool is_task_frozen_enough(struct task_struct *task)
-+{
-+ return frozen(task) ||
-+ (task_is_stopped_or_traced(task) && freezing(task));
-+}
-+
-+/*
-+ * The call to cgroup_lock() in the freezer.state write method prevents
-+ * a write to that file racing against an attach, and hence the
-+ * can_attach() result will remain valid until the attach completes.
-+ */
-+static int freezer_can_attach(struct cgroup_subsys *ss,
-+ struct cgroup *new_cgroup,
-+ struct task_struct *task)
-+{
-+ struct freezer *freezer;
-+
-+ /*
-+ * Anything frozen can't move or be moved to/from.
-+ *
-+ * Since orig_freezer->state == FROZEN means that @task has been
-+ * frozen, so it's sufficient to check the latter condition.
-+ */
-+
-+ if (is_task_frozen_enough(task))
-+ return -EBUSY;
-+
-+ freezer = cgroup_freezer(new_cgroup);
-+ if (freezer->state == CGROUP_FROZEN)
-+ return -EBUSY;
-+
-+ return 0;
-+}
-+
-+static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
-+{
-+ struct freezer *freezer;
-+
-+ /*
-+ * No lock is needed, since the task isn't on tasklist yet,
-+ * so it can't be moved to another cgroup, which means the
-+ * freezer won't be removed and will be valid during this
-+ * function call.
-+ */
-+ freezer = task_freezer(task);
-+
-+ /*
-+ * The root cgroup is non-freezable, so we can skip the
-+ * following check.
-+ */
-+ if (!freezer->css.cgroup->parent)
-+ return;
-+
-+ spin_lock_irq(&freezer->lock);
-+ BUG_ON(freezer->state == CGROUP_FROZEN);
-+
-+ /* Locking avoids race with FREEZING -> THAWED transitions. */
-+ if (freezer->state == CGROUP_FREEZING)
-+ freeze_task(task, true);
-+ spin_unlock_irq(&freezer->lock);
-+}
-+
-+/*
-+ * caller must hold freezer->lock
-+ */
-+static void update_freezer_state(struct cgroup *cgroup,
-+ struct freezer *freezer)
-+{
-+ struct cgroup_iter it;
-+ struct task_struct *task;
-+ unsigned int nfrozen = 0, ntotal = 0;
-+
-+ cgroup_iter_start(cgroup, &it);
-+ while ((task = cgroup_iter_next(cgroup, &it))) {
-+ ntotal++;
-+ if (is_task_frozen_enough(task))
-+ nfrozen++;
-+ }
-+
-+ /*
-+ * Transition to FROZEN when no new tasks can be added ensures
-+ * that we never exist in the FROZEN state while there are unfrozen
-+ * tasks.
-+ */
-+ if (nfrozen == ntotal)
-+ freezer->state = CGROUP_FROZEN;
-+ else if (nfrozen > 0)
-+ freezer->state = CGROUP_FREEZING;
-+ else
-+ freezer->state = CGROUP_THAWED;
-+ cgroup_iter_end(cgroup, &it);
-+}
-+
-+static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
-+ struct seq_file *m)
-+{
-+ struct freezer *freezer;
-+ enum freezer_state state;
-+
-+ if (!cgroup_lock_live_group(cgroup))
-+ return -ENODEV;
-+
-+ freezer = cgroup_freezer(cgroup);
-+ spin_lock_irq(&freezer->lock);
-+ state = freezer->state;
-+ if (state == CGROUP_FREEZING) {
-+ /* We change from FREEZING to FROZEN lazily if the cgroup was
-+ * only partially frozen when we exitted write. */
-+ update_freezer_state(cgroup, freezer);
-+ state = freezer->state;
-+ }
-+ spin_unlock_irq(&freezer->lock);
-+ cgroup_unlock();
-+
-+ seq_puts(m, freezer_state_strs[state]);
-+ seq_putc(m, '\n');
-+ return 0;
-+}
-+
-+static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
-+{
-+ struct cgroup_iter it;
-+ struct task_struct *task;
-+ unsigned int num_cant_freeze_now = 0;
-+
-+ freezer->state = CGROUP_FREEZING;
-+ cgroup_iter_start(cgroup, &it);
-+ while ((task = cgroup_iter_next(cgroup, &it))) {
-+ if (!freeze_task(task, true))
-+ continue;
-+ if (is_task_frozen_enough(task))
-+ continue;
-+ if (!freezing(task) && !freezer_should_skip(task))
-+ num_cant_freeze_now++;
-+ }
-+ cgroup_iter_end(cgroup, &it);
-+
-+ return num_cant_freeze_now ? -EBUSY : 0;
-+}
-+
-+static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
-+{
-+ struct cgroup_iter it;
-+ struct task_struct *task;
-+
-+ cgroup_iter_start(cgroup, &it);
-+ while ((task = cgroup_iter_next(cgroup, &it))) {
-+ thaw_process(task);
-+ }
-+ cgroup_iter_end(cgroup, &it);
-+
-+ freezer->state = CGROUP_THAWED;
-+}
-+
-+static int freezer_change_state(struct cgroup *cgroup,
-+ enum freezer_state goal_state)
-+{
-+ struct freezer *freezer;
-+ int retval = 0;
-+
-+ freezer = cgroup_freezer(cgroup);
-+
-+ spin_lock_irq(&freezer->lock);
-+
-+ update_freezer_state(cgroup, freezer);
-+ if (goal_state == freezer->state)
-+ goto out;
-+
-+ switch (goal_state) {
-+ case CGROUP_THAWED:
-+ unfreeze_cgroup(cgroup, freezer);
-+ break;
-+ case CGROUP_FROZEN:
-+ retval = try_to_freeze_cgroup(cgroup, freezer);
-+ break;
-+ default:
-+ BUG();
-+ }
-+out:
-+ spin_unlock_irq(&freezer->lock);
-+
-+ return retval;
-+}
-+
-+static int freezer_write(struct cgroup *cgroup,
-+ struct cftype *cft,
-+ const char *buffer)
-+{
-+ int retval;
-+ enum freezer_state goal_state;
-+
-+ if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
-+ goal_state = CGROUP_THAWED;
-+ else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
-+ goal_state = CGROUP_FROZEN;
-+ else
-+ return -EINVAL;
-+
-+ if (!cgroup_lock_live_group(cgroup))
-+ return -ENODEV;
-+ retval = freezer_change_state(cgroup, goal_state);
-+ cgroup_unlock();
-+ return retval;
-+}
-+
-+static struct cftype files[] = {
-+ {
-+ .name = "state",
-+ .read_seq_string = freezer_read,
-+ .write_string = freezer_write,
-+ },
-+};
-+
-+static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
-+{
-+ if (!cgroup->parent)
-+ return 0;
-+ return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
-+}
-+
-+struct cgroup_subsys freezer_subsys = {
-+ .name = "freezer",
-+ .create = freezer_create,
-+ .destroy = freezer_destroy,
-+ .populate = freezer_populate,
-+ .subsys_id = freezer_subsys_id,
-+ .can_attach = freezer_can_attach,
-+ .attach = NULL,
-+ .fork = freezer_fork,
-+ .exit = NULL,
-+};
---- /dev/null
-+++ b/kernel/freezer.c
-@@ -0,0 +1,154 @@
-+/*
-+ * kernel/freezer.c - Function to freeze a process
-+ *
-+ * Originally from kernel/power/process.c
-+ */
-+
-+#include <linux/interrupt.h>
-+#include <linux/suspend.h>
-+#include <linux/module.h>
-+#include <linux/syscalls.h>
-+#include <linux/freezer.h>
-+
-+/*
-+ * freezing is complete, mark current process as frozen
-+ */
-+static inline void frozen_process(void)
-+{
-+ if (!unlikely(current->flags & PF_NOFREEZE)) {
-+ current->flags |= PF_FROZEN;
-+ wmb();
-+ }
-+ clear_freeze_flag(current);
-+}
-+
-+/* Refrigerator is place where frozen processes are stored :-). */
-+void refrigerator(void)
-+{
-+ /* Hmm, should we be allowed to suspend when there are realtime
-+ processes around? */
-+ long save;
-+
-+ task_lock(current);
-+ if (freezing(current)) {
-+ frozen_process();
-+ task_unlock(current);
-+ } else {
-+ task_unlock(current);
-+ return;
-+ }
-+ save = current->state;
-+ pr_debug("%s entered refrigerator\n", current->comm);
-+
-+ spin_lock_irq(¤t->sighand->siglock);
-+ recalc_sigpending(); /* We sent fake signal, clean it up */
-+ spin_unlock_irq(¤t->sighand->siglock);
-+
-+ for (;;) {
-+ set_current_state(TASK_UNINTERRUPTIBLE);
-+ if (!frozen(current))
-+ break;
-+ schedule();
-+ }
-+ pr_debug("%s left refrigerator\n", current->comm);
-+ __set_current_state(save);
-+}
-+EXPORT_SYMBOL(refrigerator);
-+
-+static void fake_signal_wake_up(struct task_struct *p)
-+{
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&p->sighand->siglock, flags);
-+ signal_wake_up(p, 0);
-+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
-+}
-+
-+/**
-+ * freeze_task - send a freeze request to given task
-+ * @p: task to send the request to
-+ * @sig_only: if set, the request will only be sent if the task has the
-+ * PF_FREEZER_NOSIG flag unset
-+ * Return value: 'false', if @sig_only is set and the task has
-+ * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
-+ *
-+ * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
-+ * either sending a fake signal to it or waking it up, depending on whether
-+ * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
-+ * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
-+ * TIF_FREEZE flag will not be set.
-+ */
-+bool freeze_task(struct task_struct *p, bool sig_only)
-+{
-+ /*
-+ * We first check if the task is freezing and next if it has already
-+ * been frozen to avoid the race with frozen_process() which first marks
-+ * the task as frozen and next clears its TIF_FREEZE.
-+ */
-+ if (!freezing(p)) {
-+ rmb();
-+ if (frozen(p))
-+ return false;
-+
-+ if (!sig_only || should_send_signal(p))
-+ set_freeze_flag(p);
-+ else
-+ return false;
-+ }
-+
-+ if (should_send_signal(p)) {
-+ if (!signal_pending(p))
-+ fake_signal_wake_up(p);
-+ } else if (sig_only) {
-+ return false;
-+ } else {
-+ wake_up_state(p, TASK_INTERRUPTIBLE);
-+ }
-+
-+ return true;
-+}
-+
-+void cancel_freezing(struct task_struct *p)
-+{
-+ unsigned long flags;
-+
-+ if (freezing(p)) {
-+ pr_debug(" clean up: %s\n", p->comm);
-+ clear_freeze_flag(p);
-+ spin_lock_irqsave(&p->sighand->siglock, flags);
-+ recalc_sigpending_and_wake(p);
-+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
-+ }
-+}
-+
-+static int __thaw_process(struct task_struct *p)
-+{
-+ if (frozen(p)) {
-+ p->flags &= ~PF_FROZEN;
-+ return 1;
-+ }
-+ clear_freeze_flag(p);
-+ return 0;
-+}
-+
-+/*
-+ * Wake up a frozen process
-+ *
-+ * task_lock() is needed to prevent the race with refrigerator() which may
-+ * occur if the freezing of tasks fails. Namely, without the lock, if the
-+ * freezing of tasks failed, thaw_tasks() might have run before a task in
-+ * refrigerator() could call frozen_process(), in which case the task would be
-+ * frozen and no one would thaw it.
-+ */
-+int thaw_process(struct task_struct *p)
-+{
-+ task_lock(p);
-+ if (__thaw_process(p) == 1) {
-+ task_unlock(p);
-+ wake_up_process(p);
-+ return 1;
-+ }
-+ task_unlock(p);
-+ return 0;
-+}
-+EXPORT_SYMBOL(thaw_process);
---- a/kernel/power/process.c
-+++ b/kernel/power/process.c
-@@ -28,121 +28,6 @@ static inline int freezeable(struct task
- return 1;
- }
-
--/*
-- * freezing is complete, mark current process as frozen
-- */
--static inline void frozen_process(void)
--{
-- if (!unlikely(current->flags & PF_NOFREEZE)) {
-- current->flags |= PF_FROZEN;
-- wmb();
-- }
-- clear_freeze_flag(current);
--}
--
--/* Refrigerator is place where frozen processes are stored :-). */
--void refrigerator(void)
--{
-- /* Hmm, should we be allowed to suspend when there are realtime
-- processes around? */
-- long save;
--
-- task_lock(current);
-- if (freezing(current)) {
-- frozen_process();
-- task_unlock(current);
-- } else {
-- task_unlock(current);
-- return;
-- }
-- save = current->state;
-- pr_debug("%s entered refrigerator\n", current->comm);
--
-- spin_lock_irq(¤t->sighand->siglock);
-- recalc_sigpending(); /* We sent fake signal, clean it up */
-- spin_unlock_irq(¤t->sighand->siglock);
--
-- for (;;) {
-- set_current_state(TASK_UNINTERRUPTIBLE);
-- if (!frozen(current))
-- break;
-- schedule();
-- }
-- pr_debug("%s left refrigerator\n", current->comm);
-- __set_current_state(save);
--}
--
--static void fake_signal_wake_up(struct task_struct *p)
--{
-- unsigned long flags;
--
-- spin_lock_irqsave(&p->sighand->siglock, flags);
-- signal_wake_up(p, 0);
-- spin_unlock_irqrestore(&p->sighand->siglock, flags);
--}
--
--static inline bool should_send_signal(struct task_struct *p)
--{
-- return !(p->flags & PF_FREEZER_NOSIG);
--}
--
--/**
-- * freeze_task - send a freeze request to given task
-- * @p: task to send the request to
-- * @sig_only: if set, the request will only be sent if the task has the
-- * PF_FREEZER_NOSIG flag unset
-- * Return value: 'false', if @sig_only is set and the task has
-- * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
-- *
-- * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
-- * either sending a fake signal to it or waking it up, depending on whether
-- * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
-- * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
-- * TIF_FREEZE flag will not be set.
-- */
--static bool freeze_task(struct task_struct *p, bool sig_only)
--{
-- /*
-- * We first check if the task is freezing and next if it has already
-- * been frozen to avoid the race with frozen_process() which first marks
-- * the task as frozen and next clears its TIF_FREEZE.
-- */
-- if (!freezing(p)) {
-- rmb();
-- if (frozen(p))
-- return false;
--
-- if (!sig_only || should_send_signal(p))
-- set_freeze_flag(p);
-- else
-- return false;
-- }
--
-- if (should_send_signal(p)) {
-- if (!signal_pending(p))
-- fake_signal_wake_up(p);
-- } else if (sig_only) {
-- return false;
-- } else {
-- wake_up_state(p, TASK_INTERRUPTIBLE);
-- }
--
-- return true;
--}
--
--static void cancel_freezing(struct task_struct *p)
--{
-- unsigned long flags;
--
-- if (freezing(p)) {
-- pr_debug(" clean up: %s\n", p->comm);
-- clear_freeze_flag(p);
-- spin_lock_irqsave(&p->sighand->siglock, flags);
-- recalc_sigpending_and_wake(p);
-- spin_unlock_irqrestore(&p->sighand->siglock, flags);
-- }
--}
--
- static int try_to_freeze_tasks(bool sig_only)
- {
- struct task_struct *g, *p;
-@@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only)
- if (nosig_only && should_send_signal(p))
- continue;
-
-+ if (cgroup_frozen(p))
-+ continue;
-+
- thaw_process(p);
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
-@@ -264,4 +152,3 @@ void thaw_processes(void)
- printk("done.\n");
- }
-
--EXPORT_SYMBOL(refrigerator);