src/patches/suse-2.6.27.25/patches.suse/cgroup-freezer.patch

   1 From: Serge E. Hallyn <serue@us.ibm.com>
   2 Subject: cgroup freezer
   3 References: bnc#417294, fate#304191, fate#201036
   4 Patch-upstream: yes
   5 Git: 68d1a06b440a5df55fb253e1d1113d2e4a7209fc Mon Sep 17 00:00:00 2001
   6
   7 Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
   8 Acked-by: Nick Piggin <npiggin@suse.de>
   9 ---
  10  Documentation/cgroups.txt                   |  548 ----------------------------
  11  Documentation/cgroups/cgroups.txt           |  548 ++++++++++++++++++++++++++++
  12  Documentation/cgroups/freezer-subsystem.txt |  102 +++++
  13  Documentation/cpusets.txt                   |    2
  14  arch/alpha/Kconfig                          |    1
  15  arch/alpha/include/asm/thread_info.h        |    2
  16  arch/arm/Kconfig                            |    2
  17  arch/avr32/Kconfig                          |    2
  18  arch/avr32/include/asm/thread_info.h        |    1
  19  arch/blackfin/Kconfig                       |    3
  20  arch/cris/Kconfig                           |    2
  21  arch/frv/Kconfig                            |    2
  22  arch/h8300/Kconfig                          |    2
  23  arch/h8300/include/asm/thread_info.h        |    2
  24  arch/ia64/Kconfig                           |    2
  25  arch/m32r/Kconfig                           |    2
  26  arch/m68k/Kconfig                           |    2
  27  arch/m68knommu/Kconfig                      |    2
  28  arch/m68knommu/include/asm/thread_info.h    |    2
  29  arch/mips/Kconfig                           |    2
  30  arch/mn10300/Kconfig                        |    2
  31  arch/parisc/Kconfig                         |    2
  32  arch/powerpc/Kconfig                        |    2
  33  arch/s390/Kconfig                           |    2
  34  arch/s390/include/asm/thread_info.h         |    2
  35  arch/sh/Kconfig                             |    2
  36  arch/sparc/Kconfig                          |    2
  37  arch/sparc/include/asm/thread_info_32.h     |    2
  38  arch/sparc64/Kconfig                        |    1
  39  arch/um/Kconfig                             |    2
  40  arch/x86/Kconfig                            |    1
  41  arch/xtensa/Kconfig                         |    1
  42  include/asm-cris/thread_info.h              |    2
  43  include/asm-m68k/thread_info.h              |    1
  44  include/asm-parisc/thread_info.h            |    2
  45  include/asm-um/thread_info.h                |    2
  46  include/asm-xtensa/thread_info.h            |    2
  47  include/linux/cgroup_subsys.h               |    6
  48  include/linux/freezer.h                     |   42 --
  49  init/Kconfig                                |    7
  50  kernel/Kconfig.freezer                      |    2
  51  kernel/Makefile                             |    2
  52  kernel/cgroup_freezer.c                     |  379 +++++++++++++++++++
  53  kernel/freezer.c                            |  154 +++++++
  54  kernel/power/process.c                      |  119 ------
  55  45 files changed, 1283 insertions(+), 689 deletions(-)
  56  create mode 100644 include/linux/cgroup_freezer.h
  57  create mode 100644 kernel/cgroup_freezer.c
  58  create mode 100644 kernel/freezer.c
  59
  60 --- a/Documentation/cgroups.txt
  61 +++ /dev/null
  62 @@ -1,548 +0,0 @@
  63 -                               CGROUPS
  64 -                               -------
  65 -
  66 -Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
  67 -
  68 -Original copyright statements from cpusets.txt:
  69 -Portions Copyright (C) 2004 BULL SA.
  70 -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
  71 -Modified by Paul Jackson <pj@sgi.com>
  72 -Modified by Christoph Lameter <clameter@sgi.com>
  73 -
  74 -CONTENTS:
  75 -=========
  76 -
  77 -1. Control Groups
  78 -  1.1 What are cgroups ?
  79 -  1.2 Why are cgroups needed ?
  80 -  1.3 How are cgroups implemented ?
  81 -  1.4 What does notify_on_release do ?
  82 -  1.5 How do I use cgroups ?
  83 -2. Usage Examples and Syntax
  84 -  2.1 Basic Usage
  85 -  2.2 Attaching processes
  86 -3. Kernel API
  87 -  3.1 Overview
  88 -  3.2 Synchronization
  89 -  3.3 Subsystem API
  90 -4. Questions
  91 -
  92 -1. Control Groups
  93 -=================
  94 -
  95 -1.1 What are cgroups ?
  96 -----------------------
  97 -
  98 -Control Groups provide a mechanism for aggregating/partitioning sets of
  99 -tasks, and all their future children, into hierarchical groups with
 100 -specialized behaviour.
 101 -
 102 -Definitions:
 103 -
 104 -A *cgroup* associates a set of tasks with a set of parameters for one
 105 -or more subsystems.
 106 -
 107 -A *subsystem* is a module that makes use of the task grouping
 108 -facilities provided by cgroups to treat groups of tasks in
 109 -particular ways. A subsystem is typically a "resource controller" that
 110 -schedules a resource or applies per-cgroup limits, but it may be
 111 -anything that wants to act on a group of processes, e.g. a
 112 -virtualization subsystem.
 113 -
 114 -A *hierarchy* is a set of cgroups arranged in a tree, such that
 115 -every task in the system is in exactly one of the cgroups in the
 116 -hierarchy, and a set of subsystems; each subsystem has system-specific
 117 -state attached to each cgroup in the hierarchy.  Each hierarchy has
 118 -an instance of the cgroup virtual filesystem associated with it.
 119 -
 120 -At any one time there may be multiple active hierachies of task
 121 -cgroups. Each hierarchy is a partition of all tasks in the system.
 122 -
 123 -User level code may create and destroy cgroups by name in an
 124 -instance of the cgroup virtual file system, specify and query to
 125 -which cgroup a task is assigned, and list the task pids assigned to
 126 -a cgroup. Those creations and assignments only affect the hierarchy
 127 -associated with that instance of the cgroup file system.
 128 -
 129 -On their own, the only use for cgroups is for simple job
 130 -tracking. The intention is that other subsystems hook into the generic
 131 -cgroup support to provide new attributes for cgroups, such as
 132 -accounting/limiting the resources which processes in a cgroup can
 133 -access. For example, cpusets (see Documentation/cpusets.txt) allows
 134 -you to associate a set of CPUs and a set of memory nodes with the
 135 -tasks in each cgroup.
 136 -
 137 -1.2 Why are cgroups needed ?
 138 -----------------------------
 139 -
 140 -There are multiple efforts to provide process aggregations in the
 141 -Linux kernel, mainly for resource tracking purposes. Such efforts
 142 -include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
 143 -namespaces. These all require the basic notion of a
 144 -grouping/partitioning of processes, with newly forked processes ending
 145 -in the same group (cgroup) as their parent process.
 146 -
 147 -The kernel cgroup patch provides the minimum essential kernel
 148 -mechanisms required to efficiently implement such groups. It has
 149 -minimal impact on the system fast paths, and provides hooks for
 150 -specific subsystems such as cpusets to provide additional behaviour as
 151 -desired.
 152 -
 153 -Multiple hierarchy support is provided to allow for situations where
 154 -the division of tasks into cgroups is distinctly different for
 155 -different subsystems - having parallel hierarchies allows each
 156 -hierarchy to be a natural division of tasks, without having to handle
 157 -complex combinations of tasks that would be present if several
 158 -unrelated subsystems needed to be forced into the same tree of
 159 -cgroups.
 160 -
 161 -At one extreme, each resource controller or subsystem could be in a
 162 -separate hierarchy; at the other extreme, all subsystems
 163 -would be attached to the same hierarchy.
 164 -
 165 -As an example of a scenario (originally proposed by vatsa@in.ibm.com)
 166 -that can benefit from multiple hierarchies, consider a large
 167 -university server with various users - students, professors, system
 168 -tasks etc. The resource planning for this server could be along the
 169 -following lines:
 170 -
 171 -       CPU :           Top cpuset
 172 -                       /       \
 173 -               CPUSet1         CPUSet2
 174 -                  |              |
 175 -               (Profs)         (Students)
 176 -
 177 -               In addition (system tasks) are attached to topcpuset (so
 178 -               that they can run anywhere) with a limit of 20%
 179 -
 180 -       Memory : Professors (50%), students (30%), system (20%)
 181 -
 182 -       Disk : Prof (50%), students (30%), system (20%)
 183 -
 184 -       Network : WWW browsing (20%), Network File System (60%), others (20%)
 185 -                               / \
 186 -                       Prof (15%) students (5%)
 187 -
 188 -Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
 189 -into NFS network class.
 190 -
 191 -At the same time firefox/lynx will share an appropriate CPU/Memory class
 192 -depending on who launched it (prof/student).
 193 -
 194 -With the ability to classify tasks differently for different resources
 195 -(by putting those resource subsystems in different hierarchies) then
 196 -the admin can easily set up a script which receives exec notifications
 197 -and depending on who is launching the browser he can
 198 -
 199 -       # echo browser_pid > /mnt/<restype>/<userclass>/tasks
 200 -
 201 -With only a single hierarchy, he now would potentially have to create
 202 -a separate cgroup for every browser launched and associate it with
 203 -approp network and other resource class.  This may lead to
 204 -proliferation of such cgroups.
 205 -
 206 -Also lets say that the administrator would like to give enhanced network
 207 -access temporarily to a student's browser (since it is night and the user
 208 -wants to do online gaming :))  OR give one of the students simulation
 209 -apps enhanced CPU power,
 210 -
 211 -With ability to write pids directly to resource classes, it's just a
 212 -matter of :
 213 -
 214 -       # echo pid > /mnt/network/<new_class>/tasks
 215 -       (after some time)
 216 -       # echo pid > /mnt/network/<orig_class>/tasks
 217 -
 218 -Without this ability, he would have to split the cgroup into
 219 -multiple separate ones and then associate the new cgroups with the
 220 -new resource classes.
 221 -
 222 -
 223 -
 224 -1.3 How are cgroups implemented ?
 225 ----------------------------------
 226 -
 227 -Control Groups extends the kernel as follows:
 228 -
 229 - - Each task in the system has a reference-counted pointer to a
 230 -   css_set.
 231 -
 232 - - A css_set contains a set of reference-counted pointers to
 233 -   cgroup_subsys_state objects, one for each cgroup subsystem
 234 -   registered in the system. There is no direct link from a task to
 235 -   the cgroup of which it's a member in each hierarchy, but this
 236 -   can be determined by following pointers through the
 237 -   cgroup_subsys_state objects. This is because accessing the
 238 -   subsystem state is something that's expected to happen frequently
 239 -   and in performance-critical code, whereas operations that require a
 240 -   task's actual cgroup assignments (in particular, moving between
 241 -   cgroups) are less common. A linked list runs through the cg_list
 242 -   field of each task_struct using the css_set, anchored at
 243 -   css_set->tasks.
 244 -
 245 - - A cgroup hierarchy filesystem can be mounted  for browsing and
 246 -   manipulation from user space.
 247 -
 248 - - You can list all the tasks (by pid) attached to any cgroup.
 249 -
 250 -The implementation of cgroups requires a few, simple hooks
 251 -into the rest of the kernel, none in performance critical paths:
 252 -
 253 - - in init/main.c, to initialize the root cgroups and initial
 254 -   css_set at system boot.
 255 -
 256 - - in fork and exit, to attach and detach a task from its css_set.
 257 -
 258 -In addition a new file system, of type "cgroup" may be mounted, to
 259 -enable browsing and modifying the cgroups presently known to the
 260 -kernel.  When mounting a cgroup hierarchy, you may specify a
 261 -comma-separated list of subsystems to mount as the filesystem mount
 262 -options.  By default, mounting the cgroup filesystem attempts to
 263 -mount a hierarchy containing all registered subsystems.
 264 -
 265 -If an active hierarchy with exactly the same set of subsystems already
 266 -exists, it will be reused for the new mount. If no existing hierarchy
 267 -matches, and any of the requested subsystems are in use in an existing
 268 -hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
 269 -is activated, associated with the requested subsystems.
 270 -
 271 -It's not currently possible to bind a new subsystem to an active
 272 -cgroup hierarchy, or to unbind a subsystem from an active cgroup
 273 -hierarchy. This may be possible in future, but is fraught with nasty
 274 -error-recovery issues.
 275 -
 276 -When a cgroup filesystem is unmounted, if there are any
 277 -child cgroups created below the top-level cgroup, that hierarchy
 278 -will remain active even though unmounted; if there are no
 279 -child cgroups then the hierarchy will be deactivated.
 280 -
 281 -No new system calls are added for cgroups - all support for
 282 -querying and modifying cgroups is via this cgroup file system.
 283 -
 284 -Each task under /proc has an added file named 'cgroup' displaying,
 285 -for each active hierarchy, the subsystem names and the cgroup name
 286 -as the path relative to the root of the cgroup file system.
 287 -
 288 -Each cgroup is represented by a directory in the cgroup file system
 289 -containing the following files describing that cgroup:
 290 -
 291 - - tasks: list of tasks (by pid) attached to that cgroup
 292 - - releasable flag: cgroup currently removeable?
 293 - - notify_on_release flag: run the release agent on exit?
 294 - - release_agent: the path to use for release notifications (this file
 295 -   exists in the top cgroup only)
 296 -
 297 -Other subsystems such as cpusets may add additional files in each
 298 -cgroup dir.
 299 -
 300 -New cgroups are created using the mkdir system call or shell
 301 -command.  The properties of a cgroup, such as its flags, are
 302 -modified by writing to the appropriate file in that cgroups
 303 -directory, as listed above.
 304 -
 305 -The named hierarchical structure of nested cgroups allows partitioning
 306 -a large system into nested, dynamically changeable, "soft-partitions".
 307 -
 308 -The attachment of each task, automatically inherited at fork by any
 309 -children of that task, to a cgroup allows organizing the work load
 310 -on a system into related sets of tasks.  A task may be re-attached to
 311 -any other cgroup, if allowed by the permissions on the necessary
 312 -cgroup file system directories.
 313 -
 314 -When a task is moved from one cgroup to another, it gets a new
 315 -css_set pointer - if there's an already existing css_set with the
 316 -desired collection of cgroups then that group is reused, else a new
 317 -css_set is allocated. Note that the current implementation uses a
 318 -linear search to locate an appropriate existing css_set, so isn't
 319 -very efficient. A future version will use a hash table for better
 320 -performance.
 321 -
 322 -To allow access from a cgroup to the css_sets (and hence tasks)
 323 -that comprise it, a set of cg_cgroup_link objects form a lattice;
 324 -each cg_cgroup_link is linked into a list of cg_cgroup_links for
 325 -a single cgroup on its cgrp_link_list field, and a list of
 326 -cg_cgroup_links for a single css_set on its cg_link_list.
 327 -
 328 -Thus the set of tasks in a cgroup can be listed by iterating over
 329 -each css_set that references the cgroup, and sub-iterating over
 330 -each css_set's task set.
 331 -
 332 -The use of a Linux virtual file system (vfs) to represent the
 333 -cgroup hierarchy provides for a familiar permission and name space
 334 -for cgroups, with a minimum of additional kernel code.
 335 -
 336 -1.4 What does notify_on_release do ?
 337 -------------------------------------
 338 -
 339 -If the notify_on_release flag is enabled (1) in a cgroup, then
 340 -whenever the last task in the cgroup leaves (exits or attaches to
 341 -some other cgroup) and the last child cgroup of that cgroup
 342 -is removed, then the kernel runs the command specified by the contents
 343 -of the "release_agent" file in that hierarchy's root directory,
 344 -supplying the pathname (relative to the mount point of the cgroup
 345 -file system) of the abandoned cgroup.  This enables automatic
 346 -removal of abandoned cgroups.  The default value of
 347 -notify_on_release in the root cgroup at system boot is disabled
 348 -(0).  The default value of other cgroups at creation is the current
 349 -value of their parents notify_on_release setting. The default value of
 350 -a cgroup hierarchy's release_agent path is empty.
 351 -
 352 -1.5 How do I use cgroups ?
 353 ---------------------------
 354 -
 355 -To start a new job that is to be contained within a cgroup, using
 356 -the "cpuset" cgroup subsystem, the steps are something like:
 357 -
 358 - 1) mkdir /dev/cgroup
 359 - 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
 360 - 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
 361 -    the /dev/cgroup virtual file system.
 362 - 4) Start a task that will be the "founding father" of the new job.
 363 - 5) Attach that task to the new cgroup by writing its pid to the
 364 -    /dev/cgroup tasks file for that cgroup.
 365 - 6) fork, exec or clone the job tasks from this founding father task.
 366 -
 367 -For example, the following sequence of commands will setup a cgroup
 368 -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
 369 -and then start a subshell 'sh' in that cgroup:
 370 -
 371 -  mount -t cgroup cpuset -ocpuset /dev/cgroup
 372 -  cd /dev/cgroup
 373 -  mkdir Charlie
 374 -  cd Charlie
 375 -  /bin/echo 2-3 > cpuset.cpus
 376 -  /bin/echo 1 > cpuset.mems
 377 -  /bin/echo $$ > tasks
 378 -  sh
 379 -  # The subshell 'sh' is now running in cgroup Charlie
 380 -  # The next line should display '/Charlie'
 381 -  cat /proc/self/cgroup
 382 -
 383 -2. Usage Examples and Syntax
 384 -============================
 385 -
 386 -2.1 Basic Usage
 387 ----------------
 388 -
 389 -Creating, modifying, using the cgroups can be done through the cgroup
 390 -virtual filesystem.
 391 -
 392 -To mount a cgroup hierarchy will all available subsystems, type:
 393 -# mount -t cgroup xxx /dev/cgroup
 394 -
 395 -The "xxx" is not interpreted by the cgroup code, but will appear in
 396 -/proc/mounts so may be any useful identifying string that you like.
 397 -
 398 -To mount a cgroup hierarchy with just the cpuset and numtasks
 399 -subsystems, type:
 400 -# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
 401 -
 402 -To change the set of subsystems bound to a mounted hierarchy, just
 403 -remount with different options:
 404 -
 405 -# mount -o remount,cpuset,ns  /dev/cgroup
 406 -
 407 -Note that changing the set of subsystems is currently only supported
 408 -when the hierarchy consists of a single (root) cgroup. Supporting
 409 -the ability to arbitrarily bind/unbind subsystems from an existing
 410 -cgroup hierarchy is intended to be implemented in the future.
 411 -
 412 -Then under /dev/cgroup you can find a tree that corresponds to the
 413 -tree of the cgroups in the system. For instance, /dev/cgroup
 414 -is the cgroup that holds the whole system.
 415 -
 416 -If you want to create a new cgroup under /dev/cgroup:
 417 -# cd /dev/cgroup
 418 -# mkdir my_cgroup
 419 -
 420 -Now you want to do something with this cgroup.
 421 -# cd my_cgroup
 422 -
 423 -In this directory you can find several files:
 424 -# ls
 425 -notify_on_release releasable tasks
 426 -(plus whatever files added by the attached subsystems)
 427 -
 428 -Now attach your shell to this cgroup:
 429 -# /bin/echo $$ > tasks
 430 -
 431 -You can also create cgroups inside your cgroup by using mkdir in this
 432 -directory.
 433 -# mkdir my_sub_cs
 434 -
 435 -To remove a cgroup, just use rmdir:
 436 -# rmdir my_sub_cs
 437 -
 438 -This will fail if the cgroup is in use (has cgroups inside, or
 439 -has processes attached, or is held alive by other subsystem-specific
 440 -reference).
 441 -
 442 -2.2 Attaching processes
 443 ------------------------
 444 -
 445 -# /bin/echo PID > tasks
 446 -
 447 -Note that it is PID, not PIDs. You can only attach ONE task at a time.
 448 -If you have several tasks to attach, you have to do it one after another:
 449 -
 450 -# /bin/echo PID1 > tasks
 451 -# /bin/echo PID2 > tasks
 452 -       ...
 453 -# /bin/echo PIDn > tasks
 454 -
 455 -You can attach the current shell task by echoing 0:
 456 -
 457 -# echo 0 > tasks
 458 -
 459 -3. Kernel API
 460 -=============
 461 -
 462 -3.1 Overview
 463 -------------
 464 -
 465 -Each kernel subsystem that wants to hook into the generic cgroup
 466 -system needs to create a cgroup_subsys object. This contains
 467 -various methods, which are callbacks from the cgroup system, along
 468 -with a subsystem id which will be assigned by the cgroup system.
 469 -
 470 -Other fields in the cgroup_subsys object include:
 471 -
 472 -- subsys_id: a unique array index for the subsystem, indicating which
 473 -  entry in cgroup->subsys[] this subsystem should be managing.
 474 -
 475 -- name: should be initialized to a unique subsystem name. Should be
 476 -  no longer than MAX_CGROUP_TYPE_NAMELEN.
 477 -
 478 -- early_init: indicate if the subsystem needs early initialization
 479 -  at system boot.
 480 -
 481 -Each cgroup object created by the system has an array of pointers,
 482 -indexed by subsystem id; this pointer is entirely managed by the
 483 -subsystem; the generic cgroup code will never touch this pointer.
 484 -
 485 -3.2 Synchronization
 486 --------------------
 487 -
 488 -There is a global mutex, cgroup_mutex, used by the cgroup
 489 -system. This should be taken by anything that wants to modify a
 490 -cgroup. It may also be taken to prevent cgroups from being
 491 -modified, but more specific locks may be more appropriate in that
 492 -situation.
 493 -
 494 -See kernel/cgroup.c for more details.
 495 -
 496 -Subsystems can take/release the cgroup_mutex via the functions
 497 -cgroup_lock()/cgroup_unlock().
 498 -
 499 -Accessing a task's cgroup pointer may be done in the following ways:
 500 -- while holding cgroup_mutex
 501 -- while holding the task's alloc_lock (via task_lock())
 502 -- inside an rcu_read_lock() section via rcu_dereference()
 503 -
 504 -3.3 Subsystem API
 505 ------------------
 506 -
 507 -Each subsystem should:
 508 -
 509 -- add an entry in linux/cgroup_subsys.h
 510 -- define a cgroup_subsys object called <name>_subsys
 511 -
 512 -Each subsystem may export the following methods. The only mandatory
 513 -methods are create/destroy. Any others that are null are presumed to
 514 -be successful no-ops.
 515 -
 516 -struct cgroup_subsys_state *create(struct cgroup_subsys *ss,
 517 -                                  struct cgroup *cgrp)
 518 -(cgroup_mutex held by caller)
 519 -
 520 -Called to create a subsystem state object for a cgroup. The
 521 -subsystem should allocate its subsystem state object for the passed
 522 -cgroup, returning a pointer to the new object on success or a
 523 -negative error code. On success, the subsystem pointer should point to
 524 -a structure of type cgroup_subsys_state (typically embedded in a
 525 -larger subsystem-specific object), which will be initialized by the
 526 -cgroup system. Note that this will be called at initialization to
 527 -create the root subsystem state for this subsystem; this case can be
 528 -identified by the passed cgroup object having a NULL parent (since
 529 -it's the root of the hierarchy) and may be an appropriate place for
 530 -initialization code.
 531 -
 532 -void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 533 -(cgroup_mutex held by caller)
 534 -
 535 -The cgroup system is about to destroy the passed cgroup; the subsystem
 536 -should do any necessary cleanup and free its subsystem state
 537 -object. By the time this method is called, the cgroup has already been
 538 -unlinked from the file system and from the child list of its parent;
 539 -cgroup->parent is still valid. (Note - can also be called for a
 540 -newly-created cgroup if an error occurs after this subsystem's
 541 -create() method has been called for the new cgroup).
 542 -
 543 -void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
 544 -(cgroup_mutex held by caller)
 545 -
 546 -Called before checking the reference count on each subsystem. This may
 547 -be useful for subsystems which have some extra references even if
 548 -there are not tasks in the cgroup.
 549 -
 550 -int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 551 -              struct task_struct *task)
 552 -(cgroup_mutex held by caller)
 553 -
 554 -Called prior to moving a task into a cgroup; if the subsystem
 555 -returns an error, this will abort the attach operation.  If a NULL
 556 -task is passed, then a successful result indicates that *any*
 557 -unspecified task can be moved into the cgroup. Note that this isn't
 558 -called on a fork. If this method returns 0 (success) then this should
 559 -remain valid while the caller holds cgroup_mutex.
 560 -
 561 -void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 562 -           struct cgroup *old_cgrp, struct task_struct *task)
 563 -
 564 -Called after the task has been attached to the cgroup, to allow any
 565 -post-attachment activity that requires memory allocations or blocking.
 566 -
 567 -void fork(struct cgroup_subsy *ss, struct task_struct *task)
 568 -
 569 -Called when a task is forked into a cgroup.
 570 -
 571 -void exit(struct cgroup_subsys *ss, struct task_struct *task)
 572 -
 573 -Called during task exit.
 574 -
 575 -int populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 576 -
 577 -Called after creation of a cgroup to allow a subsystem to populate
 578 -the cgroup directory with file entries.  The subsystem should make
 579 -calls to cgroup_add_file() with objects of type cftype (see
 580 -include/linux/cgroup.h for details).  Note that although this
 581 -method can return an error code, the error code is currently not
 582 -always handled well.
 583 -
 584 -void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
 585 -
 586 -Called at the end of cgroup_clone() to do any paramater
 587 -initialization which might be required before a task could attach.  For
 588 -example in cpusets, no task may attach before 'cpus' and 'mems' are set
 589 -up.
 590 -
 591 -void bind(struct cgroup_subsys *ss, struct cgroup *root)
 592 -(cgroup_mutex held by caller)
 593 -
 594 -Called when a cgroup subsystem is rebound to a different hierarchy
 595 -and root cgroup. Currently this will only involve movement between
 596 -the default hierarchy (which never has sub-cgroups) and a hierarchy
 597 -that is being created/destroyed (and hence has no sub-cgroups).
 598 -
 599 -4. Questions
 600 -============
 601 -
 602 -Q: what's up with this '/bin/echo' ?
 603 -A: bash's builtin 'echo' command does not check calls to write() against
 604 -   errors. If you use it in the cgroup file system, you won't be
 605 -   able to tell whether a command succeeded or failed.
 606 -
 607 -Q: When I attach processes, only the first of the line gets really attached !
 608 -A: We can only return one error code per call to write(). So you should also
 609 -   put only ONE pid.
 610 -
 611 --- /dev/null
 612 +++ b/Documentation/cgroups/cgroups.txt
 613 @@ -0,0 +1,548 @@
 614 +                               CGROUPS
 615 +                               -------
 616 +
 617 +Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
 618 +
 619 +Original copyright statements from cpusets.txt:
 620 +Portions Copyright (C) 2004 BULL SA.
 621 +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
 622 +Modified by Paul Jackson <pj@sgi.com>
 623 +Modified by Christoph Lameter <clameter@sgi.com>
 624 +
 625 +CONTENTS:
 626 +=========
 627 +
 628 +1. Control Groups
 629 +  1.1 What are cgroups ?
 630 +  1.2 Why are cgroups needed ?
 631 +  1.3 How are cgroups implemented ?
 632 +  1.4 What does notify_on_release do ?
 633 +  1.5 How do I use cgroups ?
 634 +2. Usage Examples and Syntax
 635 +  2.1 Basic Usage
 636 +  2.2 Attaching processes
 637 +3. Kernel API
 638 +  3.1 Overview
 639 +  3.2 Synchronization
 640 +  3.3 Subsystem API
 641 +4. Questions
 642 +
 643 +1. Control Groups
 644 +=================
 645 +
 646 +1.1 What are cgroups ?
 647 +----------------------
 648 +
 649 +Control Groups provide a mechanism for aggregating/partitioning sets of
 650 +tasks, and all their future children, into hierarchical groups with
 651 +specialized behaviour.
 652 +
 653 +Definitions:
 654 +
 655 +A *cgroup* associates a set of tasks with a set of parameters for one
 656 +or more subsystems.
 657 +
 658 +A *subsystem* is a module that makes use of the task grouping
 659 +facilities provided by cgroups to treat groups of tasks in
 660 +particular ways. A subsystem is typically a "resource controller" that
 661 +schedules a resource or applies per-cgroup limits, but it may be
 662 +anything that wants to act on a group of processes, e.g. a
 663 +virtualization subsystem.
 664 +
 665 +A *hierarchy* is a set of cgroups arranged in a tree, such that
 666 +every task in the system is in exactly one of the cgroups in the
 667 +hierarchy, and a set of subsystems; each subsystem has system-specific
 668 +state attached to each cgroup in the hierarchy.  Each hierarchy has
 669 +an instance of the cgroup virtual filesystem associated with it.
 670 +
 671 +At any one time there may be multiple active hierachies of task
 672 +cgroups. Each hierarchy is a partition of all tasks in the system.
 673 +
 674 +User level code may create and destroy cgroups by name in an
 675 +instance of the cgroup virtual file system, specify and query to
 676 +which cgroup a task is assigned, and list the task pids assigned to
 677 +a cgroup. Those creations and assignments only affect the hierarchy
 678 +associated with that instance of the cgroup file system.
 679 +
 680 +On their own, the only use for cgroups is for simple job
 681 +tracking. The intention is that other subsystems hook into the generic
 682 +cgroup support to provide new attributes for cgroups, such as
 683 +accounting/limiting the resources which processes in a cgroup can
 684 +access. For example, cpusets (see Documentation/cpusets.txt) allows
 685 +you to associate a set of CPUs and a set of memory nodes with the
 686 +tasks in each cgroup.
 687 +
 688 +1.2 Why are cgroups needed ?
 689 +----------------------------
 690 +
 691 +There are multiple efforts to provide process aggregations in the
 692 +Linux kernel, mainly for resource tracking purposes. Such efforts
 693 +include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
 694 +namespaces. These all require the basic notion of a
 695 +grouping/partitioning of processes, with newly forked processes ending
 696 +in the same group (cgroup) as their parent process.
 697 +
 698 +The kernel cgroup patch provides the minimum essential kernel
 699 +mechanisms required to efficiently implement such groups. It has
 700 +minimal impact on the system fast paths, and provides hooks for
 701 +specific subsystems such as cpusets to provide additional behaviour as
 702 +desired.
 703 +
 704 +Multiple hierarchy support is provided to allow for situations where
 705 +the division of tasks into cgroups is distinctly different for
 706 +different subsystems - having parallel hierarchies allows each
 707 +hierarchy to be a natural division of tasks, without having to handle
 708 +complex combinations of tasks that would be present if several
 709 +unrelated subsystems needed to be forced into the same tree of
 710 +cgroups.
 711 +
 712 +At one extreme, each resource controller or subsystem could be in a
 713 +separate hierarchy; at the other extreme, all subsystems
 714 +would be attached to the same hierarchy.
 715 +
 716 +As an example of a scenario (originally proposed by vatsa@in.ibm.com)
 717 +that can benefit from multiple hierarchies, consider a large
 718 +university server with various users - students, professors, system
 719 +tasks etc. The resource planning for this server could be along the
 720 +following lines:
 721 +
 722 +       CPU :           Top cpuset
 723 +                       /       \
 724 +               CPUSet1         CPUSet2
 725 +                  |              |
 726 +               (Profs)         (Students)
 727 +
 728 +               In addition (system tasks) are attached to topcpuset (so
 729 +               that they can run anywhere) with a limit of 20%
 730 +
 731 +       Memory : Professors (50%), students (30%), system (20%)
 732 +
 733 +       Disk : Prof (50%), students (30%), system (20%)
 734 +
 735 +       Network : WWW browsing (20%), Network File System (60%), others (20%)
 736 +                               / \
 737 +                       Prof (15%) students (5%)
 738 +
 739 +Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
 740 +into NFS network class.
 741 +
 742 +At the same time firefox/lynx will share an appropriate CPU/Memory class
 743 +depending on who launched it (prof/student).
 744 +
 745 +With the ability to classify tasks differently for different resources
 746 +(by putting those resource subsystems in different hierarchies) then
 747 +the admin can easily set up a script which receives exec notifications
 748 +and depending on who is launching the browser he can
 749 +
 750 +       # echo browser_pid > /mnt/<restype>/<userclass>/tasks
 751 +
 752 +With only a single hierarchy, he now would potentially have to create
 753 +a separate cgroup for every browser launched and associate it with
 754 +approp network and other resource class.  This may lead to
 755 +proliferation of such cgroups.
 756 +
 757 +Also lets say that the administrator would like to give enhanced network
 758 +access temporarily to a student's browser (since it is night and the user
 759 +wants to do online gaming :))  OR give one of the students simulation
 760 +apps enhanced CPU power,
 761 +
 762 +With ability to write pids directly to resource classes, it's just a
 763 +matter of :
 764 +
 765 +       # echo pid > /mnt/network/<new_class>/tasks
 766 +       (after some time)
 767 +       # echo pid > /mnt/network/<orig_class>/tasks
 768 +
 769 +Without this ability, he would have to split the cgroup into
 770 +multiple separate ones and then associate the new cgroups with the
 771 +new resource classes.
 772 +
 773 +
 774 +
 775 +1.3 How are cgroups implemented ?
 776 +---------------------------------
 777 +
 778 +Control Groups extends the kernel as follows:
 779 +
 780 + - Each task in the system has a reference-counted pointer to a
 781 +   css_set.
 782 +
 783 + - A css_set contains a set of reference-counted pointers to
 784 +   cgroup_subsys_state objects, one for each cgroup subsystem
 785 +   registered in the system. There is no direct link from a task to
 786 +   the cgroup of which it's a member in each hierarchy, but this
 787 +   can be determined by following pointers through the
 788 +   cgroup_subsys_state objects. This is because accessing the
 789 +   subsystem state is something that's expected to happen frequently
 790 +   and in performance-critical code, whereas operations that require a
 791 +   task's actual cgroup assignments (in particular, moving between
 792 +   cgroups) are less common. A linked list runs through the cg_list
 793 +   field of each task_struct using the css_set, anchored at
 794 +   css_set->tasks.
 795 +
 796 + - A cgroup hierarchy filesystem can be mounted  for browsing and
 797 +   manipulation from user space.
 798 +
 799 + - You can list all the tasks (by pid) attached to any cgroup.
 800 +
 801 +The implementation of cgroups requires a few, simple hooks
 802 +into the rest of the kernel, none in performance critical paths:
 803 +
 804 + - in init/main.c, to initialize the root cgroups and initial
 805 +   css_set at system boot.
 806 +
 807 + - in fork and exit, to attach and detach a task from its css_set.
 808 +
 809 +In addition a new file system, of type "cgroup" may be mounted, to
 810 +enable browsing and modifying the cgroups presently known to the
 811 +kernel.  When mounting a cgroup hierarchy, you may specify a
 812 +comma-separated list of subsystems to mount as the filesystem mount
 813 +options.  By default, mounting the cgroup filesystem attempts to
 814 +mount a hierarchy containing all registered subsystems.
 815 +
 816 +If an active hierarchy with exactly the same set of subsystems already
 817 +exists, it will be reused for the new mount. If no existing hierarchy
 818 +matches, and any of the requested subsystems are in use in an existing
 819 +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
 820 +is activated, associated with the requested subsystems.
 821 +
 822 +It's not currently possible to bind a new subsystem to an active
 823 +cgroup hierarchy, or to unbind a subsystem from an active cgroup
 824 +hierarchy. This may be possible in future, but is fraught with nasty
 825 +error-recovery issues.
 826 +
 827 +When a cgroup filesystem is unmounted, if there are any
 828 +child cgroups created below the top-level cgroup, that hierarchy
 829 +will remain active even though unmounted; if there are no
 830 +child cgroups then the hierarchy will be deactivated.
 831 +
 832 +No new system calls are added for cgroups - all support for
 833 +querying and modifying cgroups is via this cgroup file system.
 834 +
 835 +Each task under /proc has an added file named 'cgroup' displaying,
 836 +for each active hierarchy, the subsystem names and the cgroup name
 837 +as the path relative to the root of the cgroup file system.
 838 +
 839 +Each cgroup is represented by a directory in the cgroup file system
 840 +containing the following files describing that cgroup:
 841 +
 842 + - tasks: list of tasks (by pid) attached to that cgroup
 843 + - releasable flag: cgroup currently removeable?
 844 + - notify_on_release flag: run the release agent on exit?
 845 + - release_agent: the path to use for release notifications (this file
 846 +   exists in the top cgroup only)
 847 +
 848 +Other subsystems such as cpusets may add additional files in each
 849 +cgroup dir.
 850 +
 851 +New cgroups are created using the mkdir system call or shell
 852 +command.  The properties of a cgroup, such as its flags, are
 853 +modified by writing to the appropriate file in that cgroups
 854 +directory, as listed above.
 855 +
 856 +The named hierarchical structure of nested cgroups allows partitioning
 857 +a large system into nested, dynamically changeable, "soft-partitions".
 858 +
 859 +The attachment of each task, automatically inherited at fork by any
 860 +children of that task, to a cgroup allows organizing the work load
 861 +on a system into related sets of tasks.  A task may be re-attached to
 862 +any other cgroup, if allowed by the permissions on the necessary
 863 +cgroup file system directories.
 864 +
 865 +When a task is moved from one cgroup to another, it gets a new
 866 +css_set pointer - if there's an already existing css_set with the
 867 +desired collection of cgroups then that group is reused, else a new
 868 +css_set is allocated. Note that the current implementation uses a
 869 +linear search to locate an appropriate existing css_set, so isn't
 870 +very efficient. A future version will use a hash table for better
 871 +performance.
 872 +
 873 +To allow access from a cgroup to the css_sets (and hence tasks)
 874 +that comprise it, a set of cg_cgroup_link objects form a lattice;
 875 +each cg_cgroup_link is linked into a list of cg_cgroup_links for
 876 +a single cgroup on its cgrp_link_list field, and a list of
 877 +cg_cgroup_links for a single css_set on its cg_link_list.
 878 +
 879 +Thus the set of tasks in a cgroup can be listed by iterating over
 880 +each css_set that references the cgroup, and sub-iterating over
 881 +each css_set's task set.
 882 +
 883 +The use of a Linux virtual file system (vfs) to represent the
 884 +cgroup hierarchy provides for a familiar permission and name space
 885 +for cgroups, with a minimum of additional kernel code.
 886 +
 887 +1.4 What does notify_on_release do ?
 888 +------------------------------------
 889 +
 890 +If the notify_on_release flag is enabled (1) in a cgroup, then
 891 +whenever the last task in the cgroup leaves (exits or attaches to
 892 +some other cgroup) and the last child cgroup of that cgroup
 893 +is removed, then the kernel runs the command specified by the contents
 894 +of the "release_agent" file in that hierarchy's root directory,
 895 +supplying the pathname (relative to the mount point of the cgroup
 896 +file system) of the abandoned cgroup.  This enables automatic
 897 +removal of abandoned cgroups.  The default value of
 898 +notify_on_release in the root cgroup at system boot is disabled
 899 +(0).  The default value of other cgroups at creation is the current
 900 +value of their parents notify_on_release setting. The default value of
 901 +a cgroup hierarchy's release_agent path is empty.
 902 +
 903 +1.5 How do I use cgroups ?
 904 +--------------------------
 905 +
 906 +To start a new job that is to be contained within a cgroup, using
 907 +the "cpuset" cgroup subsystem, the steps are something like:
 908 +
 909 + 1) mkdir /dev/cgroup
 910 + 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
 911 + 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
 912 +    the /dev/cgroup virtual file system.
 913 + 4) Start a task that will be the "founding father" of the new job.
 914 + 5) Attach that task to the new cgroup by writing its pid to the
 915 +    /dev/cgroup tasks file for that cgroup.
 916 + 6) fork, exec or clone the job tasks from this founding father task.
 917 +
 918 +For example, the following sequence of commands will setup a cgroup
 919 +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
 920 +and then start a subshell 'sh' in that cgroup:
 921 +
 922 +  mount -t cgroup cpuset -ocpuset /dev/cgroup
 923 +  cd /dev/cgroup
 924 +  mkdir Charlie
 925 +  cd Charlie
 926 +  /bin/echo 2-3 > cpuset.cpus
 927 +  /bin/echo 1 > cpuset.mems
 928 +  /bin/echo $$ > tasks
 929 +  sh
 930 +  # The subshell 'sh' is now running in cgroup Charlie
 931 +  # The next line should display '/Charlie'
 932 +  cat /proc/self/cgroup
 933 +
 934 +2. Usage Examples and Syntax
 935 +============================
 936 +
 937 +2.1 Basic Usage
 938 +---------------
 939 +
 940 +Creating, modifying, using the cgroups can be done through the cgroup
 941 +virtual filesystem.
 942 +
 943 +To mount a cgroup hierarchy will all available subsystems, type:
 944 +# mount -t cgroup xxx /dev/cgroup
 945 +
 946 +The "xxx" is not interpreted by the cgroup code, but will appear in
 947 +/proc/mounts so may be any useful identifying string that you like.
 948 +
 949 +To mount a cgroup hierarchy with just the cpuset and numtasks
 950 +subsystems, type:
 951 +# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
 952 +
 953 +To change the set of subsystems bound to a mounted hierarchy, just
 954 +remount with different options:
 955 +
 956 +# mount -o remount,cpuset,ns  /dev/cgroup
 957 +
 958 +Note that changing the set of subsystems is currently only supported
 959 +when the hierarchy consists of a single (root) cgroup. Supporting
 960 +the ability to arbitrarily bind/unbind subsystems from an existing
 961 +cgroup hierarchy is intended to be implemented in the future.
 962 +
 963 +Then under /dev/cgroup you can find a tree that corresponds to the
 964 +tree of the cgroups in the system. For instance, /dev/cgroup
 965 +is the cgroup that holds the whole system.
 966 +
 967 +If you want to create a new cgroup under /dev/cgroup:
 968 +# cd /dev/cgroup
 969 +# mkdir my_cgroup
 970 +
 971 +Now you want to do something with this cgroup.
 972 +# cd my_cgroup
 973 +
 974 +In this directory you can find several files:
 975 +# ls
 976 +notify_on_release releasable tasks
 977 +(plus whatever files added by the attached subsystems)
 978 +
 979 +Now attach your shell to this cgroup:
 980 +# /bin/echo $$ > tasks
 981 +
 982 +You can also create cgroups inside your cgroup by using mkdir in this
 983 +directory.
 984 +# mkdir my_sub_cs
 985 +
 986 +To remove a cgroup, just use rmdir:
 987 +# rmdir my_sub_cs
 988 +
 989 +This will fail if the cgroup is in use (has cgroups inside, or
 990 +has processes attached, or is held alive by other subsystem-specific
 991 +reference).
 992 +
 993 +2.2 Attaching processes
 994 +-----------------------
 995 +
 996 +# /bin/echo PID > tasks
 997 +
 998 +Note that it is PID, not PIDs. You can only attach ONE task at a time.
 999 +If you have several tasks to attach, you have to do it one after another:
1000 +
1001 +# /bin/echo PID1 > tasks
1002 +# /bin/echo PID2 > tasks
1003 +       ...
1004 +# /bin/echo PIDn > tasks
1005 +
1006 +You can attach the current shell task by echoing 0:
1007 +
1008 +# echo 0 > tasks
1009 +
1010 +3. Kernel API
1011 +=============
1012 +
1013 +3.1 Overview
1014 +------------
1015 +
1016 +Each kernel subsystem that wants to hook into the generic cgroup
1017 +system needs to create a cgroup_subsys object. This contains
1018 +various methods, which are callbacks from the cgroup system, along
1019 +with a subsystem id which will be assigned by the cgroup system.
1020 +
1021 +Other fields in the cgroup_subsys object include:
1022 +
1023 +- subsys_id: a unique array index for the subsystem, indicating which
1024 +  entry in cgroup->subsys[] this subsystem should be managing.
1025 +
1026 +- name: should be initialized to a unique subsystem name. Should be
1027 +  no longer than MAX_CGROUP_TYPE_NAMELEN.
1028 +
1029 +- early_init: indicate if the subsystem needs early initialization
1030 +  at system boot.
1031 +
1032 +Each cgroup object created by the system has an array of pointers,
1033 +indexed by subsystem id; this pointer is entirely managed by the
1034 +subsystem; the generic cgroup code will never touch this pointer.
1035 +
1036 +3.2 Synchronization
1037 +-------------------
1038 +
1039 +There is a global mutex, cgroup_mutex, used by the cgroup
1040 +system. This should be taken by anything that wants to modify a
1041 +cgroup. It may also be taken to prevent cgroups from being
1042 +modified, but more specific locks may be more appropriate in that
1043 +situation.
1044 +
1045 +See kernel/cgroup.c for more details.
1046 +
1047 +Subsystems can take/release the cgroup_mutex via the functions
1048 +cgroup_lock()/cgroup_unlock().
1049 +
1050 +Accessing a task's cgroup pointer may be done in the following ways:
1051 +- while holding cgroup_mutex
1052 +- while holding the task's alloc_lock (via task_lock())
1053 +- inside an rcu_read_lock() section via rcu_dereference()
1054 +
1055 +3.3 Subsystem API
1056 +-----------------
1057 +
1058 +Each subsystem should:
1059 +
1060 +- add an entry in linux/cgroup_subsys.h
1061 +- define a cgroup_subsys object called <name>_subsys
1062 +
1063 +Each subsystem may export the following methods. The only mandatory
1064 +methods are create/destroy. Any others that are null are presumed to
1065 +be successful no-ops.
1066 +
1067 +struct cgroup_subsys_state *create(struct cgroup_subsys *ss,
1068 +                                  struct cgroup *cgrp)
1069 +(cgroup_mutex held by caller)
1070 +
1071 +Called to create a subsystem state object for a cgroup. The
1072 +subsystem should allocate its subsystem state object for the passed
1073 +cgroup, returning a pointer to the new object on success or a
1074 +negative error code. On success, the subsystem pointer should point to
1075 +a structure of type cgroup_subsys_state (typically embedded in a
1076 +larger subsystem-specific object), which will be initialized by the
1077 +cgroup system. Note that this will be called at initialization to
1078 +create the root subsystem state for this subsystem; this case can be
1079 +identified by the passed cgroup object having a NULL parent (since
1080 +it's the root of the hierarchy) and may be an appropriate place for
1081 +initialization code.
1082 +
1083 +void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
1084 +(cgroup_mutex held by caller)
1085 +
1086 +The cgroup system is about to destroy the passed cgroup; the subsystem
1087 +should do any necessary cleanup and free its subsystem state
1088 +object. By the time this method is called, the cgroup has already been
1089 +unlinked from the file system and from the child list of its parent;
1090 +cgroup->parent is still valid. (Note - can also be called for a
1091 +newly-created cgroup if an error occurs after this subsystem's
1092 +create() method has been called for the new cgroup).
1093 +
1094 +void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
1095 +(cgroup_mutex held by caller)
1096 +
1097 +Called before checking the reference count on each subsystem. This may
1098 +be useful for subsystems which have some extra references even if
1099 +there are not tasks in the cgroup.
1100 +
1101 +int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1102 +              struct task_struct *task)
1103 +(cgroup_mutex held by caller)
1104 +
1105 +Called prior to moving a task into a cgroup; if the subsystem
1106 +returns an error, this will abort the attach operation.  If a NULL
1107 +task is passed, then a successful result indicates that *any*
1108 +unspecified task can be moved into the cgroup. Note that this isn't
1109 +called on a fork. If this method returns 0 (success) then this should
1110 +remain valid while the caller holds cgroup_mutex.
1111 +
1112 +void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1113 +           struct cgroup *old_cgrp, struct task_struct *task)
1114 +
1115 +Called after the task has been attached to the cgroup, to allow any
1116 +post-attachment activity that requires memory allocations or blocking.
1117 +
1118 +void fork(struct cgroup_subsy *ss, struct task_struct *task)
1119 +
1120 +Called when a task is forked into a cgroup.
1121 +
1122 +void exit(struct cgroup_subsys *ss, struct task_struct *task)
1123 +
1124 +Called during task exit.
1125 +
1126 +int populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
1127 +
1128 +Called after creation of a cgroup to allow a subsystem to populate
1129 +the cgroup directory with file entries.  The subsystem should make
1130 +calls to cgroup_add_file() with objects of type cftype (see
1131 +include/linux/cgroup.h for details).  Note that although this
1132 +method can return an error code, the error code is currently not
1133 +always handled well.
1134 +
1135 +void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
1136 +
1137 +Called at the end of cgroup_clone() to do any paramater
1138 +initialization which might be required before a task could attach.  For
1139 +example in cpusets, no task may attach before 'cpus' and 'mems' are set
1140 +up.
1141 +
1142 +void bind(struct cgroup_subsys *ss, struct cgroup *root)
1143 +(cgroup_mutex held by caller)
1144 +
1145 +Called when a cgroup subsystem is rebound to a different hierarchy
1146 +and root cgroup. Currently this will only involve movement between
1147 +the default hierarchy (which never has sub-cgroups) and a hierarchy
1148 +that is being created/destroyed (and hence has no sub-cgroups).
1149 +
1150 +4. Questions
1151 +============
1152 +
1153 +Q: what's up with this '/bin/echo' ?
1154 +A: bash's builtin 'echo' command does not check calls to write() against
1155 +   errors. If you use it in the cgroup file system, you won't be
1156 +   able to tell whether a command succeeded or failed.
1157 +
1158 +Q: When I attach processes, only the first of the line gets really attached !
1159 +A: We can only return one error code per call to write(). So you should also
1160 +   put only ONE pid.
1161 +
1162 --- /dev/null
1163 +++ b/Documentation/cgroups/freezer-subsystem.txt
1164 @@ -0,0 +1,102 @@
1165 +The cgroup freezer is useful to batch job management system which start
1166 +and stop sets of tasks in order to schedule the resources of a machine
1167 +according to the desires of a system administrator. This sort of program
1168 +is often used on HPC clusters to schedule access to the cluster as a
1169 +whole. The cgroup freezer uses cgroups to describe the set of tasks to
1170 +be started/stopped by the batch job management system. It also provides
1171 +a means to start and stop the tasks composing the job.
1172 +
1173 +The cgroup freezer will also be useful for checkpointing running groups
1174 +of tasks. The freezer allows the checkpoint code to obtain a consistent
1175 +image of the tasks by attempting to force the tasks in a cgroup into a
1176 +quiescent state. Once the tasks are quiescent another task can
1177 +walk /proc or invoke a kernel interface to gather information about the
1178 +quiesced tasks. Checkpointed tasks can be restarted later should a
1179 +recoverable error occur. This also allows the checkpointed tasks to be
1180 +migrated between nodes in a cluster by copying the gathered information
1181 +to another node and restarting the tasks there.
1182 +
1183 +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
1184 +and resuming tasks in userspace. Both of these signals are observable
1185 +from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
1186 +blocked, or ignored it can be seen by waiting or ptracing parent tasks.
1187 +SIGCONT is especially unsuitable since it can be caught by the task. Any
1188 +programs designed to watch for SIGSTOP and SIGCONT could be broken by
1189 +attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
1190 +demonstrate this problem using nested bash shells:
1191 +
1192 +       $ echo $$
1193 +       16644
1194 +       $ bash
1195 +       $ echo $$
1196 +       16690
1197 +
1198 +       From a second, unrelated bash shell:
1199 +       $ kill -SIGSTOP 16690
1200 +       $ kill -SIGCONT 16990
1201 +
1202 +       <at this point 16990 exits and causes 16644 to exit too>
1203 +
1204 +This happens because bash can observe both signals and choose how it
1205 +responds to them.
1206 +
1207 +Another example of a program which catches and responds to these
1208 +signals is gdb. In fact any program designed to use ptrace is likely to
1209 +have a problem with this method of stopping and resuming tasks.
1210 +
1211 +In contrast, the cgroup freezer uses the kernel freezer code to
1212 +prevent the freeze/unfreeze cycle from becoming visible to the tasks
1213 +being frozen. This allows the bash example above and gdb to run as
1214 +expected.
1215 +
1216 +The freezer subsystem in the container filesystem defines a file named
1217 +freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the
1218 +cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup.
1219 +Reading will return the current state.
1220 +
1221 +Note freezer.state doesn't exist in root cgroup, which means root cgroup
1222 +is non-freezable.
1223 +
1224 +* Examples of usage :
1225 +
1226 +   # mkdir /containers
1227 +   # mount -t cgroup -ofreezer freezer  /containers
1228 +   # mkdir /containers/0
1229 +   # echo $some_pid > /containers/0/tasks
1230 +
1231 +to get status of the freezer subsystem :
1232 +
1233 +   # cat /containers/0/freezer.state
1234 +   THAWED
1235 +
1236 +to freeze all tasks in the container :
1237 +
1238 +   # echo FROZEN > /containers/0/freezer.state
1239 +   # cat /containers/0/freezer.state
1240 +   FREEZING
1241 +   # cat /containers/0/freezer.state
1242 +   FROZEN
1243 +
1244 +to unfreeze all tasks in the container :
1245 +
1246 +   # echo THAWED > /containers/0/freezer.state
1247 +   # cat /containers/0/freezer.state
1248 +   THAWED
1249 +
1250 +This is the basic mechanism which should do the right thing for user space task
1251 +in a simple scenario.
1252 +
1253 +It's important to note that freezing can be incomplete. In that case we return
1254 +EBUSY. This means that some tasks in the cgroup are busy doing something that
1255 +prevents us from completely freezing the cgroup at this time. After EBUSY,
1256 +the cgroup will remain partially frozen -- reflected by freezer.state reporting
1257 +"FREEZING" when read. The state will remain "FREEZING" until one of these
1258 +things happens:
1259 +
1260 +       1) Userspace cancels the freezing operation by writing "THAWED" to
1261 +               the freezer.state file
1262 +       2) Userspace retries the freezing operation by writing "FROZEN" to
1263 +               the freezer.state file (writing "FREEZING" is not legal
1264 +               and returns EINVAL)
1265 +       3) The tasks that blocked the cgroup from entering the "FROZEN"
1266 +               state disappear from the cgroup's set of tasks.
1267 --- a/Documentation/cpusets.txt
1268 +++ b/Documentation/cpusets.txt
1269 @@ -48,7 +48,7 @@ hooks, beyond what is already present, r
1270  job placement on large systems.
1271
1272  Cpusets use the generic cgroup subsystem described in
1273 -Documentation/cgroup.txt.
1274 +Documentation/cgroups/cgroups.txt.
1275
1276  Requests by a task, using the sched_setaffinity(2) system call to
1277  include CPUs in its CPU affinity mask, and using the mbind(2) and
1278 --- a/arch/alpha/Kconfig
1279 +++ b/arch/alpha/Kconfig
1280 @@ -72,6 +72,7 @@ config ARCH_SUPPORTS_AOUT
1281         def_bool y
1282
1283  source "init/Kconfig"
1284 +source "kernel/Kconfig.freezer"
1285
1286
1287  menu "System setup"
1288 --- a/arch/alpha/include/asm/thread_info.h
1289 +++ b/arch/alpha/include/asm/thread_info.h
1290 @@ -74,12 +74,14 @@ register struct thread_info *__current_t
1291  #define TIF_UAC_SIGBUS         7
1292  #define TIF_MEMDIE             8
1293  #define TIF_RESTORE_SIGMASK    9       /* restore signal mask in do_signal */
1294 +#define TIF_FREEZE             16      /* is freezing for suspend */
1295
1296  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1297  #define _TIF_SIGPENDING                (1<<TIF_SIGPENDING)
1298  #define _TIF_NEED_RESCHED      (1<<TIF_NEED_RESCHED)
1299  #define _TIF_POLLING_NRFLAG    (1<<TIF_POLLING_NRFLAG)
1300  #define _TIF_RESTORE_SIGMASK   (1<<TIF_RESTORE_SIGMASK)
1301 +#define _TIF_FREEZE            (1<<TIF_FREEZE)
1302
1303  /* Work to do on interrupt/exception return.  */
1304  #define _TIF_WORK_MASK         (_TIF_SIGPENDING | _TIF_NEED_RESCHED)
1305 --- a/arch/arm/Kconfig
1306 +++ b/arch/arm/Kconfig
1307 @@ -190,6 +190,8 @@ config VECTORS_BASE
1308
1309  source "init/Kconfig"
1310
1311 +source "kernel/Kconfig.freezer"
1312 +
1313  menu "System Type"
1314
1315  choice
1316 --- a/arch/avr32/Kconfig
1317 +++ b/arch/avr32/Kconfig
1318 @@ -72,6 +72,8 @@ config GENERIC_BUG
1319
1320  source "init/Kconfig"
1321
1322 +source "kernel/Kconfig.freezer"
1323 +
1324  menu "System Type and features"
1325
1326  source "kernel/time/Kconfig"
1327 --- a/arch/avr32/include/asm/thread_info.h
1328 +++ b/arch/avr32/include/asm/thread_info.h
1329 @@ -96,6 +96,7 @@ static inline struct thread_info *curren
1330  #define _TIF_MEMDIE            (1 << TIF_MEMDIE)
1331  #define _TIF_RESTORE_SIGMASK   (1 << TIF_RESTORE_SIGMASK)
1332  #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP)
1333 +#define _TIF_FREEZE            (1 << TIF_FREEZE)
1334
1335  /* Note: The masks below must never span more than 16 bits! */
1336
1337 --- a/arch/blackfin/Kconfig
1338 +++ b/arch/blackfin/Kconfig
1339 @@ -64,8 +64,11 @@ config HARDWARE_PM
1340         depends on OPROFILE
1341
1342  source "init/Kconfig"
1343 +
1344  source "kernel/Kconfig.preempt"
1345
1346 +source "kernel/Kconfig.freezer"
1347 +
1348  menu "Blackfin Processor Options"
1349
1350  comment "Processor and Board Settings"
1351 --- a/arch/cris/Kconfig
1352 +++ b/arch/cris/Kconfig
1353 @@ -62,6 +62,8 @@ config HZ
1354
1355  source "init/Kconfig"
1356
1357 +source "kernel/Kconfig.freezer"
1358 +
1359  menu "General setup"
1360
1361  source "fs/Kconfig.binfmt"
1362 --- a/arch/frv/Kconfig
1363 +++ b/arch/frv/Kconfig
1364 @@ -66,6 +66,8 @@ mainmenu "Fujitsu FR-V Kernel Configurat
1365
1366  source "init/Kconfig"
1367
1368 +source "kernel/Kconfig.freezer"
1369 +
1370
1371  menu "Fujitsu FR-V system setup"
1372
1373 --- a/arch/h8300/Kconfig
1374 +++ b/arch/h8300/Kconfig
1375 @@ -89,6 +89,8 @@ config HZ
1376
1377  source "init/Kconfig"
1378
1379 +source "kernel/Kconfig.freezer"
1380 +
1381  source "arch/h8300/Kconfig.cpu"
1382
1383  menu "Executable file formats"
1384 --- a/arch/h8300/include/asm/thread_info.h
1385 +++ b/arch/h8300/include/asm/thread_info.h
1386 @@ -89,6 +89,7 @@ static inline struct thread_info *curren
1387                                            TIF_NEED_RESCHED */
1388  #define TIF_MEMDIE             4
1389  #define TIF_RESTORE_SIGMASK    5       /* restore signal mask in do_signal() */
1390 +#define TIF_FREEZE             16      /* is freezing for suspend */
1391
1392  /* as above, but as bit values */
1393  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1394 @@ -96,6 +97,7 @@ static inline struct thread_info *curren
1395  #define _TIF_NEED_RESCHED      (1<<TIF_NEED_RESCHED)
1396  #define _TIF_POLLING_NRFLAG    (1<<TIF_POLLING_NRFLAG)
1397  #define _TIF_RESTORE_SIGMASK   (1<<TIF_RESTORE_SIGMASK)
1398 +#define _TIF_FREEZE            (1<<TIF_FREEZE)
1399
1400  #define _TIF_WORK_MASK         0x0000FFFE      /* work to do on interrupt/exception return */
1401
1402 --- a/arch/ia64/Kconfig
1403 +++ b/arch/ia64/Kconfig
1404 @@ -7,6 +7,8 @@ mainmenu "IA-64 Linux Kernel Configurati
1405
1406  source "init/Kconfig"
1407
1408 +source "kernel/Kconfig.freezer"
1409 +
1410  menu "Processor type and features"
1411
1412  config IA64
1413 --- a/arch/m32r/Kconfig
1414 +++ b/arch/m32r/Kconfig
1415 @@ -45,6 +45,8 @@ config HZ
1416
1417  source "init/Kconfig"
1418
1419 +source "kernel/Kconfig.freezer"
1420 +
1421
1422  menu "Processor type and features"
1423
1424 --- a/arch/m68k/Kconfig
1425 +++ b/arch/m68k/Kconfig
1426 @@ -64,6 +64,8 @@ mainmenu "Linux/68k Kernel Configuration
1427
1428  source "init/Kconfig"
1429
1430 +source "kernel/Kconfig.freezer"
1431 +
1432  menu "Platform dependent setup"
1433
1434  config EISA
1435 --- a/arch/m68knommu/Kconfig
1436 +++ b/arch/m68knommu/Kconfig
1437 @@ -82,6 +82,8 @@ config ARCH_SUPPORTS_AOUT
1438
1439  source "init/Kconfig"
1440
1441 +source "kernel/Kconfig.freezer"
1442 +
1443  menu "Processor type and features"
1444
1445  choice
1446 --- a/arch/m68knommu/include/asm/thread_info.h
1447 +++ b/arch/m68knommu/include/asm/thread_info.h
1448 @@ -84,12 +84,14 @@ static inline struct thread_info *curren
1449  #define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1450                                            TIF_NEED_RESCHED */
1451  #define TIF_MEMDIE             4
1452 +#define TIF_FREEZE             16      /* is freezing for suspend */
1453
1454  /* as above, but as bit values */
1455  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1456  #define _TIF_SIGPENDING                (1<<TIF_SIGPENDING)
1457  #define _TIF_NEED_RESCHED      (1<<TIF_NEED_RESCHED)
1458  #define _TIF_POLLING_NRFLAG    (1<<TIF_POLLING_NRFLAG)
1459 +#define _TIF_FREEZE            (1<<TIF_FREEZE)
1460
1461  #define _TIF_WORK_MASK         0x0000FFFE      /* work to do on interrupt/exception return */
1462
1463 --- a/arch/mips/Kconfig
1464 +++ b/arch/mips/Kconfig
1465 @@ -1885,6 +1885,8 @@ config PROBE_INITRD_HEADER
1466           add initrd or initramfs image to the kernel image.
1467           Otherwise, say N.
1468
1469 +source "kernel/Kconfig.freezer"
1470 +
1471  menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)"
1472
1473  config HW_HAS_EISA
1474 --- a/arch/mn10300/Kconfig
1475 +++ b/arch/mn10300/Kconfig
1476 @@ -71,6 +71,8 @@ mainmenu "Matsushita MN10300/AM33 Kernel
1477
1478  source "init/Kconfig"
1479
1480 +source "kernel/Kconfig.freezer"
1481 +
1482
1483  menu "Matsushita MN10300 system setup"
1484
1485 --- a/arch/parisc/Kconfig
1486 +++ b/arch/parisc/Kconfig
1487 @@ -93,6 +93,8 @@ config ARCH_MAY_HAVE_PC_FDC
1488
1489  source "init/Kconfig"
1490
1491 +source "kernel/Kconfig.freezer"
1492 +
1493
1494  menu "Processor type and features"
1495
1496 --- a/arch/powerpc/Kconfig
1497 +++ b/arch/powerpc/Kconfig
1498 @@ -228,6 +228,8 @@ config PPC_OF_PLATFORM_PCI
1499
1500  source "init/Kconfig"
1501
1502 +source "kernel/Kconfig.freezer"
1503 +
1504  source "arch/powerpc/sysdev/Kconfig"
1505  source "arch/powerpc/platforms/Kconfig"
1506
1507 --- a/arch/s390/Kconfig
1508 +++ b/arch/s390/Kconfig
1509 @@ -79,6 +79,8 @@ config S390
1510
1511  source "init/Kconfig"
1512
1513 +source "kernel/Kconfig.freezer"
1514 +
1515  menu "Base setup"
1516
1517  comment "Processor type and features"
1518 --- a/arch/s390/include/asm/thread_info.h
1519 +++ b/arch/s390/include/asm/thread_info.h
1520 @@ -98,6 +98,7 @@ static inline struct thread_info *curren
1521  #define TIF_31BIT              18      /* 32bit process */
1522  #define TIF_MEMDIE             19
1523  #define TIF_RESTORE_SIGMASK    20      /* restore signal mask in do_signal() */
1524 +#define TIF_FREEZE             21
1525
1526  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1527  #define _TIF_RESTORE_SIGMASK   (1<<TIF_RESTORE_SIGMASK)
1528 @@ -110,6 +111,7 @@ static inline struct thread_info *curren
1529  #define _TIF_USEDFPU           (1<<TIF_USEDFPU)
1530  #define _TIF_POLLING_NRFLAG    (1<<TIF_POLLING_NRFLAG)
1531  #define _TIF_31BIT             (1<<TIF_31BIT)
1532 +#define _TIF_FREEZE            (1<<TIF_FREEZE)
1533
1534  #endif /* __KERNEL__ */
1535
1536 --- a/arch/sh/Kconfig
1537 +++ b/arch/sh/Kconfig
1538 @@ -106,6 +106,8 @@ config IO_TRAPPED
1539
1540  source "init/Kconfig"
1541
1542 +source "kernel/Kconfig.freezer"
1543 +
1544  menu "System type"
1545
1546  #
1547 --- a/arch/sparc/Kconfig
1548 +++ b/arch/sparc/Kconfig
1549 @@ -32,6 +32,8 @@ config HZ
1550
1551  source "init/Kconfig"
1552
1553 +source "kernel/Kconfig.freezer"
1554 +
1555  menu "General machine setup"
1556
1557  config SMP
1558 --- a/arch/sparc/include/asm/thread_info_32.h
1559 +++ b/arch/sparc/include/asm/thread_info_32.h
1560 @@ -139,6 +139,7 @@ BTFIXUPDEF_CALL(void, free_thread_info,
1561  #define TIF_POLLING_NRFLAG     9       /* true if poll_idle() is polling
1562                                          * TIF_NEED_RESCHED */
1563  #define TIF_MEMDIE             10
1564 +#define TIF_FREEZE             11      /* is freezing for suspend */
1565
1566  /* as above, but as bit values */
1567  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1568 @@ -152,6 +153,7 @@ BTFIXUPDEF_CALL(void, free_thread_info,
1569  #define _TIF_DO_NOTIFY_RESUME_MASK     (_TIF_NOTIFY_RESUME | \
1570                                          _TIF_SIGPENDING | \
1571                                          _TIF_RESTORE_SIGMASK)
1572 +#define _TIF_FREEZE            (1<<TIF_FREEZE)
1573
1574  #endif /* __KERNEL__ */
1575
1576 --- a/arch/sparc64/Kconfig
1577 +++ b/arch/sparc64/Kconfig
1578 @@ -85,6 +85,7 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
1579         def_bool y
1580
1581  source "init/Kconfig"
1582 +source "kernel/Kconfig.freezer"
1583
1584  menu "Processor type and features"
1585
1586 --- a/arch/um/Kconfig
1587 +++ b/arch/um/Kconfig
1588 @@ -229,6 +229,8 @@ endmenu
1589
1590  source "init/Kconfig"
1591
1592 +source "kernel/Kconfig.freezer"
1593 +
1594  source "drivers/block/Kconfig"
1595
1596  source "arch/um/Kconfig.char"
1597 --- a/arch/x86/Kconfig
1598 +++ b/arch/x86/Kconfig
1599 @@ -208,6 +208,7 @@ config X86_TRAMPOLINE
1600  config KTIME_SCALAR
1601         def_bool X86_32
1602  source "init/Kconfig"
1603 +source "kernel/Kconfig.freezer"
1604
1605  menu "Processor type and features"
1606
1607 --- a/arch/xtensa/Kconfig
1608 +++ b/arch/xtensa/Kconfig
1609 @@ -55,6 +55,7 @@ config HZ
1610         default 100
1611
1612  source "init/Kconfig"
1613 +source "kernel/Kconfig.freezer"
1614
1615  menu "Processor type and features"
1616
1617 --- a/include/asm-cris/thread_info.h
1618 +++ b/include/asm-cris/thread_info.h
1619 @@ -88,6 +88,7 @@ struct thread_info {
1620  #define TIF_RESTORE_SIGMASK    9       /* restore signal mask in do_signal() */
1621  #define TIF_POLLING_NRFLAG     16      /* true if poll_idle() is polling TIF_NEED_RESCHED */
1622  #define TIF_MEMDIE             17
1623 +#define TIF_FREEZE             18      /* is freezing for suspend */
1624
1625  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1626  #define _TIF_NOTIFY_RESUME     (1<<TIF_NOTIFY_RESUME)
1627 @@ -95,6 +96,7 @@ struct thread_info {
1628  #define _TIF_NEED_RESCHED      (1<<TIF_NEED_RESCHED)
1629  #define _TIF_RESTORE_SIGMASK   (1<<TIF_RESTORE_SIGMASK)
1630  #define _TIF_POLLING_NRFLAG    (1<<TIF_POLLING_NRFLAG)
1631 +#define _TIF_FREEZE            (1<<TIF_FREEZE)
1632
1633  #define _TIF_WORK_MASK         0x0000FFFE      /* work to do on interrupt/exception return */
1634  #define _TIF_ALLWORK_MASK      0x0000FFFF      /* work to do on any return to u-space */
1635 --- a/include/asm-m68k/thread_info.h
1636 +++ b/include/asm-m68k/thread_info.h
1637 @@ -52,5 +52,6 @@ struct thread_info {
1638  #define TIF_DELAYED_TRACE      14      /* single step a syscall */
1639  #define TIF_SYSCALL_TRACE      15      /* syscall trace active */
1640  #define TIF_MEMDIE             16
1641 +#define TIF_FREEZE             17      /* thread is freezing for suspend */
1642
1643  #endif /* _ASM_M68K_THREAD_INFO_H */
1644 --- a/include/asm-parisc/thread_info.h
1645 +++ b/include/asm-parisc/thread_info.h
1646 @@ -58,6 +58,7 @@ struct thread_info {
1647  #define TIF_32BIT               4       /* 32 bit binary */
1648  #define TIF_MEMDIE             5
1649  #define TIF_RESTORE_SIGMASK    6       /* restore saved signal mask */
1650 +#define TIF_FREEZE             7       /* is freezing for suspend */
1651
1652  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1653  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
1654 @@ -65,6 +66,7 @@ struct thread_info {
1655  #define _TIF_POLLING_NRFLAG    (1 << TIF_POLLING_NRFLAG)
1656  #define _TIF_32BIT             (1 << TIF_32BIT)
1657  #define _TIF_RESTORE_SIGMASK   (1 << TIF_RESTORE_SIGMASK)
1658 +#define _TIF_FREEZE            (1 << TIF_FREEZE)
1659
1660  #define _TIF_USER_WORK_MASK     (_TIF_SIGPENDING | \
1661                                   _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK)
1662 --- a/include/asm-um/thread_info.h
1663 +++ b/include/asm-um/thread_info.h
1664 @@ -69,6 +69,7 @@ static inline struct thread_info *curren
1665  #define TIF_MEMDIE             5
1666  #define TIF_SYSCALL_AUDIT      6
1667  #define TIF_RESTORE_SIGMASK    7
1668 +#define TIF_FREEZE             16      /* is freezing for suspend */
1669
1670  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1671  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
1672 @@ -77,5 +78,6 @@ static inline struct thread_info *curren
1673  #define _TIF_MEMDIE            (1 << TIF_MEMDIE)
1674  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1675  #define _TIF_RESTORE_SIGMASK   (1 << TIF_RESTORE_SIGMASK)
1676 +#define _TIF_FREEZE            (1 << TIF_FREEZE)
1677
1678  #endif
1679 --- a/include/asm-xtensa/thread_info.h
1680 +++ b/include/asm-xtensa/thread_info.h
1681 @@ -134,6 +134,7 @@ static inline struct thread_info *curren
1682  #define TIF_MEMDIE             5
1683  #define TIF_RESTORE_SIGMASK    6       /* restore signal mask in do_signal() */
1684  #define TIF_POLLING_NRFLAG     16      /* true if poll_idle() is polling TIF_NEED_RESCHED */
1685 +#define TIF_FREEZE             17      /* is freezing for suspend */
1686
1687  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1688  #define _TIF_SIGPENDING                (1<<TIF_SIGPENDING)
1689 @@ -142,6 +143,7 @@ static inline struct thread_info *curren
1690  #define _TIF_IRET              (1<<TIF_IRET)
1691  #define _TIF_POLLING_NRFLAG    (1<<TIF_POLLING_NRFLAG)
1692  #define _TIF_RESTORE_SIGMASK   (1<<TIF_RESTORE_SIGMASK)
1693 +#define _TIF_FREEZE            (1<<TIF_FREEZE)
1694
1695  #define _TIF_WORK_MASK         0x0000FFFE      /* work to do on interrupt/exception return */
1696  #define _TIF_ALLWORK_MASK      0x0000FFFF      /* work to do on any return to u-space */
1697 --- a/include/linux/cgroup_subsys.h
1698 +++ b/include/linux/cgroup_subsys.h
1699 @@ -48,3 +48,9 @@ SUBSYS(devices)
1700  #endif
1701
1702  /* */
1703 +
1704 +#ifdef CONFIG_CGROUP_FREEZER
1705 +SUBSYS(freezer)
1706 +#endif
1707 +
1708 +/* */
1709 --- a/include/linux/freezer.h
1710 +++ b/include/linux/freezer.h
1711 @@ -6,7 +6,7 @@
1712  #include <linux/sched.h>
1713  #include <linux/wait.h>
1714
1715 -#ifdef CONFIG_PM_SLEEP
1716 +#ifdef CONFIG_FREEZER
1717  /*
1718   * Check if a process has been frozen
1719   */
1720 @@ -39,29 +39,14 @@ static inline void clear_freeze_flag(str
1721         clear_tsk_thread_flag(p, TIF_FREEZE);
1722  }
1723
1724 -/*
1725 - * Wake up a frozen process
1726 - *
1727 - * task_lock() is taken to prevent the race with refrigerator() which may
1728 - * occur if the freezing of tasks fails.  Namely, without the lock, if the
1729 - * freezing of tasks failed, thaw_tasks() might have run before a task in
1730 - * refrigerator() could call frozen_process(), in which case the task would be
1731 - * frozen and no one would thaw it.
1732 - */
1733 -static inline int thaw_process(struct task_struct *p)
1734 -{
1735 -       task_lock(p);
1736 -       if (frozen(p)) {
1737 -               p->flags &= ~PF_FROZEN;
1738 -               task_unlock(p);
1739 -               wake_up_process(p);
1740 -               return 1;
1741 -       }
1742 -       clear_freeze_flag(p);
1743 -       task_unlock(p);
1744 -       return 0;
1745 +static inline bool should_send_signal(struct task_struct *p)
1746 +{
1747 +       return !(p->flags & PF_FREEZER_NOSIG);
1748  }
1749
1750 +/* Takes and releases task alloc lock using task_lock() */
1751 +extern int thaw_process(struct task_struct *p);
1752 +
1753  extern void refrigerator(void);
1754  extern int freeze_processes(void);
1755  extern void thaw_processes(void);
1756 @@ -75,6 +60,15 @@ static inline int try_to_freeze(void)
1757                 return 0;
1758  }
1759
1760 +extern bool freeze_task(struct task_struct *p, bool sig_only);
1761 +extern void cancel_freezing(struct task_struct *p);
1762 +
1763 +#ifdef CONFIG_CGROUP_FREEZER
1764 +extern int cgroup_frozen(struct task_struct *task);
1765 +#else /* !CONFIG_CGROUP_FREEZER */
1766 +static inline int cgroup_frozen(struct task_struct *task) { return 0; }
1767 +#endif /* !CONFIG_CGROUP_FREEZER */
1768 +
1769  /*
1770   * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
1771   * calls wait_for_completion(&vfork) and reset right after it returns from this
1772 @@ -166,7 +160,7 @@ static inline void set_freezable_with_si
1773         } while (try_to_freeze());                                      \
1774         __retval;                                                       \
1775  })
1776 -#else /* !CONFIG_PM_SLEEP */
1777 +#else /* !CONFIG_FREEZER */
1778  static inline int frozen(struct task_struct *p) { return 0; }
1779  static inline int freezing(struct task_struct *p) { return 0; }
1780  static inline void set_freeze_flag(struct task_struct *p) {}
1781 @@ -191,6 +185,6 @@ static inline void set_freezable_with_si
1782  #define wait_event_freezable_timeout(wq, condition, timeout)           \
1783                 wait_event_interruptible_timeout(wq, condition, timeout)
1784
1785 -#endif /* !CONFIG_PM_SLEEP */
1786 +#endif /* !CONFIG_FREEZER */
1787
1788  #endif /* FREEZER_H_INCLUDED */
1789 --- a/init/Kconfig
1790 +++ b/init/Kconfig
1791 @@ -303,6 +303,13 @@ config CGROUP_NS
1792            for instance virtual servers and checkpoint/restart
1793            jobs.
1794
1795 +config CGROUP_FREEZER
1796 +        bool "control group freezer subsystem"
1797 +        depends on CGROUPS
1798 +        help
1799 +          Provides a way to freeze and unfreeze all tasks in a
1800 +         cgroup.
1801 +
1802  config CGROUP_DEVICE
1803         bool "Device controller for cgroups"
1804         depends on CGROUPS && EXPERIMENTAL
1805 --- /dev/null
1806 +++ b/kernel/Kconfig.freezer
1807 @@ -0,0 +1,2 @@
1808 +config FREEZER
1809 +       def_bool PM_SLEEP || CGROUP_FREEZER
1810 --- a/kernel/Makefile
1811 +++ b/kernel/Makefile
1812 @@ -22,6 +22,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
1813  CFLAGS_REMOVE_sched.o = -pg
1814  endif
1815
1816 +obj-$(CONFIG_FREEZER) += freezer.o
1817  obj-$(CONFIG_PROFILING) += profile.o
1818  obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
1819  obj-$(CONFIG_STACKTRACE) += stacktrace.o
1820 @@ -54,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += bac
1821  obj-$(CONFIG_COMPAT) += compat.o
1822  obj-$(CONFIG_CGROUPS) += cgroup.o
1823  obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
1824 +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
1825  obj-$(CONFIG_CPUSETS) += cpuset.o
1826  obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
1827  obj-$(CONFIG_UTS_NS) += utsname.o
1828 --- /dev/null
1829 +++ b/kernel/cgroup_freezer.c
1830 @@ -0,0 +1,379 @@
1831 +/*
1832 + * cgroup_freezer.c -  control group freezer subsystem
1833 + *
1834 + * Copyright IBM Corporation, 2007
1835 + *
1836 + * Author : Cedric Le Goater <clg@fr.ibm.com>
1837 + *
1838 + * This program is free software; you can redistribute it and/or modify it
1839 + * under the terms of version 2.1 of the GNU Lesser General Public License
1840 + * as published by the Free Software Foundation.
1841 + *
1842 + * This program is distributed in the hope that it would be useful, but
1843 + * WITHOUT ANY WARRANTY; without even the implied warranty of
1844 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
1845 + */
1846 +
1847 +#include <linux/module.h>
1848 +#include <linux/cgroup.h>
1849 +#include <linux/fs.h>
1850 +#include <linux/uaccess.h>
1851 +#include <linux/freezer.h>
1852 +#include <linux/seq_file.h>
1853 +
1854 +enum freezer_state {
1855 +       CGROUP_THAWED = 0,
1856 +       CGROUP_FREEZING,
1857 +       CGROUP_FROZEN,
1858 +};
1859 +
1860 +struct freezer {
1861 +       struct cgroup_subsys_state css;
1862 +       enum freezer_state state;
1863 +       spinlock_t lock; /* protects _writes_ to state */
1864 +};
1865 +
1866 +static inline struct freezer *cgroup_freezer(
1867 +               struct cgroup *cgroup)
1868 +{
1869 +       return container_of(
1870 +               cgroup_subsys_state(cgroup, freezer_subsys_id),
1871 +               struct freezer, css);
1872 +}
1873 +
1874 +static inline struct freezer *task_freezer(struct task_struct *task)
1875 +{
1876 +       return container_of(task_subsys_state(task, freezer_subsys_id),
1877 +                           struct freezer, css);
1878 +}
1879 +
1880 +int cgroup_frozen(struct task_struct *task)
1881 +{
1882 +       struct freezer *freezer;
1883 +       enum freezer_state state;
1884 +
1885 +       task_lock(task);
1886 +       freezer = task_freezer(task);
1887 +       state = freezer->state;
1888 +       task_unlock(task);
1889 +
1890 +       return state == CGROUP_FROZEN;
1891 +}
1892 +
1893 +/*
1894 + * cgroups_write_string() limits the size of freezer state strings to
1895 + * CGROUP_LOCAL_BUFFER_SIZE
1896 + */
1897 +static const char *freezer_state_strs[] = {
1898 +       "THAWED",
1899 +       "FREEZING",
1900 +       "FROZEN",
1901 +};
1902 +
1903 +/*
1904 + * State diagram
1905 + * Transitions are caused by userspace writes to the freezer.state file.
1906 + * The values in parenthesis are state labels. The rest are edge labels.
1907 + *
1908 + * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
1909 + *    ^ ^                    |                     |
1910 + *    | \_______THAWED_______/                     |
1911 + *    \__________________________THAWED____________/
1912 + */
1913 +
1914 +struct cgroup_subsys freezer_subsys;
1915 +
1916 +/* Locks taken and their ordering
1917 + * ------------------------------
1918 + * css_set_lock
1919 + * cgroup_mutex (AKA cgroup_lock)
1920 + * task->alloc_lock (AKA task_lock)
1921 + * freezer->lock
1922 + * task->sighand->siglock
1923 + *
1924 + * cgroup code forces css_set_lock to be taken before task->alloc_lock
1925 + *
1926 + * freezer_create(), freezer_destroy():
1927 + * cgroup_mutex [ by cgroup core ]
1928 + *
1929 + * can_attach():
1930 + * cgroup_mutex
1931 + *
1932 + * cgroup_frozen():
1933 + * task->alloc_lock (to get task's cgroup)
1934 + *
1935 + * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
1936 + * task->alloc_lock (to get task's cgroup)
1937 + * freezer->lock
1938 + *  sighand->siglock (if the cgroup is freezing)
1939 + *
1940 + * freezer_read():
1941 + * cgroup_mutex
1942 + *  freezer->lock
1943 + *   read_lock css_set_lock (cgroup iterator start)
1944 + *
1945 + * freezer_write() (freeze):
1946 + * cgroup_mutex
1947 + *  freezer->lock
1948 + *   read_lock css_set_lock (cgroup iterator start)
1949 + *    sighand->siglock
1950 + *
1951 + * freezer_write() (unfreeze):
1952 + * cgroup_mutex
1953 + *  freezer->lock
1954 + *   read_lock css_set_lock (cgroup iterator start)
1955 + *    task->alloc_lock (to prevent races with freeze_task())
1956 + *     sighand->siglock
1957 + */
1958 +static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
1959 +                                                 struct cgroup *cgroup)
1960 +{
1961 +       struct freezer *freezer;
1962 +
1963 +       freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
1964 +       if (!freezer)
1965 +               return ERR_PTR(-ENOMEM);
1966 +
1967 +       spin_lock_init(&freezer->lock);
1968 +       freezer->state = CGROUP_THAWED;
1969 +       return &freezer->css;
1970 +}
1971 +
1972 +static void freezer_destroy(struct cgroup_subsys *ss,
1973 +                           struct cgroup *cgroup)
1974 +{
1975 +       kfree(cgroup_freezer(cgroup));
1976 +}
1977 +
1978 +/* Task is frozen or will freeze immediately when next it gets woken */
1979 +static bool is_task_frozen_enough(struct task_struct *task)
1980 +{
1981 +       return frozen(task) ||
1982 +               (task_is_stopped_or_traced(task) && freezing(task));
1983 +}
1984 +
1985 +/*
1986 + * The call to cgroup_lock() in the freezer.state write method prevents
1987 + * a write to that file racing against an attach, and hence the
1988 + * can_attach() result will remain valid until the attach completes.
1989 + */
1990 +static int freezer_can_attach(struct cgroup_subsys *ss,
1991 +                             struct cgroup *new_cgroup,
1992 +                             struct task_struct *task)
1993 +{
1994 +       struct freezer *freezer;
1995 +
1996 +       /*
1997 +        * Anything frozen can't move or be moved to/from.
1998 +        *
1999 +        * Since orig_freezer->state == FROZEN means that @task has been
2000 +        * frozen, so it's sufficient to check the latter condition.
2001 +        */
2002 +
2003 +       if (is_task_frozen_enough(task))
2004 +               return -EBUSY;
2005 +
2006 +       freezer = cgroup_freezer(new_cgroup);
2007 +       if (freezer->state == CGROUP_FROZEN)
2008 +               return -EBUSY;
2009 +
2010 +       return 0;
2011 +}
2012 +
2013 +static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
2014 +{
2015 +       struct freezer *freezer;
2016 +
2017 +       /*
2018 +        * No lock is needed, since the task isn't on tasklist yet,
2019 +        * so it can't be moved to another cgroup, which means the
2020 +        * freezer won't be removed and will be valid during this
2021 +        * function call.
2022 +        */
2023 +       freezer = task_freezer(task);
2024 +
2025 +       /*
2026 +        * The root cgroup is non-freezable, so we can skip the
2027 +        * following check.
2028 +        */
2029 +       if (!freezer->css.cgroup->parent)
2030 +               return;
2031 +
2032 +       spin_lock_irq(&freezer->lock);
2033 +       BUG_ON(freezer->state == CGROUP_FROZEN);
2034 +
2035 +       /* Locking avoids race with FREEZING -> THAWED transitions. */
2036 +       if (freezer->state == CGROUP_FREEZING)
2037 +               freeze_task(task, true);
2038 +       spin_unlock_irq(&freezer->lock);
2039 +}
2040 +
2041 +/*
2042 + * caller must hold freezer->lock
2043 + */
2044 +static void update_freezer_state(struct cgroup *cgroup,
2045 +                                struct freezer *freezer)
2046 +{
2047 +       struct cgroup_iter it;
2048 +       struct task_struct *task;
2049 +       unsigned int nfrozen = 0, ntotal = 0;
2050 +
2051 +       cgroup_iter_start(cgroup, &it);
2052 +       while ((task = cgroup_iter_next(cgroup, &it))) {
2053 +               ntotal++;
2054 +               if (is_task_frozen_enough(task))
2055 +                       nfrozen++;
2056 +       }
2057 +
2058 +       /*
2059 +        * Transition to FROZEN when no new tasks can be added ensures
2060 +        * that we never exist in the FROZEN state while there are unfrozen
2061 +        * tasks.
2062 +        */
2063 +       if (nfrozen == ntotal)
2064 +               freezer->state = CGROUP_FROZEN;
2065 +       else if (nfrozen > 0)
2066 +               freezer->state = CGROUP_FREEZING;
2067 +       else
2068 +               freezer->state = CGROUP_THAWED;
2069 +       cgroup_iter_end(cgroup, &it);
2070 +}
2071 +
2072 +static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
2073 +                       struct seq_file *m)
2074 +{
2075 +       struct freezer *freezer;
2076 +       enum freezer_state state;
2077 +
2078 +       if (!cgroup_lock_live_group(cgroup))
2079 +               return -ENODEV;
2080 +
2081 +       freezer = cgroup_freezer(cgroup);
2082 +       spin_lock_irq(&freezer->lock);
2083 +       state = freezer->state;
2084 +       if (state == CGROUP_FREEZING) {
2085 +               /* We change from FREEZING to FROZEN lazily if the cgroup was
2086 +                * only partially frozen when we exitted write. */
2087 +               update_freezer_state(cgroup, freezer);
2088 +               state = freezer->state;
2089 +       }
2090 +       spin_unlock_irq(&freezer->lock);
2091 +       cgroup_unlock();
2092 +
2093 +       seq_puts(m, freezer_state_strs[state]);
2094 +       seq_putc(m, '\n');
2095 +       return 0;
2096 +}
2097 +
2098 +static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
2099 +{
2100 +       struct cgroup_iter it;
2101 +       struct task_struct *task;
2102 +       unsigned int num_cant_freeze_now = 0;
2103 +
2104 +       freezer->state = CGROUP_FREEZING;
2105 +       cgroup_iter_start(cgroup, &it);
2106 +       while ((task = cgroup_iter_next(cgroup, &it))) {
2107 +               if (!freeze_task(task, true))
2108 +                       continue;
2109 +               if (is_task_frozen_enough(task))
2110 +                       continue;
2111 +               if (!freezing(task) && !freezer_should_skip(task))
2112 +                       num_cant_freeze_now++;
2113 +       }
2114 +       cgroup_iter_end(cgroup, &it);
2115 +
2116 +       return num_cant_freeze_now ? -EBUSY : 0;
2117 +}
2118 +
2119 +static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
2120 +{
2121 +       struct cgroup_iter it;
2122 +       struct task_struct *task;
2123 +
2124 +       cgroup_iter_start(cgroup, &it);
2125 +       while ((task = cgroup_iter_next(cgroup, &it))) {
2126 +               thaw_process(task);
2127 +       }
2128 +       cgroup_iter_end(cgroup, &it);
2129 +
2130 +       freezer->state = CGROUP_THAWED;
2131 +}
2132 +
2133 +static int freezer_change_state(struct cgroup *cgroup,
2134 +                               enum freezer_state goal_state)
2135 +{
2136 +       struct freezer *freezer;
2137 +       int retval = 0;
2138 +
2139 +       freezer = cgroup_freezer(cgroup);
2140 +
2141 +       spin_lock_irq(&freezer->lock);
2142 +
2143 +       update_freezer_state(cgroup, freezer);
2144 +       if (goal_state == freezer->state)
2145 +               goto out;
2146 +
2147 +       switch (goal_state) {
2148 +       case CGROUP_THAWED:
2149 +               unfreeze_cgroup(cgroup, freezer);
2150 +               break;
2151 +       case CGROUP_FROZEN:
2152 +               retval = try_to_freeze_cgroup(cgroup, freezer);
2153 +               break;
2154 +       default:
2155 +               BUG();
2156 +       }
2157 +out:
2158 +       spin_unlock_irq(&freezer->lock);
2159 +
2160 +       return retval;
2161 +}
2162 +
2163 +static int freezer_write(struct cgroup *cgroup,
2164 +                        struct cftype *cft,
2165 +                        const char *buffer)
2166 +{
2167 +       int retval;
2168 +       enum freezer_state goal_state;
2169 +
2170 +       if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
2171 +               goal_state = CGROUP_THAWED;
2172 +       else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
2173 +               goal_state = CGROUP_FROZEN;
2174 +       else
2175 +               return -EINVAL;
2176 +
2177 +       if (!cgroup_lock_live_group(cgroup))
2178 +               return -ENODEV;
2179 +       retval = freezer_change_state(cgroup, goal_state);
2180 +       cgroup_unlock();
2181 +       return retval;
2182 +}
2183 +
2184 +static struct cftype files[] = {
2185 +       {
2186 +               .name = "state",
2187 +               .read_seq_string = freezer_read,
2188 +               .write_string = freezer_write,
2189 +       },
2190 +};
2191 +
2192 +static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
2193 +{
2194 +       if (!cgroup->parent)
2195 +               return 0;
2196 +       return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
2197 +}
2198 +
2199 +struct cgroup_subsys freezer_subsys = {
2200 +       .name           = "freezer",
2201 +       .create         = freezer_create,
2202 +       .destroy        = freezer_destroy,
2203 +       .populate       = freezer_populate,
2204 +       .subsys_id      = freezer_subsys_id,
2205 +       .can_attach     = freezer_can_attach,
2206 +       .attach         = NULL,
2207 +       .fork           = freezer_fork,
2208 +       .exit           = NULL,
2209 +};
2210 --- /dev/null
2211 +++ b/kernel/freezer.c
2212 @@ -0,0 +1,154 @@
2213 +/*
2214 + * kernel/freezer.c - Function to freeze a process
2215 + *
2216 + * Originally from kernel/power/process.c
2217 + */
2218 +
2219 +#include <linux/interrupt.h>
2220 +#include <linux/suspend.h>
2221 +#include <linux/module.h>
2222 +#include <linux/syscalls.h>
2223 +#include <linux/freezer.h>
2224 +
2225 +/*
2226 + * freezing is complete, mark current process as frozen
2227 + */
2228 +static inline void frozen_process(void)
2229 +{
2230 +       if (!unlikely(current->flags & PF_NOFREEZE)) {
2231 +               current->flags |= PF_FROZEN;
2232 +               wmb();
2233 +       }
2234 +       clear_freeze_flag(current);
2235 +}
2236 +
2237 +/* Refrigerator is place where frozen processes are stored :-). */
2238 +void refrigerator(void)
2239 +{
2240 +       /* Hmm, should we be allowed to suspend when there are realtime
2241 +          processes around? */
2242 +       long save;
2243 +
2244 +       task_lock(current);
2245 +       if (freezing(current)) {
2246 +               frozen_process();
2247 +               task_unlock(current);
2248 +       } else {
2249 +               task_unlock(current);
2250 +               return;
2251 +       }
2252 +       save = current->state;
2253 +       pr_debug("%s entered refrigerator\n", current->comm);
2254 +
2255 +       spin_lock_irq(&current->sighand->siglock);
2256 +       recalc_sigpending(); /* We sent fake signal, clean it up */
2257 +       spin_unlock_irq(&current->sighand->siglock);
2258 +
2259 +       for (;;) {
2260 +               set_current_state(TASK_UNINTERRUPTIBLE);
2261 +               if (!frozen(current))
2262 +                       break;
2263 +               schedule();
2264 +       }
2265 +       pr_debug("%s left refrigerator\n", current->comm);
2266 +       __set_current_state(save);
2267 +}
2268 +EXPORT_SYMBOL(refrigerator);
2269 +
2270 +static void fake_signal_wake_up(struct task_struct *p)
2271 +{
2272 +       unsigned long flags;
2273 +
2274 +       spin_lock_irqsave(&p->sighand->siglock, flags);
2275 +       signal_wake_up(p, 0);
2276 +       spin_unlock_irqrestore(&p->sighand->siglock, flags);
2277 +}
2278 +
2279 +/**
2280 + *     freeze_task - send a freeze request to given task
2281 + *     @p: task to send the request to
2282 + *     @sig_only: if set, the request will only be sent if the task has the
2283 + *             PF_FREEZER_NOSIG flag unset
2284 + *     Return value: 'false', if @sig_only is set and the task has
2285 + *             PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
2286 + *
2287 + *     The freeze request is sent by setting the tasks's TIF_FREEZE flag and
2288 + *     either sending a fake signal to it or waking it up, depending on whether
2289 + *     or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
2290 + *     has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
2291 + *     TIF_FREEZE flag will not be set.
2292 + */
2293 +bool freeze_task(struct task_struct *p, bool sig_only)
2294 +{
2295 +       /*
2296 +        * We first check if the task is freezing and next if it has already
2297 +        * been frozen to avoid the race with frozen_process() which first marks
2298 +        * the task as frozen and next clears its TIF_FREEZE.
2299 +        */
2300 +       if (!freezing(p)) {
2301 +               rmb();
2302 +               if (frozen(p))
2303 +                       return false;
2304 +
2305 +               if (!sig_only || should_send_signal(p))
2306 +                       set_freeze_flag(p);
2307 +               else
2308 +                       return false;
2309 +       }
2310 +
2311 +       if (should_send_signal(p)) {
2312 +               if (!signal_pending(p))
2313 +                       fake_signal_wake_up(p);
2314 +       } else if (sig_only) {
2315 +               return false;
2316 +       } else {
2317 +               wake_up_state(p, TASK_INTERRUPTIBLE);
2318 +       }
2319 +
2320 +       return true;
2321 +}
2322 +
2323 +void cancel_freezing(struct task_struct *p)
2324 +{
2325 +       unsigned long flags;
2326 +
2327 +       if (freezing(p)) {
2328 +               pr_debug("  clean up: %s\n", p->comm);
2329 +               clear_freeze_flag(p);
2330 +               spin_lock_irqsave(&p->sighand->siglock, flags);
2331 +               recalc_sigpending_and_wake(p);
2332 +               spin_unlock_irqrestore(&p->sighand->siglock, flags);
2333 +       }
2334 +}
2335 +
2336 +static int __thaw_process(struct task_struct *p)
2337 +{
2338 +       if (frozen(p)) {
2339 +               p->flags &= ~PF_FROZEN;
2340 +               return 1;
2341 +       }
2342 +       clear_freeze_flag(p);
2343 +       return 0;
2344 +}
2345 +
2346 +/*
2347 + * Wake up a frozen process
2348 + *
2349 + * task_lock() is needed to prevent the race with refrigerator() which may
2350 + * occur if the freezing of tasks fails.  Namely, without the lock, if the
2351 + * freezing of tasks failed, thaw_tasks() might have run before a task in
2352 + * refrigerator() could call frozen_process(), in which case the task would be
2353 + * frozen and no one would thaw it.
2354 + */
2355 +int thaw_process(struct task_struct *p)
2356 +{
2357 +       task_lock(p);
2358 +       if (__thaw_process(p) == 1) {
2359 +               task_unlock(p);
2360 +               wake_up_process(p);
2361 +               return 1;
2362 +       }
2363 +       task_unlock(p);
2364 +       return 0;
2365 +}
2366 +EXPORT_SYMBOL(thaw_process);
2367 --- a/kernel/power/process.c
2368 +++ b/kernel/power/process.c
2369 @@ -28,121 +28,6 @@ static inline int freezeable(struct task
2370         return 1;
2371  }
2372
2373 -/*
2374 - * freezing is complete, mark current process as frozen
2375 - */
2376 -static inline void frozen_process(void)
2377 -{
2378 -       if (!unlikely(current->flags & PF_NOFREEZE)) {
2379 -               current->flags |= PF_FROZEN;
2380 -               wmb();
2381 -       }
2382 -       clear_freeze_flag(current);
2383 -}
2384 -
2385 -/* Refrigerator is place where frozen processes are stored :-). */
2386 -void refrigerator(void)
2387 -{
2388 -       /* Hmm, should we be allowed to suspend when there are realtime
2389 -          processes around? */
2390 -       long save;
2391 -
2392 -       task_lock(current);
2393 -       if (freezing(current)) {
2394 -               frozen_process();
2395 -               task_unlock(current);
2396 -       } else {
2397 -               task_unlock(current);
2398 -               return;
2399 -       }
2400 -       save = current->state;
2401 -       pr_debug("%s entered refrigerator\n", current->comm);
2402 -
2403 -       spin_lock_irq(&current->sighand->siglock);
2404 -       recalc_sigpending(); /* We sent fake signal, clean it up */
2405 -       spin_unlock_irq(&current->sighand->siglock);
2406 -
2407 -       for (;;) {
2408 -               set_current_state(TASK_UNINTERRUPTIBLE);
2409 -               if (!frozen(current))
2410 -                       break;
2411 -               schedule();
2412 -       }
2413 -       pr_debug("%s left refrigerator\n", current->comm);
2414 -       __set_current_state(save);
2415 -}
2416 -
2417 -static void fake_signal_wake_up(struct task_struct *p)
2418 -{
2419 -       unsigned long flags;
2420 -
2421 -       spin_lock_irqsave(&p->sighand->siglock, flags);
2422 -       signal_wake_up(p, 0);
2423 -       spin_unlock_irqrestore(&p->sighand->siglock, flags);
2424 -}
2425 -
2426 -static inline bool should_send_signal(struct task_struct *p)
2427 -{
2428 -       return !(p->flags & PF_FREEZER_NOSIG);
2429 -}
2430 -
2431 -/**
2432 - *     freeze_task - send a freeze request to given task
2433 - *     @p: task to send the request to
2434 - *     @sig_only: if set, the request will only be sent if the task has the
2435 - *             PF_FREEZER_NOSIG flag unset
2436 - *     Return value: 'false', if @sig_only is set and the task has
2437 - *             PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
2438 - *
2439 - *     The freeze request is sent by setting the tasks's TIF_FREEZE flag and
2440 - *     either sending a fake signal to it or waking it up, depending on whether
2441 - *     or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
2442 - *     has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
2443 - *     TIF_FREEZE flag will not be set.
2444 - */
2445 -static bool freeze_task(struct task_struct *p, bool sig_only)
2446 -{
2447 -       /*
2448 -        * We first check if the task is freezing and next if it has already
2449 -        * been frozen to avoid the race with frozen_process() which first marks
2450 -        * the task as frozen and next clears its TIF_FREEZE.
2451 -        */
2452 -       if (!freezing(p)) {
2453 -               rmb();
2454 -               if (frozen(p))
2455 -                       return false;
2456 -
2457 -               if (!sig_only || should_send_signal(p))
2458 -                       set_freeze_flag(p);
2459 -               else
2460 -                       return false;
2461 -       }
2462 -
2463 -       if (should_send_signal(p)) {
2464 -               if (!signal_pending(p))
2465 -                       fake_signal_wake_up(p);
2466 -       } else if (sig_only) {
2467 -               return false;
2468 -       } else {
2469 -               wake_up_state(p, TASK_INTERRUPTIBLE);
2470 -       }
2471 -
2472 -       return true;
2473 -}
2474 -
2475 -static void cancel_freezing(struct task_struct *p)
2476 -{
2477 -       unsigned long flags;
2478 -
2479 -       if (freezing(p)) {
2480 -               pr_debug("  clean up: %s\n", p->comm);
2481 -               clear_freeze_flag(p);
2482 -               spin_lock_irqsave(&p->sighand->siglock, flags);
2483 -               recalc_sigpending_and_wake(p);
2484 -               spin_unlock_irqrestore(&p->sighand->siglock, flags);
2485 -       }
2486 -}
2487 -
2488  static int try_to_freeze_tasks(bool sig_only)
2489  {
2490         struct task_struct *g, *p;
2491 @@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only)
2492                 if (nosig_only && should_send_signal(p))
2493                         continue;
2494
2495 +               if (cgroup_frozen(p))
2496 +                       continue;
2497 +
2498                 thaw_process(p);
2499         } while_each_thread(g, p);
2500         read_unlock(&tasklist_lock);
2501 @@ -264,4 +152,3 @@ void thaw_processes(void)
2502         printk("done.\n");
2503  }
2504
2505 -EXPORT_SYMBOL(refrigerator);