]> git.ipfire.org Git - people/teissler/ipfire-2.x.git/blob - src/patches/suse-2.6.27.25/patches.suse/cgroup-freezer.patch
Updated xen patches taken from suse.
[people/teissler/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.suse / cgroup-freezer.patch
1 From: Serge E. Hallyn <serue@us.ibm.com>
2 Subject: cgroup freezer
3 References: bnc#417294, fate#304191, fate#201036
4 Patch-upstream: yes
5 Git: 68d1a06b440a5df55fb253e1d1113d2e4a7209fc Mon Sep 17 00:00:00 2001
6
7 Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
8 Acked-by: Nick Piggin <npiggin@suse.de>
9 ---
10 Documentation/cgroups.txt | 548 ----------------------------
11 Documentation/cgroups/cgroups.txt | 548 ++++++++++++++++++++++++++++
12 Documentation/cgroups/freezer-subsystem.txt | 102 +++++
13 Documentation/cpusets.txt | 2
14 arch/alpha/Kconfig | 1
15 arch/alpha/include/asm/thread_info.h | 2
16 arch/arm/Kconfig | 2
17 arch/avr32/Kconfig | 2
18 arch/avr32/include/asm/thread_info.h | 1
19 arch/blackfin/Kconfig | 3
20 arch/cris/Kconfig | 2
21 arch/frv/Kconfig | 2
22 arch/h8300/Kconfig | 2
23 arch/h8300/include/asm/thread_info.h | 2
24 arch/ia64/Kconfig | 2
25 arch/m32r/Kconfig | 2
26 arch/m68k/Kconfig | 2
27 arch/m68knommu/Kconfig | 2
28 arch/m68knommu/include/asm/thread_info.h | 2
29 arch/mips/Kconfig | 2
30 arch/mn10300/Kconfig | 2
31 arch/parisc/Kconfig | 2
32 arch/powerpc/Kconfig | 2
33 arch/s390/Kconfig | 2
34 arch/s390/include/asm/thread_info.h | 2
35 arch/sh/Kconfig | 2
36 arch/sparc/Kconfig | 2
37 arch/sparc/include/asm/thread_info_32.h | 2
38 arch/sparc64/Kconfig | 1
39 arch/um/Kconfig | 2
40 arch/x86/Kconfig | 1
41 arch/xtensa/Kconfig | 1
42 include/asm-cris/thread_info.h | 2
43 include/asm-m68k/thread_info.h | 1
44 include/asm-parisc/thread_info.h | 2
45 include/asm-um/thread_info.h | 2
46 include/asm-xtensa/thread_info.h | 2
47 include/linux/cgroup_subsys.h | 6
48 include/linux/freezer.h | 42 --
49 init/Kconfig | 7
50 kernel/Kconfig.freezer | 2
51 kernel/Makefile | 2
52 kernel/cgroup_freezer.c | 379 +++++++++++++++++++
53 kernel/freezer.c | 154 +++++++
54 kernel/power/process.c | 119 ------
55 45 files changed, 1283 insertions(+), 689 deletions(-)
56 create mode 100644 include/linux/cgroup_freezer.h
57 create mode 100644 kernel/cgroup_freezer.c
58 create mode 100644 kernel/freezer.c
59
60 --- a/Documentation/cgroups.txt
61 +++ /dev/null
62 @@ -1,548 +0,0 @@
63 - CGROUPS
64 - -------
65 -
66 -Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
67 -
68 -Original copyright statements from cpusets.txt:
69 -Portions Copyright (C) 2004 BULL SA.
70 -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
71 -Modified by Paul Jackson <pj@sgi.com>
72 -Modified by Christoph Lameter <clameter@sgi.com>
73 -
74 -CONTENTS:
75 -=========
76 -
77 -1. Control Groups
78 - 1.1 What are cgroups ?
79 - 1.2 Why are cgroups needed ?
80 - 1.3 How are cgroups implemented ?
81 - 1.4 What does notify_on_release do ?
82 - 1.5 How do I use cgroups ?
83 -2. Usage Examples and Syntax
84 - 2.1 Basic Usage
85 - 2.2 Attaching processes
86 -3. Kernel API
87 - 3.1 Overview
88 - 3.2 Synchronization
89 - 3.3 Subsystem API
90 -4. Questions
91 -
92 -1. Control Groups
93 -=================
94 -
95 -1.1 What are cgroups ?
96 -----------------------
97 -
98 -Control Groups provide a mechanism for aggregating/partitioning sets of
99 -tasks, and all their future children, into hierarchical groups with
100 -specialized behaviour.
101 -
102 -Definitions:
103 -
104 -A *cgroup* associates a set of tasks with a set of parameters for one
105 -or more subsystems.
106 -
107 -A *subsystem* is a module that makes use of the task grouping
108 -facilities provided by cgroups to treat groups of tasks in
109 -particular ways. A subsystem is typically a "resource controller" that
110 -schedules a resource or applies per-cgroup limits, but it may be
111 -anything that wants to act on a group of processes, e.g. a
112 -virtualization subsystem.
113 -
114 -A *hierarchy* is a set of cgroups arranged in a tree, such that
115 -every task in the system is in exactly one of the cgroups in the
116 -hierarchy, and a set of subsystems; each subsystem has system-specific
117 -state attached to each cgroup in the hierarchy. Each hierarchy has
118 -an instance of the cgroup virtual filesystem associated with it.
119 -
120 -At any one time there may be multiple active hierachies of task
121 -cgroups. Each hierarchy is a partition of all tasks in the system.
122 -
123 -User level code may create and destroy cgroups by name in an
124 -instance of the cgroup virtual file system, specify and query to
125 -which cgroup a task is assigned, and list the task pids assigned to
126 -a cgroup. Those creations and assignments only affect the hierarchy
127 -associated with that instance of the cgroup file system.
128 -
129 -On their own, the only use for cgroups is for simple job
130 -tracking. The intention is that other subsystems hook into the generic
131 -cgroup support to provide new attributes for cgroups, such as
132 -accounting/limiting the resources which processes in a cgroup can
133 -access. For example, cpusets (see Documentation/cpusets.txt) allows
134 -you to associate a set of CPUs and a set of memory nodes with the
135 -tasks in each cgroup.
136 -
137 -1.2 Why are cgroups needed ?
138 -----------------------------
139 -
140 -There are multiple efforts to provide process aggregations in the
141 -Linux kernel, mainly for resource tracking purposes. Such efforts
142 -include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
143 -namespaces. These all require the basic notion of a
144 -grouping/partitioning of processes, with newly forked processes ending
145 -in the same group (cgroup) as their parent process.
146 -
147 -The kernel cgroup patch provides the minimum essential kernel
148 -mechanisms required to efficiently implement such groups. It has
149 -minimal impact on the system fast paths, and provides hooks for
150 -specific subsystems such as cpusets to provide additional behaviour as
151 -desired.
152 -
153 -Multiple hierarchy support is provided to allow for situations where
154 -the division of tasks into cgroups is distinctly different for
155 -different subsystems - having parallel hierarchies allows each
156 -hierarchy to be a natural division of tasks, without having to handle
157 -complex combinations of tasks that would be present if several
158 -unrelated subsystems needed to be forced into the same tree of
159 -cgroups.
160 -
161 -At one extreme, each resource controller or subsystem could be in a
162 -separate hierarchy; at the other extreme, all subsystems
163 -would be attached to the same hierarchy.
164 -
165 -As an example of a scenario (originally proposed by vatsa@in.ibm.com)
166 -that can benefit from multiple hierarchies, consider a large
167 -university server with various users - students, professors, system
168 -tasks etc. The resource planning for this server could be along the
169 -following lines:
170 -
171 - CPU : Top cpuset
172 - / \
173 - CPUSet1 CPUSet2
174 - | |
175 - (Profs) (Students)
176 -
177 - In addition (system tasks) are attached to topcpuset (so
178 - that they can run anywhere) with a limit of 20%
179 -
180 - Memory : Professors (50%), students (30%), system (20%)
181 -
182 - Disk : Prof (50%), students (30%), system (20%)
183 -
184 - Network : WWW browsing (20%), Network File System (60%), others (20%)
185 - / \
186 - Prof (15%) students (5%)
187 -
188 -Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
189 -into NFS network class.
190 -
191 -At the same time firefox/lynx will share an appropriate CPU/Memory class
192 -depending on who launched it (prof/student).
193 -
194 -With the ability to classify tasks differently for different resources
195 -(by putting those resource subsystems in different hierarchies) then
196 -the admin can easily set up a script which receives exec notifications
197 -and depending on who is launching the browser he can
198 -
199 - # echo browser_pid > /mnt/<restype>/<userclass>/tasks
200 -
201 -With only a single hierarchy, he now would potentially have to create
202 -a separate cgroup for every browser launched and associate it with
203 -approp network and other resource class. This may lead to
204 -proliferation of such cgroups.
205 -
206 -Also lets say that the administrator would like to give enhanced network
207 -access temporarily to a student's browser (since it is night and the user
208 -wants to do online gaming :)) OR give one of the students simulation
209 -apps enhanced CPU power,
210 -
211 -With ability to write pids directly to resource classes, it's just a
212 -matter of :
213 -
214 - # echo pid > /mnt/network/<new_class>/tasks
215 - (after some time)
216 - # echo pid > /mnt/network/<orig_class>/tasks
217 -
218 -Without this ability, he would have to split the cgroup into
219 -multiple separate ones and then associate the new cgroups with the
220 -new resource classes.
221 -
222 -
223 -
224 -1.3 How are cgroups implemented ?
225 ----------------------------------
226 -
227 -Control Groups extends the kernel as follows:
228 -
229 - - Each task in the system has a reference-counted pointer to a
230 - css_set.
231 -
232 - - A css_set contains a set of reference-counted pointers to
233 - cgroup_subsys_state objects, one for each cgroup subsystem
234 - registered in the system. There is no direct link from a task to
235 - the cgroup of which it's a member in each hierarchy, but this
236 - can be determined by following pointers through the
237 - cgroup_subsys_state objects. This is because accessing the
238 - subsystem state is something that's expected to happen frequently
239 - and in performance-critical code, whereas operations that require a
240 - task's actual cgroup assignments (in particular, moving between
241 - cgroups) are less common. A linked list runs through the cg_list
242 - field of each task_struct using the css_set, anchored at
243 - css_set->tasks.
244 -
245 - - A cgroup hierarchy filesystem can be mounted for browsing and
246 - manipulation from user space.
247 -
248 - - You can list all the tasks (by pid) attached to any cgroup.
249 -
250 -The implementation of cgroups requires a few, simple hooks
251 -into the rest of the kernel, none in performance critical paths:
252 -
253 - - in init/main.c, to initialize the root cgroups and initial
254 - css_set at system boot.
255 -
256 - - in fork and exit, to attach and detach a task from its css_set.
257 -
258 -In addition a new file system, of type "cgroup" may be mounted, to
259 -enable browsing and modifying the cgroups presently known to the
260 -kernel. When mounting a cgroup hierarchy, you may specify a
261 -comma-separated list of subsystems to mount as the filesystem mount
262 -options. By default, mounting the cgroup filesystem attempts to
263 -mount a hierarchy containing all registered subsystems.
264 -
265 -If an active hierarchy with exactly the same set of subsystems already
266 -exists, it will be reused for the new mount. If no existing hierarchy
267 -matches, and any of the requested subsystems are in use in an existing
268 -hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
269 -is activated, associated with the requested subsystems.
270 -
271 -It's not currently possible to bind a new subsystem to an active
272 -cgroup hierarchy, or to unbind a subsystem from an active cgroup
273 -hierarchy. This may be possible in future, but is fraught with nasty
274 -error-recovery issues.
275 -
276 -When a cgroup filesystem is unmounted, if there are any
277 -child cgroups created below the top-level cgroup, that hierarchy
278 -will remain active even though unmounted; if there are no
279 -child cgroups then the hierarchy will be deactivated.
280 -
281 -No new system calls are added for cgroups - all support for
282 -querying and modifying cgroups is via this cgroup file system.
283 -
284 -Each task under /proc has an added file named 'cgroup' displaying,
285 -for each active hierarchy, the subsystem names and the cgroup name
286 -as the path relative to the root of the cgroup file system.
287 -
288 -Each cgroup is represented by a directory in the cgroup file system
289 -containing the following files describing that cgroup:
290 -
291 - - tasks: list of tasks (by pid) attached to that cgroup
292 - - releasable flag: cgroup currently removeable?
293 - - notify_on_release flag: run the release agent on exit?
294 - - release_agent: the path to use for release notifications (this file
295 - exists in the top cgroup only)
296 -
297 -Other subsystems such as cpusets may add additional files in each
298 -cgroup dir.
299 -
300 -New cgroups are created using the mkdir system call or shell
301 -command. The properties of a cgroup, such as its flags, are
302 -modified by writing to the appropriate file in that cgroups
303 -directory, as listed above.
304 -
305 -The named hierarchical structure of nested cgroups allows partitioning
306 -a large system into nested, dynamically changeable, "soft-partitions".
307 -
308 -The attachment of each task, automatically inherited at fork by any
309 -children of that task, to a cgroup allows organizing the work load
310 -on a system into related sets of tasks. A task may be re-attached to
311 -any other cgroup, if allowed by the permissions on the necessary
312 -cgroup file system directories.
313 -
314 -When a task is moved from one cgroup to another, it gets a new
315 -css_set pointer - if there's an already existing css_set with the
316 -desired collection of cgroups then that group is reused, else a new
317 -css_set is allocated. Note that the current implementation uses a
318 -linear search to locate an appropriate existing css_set, so isn't
319 -very efficient. A future version will use a hash table for better
320 -performance.
321 -
322 -To allow access from a cgroup to the css_sets (and hence tasks)
323 -that comprise it, a set of cg_cgroup_link objects form a lattice;
324 -each cg_cgroup_link is linked into a list of cg_cgroup_links for
325 -a single cgroup on its cgrp_link_list field, and a list of
326 -cg_cgroup_links for a single css_set on its cg_link_list.
327 -
328 -Thus the set of tasks in a cgroup can be listed by iterating over
329 -each css_set that references the cgroup, and sub-iterating over
330 -each css_set's task set.
331 -
332 -The use of a Linux virtual file system (vfs) to represent the
333 -cgroup hierarchy provides for a familiar permission and name space
334 -for cgroups, with a minimum of additional kernel code.
335 -
336 -1.4 What does notify_on_release do ?
337 -------------------------------------
338 -
339 -If the notify_on_release flag is enabled (1) in a cgroup, then
340 -whenever the last task in the cgroup leaves (exits or attaches to
341 -some other cgroup) and the last child cgroup of that cgroup
342 -is removed, then the kernel runs the command specified by the contents
343 -of the "release_agent" file in that hierarchy's root directory,
344 -supplying the pathname (relative to the mount point of the cgroup
345 -file system) of the abandoned cgroup. This enables automatic
346 -removal of abandoned cgroups. The default value of
347 -notify_on_release in the root cgroup at system boot is disabled
348 -(0). The default value of other cgroups at creation is the current
349 -value of their parents notify_on_release setting. The default value of
350 -a cgroup hierarchy's release_agent path is empty.
351 -
352 -1.5 How do I use cgroups ?
353 ---------------------------
354 -
355 -To start a new job that is to be contained within a cgroup, using
356 -the "cpuset" cgroup subsystem, the steps are something like:
357 -
358 - 1) mkdir /dev/cgroup
359 - 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
360 - 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
361 - the /dev/cgroup virtual file system.
362 - 4) Start a task that will be the "founding father" of the new job.
363 - 5) Attach that task to the new cgroup by writing its pid to the
364 - /dev/cgroup tasks file for that cgroup.
365 - 6) fork, exec or clone the job tasks from this founding father task.
366 -
367 -For example, the following sequence of commands will setup a cgroup
368 -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
369 -and then start a subshell 'sh' in that cgroup:
370 -
371 - mount -t cgroup cpuset -ocpuset /dev/cgroup
372 - cd /dev/cgroup
373 - mkdir Charlie
374 - cd Charlie
375 - /bin/echo 2-3 > cpuset.cpus
376 - /bin/echo 1 > cpuset.mems
377 - /bin/echo $$ > tasks
378 - sh
379 - # The subshell 'sh' is now running in cgroup Charlie
380 - # The next line should display '/Charlie'
381 - cat /proc/self/cgroup
382 -
383 -2. Usage Examples and Syntax
384 -============================
385 -
386 -2.1 Basic Usage
387 ----------------
388 -
389 -Creating, modifying, using the cgroups can be done through the cgroup
390 -virtual filesystem.
391 -
392 -To mount a cgroup hierarchy will all available subsystems, type:
393 -# mount -t cgroup xxx /dev/cgroup
394 -
395 -The "xxx" is not interpreted by the cgroup code, but will appear in
396 -/proc/mounts so may be any useful identifying string that you like.
397 -
398 -To mount a cgroup hierarchy with just the cpuset and numtasks
399 -subsystems, type:
400 -# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
401 -
402 -To change the set of subsystems bound to a mounted hierarchy, just
403 -remount with different options:
404 -
405 -# mount -o remount,cpuset,ns /dev/cgroup
406 -
407 -Note that changing the set of subsystems is currently only supported
408 -when the hierarchy consists of a single (root) cgroup. Supporting
409 -the ability to arbitrarily bind/unbind subsystems from an existing
410 -cgroup hierarchy is intended to be implemented in the future.
411 -
412 -Then under /dev/cgroup you can find a tree that corresponds to the
413 -tree of the cgroups in the system. For instance, /dev/cgroup
414 -is the cgroup that holds the whole system.
415 -
416 -If you want to create a new cgroup under /dev/cgroup:
417 -# cd /dev/cgroup
418 -# mkdir my_cgroup
419 -
420 -Now you want to do something with this cgroup.
421 -# cd my_cgroup
422 -
423 -In this directory you can find several files:
424 -# ls
425 -notify_on_release releasable tasks
426 -(plus whatever files added by the attached subsystems)
427 -
428 -Now attach your shell to this cgroup:
429 -# /bin/echo $$ > tasks
430 -
431 -You can also create cgroups inside your cgroup by using mkdir in this
432 -directory.
433 -# mkdir my_sub_cs
434 -
435 -To remove a cgroup, just use rmdir:
436 -# rmdir my_sub_cs
437 -
438 -This will fail if the cgroup is in use (has cgroups inside, or
439 -has processes attached, or is held alive by other subsystem-specific
440 -reference).
441 -
442 -2.2 Attaching processes
443 ------------------------
444 -
445 -# /bin/echo PID > tasks
446 -
447 -Note that it is PID, not PIDs. You can only attach ONE task at a time.
448 -If you have several tasks to attach, you have to do it one after another:
449 -
450 -# /bin/echo PID1 > tasks
451 -# /bin/echo PID2 > tasks
452 - ...
453 -# /bin/echo PIDn > tasks
454 -
455 -You can attach the current shell task by echoing 0:
456 -
457 -# echo 0 > tasks
458 -
459 -3. Kernel API
460 -=============
461 -
462 -3.1 Overview
463 -------------
464 -
465 -Each kernel subsystem that wants to hook into the generic cgroup
466 -system needs to create a cgroup_subsys object. This contains
467 -various methods, which are callbacks from the cgroup system, along
468 -with a subsystem id which will be assigned by the cgroup system.
469 -
470 -Other fields in the cgroup_subsys object include:
471 -
472 -- subsys_id: a unique array index for the subsystem, indicating which
473 - entry in cgroup->subsys[] this subsystem should be managing.
474 -
475 -- name: should be initialized to a unique subsystem name. Should be
476 - no longer than MAX_CGROUP_TYPE_NAMELEN.
477 -
478 -- early_init: indicate if the subsystem needs early initialization
479 - at system boot.
480 -
481 -Each cgroup object created by the system has an array of pointers,
482 -indexed by subsystem id; this pointer is entirely managed by the
483 -subsystem; the generic cgroup code will never touch this pointer.
484 -
485 -3.2 Synchronization
486 --------------------
487 -
488 -There is a global mutex, cgroup_mutex, used by the cgroup
489 -system. This should be taken by anything that wants to modify a
490 -cgroup. It may also be taken to prevent cgroups from being
491 -modified, but more specific locks may be more appropriate in that
492 -situation.
493 -
494 -See kernel/cgroup.c for more details.
495 -
496 -Subsystems can take/release the cgroup_mutex via the functions
497 -cgroup_lock()/cgroup_unlock().
498 -
499 -Accessing a task's cgroup pointer may be done in the following ways:
500 -- while holding cgroup_mutex
501 -- while holding the task's alloc_lock (via task_lock())
502 -- inside an rcu_read_lock() section via rcu_dereference()
503 -
504 -3.3 Subsystem API
505 ------------------
506 -
507 -Each subsystem should:
508 -
509 -- add an entry in linux/cgroup_subsys.h
510 -- define a cgroup_subsys object called <name>_subsys
511 -
512 -Each subsystem may export the following methods. The only mandatory
513 -methods are create/destroy. Any others that are null are presumed to
514 -be successful no-ops.
515 -
516 -struct cgroup_subsys_state *create(struct cgroup_subsys *ss,
517 - struct cgroup *cgrp)
518 -(cgroup_mutex held by caller)
519 -
520 -Called to create a subsystem state object for a cgroup. The
521 -subsystem should allocate its subsystem state object for the passed
522 -cgroup, returning a pointer to the new object on success or a
523 -negative error code. On success, the subsystem pointer should point to
524 -a structure of type cgroup_subsys_state (typically embedded in a
525 -larger subsystem-specific object), which will be initialized by the
526 -cgroup system. Note that this will be called at initialization to
527 -create the root subsystem state for this subsystem; this case can be
528 -identified by the passed cgroup object having a NULL parent (since
529 -it's the root of the hierarchy) and may be an appropriate place for
530 -initialization code.
531 -
532 -void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
533 -(cgroup_mutex held by caller)
534 -
535 -The cgroup system is about to destroy the passed cgroup; the subsystem
536 -should do any necessary cleanup and free its subsystem state
537 -object. By the time this method is called, the cgroup has already been
538 -unlinked from the file system and from the child list of its parent;
539 -cgroup->parent is still valid. (Note - can also be called for a
540 -newly-created cgroup if an error occurs after this subsystem's
541 -create() method has been called for the new cgroup).
542 -
543 -void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
544 -(cgroup_mutex held by caller)
545 -
546 -Called before checking the reference count on each subsystem. This may
547 -be useful for subsystems which have some extra references even if
548 -there are not tasks in the cgroup.
549 -
550 -int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
551 - struct task_struct *task)
552 -(cgroup_mutex held by caller)
553 -
554 -Called prior to moving a task into a cgroup; if the subsystem
555 -returns an error, this will abort the attach operation. If a NULL
556 -task is passed, then a successful result indicates that *any*
557 -unspecified task can be moved into the cgroup. Note that this isn't
558 -called on a fork. If this method returns 0 (success) then this should
559 -remain valid while the caller holds cgroup_mutex.
560 -
561 -void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
562 - struct cgroup *old_cgrp, struct task_struct *task)
563 -
564 -Called after the task has been attached to the cgroup, to allow any
565 -post-attachment activity that requires memory allocations or blocking.
566 -
567 -void fork(struct cgroup_subsy *ss, struct task_struct *task)
568 -
569 -Called when a task is forked into a cgroup.
570 -
571 -void exit(struct cgroup_subsys *ss, struct task_struct *task)
572 -
573 -Called during task exit.
574 -
575 -int populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
576 -
577 -Called after creation of a cgroup to allow a subsystem to populate
578 -the cgroup directory with file entries. The subsystem should make
579 -calls to cgroup_add_file() with objects of type cftype (see
580 -include/linux/cgroup.h for details). Note that although this
581 -method can return an error code, the error code is currently not
582 -always handled well.
583 -
584 -void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
585 -
586 -Called at the end of cgroup_clone() to do any paramater
587 -initialization which might be required before a task could attach. For
588 -example in cpusets, no task may attach before 'cpus' and 'mems' are set
589 -up.
590 -
591 -void bind(struct cgroup_subsys *ss, struct cgroup *root)
592 -(cgroup_mutex held by caller)
593 -
594 -Called when a cgroup subsystem is rebound to a different hierarchy
595 -and root cgroup. Currently this will only involve movement between
596 -the default hierarchy (which never has sub-cgroups) and a hierarchy
597 -that is being created/destroyed (and hence has no sub-cgroups).
598 -
599 -4. Questions
600 -============
601 -
602 -Q: what's up with this '/bin/echo' ?
603 -A: bash's builtin 'echo' command does not check calls to write() against
604 - errors. If you use it in the cgroup file system, you won't be
605 - able to tell whether a command succeeded or failed.
606 -
607 -Q: When I attach processes, only the first of the line gets really attached !
608 -A: We can only return one error code per call to write(). So you should also
609 - put only ONE pid.
610 -
611 --- /dev/null
612 +++ b/Documentation/cgroups/cgroups.txt
613 @@ -0,0 +1,548 @@
614 + CGROUPS
615 + -------
616 +
617 +Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
618 +
619 +Original copyright statements from cpusets.txt:
620 +Portions Copyright (C) 2004 BULL SA.
621 +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
622 +Modified by Paul Jackson <pj@sgi.com>
623 +Modified by Christoph Lameter <clameter@sgi.com>
624 +
625 +CONTENTS:
626 +=========
627 +
628 +1. Control Groups
629 + 1.1 What are cgroups ?
630 + 1.2 Why are cgroups needed ?
631 + 1.3 How are cgroups implemented ?
632 + 1.4 What does notify_on_release do ?
633 + 1.5 How do I use cgroups ?
634 +2. Usage Examples and Syntax
635 + 2.1 Basic Usage
636 + 2.2 Attaching processes
637 +3. Kernel API
638 + 3.1 Overview
639 + 3.2 Synchronization
640 + 3.3 Subsystem API
641 +4. Questions
642 +
643 +1. Control Groups
644 +=================
645 +
646 +1.1 What are cgroups ?
647 +----------------------
648 +
649 +Control Groups provide a mechanism for aggregating/partitioning sets of
650 +tasks, and all their future children, into hierarchical groups with
651 +specialized behaviour.
652 +
653 +Definitions:
654 +
655 +A *cgroup* associates a set of tasks with a set of parameters for one
656 +or more subsystems.
657 +
658 +A *subsystem* is a module that makes use of the task grouping
659 +facilities provided by cgroups to treat groups of tasks in
660 +particular ways. A subsystem is typically a "resource controller" that
661 +schedules a resource or applies per-cgroup limits, but it may be
662 +anything that wants to act on a group of processes, e.g. a
663 +virtualization subsystem.
664 +
665 +A *hierarchy* is a set of cgroups arranged in a tree, such that
666 +every task in the system is in exactly one of the cgroups in the
667 +hierarchy, and a set of subsystems; each subsystem has system-specific
668 +state attached to each cgroup in the hierarchy. Each hierarchy has
669 +an instance of the cgroup virtual filesystem associated with it.
670 +
671 +At any one time there may be multiple active hierachies of task
672 +cgroups. Each hierarchy is a partition of all tasks in the system.
673 +
674 +User level code may create and destroy cgroups by name in an
675 +instance of the cgroup virtual file system, specify and query to
676 +which cgroup a task is assigned, and list the task pids assigned to
677 +a cgroup. Those creations and assignments only affect the hierarchy
678 +associated with that instance of the cgroup file system.
679 +
680 +On their own, the only use for cgroups is for simple job
681 +tracking. The intention is that other subsystems hook into the generic
682 +cgroup support to provide new attributes for cgroups, such as
683 +accounting/limiting the resources which processes in a cgroup can
684 +access. For example, cpusets (see Documentation/cpusets.txt) allows
685 +you to associate a set of CPUs and a set of memory nodes with the
686 +tasks in each cgroup.
687 +
688 +1.2 Why are cgroups needed ?
689 +----------------------------
690 +
691 +There are multiple efforts to provide process aggregations in the
692 +Linux kernel, mainly for resource tracking purposes. Such efforts
693 +include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
694 +namespaces. These all require the basic notion of a
695 +grouping/partitioning of processes, with newly forked processes ending
696 +in the same group (cgroup) as their parent process.
697 +
698 +The kernel cgroup patch provides the minimum essential kernel
699 +mechanisms required to efficiently implement such groups. It has
700 +minimal impact on the system fast paths, and provides hooks for
701 +specific subsystems such as cpusets to provide additional behaviour as
702 +desired.
703 +
704 +Multiple hierarchy support is provided to allow for situations where
705 +the division of tasks into cgroups is distinctly different for
706 +different subsystems - having parallel hierarchies allows each
707 +hierarchy to be a natural division of tasks, without having to handle
708 +complex combinations of tasks that would be present if several
709 +unrelated subsystems needed to be forced into the same tree of
710 +cgroups.
711 +
712 +At one extreme, each resource controller or subsystem could be in a
713 +separate hierarchy; at the other extreme, all subsystems
714 +would be attached to the same hierarchy.
715 +
716 +As an example of a scenario (originally proposed by vatsa@in.ibm.com)
717 +that can benefit from multiple hierarchies, consider a large
718 +university server with various users - students, professors, system
719 +tasks etc. The resource planning for this server could be along the
720 +following lines:
721 +
722 + CPU : Top cpuset
723 + / \
724 + CPUSet1 CPUSet2
725 + | |
726 + (Profs) (Students)
727 +
728 + In addition (system tasks) are attached to topcpuset (so
729 + that they can run anywhere) with a limit of 20%
730 +
731 + Memory : Professors (50%), students (30%), system (20%)
732 +
733 + Disk : Prof (50%), students (30%), system (20%)
734 +
735 + Network : WWW browsing (20%), Network File System (60%), others (20%)
736 + / \
737 + Prof (15%) students (5%)
738 +
739 +Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
740 +into NFS network class.
741 +
742 +At the same time firefox/lynx will share an appropriate CPU/Memory class
743 +depending on who launched it (prof/student).
744 +
745 +With the ability to classify tasks differently for different resources
746 +(by putting those resource subsystems in different hierarchies) then
747 +the admin can easily set up a script which receives exec notifications
748 +and depending on who is launching the browser he can
749 +
750 + # echo browser_pid > /mnt/<restype>/<userclass>/tasks
751 +
752 +With only a single hierarchy, he now would potentially have to create
753 +a separate cgroup for every browser launched and associate it with
754 +approp network and other resource class. This may lead to
755 +proliferation of such cgroups.
756 +
757 +Also lets say that the administrator would like to give enhanced network
758 +access temporarily to a student's browser (since it is night and the user
759 +wants to do online gaming :)) OR give one of the students simulation
760 +apps enhanced CPU power,
761 +
762 +With ability to write pids directly to resource classes, it's just a
763 +matter of :
764 +
765 + # echo pid > /mnt/network/<new_class>/tasks
766 + (after some time)
767 + # echo pid > /mnt/network/<orig_class>/tasks
768 +
769 +Without this ability, he would have to split the cgroup into
770 +multiple separate ones and then associate the new cgroups with the
771 +new resource classes.
772 +
773 +
774 +
775 +1.3 How are cgroups implemented ?
776 +---------------------------------
777 +
778 +Control Groups extends the kernel as follows:
779 +
780 + - Each task in the system has a reference-counted pointer to a
781 + css_set.
782 +
783 + - A css_set contains a set of reference-counted pointers to
784 + cgroup_subsys_state objects, one for each cgroup subsystem
785 + registered in the system. There is no direct link from a task to
786 + the cgroup of which it's a member in each hierarchy, but this
787 + can be determined by following pointers through the
788 + cgroup_subsys_state objects. This is because accessing the
789 + subsystem state is something that's expected to happen frequently
790 + and in performance-critical code, whereas operations that require a
791 + task's actual cgroup assignments (in particular, moving between
792 + cgroups) are less common. A linked list runs through the cg_list
793 + field of each task_struct using the css_set, anchored at
794 + css_set->tasks.
795 +
796 + - A cgroup hierarchy filesystem can be mounted for browsing and
797 + manipulation from user space.
798 +
799 + - You can list all the tasks (by pid) attached to any cgroup.
800 +
801 +The implementation of cgroups requires a few, simple hooks
802 +into the rest of the kernel, none in performance critical paths:
803 +
804 + - in init/main.c, to initialize the root cgroups and initial
805 + css_set at system boot.
806 +
807 + - in fork and exit, to attach and detach a task from its css_set.
808 +
809 +In addition a new file system, of type "cgroup" may be mounted, to
810 +enable browsing and modifying the cgroups presently known to the
811 +kernel. When mounting a cgroup hierarchy, you may specify a
812 +comma-separated list of subsystems to mount as the filesystem mount
813 +options. By default, mounting the cgroup filesystem attempts to
814 +mount a hierarchy containing all registered subsystems.
815 +
816 +If an active hierarchy with exactly the same set of subsystems already
817 +exists, it will be reused for the new mount. If no existing hierarchy
818 +matches, and any of the requested subsystems are in use in an existing
819 +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
820 +is activated, associated with the requested subsystems.
821 +
822 +It's not currently possible to bind a new subsystem to an active
823 +cgroup hierarchy, or to unbind a subsystem from an active cgroup
824 +hierarchy. This may be possible in future, but is fraught with nasty
825 +error-recovery issues.
826 +
827 +When a cgroup filesystem is unmounted, if there are any
828 +child cgroups created below the top-level cgroup, that hierarchy
829 +will remain active even though unmounted; if there are no
830 +child cgroups then the hierarchy will be deactivated.
831 +
832 +No new system calls are added for cgroups - all support for
833 +querying and modifying cgroups is via this cgroup file system.
834 +
835 +Each task under /proc has an added file named 'cgroup' displaying,
836 +for each active hierarchy, the subsystem names and the cgroup name
837 +as the path relative to the root of the cgroup file system.
838 +
839 +Each cgroup is represented by a directory in the cgroup file system
840 +containing the following files describing that cgroup:
841 +
842 + - tasks: list of tasks (by pid) attached to that cgroup
843 + - releasable flag: cgroup currently removeable?
844 + - notify_on_release flag: run the release agent on exit?
845 + - release_agent: the path to use for release notifications (this file
846 + exists in the top cgroup only)
847 +
848 +Other subsystems such as cpusets may add additional files in each
849 +cgroup dir.
850 +
851 +New cgroups are created using the mkdir system call or shell
852 +command. The properties of a cgroup, such as its flags, are
853 +modified by writing to the appropriate file in that cgroups
854 +directory, as listed above.
855 +
856 +The named hierarchical structure of nested cgroups allows partitioning
857 +a large system into nested, dynamically changeable, "soft-partitions".
858 +
859 +The attachment of each task, automatically inherited at fork by any
860 +children of that task, to a cgroup allows organizing the work load
861 +on a system into related sets of tasks. A task may be re-attached to
862 +any other cgroup, if allowed by the permissions on the necessary
863 +cgroup file system directories.
864 +
865 +When a task is moved from one cgroup to another, it gets a new
866 +css_set pointer - if there's an already existing css_set with the
867 +desired collection of cgroups then that group is reused, else a new
868 +css_set is allocated. Note that the current implementation uses a
869 +linear search to locate an appropriate existing css_set, so isn't
870 +very efficient. A future version will use a hash table for better
871 +performance.
872 +
873 +To allow access from a cgroup to the css_sets (and hence tasks)
874 +that comprise it, a set of cg_cgroup_link objects form a lattice;
875 +each cg_cgroup_link is linked into a list of cg_cgroup_links for
876 +a single cgroup on its cgrp_link_list field, and a list of
877 +cg_cgroup_links for a single css_set on its cg_link_list.
878 +
879 +Thus the set of tasks in a cgroup can be listed by iterating over
880 +each css_set that references the cgroup, and sub-iterating over
881 +each css_set's task set.
882 +
883 +The use of a Linux virtual file system (vfs) to represent the
884 +cgroup hierarchy provides for a familiar permission and name space
885 +for cgroups, with a minimum of additional kernel code.
886 +
887 +1.4 What does notify_on_release do ?
888 +------------------------------------
889 +
890 +If the notify_on_release flag is enabled (1) in a cgroup, then
891 +whenever the last task in the cgroup leaves (exits or attaches to
892 +some other cgroup) and the last child cgroup of that cgroup
893 +is removed, then the kernel runs the command specified by the contents
894 +of the "release_agent" file in that hierarchy's root directory,
895 +supplying the pathname (relative to the mount point of the cgroup
896 +file system) of the abandoned cgroup. This enables automatic
897 +removal of abandoned cgroups. The default value of
898 +notify_on_release in the root cgroup at system boot is disabled
899 +(0). The default value of other cgroups at creation is the current
900 +value of their parents notify_on_release setting. The default value of
901 +a cgroup hierarchy's release_agent path is empty.
902 +
903 +1.5 How do I use cgroups ?
904 +--------------------------
905 +
906 +To start a new job that is to be contained within a cgroup, using
907 +the "cpuset" cgroup subsystem, the steps are something like:
908 +
909 + 1) mkdir /dev/cgroup
910 + 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
911 + 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
912 + the /dev/cgroup virtual file system.
913 + 4) Start a task that will be the "founding father" of the new job.
914 + 5) Attach that task to the new cgroup by writing its pid to the
915 + /dev/cgroup tasks file for that cgroup.
916 + 6) fork, exec or clone the job tasks from this founding father task.
917 +
918 +For example, the following sequence of commands will setup a cgroup
919 +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
920 +and then start a subshell 'sh' in that cgroup:
921 +
922 + mount -t cgroup cpuset -ocpuset /dev/cgroup
923 + cd /dev/cgroup
924 + mkdir Charlie
925 + cd Charlie
926 + /bin/echo 2-3 > cpuset.cpus
927 + /bin/echo 1 > cpuset.mems
928 + /bin/echo $$ > tasks
929 + sh
930 + # The subshell 'sh' is now running in cgroup Charlie
931 + # The next line should display '/Charlie'
932 + cat /proc/self/cgroup
933 +
934 +2. Usage Examples and Syntax
935 +============================
936 +
937 +2.1 Basic Usage
938 +---------------
939 +
940 +Creating, modifying, using the cgroups can be done through the cgroup
941 +virtual filesystem.
942 +
943 +To mount a cgroup hierarchy will all available subsystems, type:
944 +# mount -t cgroup xxx /dev/cgroup
945 +
946 +The "xxx" is not interpreted by the cgroup code, but will appear in
947 +/proc/mounts so may be any useful identifying string that you like.
948 +
949 +To mount a cgroup hierarchy with just the cpuset and numtasks
950 +subsystems, type:
951 +# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
952 +
953 +To change the set of subsystems bound to a mounted hierarchy, just
954 +remount with different options:
955 +
956 +# mount -o remount,cpuset,ns /dev/cgroup
957 +
958 +Note that changing the set of subsystems is currently only supported
959 +when the hierarchy consists of a single (root) cgroup. Supporting
960 +the ability to arbitrarily bind/unbind subsystems from an existing
961 +cgroup hierarchy is intended to be implemented in the future.
962 +
963 +Then under /dev/cgroup you can find a tree that corresponds to the
964 +tree of the cgroups in the system. For instance, /dev/cgroup
965 +is the cgroup that holds the whole system.
966 +
967 +If you want to create a new cgroup under /dev/cgroup:
968 +# cd /dev/cgroup
969 +# mkdir my_cgroup
970 +
971 +Now you want to do something with this cgroup.
972 +# cd my_cgroup
973 +
974 +In this directory you can find several files:
975 +# ls
976 +notify_on_release releasable tasks
977 +(plus whatever files added by the attached subsystems)
978 +
979 +Now attach your shell to this cgroup:
980 +# /bin/echo $$ > tasks
981 +
982 +You can also create cgroups inside your cgroup by using mkdir in this
983 +directory.
984 +# mkdir my_sub_cs
985 +
986 +To remove a cgroup, just use rmdir:
987 +# rmdir my_sub_cs
988 +
989 +This will fail if the cgroup is in use (has cgroups inside, or
990 +has processes attached, or is held alive by other subsystem-specific
991 +reference).
992 +
993 +2.2 Attaching processes
994 +-----------------------
995 +
996 +# /bin/echo PID > tasks
997 +
998 +Note that it is PID, not PIDs. You can only attach ONE task at a time.
999 +If you have several tasks to attach, you have to do it one after another:
1000 +
1001 +# /bin/echo PID1 > tasks
1002 +# /bin/echo PID2 > tasks
1003 + ...
1004 +# /bin/echo PIDn > tasks
1005 +
1006 +You can attach the current shell task by echoing 0:
1007 +
1008 +# echo 0 > tasks
1009 +
1010 +3. Kernel API
1011 +=============
1012 +
1013 +3.1 Overview
1014 +------------
1015 +
1016 +Each kernel subsystem that wants to hook into the generic cgroup
1017 +system needs to create a cgroup_subsys object. This contains
1018 +various methods, which are callbacks from the cgroup system, along
1019 +with a subsystem id which will be assigned by the cgroup system.
1020 +
1021 +Other fields in the cgroup_subsys object include:
1022 +
1023 +- subsys_id: a unique array index for the subsystem, indicating which
1024 + entry in cgroup->subsys[] this subsystem should be managing.
1025 +
1026 +- name: should be initialized to a unique subsystem name. Should be
1027 + no longer than MAX_CGROUP_TYPE_NAMELEN.
1028 +
1029 +- early_init: indicate if the subsystem needs early initialization
1030 + at system boot.
1031 +
1032 +Each cgroup object created by the system has an array of pointers,
1033 +indexed by subsystem id; this pointer is entirely managed by the
1034 +subsystem; the generic cgroup code will never touch this pointer.
1035 +
1036 +3.2 Synchronization
1037 +-------------------
1038 +
1039 +There is a global mutex, cgroup_mutex, used by the cgroup
1040 +system. This should be taken by anything that wants to modify a
1041 +cgroup. It may also be taken to prevent cgroups from being
1042 +modified, but more specific locks may be more appropriate in that
1043 +situation.
1044 +
1045 +See kernel/cgroup.c for more details.
1046 +
1047 +Subsystems can take/release the cgroup_mutex via the functions
1048 +cgroup_lock()/cgroup_unlock().
1049 +
1050 +Accessing a task's cgroup pointer may be done in the following ways:
1051 +- while holding cgroup_mutex
1052 +- while holding the task's alloc_lock (via task_lock())
1053 +- inside an rcu_read_lock() section via rcu_dereference()
1054 +
1055 +3.3 Subsystem API
1056 +-----------------
1057 +
1058 +Each subsystem should:
1059 +
1060 +- add an entry in linux/cgroup_subsys.h
1061 +- define a cgroup_subsys object called <name>_subsys
1062 +
1063 +Each subsystem may export the following methods. The only mandatory
1064 +methods are create/destroy. Any others that are null are presumed to
1065 +be successful no-ops.
1066 +
1067 +struct cgroup_subsys_state *create(struct cgroup_subsys *ss,
1068 + struct cgroup *cgrp)
1069 +(cgroup_mutex held by caller)
1070 +
1071 +Called to create a subsystem state object for a cgroup. The
1072 +subsystem should allocate its subsystem state object for the passed
1073 +cgroup, returning a pointer to the new object on success or a
1074 +negative error code. On success, the subsystem pointer should point to
1075 +a structure of type cgroup_subsys_state (typically embedded in a
1076 +larger subsystem-specific object), which will be initialized by the
1077 +cgroup system. Note that this will be called at initialization to
1078 +create the root subsystem state for this subsystem; this case can be
1079 +identified by the passed cgroup object having a NULL parent (since
1080 +it's the root of the hierarchy) and may be an appropriate place for
1081 +initialization code.
1082 +
1083 +void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
1084 +(cgroup_mutex held by caller)
1085 +
1086 +The cgroup system is about to destroy the passed cgroup; the subsystem
1087 +should do any necessary cleanup and free its subsystem state
1088 +object. By the time this method is called, the cgroup has already been
1089 +unlinked from the file system and from the child list of its parent;
1090 +cgroup->parent is still valid. (Note - can also be called for a
1091 +newly-created cgroup if an error occurs after this subsystem's
1092 +create() method has been called for the new cgroup).
1093 +
1094 +void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
1095 +(cgroup_mutex held by caller)
1096 +
1097 +Called before checking the reference count on each subsystem. This may
1098 +be useful for subsystems which have some extra references even if
1099 +there are not tasks in the cgroup.
1100 +
1101 +int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1102 + struct task_struct *task)
1103 +(cgroup_mutex held by caller)
1104 +
1105 +Called prior to moving a task into a cgroup; if the subsystem
1106 +returns an error, this will abort the attach operation. If a NULL
1107 +task is passed, then a successful result indicates that *any*
1108 +unspecified task can be moved into the cgroup. Note that this isn't
1109 +called on a fork. If this method returns 0 (success) then this should
1110 +remain valid while the caller holds cgroup_mutex.
1111 +
1112 +void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1113 + struct cgroup *old_cgrp, struct task_struct *task)
1114 +
1115 +Called after the task has been attached to the cgroup, to allow any
1116 +post-attachment activity that requires memory allocations or blocking.
1117 +
1118 +void fork(struct cgroup_subsy *ss, struct task_struct *task)
1119 +
1120 +Called when a task is forked into a cgroup.
1121 +
1122 +void exit(struct cgroup_subsys *ss, struct task_struct *task)
1123 +
1124 +Called during task exit.
1125 +
1126 +int populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
1127 +
1128 +Called after creation of a cgroup to allow a subsystem to populate
1129 +the cgroup directory with file entries. The subsystem should make
1130 +calls to cgroup_add_file() with objects of type cftype (see
1131 +include/linux/cgroup.h for details). Note that although this
1132 +method can return an error code, the error code is currently not
1133 +always handled well.
1134 +
1135 +void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
1136 +
1137 +Called at the end of cgroup_clone() to do any paramater
1138 +initialization which might be required before a task could attach. For
1139 +example in cpusets, no task may attach before 'cpus' and 'mems' are set
1140 +up.
1141 +
1142 +void bind(struct cgroup_subsys *ss, struct cgroup *root)
1143 +(cgroup_mutex held by caller)
1144 +
1145 +Called when a cgroup subsystem is rebound to a different hierarchy
1146 +and root cgroup. Currently this will only involve movement between
1147 +the default hierarchy (which never has sub-cgroups) and a hierarchy
1148 +that is being created/destroyed (and hence has no sub-cgroups).
1149 +
1150 +4. Questions
1151 +============
1152 +
1153 +Q: what's up with this '/bin/echo' ?
1154 +A: bash's builtin 'echo' command does not check calls to write() against
1155 + errors. If you use it in the cgroup file system, you won't be
1156 + able to tell whether a command succeeded or failed.
1157 +
1158 +Q: When I attach processes, only the first of the line gets really attached !
1159 +A: We can only return one error code per call to write(). So you should also
1160 + put only ONE pid.
1161 +
1162 --- /dev/null
1163 +++ b/Documentation/cgroups/freezer-subsystem.txt
1164 @@ -0,0 +1,102 @@
1165 +The cgroup freezer is useful to batch job management system which start
1166 +and stop sets of tasks in order to schedule the resources of a machine
1167 +according to the desires of a system administrator. This sort of program
1168 +is often used on HPC clusters to schedule access to the cluster as a
1169 +whole. The cgroup freezer uses cgroups to describe the set of tasks to
1170 +be started/stopped by the batch job management system. It also provides
1171 +a means to start and stop the tasks composing the job.
1172 +
1173 +The cgroup freezer will also be useful for checkpointing running groups
1174 +of tasks. The freezer allows the checkpoint code to obtain a consistent
1175 +image of the tasks by attempting to force the tasks in a cgroup into a
1176 +quiescent state. Once the tasks are quiescent another task can
1177 +walk /proc or invoke a kernel interface to gather information about the
1178 +quiesced tasks. Checkpointed tasks can be restarted later should a
1179 +recoverable error occur. This also allows the checkpointed tasks to be
1180 +migrated between nodes in a cluster by copying the gathered information
1181 +to another node and restarting the tasks there.
1182 +
1183 +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
1184 +and resuming tasks in userspace. Both of these signals are observable
1185 +from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
1186 +blocked, or ignored it can be seen by waiting or ptracing parent tasks.
1187 +SIGCONT is especially unsuitable since it can be caught by the task. Any
1188 +programs designed to watch for SIGSTOP and SIGCONT could be broken by
1189 +attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
1190 +demonstrate this problem using nested bash shells:
1191 +
1192 + $ echo $$
1193 + 16644
1194 + $ bash
1195 + $ echo $$
1196 + 16690
1197 +
1198 + From a second, unrelated bash shell:
1199 + $ kill -SIGSTOP 16690
1200 + $ kill -SIGCONT 16990
1201 +
1202 + <at this point 16990 exits and causes 16644 to exit too>
1203 +
1204 +This happens because bash can observe both signals and choose how it
1205 +responds to them.
1206 +
1207 +Another example of a program which catches and responds to these
1208 +signals is gdb. In fact any program designed to use ptrace is likely to
1209 +have a problem with this method of stopping and resuming tasks.
1210 +
1211 +In contrast, the cgroup freezer uses the kernel freezer code to
1212 +prevent the freeze/unfreeze cycle from becoming visible to the tasks
1213 +being frozen. This allows the bash example above and gdb to run as
1214 +expected.
1215 +
1216 +The freezer subsystem in the container filesystem defines a file named
1217 +freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the
1218 +cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup.
1219 +Reading will return the current state.
1220 +
1221 +Note freezer.state doesn't exist in root cgroup, which means root cgroup
1222 +is non-freezable.
1223 +
1224 +* Examples of usage :
1225 +
1226 + # mkdir /containers
1227 + # mount -t cgroup -ofreezer freezer /containers
1228 + # mkdir /containers/0
1229 + # echo $some_pid > /containers/0/tasks
1230 +
1231 +to get status of the freezer subsystem :
1232 +
1233 + # cat /containers/0/freezer.state
1234 + THAWED
1235 +
1236 +to freeze all tasks in the container :
1237 +
1238 + # echo FROZEN > /containers/0/freezer.state
1239 + # cat /containers/0/freezer.state
1240 + FREEZING
1241 + # cat /containers/0/freezer.state
1242 + FROZEN
1243 +
1244 +to unfreeze all tasks in the container :
1245 +
1246 + # echo THAWED > /containers/0/freezer.state
1247 + # cat /containers/0/freezer.state
1248 + THAWED
1249 +
1250 +This is the basic mechanism which should do the right thing for user space task
1251 +in a simple scenario.
1252 +
1253 +It's important to note that freezing can be incomplete. In that case we return
1254 +EBUSY. This means that some tasks in the cgroup are busy doing something that
1255 +prevents us from completely freezing the cgroup at this time. After EBUSY,
1256 +the cgroup will remain partially frozen -- reflected by freezer.state reporting
1257 +"FREEZING" when read. The state will remain "FREEZING" until one of these
1258 +things happens:
1259 +
1260 + 1) Userspace cancels the freezing operation by writing "THAWED" to
1261 + the freezer.state file
1262 + 2) Userspace retries the freezing operation by writing "FROZEN" to
1263 + the freezer.state file (writing "FREEZING" is not legal
1264 + and returns EINVAL)
1265 + 3) The tasks that blocked the cgroup from entering the "FROZEN"
1266 + state disappear from the cgroup's set of tasks.
1267 --- a/Documentation/cpusets.txt
1268 +++ b/Documentation/cpusets.txt
1269 @@ -48,7 +48,7 @@ hooks, beyond what is already present, r
1270 job placement on large systems.
1271
1272 Cpusets use the generic cgroup subsystem described in
1273 -Documentation/cgroup.txt.
1274 +Documentation/cgroups/cgroups.txt.
1275
1276 Requests by a task, using the sched_setaffinity(2) system call to
1277 include CPUs in its CPU affinity mask, and using the mbind(2) and
1278 --- a/arch/alpha/Kconfig
1279 +++ b/arch/alpha/Kconfig
1280 @@ -72,6 +72,7 @@ config ARCH_SUPPORTS_AOUT
1281 def_bool y
1282
1283 source "init/Kconfig"
1284 +source "kernel/Kconfig.freezer"
1285
1286
1287 menu "System setup"
1288 --- a/arch/alpha/include/asm/thread_info.h
1289 +++ b/arch/alpha/include/asm/thread_info.h
1290 @@ -74,12 +74,14 @@ register struct thread_info *__current_t
1291 #define TIF_UAC_SIGBUS 7
1292 #define TIF_MEMDIE 8
1293 #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */
1294 +#define TIF_FREEZE 16 /* is freezing for suspend */
1295
1296 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
1297 #define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
1298 #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
1299 #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
1300 #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
1301 +#define _TIF_FREEZE (1<<TIF_FREEZE)
1302
1303 /* Work to do on interrupt/exception return. */
1304 #define _TIF_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED)
1305 --- a/arch/arm/Kconfig
1306 +++ b/arch/arm/Kconfig
1307 @@ -190,6 +190,8 @@ config VECTORS_BASE
1308
1309 source "init/Kconfig"
1310
1311 +source "kernel/Kconfig.freezer"
1312 +
1313 menu "System Type"
1314
1315 choice
1316 --- a/arch/avr32/Kconfig
1317 +++ b/arch/avr32/Kconfig
1318 @@ -72,6 +72,8 @@ config GENERIC_BUG
1319
1320 source "init/Kconfig"
1321
1322 +source "kernel/Kconfig.freezer"
1323 +
1324 menu "System Type and features"
1325
1326 source "kernel/time/Kconfig"
1327 --- a/arch/avr32/include/asm/thread_info.h
1328 +++ b/arch/avr32/include/asm/thread_info.h
1329 @@ -96,6 +96,7 @@ static inline struct thread_info *curren
1330 #define _TIF_MEMDIE (1 << TIF_MEMDIE)
1331 #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK)
1332 #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP)
1333 +#define _TIF_FREEZE (1 << TIF_FREEZE)
1334
1335 /* Note: The masks below must never span more than 16 bits! */
1336
1337 --- a/arch/blackfin/Kconfig
1338 +++ b/arch/blackfin/Kconfig
1339 @@ -64,8 +64,11 @@ config HARDWARE_PM
1340 depends on OPROFILE
1341
1342 source "init/Kconfig"
1343 +
1344 source "kernel/Kconfig.preempt"
1345
1346 +source "kernel/Kconfig.freezer"
1347 +
1348 menu "Blackfin Processor Options"
1349
1350 comment "Processor and Board Settings"
1351 --- a/arch/cris/Kconfig
1352 +++ b/arch/cris/Kconfig
1353 @@ -62,6 +62,8 @@ config HZ
1354
1355 source "init/Kconfig"
1356
1357 +source "kernel/Kconfig.freezer"
1358 +
1359 menu "General setup"
1360
1361 source "fs/Kconfig.binfmt"
1362 --- a/arch/frv/Kconfig
1363 +++ b/arch/frv/Kconfig
1364 @@ -66,6 +66,8 @@ mainmenu "Fujitsu FR-V Kernel Configurat
1365
1366 source "init/Kconfig"
1367
1368 +source "kernel/Kconfig.freezer"
1369 +
1370
1371 menu "Fujitsu FR-V system setup"
1372
1373 --- a/arch/h8300/Kconfig
1374 +++ b/arch/h8300/Kconfig
1375 @@ -89,6 +89,8 @@ config HZ
1376
1377 source "init/Kconfig"
1378
1379 +source "kernel/Kconfig.freezer"
1380 +
1381 source "arch/h8300/Kconfig.cpu"
1382
1383 menu "Executable file formats"
1384 --- a/arch/h8300/include/asm/thread_info.h
1385 +++ b/arch/h8300/include/asm/thread_info.h
1386 @@ -89,6 +89,7 @@ static inline struct thread_info *curren
1387 TIF_NEED_RESCHED */
1388 #define TIF_MEMDIE 4
1389 #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */
1390 +#define TIF_FREEZE 16 /* is freezing for suspend */
1391
1392 /* as above, but as bit values */
1393 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
1394 @@ -96,6 +97,7 @@ static inline struct thread_info *curren
1395 #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
1396 #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
1397 #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
1398 +#define _TIF_FREEZE (1<<TIF_FREEZE)
1399
1400 #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */
1401
1402 --- a/arch/ia64/Kconfig
1403 +++ b/arch/ia64/Kconfig
1404 @@ -7,6 +7,8 @@ mainmenu "IA-64 Linux Kernel Configurati
1405
1406 source "init/Kconfig"
1407
1408 +source "kernel/Kconfig.freezer"
1409 +
1410 menu "Processor type and features"
1411
1412 config IA64
1413 --- a/arch/m32r/Kconfig
1414 +++ b/arch/m32r/Kconfig
1415 @@ -45,6 +45,8 @@ config HZ
1416
1417 source "init/Kconfig"
1418
1419 +source "kernel/Kconfig.freezer"
1420 +
1421
1422 menu "Processor type and features"
1423
1424 --- a/arch/m68k/Kconfig
1425 +++ b/arch/m68k/Kconfig
1426 @@ -64,6 +64,8 @@ mainmenu "Linux/68k Kernel Configuration
1427
1428 source "init/Kconfig"
1429
1430 +source "kernel/Kconfig.freezer"
1431 +
1432 menu "Platform dependent setup"
1433
1434 config EISA
1435 --- a/arch/m68knommu/Kconfig
1436 +++ b/arch/m68knommu/Kconfig
1437 @@ -82,6 +82,8 @@ config ARCH_SUPPORTS_AOUT
1438
1439 source "init/Kconfig"
1440
1441 +source "kernel/Kconfig.freezer"
1442 +
1443 menu "Processor type and features"
1444
1445 choice
1446 --- a/arch/m68knommu/include/asm/thread_info.h
1447 +++ b/arch/m68knommu/include/asm/thread_info.h
1448 @@ -84,12 +84,14 @@ static inline struct thread_info *curren
1449 #define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling
1450 TIF_NEED_RESCHED */
1451 #define TIF_MEMDIE 4
1452 +#define TIF_FREEZE 16 /* is freezing for suspend */
1453
1454 /* as above, but as bit values */
1455 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
1456 #define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
1457 #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
1458 #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
1459 +#define _TIF_FREEZE (1<<TIF_FREEZE)
1460
1461 #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */
1462
1463 --- a/arch/mips/Kconfig
1464 +++ b/arch/mips/Kconfig
1465 @@ -1885,6 +1885,8 @@ config PROBE_INITRD_HEADER
1466 add initrd or initramfs image to the kernel image.
1467 Otherwise, say N.
1468
1469 +source "kernel/Kconfig.freezer"
1470 +
1471 menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)"
1472
1473 config HW_HAS_EISA
1474 --- a/arch/mn10300/Kconfig
1475 +++ b/arch/mn10300/Kconfig
1476 @@ -71,6 +71,8 @@ mainmenu "Matsushita MN10300/AM33 Kernel
1477
1478 source "init/Kconfig"
1479
1480 +source "kernel/Kconfig.freezer"
1481 +
1482
1483 menu "Matsushita MN10300 system setup"
1484
1485 --- a/arch/parisc/Kconfig
1486 +++ b/arch/parisc/Kconfig
1487 @@ -93,6 +93,8 @@ config ARCH_MAY_HAVE_PC_FDC
1488
1489 source "init/Kconfig"
1490
1491 +source "kernel/Kconfig.freezer"
1492 +
1493
1494 menu "Processor type and features"
1495
1496 --- a/arch/powerpc/Kconfig
1497 +++ b/arch/powerpc/Kconfig
1498 @@ -228,6 +228,8 @@ config PPC_OF_PLATFORM_PCI
1499
1500 source "init/Kconfig"
1501
1502 +source "kernel/Kconfig.freezer"
1503 +
1504 source "arch/powerpc/sysdev/Kconfig"
1505 source "arch/powerpc/platforms/Kconfig"
1506
1507 --- a/arch/s390/Kconfig
1508 +++ b/arch/s390/Kconfig
1509 @@ -79,6 +79,8 @@ config S390
1510
1511 source "init/Kconfig"
1512
1513 +source "kernel/Kconfig.freezer"
1514 +
1515 menu "Base setup"
1516
1517 comment "Processor type and features"
1518 --- a/arch/s390/include/asm/thread_info.h
1519 +++ b/arch/s390/include/asm/thread_info.h
1520 @@ -98,6 +98,7 @@ static inline struct thread_info *curren
1521 #define TIF_31BIT 18 /* 32bit process */
1522 #define TIF_MEMDIE 19
1523 #define TIF_RESTORE_SIGMASK 20 /* restore signal mask in do_signal() */
1524 +#define TIF_FREEZE 21
1525
1526 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
1527 #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
1528 @@ -110,6 +111,7 @@ static inline struct thread_info *curren
1529 #define _TIF_USEDFPU (1<<TIF_USEDFPU)
1530 #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
1531 #define _TIF_31BIT (1<<TIF_31BIT)
1532 +#define _TIF_FREEZE (1<<TIF_FREEZE)
1533
1534 #endif /* __KERNEL__ */
1535
1536 --- a/arch/sh/Kconfig
1537 +++ b/arch/sh/Kconfig
1538 @@ -106,6 +106,8 @@ config IO_TRAPPED
1539
1540 source "init/Kconfig"
1541
1542 +source "kernel/Kconfig.freezer"
1543 +
1544 menu "System type"
1545
1546 #
1547 --- a/arch/sparc/Kconfig
1548 +++ b/arch/sparc/Kconfig
1549 @@ -32,6 +32,8 @@ config HZ
1550
1551 source "init/Kconfig"
1552
1553 +source "kernel/Kconfig.freezer"
1554 +
1555 menu "General machine setup"
1556
1557 config SMP
1558 --- a/arch/sparc/include/asm/thread_info_32.h
1559 +++ b/arch/sparc/include/asm/thread_info_32.h
1560 @@ -139,6 +139,7 @@ BTFIXUPDEF_CALL(void, free_thread_info,
1561 #define TIF_POLLING_NRFLAG 9 /* true if poll_idle() is polling
1562 * TIF_NEED_RESCHED */
1563 #define TIF_MEMDIE 10
1564 +#define TIF_FREEZE 11 /* is freezing for suspend */
1565
1566 /* as above, but as bit values */
1567 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
1568 @@ -152,6 +153,7 @@ BTFIXUPDEF_CALL(void, free_thread_info,
1569 #define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | \
1570 _TIF_SIGPENDING | \
1571 _TIF_RESTORE_SIGMASK)
1572 +#define _TIF_FREEZE (1<<TIF_FREEZE)
1573
1574 #endif /* __KERNEL__ */
1575
1576 --- a/arch/sparc64/Kconfig
1577 +++ b/arch/sparc64/Kconfig
1578 @@ -85,6 +85,7 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
1579 def_bool y
1580
1581 source "init/Kconfig"
1582 +source "kernel/Kconfig.freezer"
1583
1584 menu "Processor type and features"
1585
1586 --- a/arch/um/Kconfig
1587 +++ b/arch/um/Kconfig
1588 @@ -229,6 +229,8 @@ endmenu
1589
1590 source "init/Kconfig"
1591
1592 +source "kernel/Kconfig.freezer"
1593 +
1594 source "drivers/block/Kconfig"
1595
1596 source "arch/um/Kconfig.char"
1597 --- a/arch/x86/Kconfig
1598 +++ b/arch/x86/Kconfig
1599 @@ -208,6 +208,7 @@ config X86_TRAMPOLINE
1600 config KTIME_SCALAR
1601 def_bool X86_32
1602 source "init/Kconfig"
1603 +source "kernel/Kconfig.freezer"
1604
1605 menu "Processor type and features"
1606
1607 --- a/arch/xtensa/Kconfig
1608 +++ b/arch/xtensa/Kconfig
1609 @@ -55,6 +55,7 @@ config HZ
1610 default 100
1611
1612 source "init/Kconfig"
1613 +source "kernel/Kconfig.freezer"
1614
1615 menu "Processor type and features"
1616
1617 --- a/include/asm-cris/thread_info.h
1618 +++ b/include/asm-cris/thread_info.h
1619 @@ -88,6 +88,7 @@ struct thread_info {
1620 #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */
1621 #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */
1622 #define TIF_MEMDIE 17
1623 +#define TIF_FREEZE 18 /* is freezing for suspend */
1624
1625 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
1626 #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
1627 @@ -95,6 +96,7 @@ struct thread_info {
1628 #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
1629 #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
1630 #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
1631 +#define _TIF_FREEZE (1<<TIF_FREEZE)
1632
1633 #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */
1634 #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */
1635 --- a/include/asm-m68k/thread_info.h
1636 +++ b/include/asm-m68k/thread_info.h
1637 @@ -52,5 +52,6 @@ struct thread_info {
1638 #define TIF_DELAYED_TRACE 14 /* single step a syscall */
1639 #define TIF_SYSCALL_TRACE 15 /* syscall trace active */
1640 #define TIF_MEMDIE 16
1641 +#define TIF_FREEZE 17 /* thread is freezing for suspend */
1642
1643 #endif /* _ASM_M68K_THREAD_INFO_H */
1644 --- a/include/asm-parisc/thread_info.h
1645 +++ b/include/asm-parisc/thread_info.h
1646 @@ -58,6 +58,7 @@ struct thread_info {
1647 #define TIF_32BIT 4 /* 32 bit binary */
1648 #define TIF_MEMDIE 5
1649 #define TIF_RESTORE_SIGMASK 6 /* restore saved signal mask */
1650 +#define TIF_FREEZE 7 /* is freezing for suspend */
1651
1652 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
1653 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
1654 @@ -65,6 +66,7 @@ struct thread_info {
1655 #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
1656 #define _TIF_32BIT (1 << TIF_32BIT)
1657 #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK)
1658 +#define _TIF_FREEZE (1 << TIF_FREEZE)
1659
1660 #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | \
1661 _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK)
1662 --- a/include/asm-um/thread_info.h
1663 +++ b/include/asm-um/thread_info.h
1664 @@ -69,6 +69,7 @@ static inline struct thread_info *curren
1665 #define TIF_MEMDIE 5
1666 #define TIF_SYSCALL_AUDIT 6
1667 #define TIF_RESTORE_SIGMASK 7
1668 +#define TIF_FREEZE 16 /* is freezing for suspend */
1669
1670 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
1671 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
1672 @@ -77,5 +78,6 @@ static inline struct thread_info *curren
1673 #define _TIF_MEMDIE (1 << TIF_MEMDIE)
1674 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
1675 #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK)
1676 +#define _TIF_FREEZE (1 << TIF_FREEZE)
1677
1678 #endif
1679 --- a/include/asm-xtensa/thread_info.h
1680 +++ b/include/asm-xtensa/thread_info.h
1681 @@ -134,6 +134,7 @@ static inline struct thread_info *curren
1682 #define TIF_MEMDIE 5
1683 #define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */
1684 #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */
1685 +#define TIF_FREEZE 17 /* is freezing for suspend */
1686
1687 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
1688 #define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
1689 @@ -142,6 +143,7 @@ static inline struct thread_info *curren
1690 #define _TIF_IRET (1<<TIF_IRET)
1691 #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
1692 #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
1693 +#define _TIF_FREEZE (1<<TIF_FREEZE)
1694
1695 #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */
1696 #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */
1697 --- a/include/linux/cgroup_subsys.h
1698 +++ b/include/linux/cgroup_subsys.h
1699 @@ -48,3 +48,9 @@ SUBSYS(devices)
1700 #endif
1701
1702 /* */
1703 +
1704 +#ifdef CONFIG_CGROUP_FREEZER
1705 +SUBSYS(freezer)
1706 +#endif
1707 +
1708 +/* */
1709 --- a/include/linux/freezer.h
1710 +++ b/include/linux/freezer.h
1711 @@ -6,7 +6,7 @@
1712 #include <linux/sched.h>
1713 #include <linux/wait.h>
1714
1715 -#ifdef CONFIG_PM_SLEEP
1716 +#ifdef CONFIG_FREEZER
1717 /*
1718 * Check if a process has been frozen
1719 */
1720 @@ -39,29 +39,14 @@ static inline void clear_freeze_flag(str
1721 clear_tsk_thread_flag(p, TIF_FREEZE);
1722 }
1723
1724 -/*
1725 - * Wake up a frozen process
1726 - *
1727 - * task_lock() is taken to prevent the race with refrigerator() which may
1728 - * occur if the freezing of tasks fails. Namely, without the lock, if the
1729 - * freezing of tasks failed, thaw_tasks() might have run before a task in
1730 - * refrigerator() could call frozen_process(), in which case the task would be
1731 - * frozen and no one would thaw it.
1732 - */
1733 -static inline int thaw_process(struct task_struct *p)
1734 -{
1735 - task_lock(p);
1736 - if (frozen(p)) {
1737 - p->flags &= ~PF_FROZEN;
1738 - task_unlock(p);
1739 - wake_up_process(p);
1740 - return 1;
1741 - }
1742 - clear_freeze_flag(p);
1743 - task_unlock(p);
1744 - return 0;
1745 +static inline bool should_send_signal(struct task_struct *p)
1746 +{
1747 + return !(p->flags & PF_FREEZER_NOSIG);
1748 }
1749
1750 +/* Takes and releases task alloc lock using task_lock() */
1751 +extern int thaw_process(struct task_struct *p);
1752 +
1753 extern void refrigerator(void);
1754 extern int freeze_processes(void);
1755 extern void thaw_processes(void);
1756 @@ -75,6 +60,15 @@ static inline int try_to_freeze(void)
1757 return 0;
1758 }
1759
1760 +extern bool freeze_task(struct task_struct *p, bool sig_only);
1761 +extern void cancel_freezing(struct task_struct *p);
1762 +
1763 +#ifdef CONFIG_CGROUP_FREEZER
1764 +extern int cgroup_frozen(struct task_struct *task);
1765 +#else /* !CONFIG_CGROUP_FREEZER */
1766 +static inline int cgroup_frozen(struct task_struct *task) { return 0; }
1767 +#endif /* !CONFIG_CGROUP_FREEZER */
1768 +
1769 /*
1770 * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
1771 * calls wait_for_completion(&vfork) and reset right after it returns from this
1772 @@ -166,7 +160,7 @@ static inline void set_freezable_with_si
1773 } while (try_to_freeze()); \
1774 __retval; \
1775 })
1776 -#else /* !CONFIG_PM_SLEEP */
1777 +#else /* !CONFIG_FREEZER */
1778 static inline int frozen(struct task_struct *p) { return 0; }
1779 static inline int freezing(struct task_struct *p) { return 0; }
1780 static inline void set_freeze_flag(struct task_struct *p) {}
1781 @@ -191,6 +185,6 @@ static inline void set_freezable_with_si
1782 #define wait_event_freezable_timeout(wq, condition, timeout) \
1783 wait_event_interruptible_timeout(wq, condition, timeout)
1784
1785 -#endif /* !CONFIG_PM_SLEEP */
1786 +#endif /* !CONFIG_FREEZER */
1787
1788 #endif /* FREEZER_H_INCLUDED */
1789 --- a/init/Kconfig
1790 +++ b/init/Kconfig
1791 @@ -303,6 +303,13 @@ config CGROUP_NS
1792 for instance virtual servers and checkpoint/restart
1793 jobs.
1794
1795 +config CGROUP_FREEZER
1796 + bool "control group freezer subsystem"
1797 + depends on CGROUPS
1798 + help
1799 + Provides a way to freeze and unfreeze all tasks in a
1800 + cgroup.
1801 +
1802 config CGROUP_DEVICE
1803 bool "Device controller for cgroups"
1804 depends on CGROUPS && EXPERIMENTAL
1805 --- /dev/null
1806 +++ b/kernel/Kconfig.freezer
1807 @@ -0,0 +1,2 @@
1808 +config FREEZER
1809 + def_bool PM_SLEEP || CGROUP_FREEZER
1810 --- a/kernel/Makefile
1811 +++ b/kernel/Makefile
1812 @@ -22,6 +22,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
1813 CFLAGS_REMOVE_sched.o = -pg
1814 endif
1815
1816 +obj-$(CONFIG_FREEZER) += freezer.o
1817 obj-$(CONFIG_PROFILING) += profile.o
1818 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
1819 obj-$(CONFIG_STACKTRACE) += stacktrace.o
1820 @@ -54,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += bac
1821 obj-$(CONFIG_COMPAT) += compat.o
1822 obj-$(CONFIG_CGROUPS) += cgroup.o
1823 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
1824 +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
1825 obj-$(CONFIG_CPUSETS) += cpuset.o
1826 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
1827 obj-$(CONFIG_UTS_NS) += utsname.o
1828 --- /dev/null
1829 +++ b/kernel/cgroup_freezer.c
1830 @@ -0,0 +1,379 @@
1831 +/*
1832 + * cgroup_freezer.c - control group freezer subsystem
1833 + *
1834 + * Copyright IBM Corporation, 2007
1835 + *
1836 + * Author : Cedric Le Goater <clg@fr.ibm.com>
1837 + *
1838 + * This program is free software; you can redistribute it and/or modify it
1839 + * under the terms of version 2.1 of the GNU Lesser General Public License
1840 + * as published by the Free Software Foundation.
1841 + *
1842 + * This program is distributed in the hope that it would be useful, but
1843 + * WITHOUT ANY WARRANTY; without even the implied warranty of
1844 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
1845 + */
1846 +
1847 +#include <linux/module.h>
1848 +#include <linux/cgroup.h>
1849 +#include <linux/fs.h>
1850 +#include <linux/uaccess.h>
1851 +#include <linux/freezer.h>
1852 +#include <linux/seq_file.h>
1853 +
1854 +enum freezer_state {
1855 + CGROUP_THAWED = 0,
1856 + CGROUP_FREEZING,
1857 + CGROUP_FROZEN,
1858 +};
1859 +
1860 +struct freezer {
1861 + struct cgroup_subsys_state css;
1862 + enum freezer_state state;
1863 + spinlock_t lock; /* protects _writes_ to state */
1864 +};
1865 +
1866 +static inline struct freezer *cgroup_freezer(
1867 + struct cgroup *cgroup)
1868 +{
1869 + return container_of(
1870 + cgroup_subsys_state(cgroup, freezer_subsys_id),
1871 + struct freezer, css);
1872 +}
1873 +
1874 +static inline struct freezer *task_freezer(struct task_struct *task)
1875 +{
1876 + return container_of(task_subsys_state(task, freezer_subsys_id),
1877 + struct freezer, css);
1878 +}
1879 +
1880 +int cgroup_frozen(struct task_struct *task)
1881 +{
1882 + struct freezer *freezer;
1883 + enum freezer_state state;
1884 +
1885 + task_lock(task);
1886 + freezer = task_freezer(task);
1887 + state = freezer->state;
1888 + task_unlock(task);
1889 +
1890 + return state == CGROUP_FROZEN;
1891 +}
1892 +
1893 +/*
1894 + * cgroups_write_string() limits the size of freezer state strings to
1895 + * CGROUP_LOCAL_BUFFER_SIZE
1896 + */
1897 +static const char *freezer_state_strs[] = {
1898 + "THAWED",
1899 + "FREEZING",
1900 + "FROZEN",
1901 +};
1902 +
1903 +/*
1904 + * State diagram
1905 + * Transitions are caused by userspace writes to the freezer.state file.
1906 + * The values in parenthesis are state labels. The rest are edge labels.
1907 + *
1908 + * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
1909 + * ^ ^ | |
1910 + * | \_______THAWED_______/ |
1911 + * \__________________________THAWED____________/
1912 + */
1913 +
1914 +struct cgroup_subsys freezer_subsys;
1915 +
1916 +/* Locks taken and their ordering
1917 + * ------------------------------
1918 + * css_set_lock
1919 + * cgroup_mutex (AKA cgroup_lock)
1920 + * task->alloc_lock (AKA task_lock)
1921 + * freezer->lock
1922 + * task->sighand->siglock
1923 + *
1924 + * cgroup code forces css_set_lock to be taken before task->alloc_lock
1925 + *
1926 + * freezer_create(), freezer_destroy():
1927 + * cgroup_mutex [ by cgroup core ]
1928 + *
1929 + * can_attach():
1930 + * cgroup_mutex
1931 + *
1932 + * cgroup_frozen():
1933 + * task->alloc_lock (to get task's cgroup)
1934 + *
1935 + * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
1936 + * task->alloc_lock (to get task's cgroup)
1937 + * freezer->lock
1938 + * sighand->siglock (if the cgroup is freezing)
1939 + *
1940 + * freezer_read():
1941 + * cgroup_mutex
1942 + * freezer->lock
1943 + * read_lock css_set_lock (cgroup iterator start)
1944 + *
1945 + * freezer_write() (freeze):
1946 + * cgroup_mutex
1947 + * freezer->lock
1948 + * read_lock css_set_lock (cgroup iterator start)
1949 + * sighand->siglock
1950 + *
1951 + * freezer_write() (unfreeze):
1952 + * cgroup_mutex
1953 + * freezer->lock
1954 + * read_lock css_set_lock (cgroup iterator start)
1955 + * task->alloc_lock (to prevent races with freeze_task())
1956 + * sighand->siglock
1957 + */
1958 +static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
1959 + struct cgroup *cgroup)
1960 +{
1961 + struct freezer *freezer;
1962 +
1963 + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
1964 + if (!freezer)
1965 + return ERR_PTR(-ENOMEM);
1966 +
1967 + spin_lock_init(&freezer->lock);
1968 + freezer->state = CGROUP_THAWED;
1969 + return &freezer->css;
1970 +}
1971 +
1972 +static void freezer_destroy(struct cgroup_subsys *ss,
1973 + struct cgroup *cgroup)
1974 +{
1975 + kfree(cgroup_freezer(cgroup));
1976 +}
1977 +
1978 +/* Task is frozen or will freeze immediately when next it gets woken */
1979 +static bool is_task_frozen_enough(struct task_struct *task)
1980 +{
1981 + return frozen(task) ||
1982 + (task_is_stopped_or_traced(task) && freezing(task));
1983 +}
1984 +
1985 +/*
1986 + * The call to cgroup_lock() in the freezer.state write method prevents
1987 + * a write to that file racing against an attach, and hence the
1988 + * can_attach() result will remain valid until the attach completes.
1989 + */
1990 +static int freezer_can_attach(struct cgroup_subsys *ss,
1991 + struct cgroup *new_cgroup,
1992 + struct task_struct *task)
1993 +{
1994 + struct freezer *freezer;
1995 +
1996 + /*
1997 + * Anything frozen can't move or be moved to/from.
1998 + *
1999 + * Since orig_freezer->state == FROZEN means that @task has been
2000 + * frozen, so it's sufficient to check the latter condition.
2001 + */
2002 +
2003 + if (is_task_frozen_enough(task))
2004 + return -EBUSY;
2005 +
2006 + freezer = cgroup_freezer(new_cgroup);
2007 + if (freezer->state == CGROUP_FROZEN)
2008 + return -EBUSY;
2009 +
2010 + return 0;
2011 +}
2012 +
2013 +static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
2014 +{
2015 + struct freezer *freezer;
2016 +
2017 + /*
2018 + * No lock is needed, since the task isn't on tasklist yet,
2019 + * so it can't be moved to another cgroup, which means the
2020 + * freezer won't be removed and will be valid during this
2021 + * function call.
2022 + */
2023 + freezer = task_freezer(task);
2024 +
2025 + /*
2026 + * The root cgroup is non-freezable, so we can skip the
2027 + * following check.
2028 + */
2029 + if (!freezer->css.cgroup->parent)
2030 + return;
2031 +
2032 + spin_lock_irq(&freezer->lock);
2033 + BUG_ON(freezer->state == CGROUP_FROZEN);
2034 +
2035 + /* Locking avoids race with FREEZING -> THAWED transitions. */
2036 + if (freezer->state == CGROUP_FREEZING)
2037 + freeze_task(task, true);
2038 + spin_unlock_irq(&freezer->lock);
2039 +}
2040 +
2041 +/*
2042 + * caller must hold freezer->lock
2043 + */
2044 +static void update_freezer_state(struct cgroup *cgroup,
2045 + struct freezer *freezer)
2046 +{
2047 + struct cgroup_iter it;
2048 + struct task_struct *task;
2049 + unsigned int nfrozen = 0, ntotal = 0;
2050 +
2051 + cgroup_iter_start(cgroup, &it);
2052 + while ((task = cgroup_iter_next(cgroup, &it))) {
2053 + ntotal++;
2054 + if (is_task_frozen_enough(task))
2055 + nfrozen++;
2056 + }
2057 +
2058 + /*
2059 + * Transition to FROZEN when no new tasks can be added ensures
2060 + * that we never exist in the FROZEN state while there are unfrozen
2061 + * tasks.
2062 + */
2063 + if (nfrozen == ntotal)
2064 + freezer->state = CGROUP_FROZEN;
2065 + else if (nfrozen > 0)
2066 + freezer->state = CGROUP_FREEZING;
2067 + else
2068 + freezer->state = CGROUP_THAWED;
2069 + cgroup_iter_end(cgroup, &it);
2070 +}
2071 +
2072 +static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
2073 + struct seq_file *m)
2074 +{
2075 + struct freezer *freezer;
2076 + enum freezer_state state;
2077 +
2078 + if (!cgroup_lock_live_group(cgroup))
2079 + return -ENODEV;
2080 +
2081 + freezer = cgroup_freezer(cgroup);
2082 + spin_lock_irq(&freezer->lock);
2083 + state = freezer->state;
2084 + if (state == CGROUP_FREEZING) {
2085 + /* We change from FREEZING to FROZEN lazily if the cgroup was
2086 + * only partially frozen when we exitted write. */
2087 + update_freezer_state(cgroup, freezer);
2088 + state = freezer->state;
2089 + }
2090 + spin_unlock_irq(&freezer->lock);
2091 + cgroup_unlock();
2092 +
2093 + seq_puts(m, freezer_state_strs[state]);
2094 + seq_putc(m, '\n');
2095 + return 0;
2096 +}
2097 +
2098 +static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
2099 +{
2100 + struct cgroup_iter it;
2101 + struct task_struct *task;
2102 + unsigned int num_cant_freeze_now = 0;
2103 +
2104 + freezer->state = CGROUP_FREEZING;
2105 + cgroup_iter_start(cgroup, &it);
2106 + while ((task = cgroup_iter_next(cgroup, &it))) {
2107 + if (!freeze_task(task, true))
2108 + continue;
2109 + if (is_task_frozen_enough(task))
2110 + continue;
2111 + if (!freezing(task) && !freezer_should_skip(task))
2112 + num_cant_freeze_now++;
2113 + }
2114 + cgroup_iter_end(cgroup, &it);
2115 +
2116 + return num_cant_freeze_now ? -EBUSY : 0;
2117 +}
2118 +
2119 +static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
2120 +{
2121 + struct cgroup_iter it;
2122 + struct task_struct *task;
2123 +
2124 + cgroup_iter_start(cgroup, &it);
2125 + while ((task = cgroup_iter_next(cgroup, &it))) {
2126 + thaw_process(task);
2127 + }
2128 + cgroup_iter_end(cgroup, &it);
2129 +
2130 + freezer->state = CGROUP_THAWED;
2131 +}
2132 +
2133 +static int freezer_change_state(struct cgroup *cgroup,
2134 + enum freezer_state goal_state)
2135 +{
2136 + struct freezer *freezer;
2137 + int retval = 0;
2138 +
2139 + freezer = cgroup_freezer(cgroup);
2140 +
2141 + spin_lock_irq(&freezer->lock);
2142 +
2143 + update_freezer_state(cgroup, freezer);
2144 + if (goal_state == freezer->state)
2145 + goto out;
2146 +
2147 + switch (goal_state) {
2148 + case CGROUP_THAWED:
2149 + unfreeze_cgroup(cgroup, freezer);
2150 + break;
2151 + case CGROUP_FROZEN:
2152 + retval = try_to_freeze_cgroup(cgroup, freezer);
2153 + break;
2154 + default:
2155 + BUG();
2156 + }
2157 +out:
2158 + spin_unlock_irq(&freezer->lock);
2159 +
2160 + return retval;
2161 +}
2162 +
2163 +static int freezer_write(struct cgroup *cgroup,
2164 + struct cftype *cft,
2165 + const char *buffer)
2166 +{
2167 + int retval;
2168 + enum freezer_state goal_state;
2169 +
2170 + if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
2171 + goal_state = CGROUP_THAWED;
2172 + else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
2173 + goal_state = CGROUP_FROZEN;
2174 + else
2175 + return -EINVAL;
2176 +
2177 + if (!cgroup_lock_live_group(cgroup))
2178 + return -ENODEV;
2179 + retval = freezer_change_state(cgroup, goal_state);
2180 + cgroup_unlock();
2181 + return retval;
2182 +}
2183 +
2184 +static struct cftype files[] = {
2185 + {
2186 + .name = "state",
2187 + .read_seq_string = freezer_read,
2188 + .write_string = freezer_write,
2189 + },
2190 +};
2191 +
2192 +static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
2193 +{
2194 + if (!cgroup->parent)
2195 + return 0;
2196 + return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
2197 +}
2198 +
2199 +struct cgroup_subsys freezer_subsys = {
2200 + .name = "freezer",
2201 + .create = freezer_create,
2202 + .destroy = freezer_destroy,
2203 + .populate = freezer_populate,
2204 + .subsys_id = freezer_subsys_id,
2205 + .can_attach = freezer_can_attach,
2206 + .attach = NULL,
2207 + .fork = freezer_fork,
2208 + .exit = NULL,
2209 +};
2210 --- /dev/null
2211 +++ b/kernel/freezer.c
2212 @@ -0,0 +1,154 @@
2213 +/*
2214 + * kernel/freezer.c - Function to freeze a process
2215 + *
2216 + * Originally from kernel/power/process.c
2217 + */
2218 +
2219 +#include <linux/interrupt.h>
2220 +#include <linux/suspend.h>
2221 +#include <linux/module.h>
2222 +#include <linux/syscalls.h>
2223 +#include <linux/freezer.h>
2224 +
2225 +/*
2226 + * freezing is complete, mark current process as frozen
2227 + */
2228 +static inline void frozen_process(void)
2229 +{
2230 + if (!unlikely(current->flags & PF_NOFREEZE)) {
2231 + current->flags |= PF_FROZEN;
2232 + wmb();
2233 + }
2234 + clear_freeze_flag(current);
2235 +}
2236 +
2237 +/* Refrigerator is place where frozen processes are stored :-). */
2238 +void refrigerator(void)
2239 +{
2240 + /* Hmm, should we be allowed to suspend when there are realtime
2241 + processes around? */
2242 + long save;
2243 +
2244 + task_lock(current);
2245 + if (freezing(current)) {
2246 + frozen_process();
2247 + task_unlock(current);
2248 + } else {
2249 + task_unlock(current);
2250 + return;
2251 + }
2252 + save = current->state;
2253 + pr_debug("%s entered refrigerator\n", current->comm);
2254 +
2255 + spin_lock_irq(&current->sighand->siglock);
2256 + recalc_sigpending(); /* We sent fake signal, clean it up */
2257 + spin_unlock_irq(&current->sighand->siglock);
2258 +
2259 + for (;;) {
2260 + set_current_state(TASK_UNINTERRUPTIBLE);
2261 + if (!frozen(current))
2262 + break;
2263 + schedule();
2264 + }
2265 + pr_debug("%s left refrigerator\n", current->comm);
2266 + __set_current_state(save);
2267 +}
2268 +EXPORT_SYMBOL(refrigerator);
2269 +
2270 +static void fake_signal_wake_up(struct task_struct *p)
2271 +{
2272 + unsigned long flags;
2273 +
2274 + spin_lock_irqsave(&p->sighand->siglock, flags);
2275 + signal_wake_up(p, 0);
2276 + spin_unlock_irqrestore(&p->sighand->siglock, flags);
2277 +}
2278 +
2279 +/**
2280 + * freeze_task - send a freeze request to given task
2281 + * @p: task to send the request to
2282 + * @sig_only: if set, the request will only be sent if the task has the
2283 + * PF_FREEZER_NOSIG flag unset
2284 + * Return value: 'false', if @sig_only is set and the task has
2285 + * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
2286 + *
2287 + * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
2288 + * either sending a fake signal to it or waking it up, depending on whether
2289 + * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
2290 + * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
2291 + * TIF_FREEZE flag will not be set.
2292 + */
2293 +bool freeze_task(struct task_struct *p, bool sig_only)
2294 +{
2295 + /*
2296 + * We first check if the task is freezing and next if it has already
2297 + * been frozen to avoid the race with frozen_process() which first marks
2298 + * the task as frozen and next clears its TIF_FREEZE.
2299 + */
2300 + if (!freezing(p)) {
2301 + rmb();
2302 + if (frozen(p))
2303 + return false;
2304 +
2305 + if (!sig_only || should_send_signal(p))
2306 + set_freeze_flag(p);
2307 + else
2308 + return false;
2309 + }
2310 +
2311 + if (should_send_signal(p)) {
2312 + if (!signal_pending(p))
2313 + fake_signal_wake_up(p);
2314 + } else if (sig_only) {
2315 + return false;
2316 + } else {
2317 + wake_up_state(p, TASK_INTERRUPTIBLE);
2318 + }
2319 +
2320 + return true;
2321 +}
2322 +
2323 +void cancel_freezing(struct task_struct *p)
2324 +{
2325 + unsigned long flags;
2326 +
2327 + if (freezing(p)) {
2328 + pr_debug(" clean up: %s\n", p->comm);
2329 + clear_freeze_flag(p);
2330 + spin_lock_irqsave(&p->sighand->siglock, flags);
2331 + recalc_sigpending_and_wake(p);
2332 + spin_unlock_irqrestore(&p->sighand->siglock, flags);
2333 + }
2334 +}
2335 +
2336 +static int __thaw_process(struct task_struct *p)
2337 +{
2338 + if (frozen(p)) {
2339 + p->flags &= ~PF_FROZEN;
2340 + return 1;
2341 + }
2342 + clear_freeze_flag(p);
2343 + return 0;
2344 +}
2345 +
2346 +/*
2347 + * Wake up a frozen process
2348 + *
2349 + * task_lock() is needed to prevent the race with refrigerator() which may
2350 + * occur if the freezing of tasks fails. Namely, without the lock, if the
2351 + * freezing of tasks failed, thaw_tasks() might have run before a task in
2352 + * refrigerator() could call frozen_process(), in which case the task would be
2353 + * frozen and no one would thaw it.
2354 + */
2355 +int thaw_process(struct task_struct *p)
2356 +{
2357 + task_lock(p);
2358 + if (__thaw_process(p) == 1) {
2359 + task_unlock(p);
2360 + wake_up_process(p);
2361 + return 1;
2362 + }
2363 + task_unlock(p);
2364 + return 0;
2365 +}
2366 +EXPORT_SYMBOL(thaw_process);
2367 --- a/kernel/power/process.c
2368 +++ b/kernel/power/process.c
2369 @@ -28,121 +28,6 @@ static inline int freezeable(struct task
2370 return 1;
2371 }
2372
2373 -/*
2374 - * freezing is complete, mark current process as frozen
2375 - */
2376 -static inline void frozen_process(void)
2377 -{
2378 - if (!unlikely(current->flags & PF_NOFREEZE)) {
2379 - current->flags |= PF_FROZEN;
2380 - wmb();
2381 - }
2382 - clear_freeze_flag(current);
2383 -}
2384 -
2385 -/* Refrigerator is place where frozen processes are stored :-). */
2386 -void refrigerator(void)
2387 -{
2388 - /* Hmm, should we be allowed to suspend when there are realtime
2389 - processes around? */
2390 - long save;
2391 -
2392 - task_lock(current);
2393 - if (freezing(current)) {
2394 - frozen_process();
2395 - task_unlock(current);
2396 - } else {
2397 - task_unlock(current);
2398 - return;
2399 - }
2400 - save = current->state;
2401 - pr_debug("%s entered refrigerator\n", current->comm);
2402 -
2403 - spin_lock_irq(&current->sighand->siglock);
2404 - recalc_sigpending(); /* We sent fake signal, clean it up */
2405 - spin_unlock_irq(&current->sighand->siglock);
2406 -
2407 - for (;;) {
2408 - set_current_state(TASK_UNINTERRUPTIBLE);
2409 - if (!frozen(current))
2410 - break;
2411 - schedule();
2412 - }
2413 - pr_debug("%s left refrigerator\n", current->comm);
2414 - __set_current_state(save);
2415 -}
2416 -
2417 -static void fake_signal_wake_up(struct task_struct *p)
2418 -{
2419 - unsigned long flags;
2420 -
2421 - spin_lock_irqsave(&p->sighand->siglock, flags);
2422 - signal_wake_up(p, 0);
2423 - spin_unlock_irqrestore(&p->sighand->siglock, flags);
2424 -}
2425 -
2426 -static inline bool should_send_signal(struct task_struct *p)
2427 -{
2428 - return !(p->flags & PF_FREEZER_NOSIG);
2429 -}
2430 -
2431 -/**
2432 - * freeze_task - send a freeze request to given task
2433 - * @p: task to send the request to
2434 - * @sig_only: if set, the request will only be sent if the task has the
2435 - * PF_FREEZER_NOSIG flag unset
2436 - * Return value: 'false', if @sig_only is set and the task has
2437 - * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
2438 - *
2439 - * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
2440 - * either sending a fake signal to it or waking it up, depending on whether
2441 - * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
2442 - * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
2443 - * TIF_FREEZE flag will not be set.
2444 - */
2445 -static bool freeze_task(struct task_struct *p, bool sig_only)
2446 -{
2447 - /*
2448 - * We first check if the task is freezing and next if it has already
2449 - * been frozen to avoid the race with frozen_process() which first marks
2450 - * the task as frozen and next clears its TIF_FREEZE.
2451 - */
2452 - if (!freezing(p)) {
2453 - rmb();
2454 - if (frozen(p))
2455 - return false;
2456 -
2457 - if (!sig_only || should_send_signal(p))
2458 - set_freeze_flag(p);
2459 - else
2460 - return false;
2461 - }
2462 -
2463 - if (should_send_signal(p)) {
2464 - if (!signal_pending(p))
2465 - fake_signal_wake_up(p);
2466 - } else if (sig_only) {
2467 - return false;
2468 - } else {
2469 - wake_up_state(p, TASK_INTERRUPTIBLE);
2470 - }
2471 -
2472 - return true;
2473 -}
2474 -
2475 -static void cancel_freezing(struct task_struct *p)
2476 -{
2477 - unsigned long flags;
2478 -
2479 - if (freezing(p)) {
2480 - pr_debug(" clean up: %s\n", p->comm);
2481 - clear_freeze_flag(p);
2482 - spin_lock_irqsave(&p->sighand->siglock, flags);
2483 - recalc_sigpending_and_wake(p);
2484 - spin_unlock_irqrestore(&p->sighand->siglock, flags);
2485 - }
2486 -}
2487 -
2488 static int try_to_freeze_tasks(bool sig_only)
2489 {
2490 struct task_struct *g, *p;
2491 @@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only)
2492 if (nosig_only && should_send_signal(p))
2493 continue;
2494
2495 + if (cgroup_frozen(p))
2496 + continue;
2497 +
2498 thaw_process(p);
2499 } while_each_thread(g, p);
2500 read_unlock(&tasklist_lock);
2501 @@ -264,4 +152,3 @@ void thaw_processes(void)
2502 printk("done.\n");
2503 }
2504
2505 -EXPORT_SYMBOL(refrigerator);