]>
Commit | Line | Data |
---|---|---|
00e5a55c BS |
1 | From: Serge E. Hallyn <serue@us.ibm.com> |
2 | Subject: cgroup freezer | |
3 | References: bnc#417294, fate#304191, fate#201036 | |
4 | Patch-upstream: yes | |
5 | Git: 68d1a06b440a5df55fb253e1d1113d2e4a7209fc Mon Sep 17 00:00:00 2001 | |
6 | ||
7 | Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> | |
8 | Acked-by: Nick Piggin <npiggin@suse.de> | |
9 | --- | |
10 | Documentation/cgroups.txt | 548 ---------------------------- | |
11 | Documentation/cgroups/cgroups.txt | 548 ++++++++++++++++++++++++++++ | |
12 | Documentation/cgroups/freezer-subsystem.txt | 102 +++++ | |
13 | Documentation/cpusets.txt | 2 | |
14 | arch/alpha/Kconfig | 1 | |
15 | arch/alpha/include/asm/thread_info.h | 2 | |
16 | arch/arm/Kconfig | 2 | |
17 | arch/avr32/Kconfig | 2 | |
18 | arch/avr32/include/asm/thread_info.h | 1 | |
19 | arch/blackfin/Kconfig | 3 | |
20 | arch/cris/Kconfig | 2 | |
21 | arch/frv/Kconfig | 2 | |
22 | arch/h8300/Kconfig | 2 | |
23 | arch/h8300/include/asm/thread_info.h | 2 | |
24 | arch/ia64/Kconfig | 2 | |
25 | arch/m32r/Kconfig | 2 | |
26 | arch/m68k/Kconfig | 2 | |
27 | arch/m68knommu/Kconfig | 2 | |
28 | arch/m68knommu/include/asm/thread_info.h | 2 | |
29 | arch/mips/Kconfig | 2 | |
30 | arch/mn10300/Kconfig | 2 | |
31 | arch/parisc/Kconfig | 2 | |
32 | arch/powerpc/Kconfig | 2 | |
33 | arch/s390/Kconfig | 2 | |
34 | arch/s390/include/asm/thread_info.h | 2 | |
35 | arch/sh/Kconfig | 2 | |
36 | arch/sparc/Kconfig | 2 | |
37 | arch/sparc/include/asm/thread_info_32.h | 2 | |
38 | arch/sparc64/Kconfig | 1 | |
39 | arch/um/Kconfig | 2 | |
40 | arch/x86/Kconfig | 1 | |
41 | arch/xtensa/Kconfig | 1 | |
42 | include/asm-cris/thread_info.h | 2 | |
43 | include/asm-m68k/thread_info.h | 1 | |
44 | include/asm-parisc/thread_info.h | 2 | |
45 | include/asm-um/thread_info.h | 2 | |
46 | include/asm-xtensa/thread_info.h | 2 | |
47 | include/linux/cgroup_subsys.h | 6 | |
48 | include/linux/freezer.h | 42 -- | |
49 | init/Kconfig | 7 | |
50 | kernel/Kconfig.freezer | 2 | |
51 | kernel/Makefile | 2 | |
52 | kernel/cgroup_freezer.c | 379 +++++++++++++++++++ | |
53 | kernel/freezer.c | 154 +++++++ | |
54 | kernel/power/process.c | 119 ------ | |
55 | 45 files changed, 1283 insertions(+), 689 deletions(-) | |
56 | create mode 100644 include/linux/cgroup_freezer.h | |
57 | create mode 100644 kernel/cgroup_freezer.c | |
58 | create mode 100644 kernel/freezer.c | |
59 | ||
60 | --- a/Documentation/cgroups.txt | |
61 | +++ /dev/null | |
62 | @@ -1,548 +0,0 @@ | |
63 | - CGROUPS | |
64 | - ------- | |
65 | - | |
66 | -Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt | |
67 | - | |
68 | -Original copyright statements from cpusets.txt: | |
69 | -Portions Copyright (C) 2004 BULL SA. | |
70 | -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. | |
71 | -Modified by Paul Jackson <pj@sgi.com> | |
72 | -Modified by Christoph Lameter <clameter@sgi.com> | |
73 | - | |
74 | -CONTENTS: | |
75 | -========= | |
76 | - | |
77 | -1. Control Groups | |
78 | - 1.1 What are cgroups ? | |
79 | - 1.2 Why are cgroups needed ? | |
80 | - 1.3 How are cgroups implemented ? | |
81 | - 1.4 What does notify_on_release do ? | |
82 | - 1.5 How do I use cgroups ? | |
83 | -2. Usage Examples and Syntax | |
84 | - 2.1 Basic Usage | |
85 | - 2.2 Attaching processes | |
86 | -3. Kernel API | |
87 | - 3.1 Overview | |
88 | - 3.2 Synchronization | |
89 | - 3.3 Subsystem API | |
90 | -4. Questions | |
91 | - | |
92 | -1. Control Groups | |
93 | -================= | |
94 | - | |
95 | -1.1 What are cgroups ? | |
96 | ----------------------- | |
97 | - | |
98 | -Control Groups provide a mechanism for aggregating/partitioning sets of | |
99 | -tasks, and all their future children, into hierarchical groups with | |
100 | -specialized behaviour. | |
101 | - | |
102 | -Definitions: | |
103 | - | |
104 | -A *cgroup* associates a set of tasks with a set of parameters for one | |
105 | -or more subsystems. | |
106 | - | |
107 | -A *subsystem* is a module that makes use of the task grouping | |
108 | -facilities provided by cgroups to treat groups of tasks in | |
109 | -particular ways. A subsystem is typically a "resource controller" that | |
110 | -schedules a resource or applies per-cgroup limits, but it may be | |
111 | -anything that wants to act on a group of processes, e.g. a | |
112 | -virtualization subsystem. | |
113 | - | |
114 | -A *hierarchy* is a set of cgroups arranged in a tree, such that | |
115 | -every task in the system is in exactly one of the cgroups in the | |
116 | -hierarchy, and a set of subsystems; each subsystem has system-specific | |
117 | -state attached to each cgroup in the hierarchy. Each hierarchy has | |
118 | -an instance of the cgroup virtual filesystem associated with it. | |
119 | - | |
120 | -At any one time there may be multiple active hierachies of task | |
121 | -cgroups. Each hierarchy is a partition of all tasks in the system. | |
122 | - | |
123 | -User level code may create and destroy cgroups by name in an | |
124 | -instance of the cgroup virtual file system, specify and query to | |
125 | -which cgroup a task is assigned, and list the task pids assigned to | |
126 | -a cgroup. Those creations and assignments only affect the hierarchy | |
127 | -associated with that instance of the cgroup file system. | |
128 | - | |
129 | -On their own, the only use for cgroups is for simple job | |
130 | -tracking. The intention is that other subsystems hook into the generic | |
131 | -cgroup support to provide new attributes for cgroups, such as | |
132 | -accounting/limiting the resources which processes in a cgroup can | |
133 | -access. For example, cpusets (see Documentation/cpusets.txt) allows | |
134 | -you to associate a set of CPUs and a set of memory nodes with the | |
135 | -tasks in each cgroup. | |
136 | - | |
137 | -1.2 Why are cgroups needed ? | |
138 | ----------------------------- | |
139 | - | |
140 | -There are multiple efforts to provide process aggregations in the | |
141 | -Linux kernel, mainly for resource tracking purposes. Such efforts | |
142 | -include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server | |
143 | -namespaces. These all require the basic notion of a | |
144 | -grouping/partitioning of processes, with newly forked processes ending | |
145 | -in the same group (cgroup) as their parent process. | |
146 | - | |
147 | -The kernel cgroup patch provides the minimum essential kernel | |
148 | -mechanisms required to efficiently implement such groups. It has | |
149 | -minimal impact on the system fast paths, and provides hooks for | |
150 | -specific subsystems such as cpusets to provide additional behaviour as | |
151 | -desired. | |
152 | - | |
153 | -Multiple hierarchy support is provided to allow for situations where | |
154 | -the division of tasks into cgroups is distinctly different for | |
155 | -different subsystems - having parallel hierarchies allows each | |
156 | -hierarchy to be a natural division of tasks, without having to handle | |
157 | -complex combinations of tasks that would be present if several | |
158 | -unrelated subsystems needed to be forced into the same tree of | |
159 | -cgroups. | |
160 | - | |
161 | -At one extreme, each resource controller or subsystem could be in a | |
162 | -separate hierarchy; at the other extreme, all subsystems | |
163 | -would be attached to the same hierarchy. | |
164 | - | |
165 | -As an example of a scenario (originally proposed by vatsa@in.ibm.com) | |
166 | -that can benefit from multiple hierarchies, consider a large | |
167 | -university server with various users - students, professors, system | |
168 | -tasks etc. The resource planning for this server could be along the | |
169 | -following lines: | |
170 | - | |
171 | - CPU : Top cpuset | |
172 | - / \ | |
173 | - CPUSet1 CPUSet2 | |
174 | - | | | |
175 | - (Profs) (Students) | |
176 | - | |
177 | - In addition (system tasks) are attached to topcpuset (so | |
178 | - that they can run anywhere) with a limit of 20% | |
179 | - | |
180 | - Memory : Professors (50%), students (30%), system (20%) | |
181 | - | |
182 | - Disk : Prof (50%), students (30%), system (20%) | |
183 | - | |
184 | - Network : WWW browsing (20%), Network File System (60%), others (20%) | |
185 | - / \ | |
186 | - Prof (15%) students (5%) | |
187 | - | |
188 | -Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go | |
189 | -into NFS network class. | |
190 | - | |
191 | -At the same time firefox/lynx will share an appropriate CPU/Memory class | |
192 | -depending on who launched it (prof/student). | |
193 | - | |
194 | -With the ability to classify tasks differently for different resources | |
195 | -(by putting those resource subsystems in different hierarchies) then | |
196 | -the admin can easily set up a script which receives exec notifications | |
197 | -and depending on who is launching the browser he can | |
198 | - | |
199 | - # echo browser_pid > /mnt/<restype>/<userclass>/tasks | |
200 | - | |
201 | -With only a single hierarchy, he now would potentially have to create | |
202 | -a separate cgroup for every browser launched and associate it with | |
203 | -approp network and other resource class. This may lead to | |
204 | -proliferation of such cgroups. | |
205 | - | |
206 | -Also lets say that the administrator would like to give enhanced network | |
207 | -access temporarily to a student's browser (since it is night and the user | |
208 | -wants to do online gaming :)) OR give one of the students simulation | |
209 | -apps enhanced CPU power, | |
210 | - | |
211 | -With ability to write pids directly to resource classes, it's just a | |
212 | -matter of : | |
213 | - | |
214 | - # echo pid > /mnt/network/<new_class>/tasks | |
215 | - (after some time) | |
216 | - # echo pid > /mnt/network/<orig_class>/tasks | |
217 | - | |
218 | -Without this ability, he would have to split the cgroup into | |
219 | -multiple separate ones and then associate the new cgroups with the | |
220 | -new resource classes. | |
221 | - | |
222 | - | |
223 | - | |
224 | -1.3 How are cgroups implemented ? | |
225 | ---------------------------------- | |
226 | - | |
227 | -Control Groups extends the kernel as follows: | |
228 | - | |
229 | - - Each task in the system has a reference-counted pointer to a | |
230 | - css_set. | |
231 | - | |
232 | - - A css_set contains a set of reference-counted pointers to | |
233 | - cgroup_subsys_state objects, one for each cgroup subsystem | |
234 | - registered in the system. There is no direct link from a task to | |
235 | - the cgroup of which it's a member in each hierarchy, but this | |
236 | - can be determined by following pointers through the | |
237 | - cgroup_subsys_state objects. This is because accessing the | |
238 | - subsystem state is something that's expected to happen frequently | |
239 | - and in performance-critical code, whereas operations that require a | |
240 | - task's actual cgroup assignments (in particular, moving between | |
241 | - cgroups) are less common. A linked list runs through the cg_list | |
242 | - field of each task_struct using the css_set, anchored at | |
243 | - css_set->tasks. | |
244 | - | |
245 | - - A cgroup hierarchy filesystem can be mounted for browsing and | |
246 | - manipulation from user space. | |
247 | - | |
248 | - - You can list all the tasks (by pid) attached to any cgroup. | |
249 | - | |
250 | -The implementation of cgroups requires a few, simple hooks | |
251 | -into the rest of the kernel, none in performance critical paths: | |
252 | - | |
253 | - - in init/main.c, to initialize the root cgroups and initial | |
254 | - css_set at system boot. | |
255 | - | |
256 | - - in fork and exit, to attach and detach a task from its css_set. | |
257 | - | |
258 | -In addition a new file system, of type "cgroup" may be mounted, to | |
259 | -enable browsing and modifying the cgroups presently known to the | |
260 | -kernel. When mounting a cgroup hierarchy, you may specify a | |
261 | -comma-separated list of subsystems to mount as the filesystem mount | |
262 | -options. By default, mounting the cgroup filesystem attempts to | |
263 | -mount a hierarchy containing all registered subsystems. | |
264 | - | |
265 | -If an active hierarchy with exactly the same set of subsystems already | |
266 | -exists, it will be reused for the new mount. If no existing hierarchy | |
267 | -matches, and any of the requested subsystems are in use in an existing | |
268 | -hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy | |
269 | -is activated, associated with the requested subsystems. | |
270 | - | |
271 | -It's not currently possible to bind a new subsystem to an active | |
272 | -cgroup hierarchy, or to unbind a subsystem from an active cgroup | |
273 | -hierarchy. This may be possible in future, but is fraught with nasty | |
274 | -error-recovery issues. | |
275 | - | |
276 | -When a cgroup filesystem is unmounted, if there are any | |
277 | -child cgroups created below the top-level cgroup, that hierarchy | |
278 | -will remain active even though unmounted; if there are no | |
279 | -child cgroups then the hierarchy will be deactivated. | |
280 | - | |
281 | -No new system calls are added for cgroups - all support for | |
282 | -querying and modifying cgroups is via this cgroup file system. | |
283 | - | |
284 | -Each task under /proc has an added file named 'cgroup' displaying, | |
285 | -for each active hierarchy, the subsystem names and the cgroup name | |
286 | -as the path relative to the root of the cgroup file system. | |
287 | - | |
288 | -Each cgroup is represented by a directory in the cgroup file system | |
289 | -containing the following files describing that cgroup: | |
290 | - | |
291 | - - tasks: list of tasks (by pid) attached to that cgroup | |
292 | - - releasable flag: cgroup currently removeable? | |
293 | - - notify_on_release flag: run the release agent on exit? | |
294 | - - release_agent: the path to use for release notifications (this file | |
295 | - exists in the top cgroup only) | |
296 | - | |
297 | -Other subsystems such as cpusets may add additional files in each | |
298 | -cgroup dir. | |
299 | - | |
300 | -New cgroups are created using the mkdir system call or shell | |
301 | -command. The properties of a cgroup, such as its flags, are | |
302 | -modified by writing to the appropriate file in that cgroups | |
303 | -directory, as listed above. | |
304 | - | |
305 | -The named hierarchical structure of nested cgroups allows partitioning | |
306 | -a large system into nested, dynamically changeable, "soft-partitions". | |
307 | - | |
308 | -The attachment of each task, automatically inherited at fork by any | |
309 | -children of that task, to a cgroup allows organizing the work load | |
310 | -on a system into related sets of tasks. A task may be re-attached to | |
311 | -any other cgroup, if allowed by the permissions on the necessary | |
312 | -cgroup file system directories. | |
313 | - | |
314 | -When a task is moved from one cgroup to another, it gets a new | |
315 | -css_set pointer - if there's an already existing css_set with the | |
316 | -desired collection of cgroups then that group is reused, else a new | |
317 | -css_set is allocated. Note that the current implementation uses a | |
318 | -linear search to locate an appropriate existing css_set, so isn't | |
319 | -very efficient. A future version will use a hash table for better | |
320 | -performance. | |
321 | - | |
322 | -To allow access from a cgroup to the css_sets (and hence tasks) | |
323 | -that comprise it, a set of cg_cgroup_link objects form a lattice; | |
324 | -each cg_cgroup_link is linked into a list of cg_cgroup_links for | |
325 | -a single cgroup on its cgrp_link_list field, and a list of | |
326 | -cg_cgroup_links for a single css_set on its cg_link_list. | |
327 | - | |
328 | -Thus the set of tasks in a cgroup can be listed by iterating over | |
329 | -each css_set that references the cgroup, and sub-iterating over | |
330 | -each css_set's task set. | |
331 | - | |
332 | -The use of a Linux virtual file system (vfs) to represent the | |
333 | -cgroup hierarchy provides for a familiar permission and name space | |
334 | -for cgroups, with a minimum of additional kernel code. | |
335 | - | |
336 | -1.4 What does notify_on_release do ? | |
337 | ------------------------------------- | |
338 | - | |
339 | -If the notify_on_release flag is enabled (1) in a cgroup, then | |
340 | -whenever the last task in the cgroup leaves (exits or attaches to | |
341 | -some other cgroup) and the last child cgroup of that cgroup | |
342 | -is removed, then the kernel runs the command specified by the contents | |
343 | -of the "release_agent" file in that hierarchy's root directory, | |
344 | -supplying the pathname (relative to the mount point of the cgroup | |
345 | -file system) of the abandoned cgroup. This enables automatic | |
346 | -removal of abandoned cgroups. The default value of | |
347 | -notify_on_release in the root cgroup at system boot is disabled | |
348 | -(0). The default value of other cgroups at creation is the current | |
349 | -value of their parents notify_on_release setting. The default value of | |
350 | -a cgroup hierarchy's release_agent path is empty. | |
351 | - | |
352 | -1.5 How do I use cgroups ? | |
353 | --------------------------- | |
354 | - | |
355 | -To start a new job that is to be contained within a cgroup, using | |
356 | -the "cpuset" cgroup subsystem, the steps are something like: | |
357 | - | |
358 | - 1) mkdir /dev/cgroup | |
359 | - 2) mount -t cgroup -ocpuset cpuset /dev/cgroup | |
360 | - 3) Create the new cgroup by doing mkdir's and write's (or echo's) in | |
361 | - the /dev/cgroup virtual file system. | |
362 | - 4) Start a task that will be the "founding father" of the new job. | |
363 | - 5) Attach that task to the new cgroup by writing its pid to the | |
364 | - /dev/cgroup tasks file for that cgroup. | |
365 | - 6) fork, exec or clone the job tasks from this founding father task. | |
366 | - | |
367 | -For example, the following sequence of commands will setup a cgroup | |
368 | -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, | |
369 | -and then start a subshell 'sh' in that cgroup: | |
370 | - | |
371 | - mount -t cgroup cpuset -ocpuset /dev/cgroup | |
372 | - cd /dev/cgroup | |
373 | - mkdir Charlie | |
374 | - cd Charlie | |
375 | - /bin/echo 2-3 > cpuset.cpus | |
376 | - /bin/echo 1 > cpuset.mems | |
377 | - /bin/echo $$ > tasks | |
378 | - sh | |
379 | - # The subshell 'sh' is now running in cgroup Charlie | |
380 | - # The next line should display '/Charlie' | |
381 | - cat /proc/self/cgroup | |
382 | - | |
383 | -2. Usage Examples and Syntax | |
384 | -============================ | |
385 | - | |
386 | -2.1 Basic Usage | |
387 | ---------------- | |
388 | - | |
389 | -Creating, modifying, using the cgroups can be done through the cgroup | |
390 | -virtual filesystem. | |
391 | - | |
392 | -To mount a cgroup hierarchy will all available subsystems, type: | |
393 | -# mount -t cgroup xxx /dev/cgroup | |
394 | - | |
395 | -The "xxx" is not interpreted by the cgroup code, but will appear in | |
396 | -/proc/mounts so may be any useful identifying string that you like. | |
397 | - | |
398 | -To mount a cgroup hierarchy with just the cpuset and numtasks | |
399 | -subsystems, type: | |
400 | -# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup | |
401 | - | |
402 | -To change the set of subsystems bound to a mounted hierarchy, just | |
403 | -remount with different options: | |
404 | - | |
405 | -# mount -o remount,cpuset,ns /dev/cgroup | |
406 | - | |
407 | -Note that changing the set of subsystems is currently only supported | |
408 | -when the hierarchy consists of a single (root) cgroup. Supporting | |
409 | -the ability to arbitrarily bind/unbind subsystems from an existing | |
410 | -cgroup hierarchy is intended to be implemented in the future. | |
411 | - | |
412 | -Then under /dev/cgroup you can find a tree that corresponds to the | |
413 | -tree of the cgroups in the system. For instance, /dev/cgroup | |
414 | -is the cgroup that holds the whole system. | |
415 | - | |
416 | -If you want to create a new cgroup under /dev/cgroup: | |
417 | -# cd /dev/cgroup | |
418 | -# mkdir my_cgroup | |
419 | - | |
420 | -Now you want to do something with this cgroup. | |
421 | -# cd my_cgroup | |
422 | - | |
423 | -In this directory you can find several files: | |
424 | -# ls | |
425 | -notify_on_release releasable tasks | |
426 | -(plus whatever files added by the attached subsystems) | |
427 | - | |
428 | -Now attach your shell to this cgroup: | |
429 | -# /bin/echo $$ > tasks | |
430 | - | |
431 | -You can also create cgroups inside your cgroup by using mkdir in this | |
432 | -directory. | |
433 | -# mkdir my_sub_cs | |
434 | - | |
435 | -To remove a cgroup, just use rmdir: | |
436 | -# rmdir my_sub_cs | |
437 | - | |
438 | -This will fail if the cgroup is in use (has cgroups inside, or | |
439 | -has processes attached, or is held alive by other subsystem-specific | |
440 | -reference). | |
441 | - | |
442 | -2.2 Attaching processes | |
443 | ------------------------ | |
444 | - | |
445 | -# /bin/echo PID > tasks | |
446 | - | |
447 | -Note that it is PID, not PIDs. You can only attach ONE task at a time. | |
448 | -If you have several tasks to attach, you have to do it one after another: | |
449 | - | |
450 | -# /bin/echo PID1 > tasks | |
451 | -# /bin/echo PID2 > tasks | |
452 | - ... | |
453 | -# /bin/echo PIDn > tasks | |
454 | - | |
455 | -You can attach the current shell task by echoing 0: | |
456 | - | |
457 | -# echo 0 > tasks | |
458 | - | |
459 | -3. Kernel API | |
460 | -============= | |
461 | - | |
462 | -3.1 Overview | |
463 | ------------- | |
464 | - | |
465 | -Each kernel subsystem that wants to hook into the generic cgroup | |
466 | -system needs to create a cgroup_subsys object. This contains | |
467 | -various methods, which are callbacks from the cgroup system, along | |
468 | -with a subsystem id which will be assigned by the cgroup system. | |
469 | - | |
470 | -Other fields in the cgroup_subsys object include: | |
471 | - | |
472 | -- subsys_id: a unique array index for the subsystem, indicating which | |
473 | - entry in cgroup->subsys[] this subsystem should be managing. | |
474 | - | |
475 | -- name: should be initialized to a unique subsystem name. Should be | |
476 | - no longer than MAX_CGROUP_TYPE_NAMELEN. | |
477 | - | |
478 | -- early_init: indicate if the subsystem needs early initialization | |
479 | - at system boot. | |
480 | - | |
481 | -Each cgroup object created by the system has an array of pointers, | |
482 | -indexed by subsystem id; this pointer is entirely managed by the | |
483 | -subsystem; the generic cgroup code will never touch this pointer. | |
484 | - | |
485 | -3.2 Synchronization | |
486 | -------------------- | |
487 | - | |
488 | -There is a global mutex, cgroup_mutex, used by the cgroup | |
489 | -system. This should be taken by anything that wants to modify a | |
490 | -cgroup. It may also be taken to prevent cgroups from being | |
491 | -modified, but more specific locks may be more appropriate in that | |
492 | -situation. | |
493 | - | |
494 | -See kernel/cgroup.c for more details. | |
495 | - | |
496 | -Subsystems can take/release the cgroup_mutex via the functions | |
497 | -cgroup_lock()/cgroup_unlock(). | |
498 | - | |
499 | -Accessing a task's cgroup pointer may be done in the following ways: | |
500 | -- while holding cgroup_mutex | |
501 | -- while holding the task's alloc_lock (via task_lock()) | |
502 | -- inside an rcu_read_lock() section via rcu_dereference() | |
503 | - | |
504 | -3.3 Subsystem API | |
505 | ------------------ | |
506 | - | |
507 | -Each subsystem should: | |
508 | - | |
509 | -- add an entry in linux/cgroup_subsys.h | |
510 | -- define a cgroup_subsys object called <name>_subsys | |
511 | - | |
512 | -Each subsystem may export the following methods. The only mandatory | |
513 | -methods are create/destroy. Any others that are null are presumed to | |
514 | -be successful no-ops. | |
515 | - | |
516 | -struct cgroup_subsys_state *create(struct cgroup_subsys *ss, | |
517 | - struct cgroup *cgrp) | |
518 | -(cgroup_mutex held by caller) | |
519 | - | |
520 | -Called to create a subsystem state object for a cgroup. The | |
521 | -subsystem should allocate its subsystem state object for the passed | |
522 | -cgroup, returning a pointer to the new object on success or a | |
523 | -negative error code. On success, the subsystem pointer should point to | |
524 | -a structure of type cgroup_subsys_state (typically embedded in a | |
525 | -larger subsystem-specific object), which will be initialized by the | |
526 | -cgroup system. Note that this will be called at initialization to | |
527 | -create the root subsystem state for this subsystem; this case can be | |
528 | -identified by the passed cgroup object having a NULL parent (since | |
529 | -it's the root of the hierarchy) and may be an appropriate place for | |
530 | -initialization code. | |
531 | - | |
532 | -void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | |
533 | -(cgroup_mutex held by caller) | |
534 | - | |
535 | -The cgroup system is about to destroy the passed cgroup; the subsystem | |
536 | -should do any necessary cleanup and free its subsystem state | |
537 | -object. By the time this method is called, the cgroup has already been | |
538 | -unlinked from the file system and from the child list of its parent; | |
539 | -cgroup->parent is still valid. (Note - can also be called for a | |
540 | -newly-created cgroup if an error occurs after this subsystem's | |
541 | -create() method has been called for the new cgroup). | |
542 | - | |
543 | -void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); | |
544 | -(cgroup_mutex held by caller) | |
545 | - | |
546 | -Called before checking the reference count on each subsystem. This may | |
547 | -be useful for subsystems which have some extra references even if | |
548 | -there are not tasks in the cgroup. | |
549 | - | |
550 | -int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |
551 | - struct task_struct *task) | |
552 | -(cgroup_mutex held by caller) | |
553 | - | |
554 | -Called prior to moving a task into a cgroup; if the subsystem | |
555 | -returns an error, this will abort the attach operation. If a NULL | |
556 | -task is passed, then a successful result indicates that *any* | |
557 | -unspecified task can be moved into the cgroup. Note that this isn't | |
558 | -called on a fork. If this method returns 0 (success) then this should | |
559 | -remain valid while the caller holds cgroup_mutex. | |
560 | - | |
561 | -void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |
562 | - struct cgroup *old_cgrp, struct task_struct *task) | |
563 | - | |
564 | -Called after the task has been attached to the cgroup, to allow any | |
565 | -post-attachment activity that requires memory allocations or blocking. | |
566 | - | |
567 | -void fork(struct cgroup_subsy *ss, struct task_struct *task) | |
568 | - | |
569 | -Called when a task is forked into a cgroup. | |
570 | - | |
571 | -void exit(struct cgroup_subsys *ss, struct task_struct *task) | |
572 | - | |
573 | -Called during task exit. | |
574 | - | |
575 | -int populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | |
576 | - | |
577 | -Called after creation of a cgroup to allow a subsystem to populate | |
578 | -the cgroup directory with file entries. The subsystem should make | |
579 | -calls to cgroup_add_file() with objects of type cftype (see | |
580 | -include/linux/cgroup.h for details). Note that although this | |
581 | -method can return an error code, the error code is currently not | |
582 | -always handled well. | |
583 | - | |
584 | -void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) | |
585 | - | |
586 | -Called at the end of cgroup_clone() to do any paramater | |
587 | -initialization which might be required before a task could attach. For | |
588 | -example in cpusets, no task may attach before 'cpus' and 'mems' are set | |
589 | -up. | |
590 | - | |
591 | -void bind(struct cgroup_subsys *ss, struct cgroup *root) | |
592 | -(cgroup_mutex held by caller) | |
593 | - | |
594 | -Called when a cgroup subsystem is rebound to a different hierarchy | |
595 | -and root cgroup. Currently this will only involve movement between | |
596 | -the default hierarchy (which never has sub-cgroups) and a hierarchy | |
597 | -that is being created/destroyed (and hence has no sub-cgroups). | |
598 | - | |
599 | -4. Questions | |
600 | -============ | |
601 | - | |
602 | -Q: what's up with this '/bin/echo' ? | |
603 | -A: bash's builtin 'echo' command does not check calls to write() against | |
604 | - errors. If you use it in the cgroup file system, you won't be | |
605 | - able to tell whether a command succeeded or failed. | |
606 | - | |
607 | -Q: When I attach processes, only the first of the line gets really attached ! | |
608 | -A: We can only return one error code per call to write(). So you should also | |
609 | - put only ONE pid. | |
610 | - | |
611 | --- /dev/null | |
612 | +++ b/Documentation/cgroups/cgroups.txt | |
613 | @@ -0,0 +1,548 @@ | |
614 | + CGROUPS | |
615 | + ------- | |
616 | + | |
617 | +Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt | |
618 | + | |
619 | +Original copyright statements from cpusets.txt: | |
620 | +Portions Copyright (C) 2004 BULL SA. | |
621 | +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. | |
622 | +Modified by Paul Jackson <pj@sgi.com> | |
623 | +Modified by Christoph Lameter <clameter@sgi.com> | |
624 | + | |
625 | +CONTENTS: | |
626 | +========= | |
627 | + | |
628 | +1. Control Groups | |
629 | + 1.1 What are cgroups ? | |
630 | + 1.2 Why are cgroups needed ? | |
631 | + 1.3 How are cgroups implemented ? | |
632 | + 1.4 What does notify_on_release do ? | |
633 | + 1.5 How do I use cgroups ? | |
634 | +2. Usage Examples and Syntax | |
635 | + 2.1 Basic Usage | |
636 | + 2.2 Attaching processes | |
637 | +3. Kernel API | |
638 | + 3.1 Overview | |
639 | + 3.2 Synchronization | |
640 | + 3.3 Subsystem API | |
641 | +4. Questions | |
642 | + | |
643 | +1. Control Groups | |
644 | +================= | |
645 | + | |
646 | +1.1 What are cgroups ? | |
647 | +---------------------- | |
648 | + | |
649 | +Control Groups provide a mechanism for aggregating/partitioning sets of | |
650 | +tasks, and all their future children, into hierarchical groups with | |
651 | +specialized behaviour. | |
652 | + | |
653 | +Definitions: | |
654 | + | |
655 | +A *cgroup* associates a set of tasks with a set of parameters for one | |
656 | +or more subsystems. | |
657 | + | |
658 | +A *subsystem* is a module that makes use of the task grouping | |
659 | +facilities provided by cgroups to treat groups of tasks in | |
660 | +particular ways. A subsystem is typically a "resource controller" that | |
661 | +schedules a resource or applies per-cgroup limits, but it may be | |
662 | +anything that wants to act on a group of processes, e.g. a | |
663 | +virtualization subsystem. | |
664 | + | |
665 | +A *hierarchy* is a set of cgroups arranged in a tree, such that | |
666 | +every task in the system is in exactly one of the cgroups in the | |
667 | +hierarchy, and a set of subsystems; each subsystem has system-specific | |
668 | +state attached to each cgroup in the hierarchy. Each hierarchy has | |
669 | +an instance of the cgroup virtual filesystem associated with it. | |
670 | + | |
671 | +At any one time there may be multiple active hierachies of task | |
672 | +cgroups. Each hierarchy is a partition of all tasks in the system. | |
673 | + | |
674 | +User level code may create and destroy cgroups by name in an | |
675 | +instance of the cgroup virtual file system, specify and query to | |
676 | +which cgroup a task is assigned, and list the task pids assigned to | |
677 | +a cgroup. Those creations and assignments only affect the hierarchy | |
678 | +associated with that instance of the cgroup file system. | |
679 | + | |
680 | +On their own, the only use for cgroups is for simple job | |
681 | +tracking. The intention is that other subsystems hook into the generic | |
682 | +cgroup support to provide new attributes for cgroups, such as | |
683 | +accounting/limiting the resources which processes in a cgroup can | |
684 | +access. For example, cpusets (see Documentation/cpusets.txt) allows | |
685 | +you to associate a set of CPUs and a set of memory nodes with the | |
686 | +tasks in each cgroup. | |
687 | + | |
688 | +1.2 Why are cgroups needed ? | |
689 | +---------------------------- | |
690 | + | |
691 | +There are multiple efforts to provide process aggregations in the | |
692 | +Linux kernel, mainly for resource tracking purposes. Such efforts | |
693 | +include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server | |
694 | +namespaces. These all require the basic notion of a | |
695 | +grouping/partitioning of processes, with newly forked processes ending | |
696 | +in the same group (cgroup) as their parent process. | |
697 | + | |
698 | +The kernel cgroup patch provides the minimum essential kernel | |
699 | +mechanisms required to efficiently implement such groups. It has | |
700 | +minimal impact on the system fast paths, and provides hooks for | |
701 | +specific subsystems such as cpusets to provide additional behaviour as | |
702 | +desired. | |
703 | + | |
704 | +Multiple hierarchy support is provided to allow for situations where | |
705 | +the division of tasks into cgroups is distinctly different for | |
706 | +different subsystems - having parallel hierarchies allows each | |
707 | +hierarchy to be a natural division of tasks, without having to handle | |
708 | +complex combinations of tasks that would be present if several | |
709 | +unrelated subsystems needed to be forced into the same tree of | |
710 | +cgroups. | |
711 | + | |
712 | +At one extreme, each resource controller or subsystem could be in a | |
713 | +separate hierarchy; at the other extreme, all subsystems | |
714 | +would be attached to the same hierarchy. | |
715 | + | |
716 | +As an example of a scenario (originally proposed by vatsa@in.ibm.com) | |
717 | +that can benefit from multiple hierarchies, consider a large | |
718 | +university server with various users - students, professors, system | |
719 | +tasks etc. The resource planning for this server could be along the | |
720 | +following lines: | |
721 | + | |
722 | + CPU : Top cpuset | |
723 | + / \ | |
724 | + CPUSet1 CPUSet2 | |
725 | + | | | |
726 | + (Profs) (Students) | |
727 | + | |
728 | + In addition (system tasks) are attached to topcpuset (so | |
729 | + that they can run anywhere) with a limit of 20% | |
730 | + | |
731 | + Memory : Professors (50%), students (30%), system (20%) | |
732 | + | |
733 | + Disk : Prof (50%), students (30%), system (20%) | |
734 | + | |
735 | + Network : WWW browsing (20%), Network File System (60%), others (20%) | |
736 | + / \ | |
737 | + Prof (15%) students (5%) | |
738 | + | |
739 | +Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go | |
740 | +into NFS network class. | |
741 | + | |
742 | +At the same time firefox/lynx will share an appropriate CPU/Memory class | |
743 | +depending on who launched it (prof/student). | |
744 | + | |
745 | +With the ability to classify tasks differently for different resources | |
746 | +(by putting those resource subsystems in different hierarchies) then | |
747 | +the admin can easily set up a script which receives exec notifications | |
748 | +and depending on who is launching the browser he can | |
749 | + | |
750 | + # echo browser_pid > /mnt/<restype>/<userclass>/tasks | |
751 | + | |
752 | +With only a single hierarchy, he now would potentially have to create | |
753 | +a separate cgroup for every browser launched and associate it with | |
754 | +approp network and other resource class. This may lead to | |
755 | +proliferation of such cgroups. | |
756 | + | |
757 | +Also lets say that the administrator would like to give enhanced network | |
758 | +access temporarily to a student's browser (since it is night and the user | |
759 | +wants to do online gaming :)) OR give one of the students simulation | |
760 | +apps enhanced CPU power, | |
761 | + | |
762 | +With ability to write pids directly to resource classes, it's just a | |
763 | +matter of : | |
764 | + | |
765 | + # echo pid > /mnt/network/<new_class>/tasks | |
766 | + (after some time) | |
767 | + # echo pid > /mnt/network/<orig_class>/tasks | |
768 | + | |
769 | +Without this ability, he would have to split the cgroup into | |
770 | +multiple separate ones and then associate the new cgroups with the | |
771 | +new resource classes. | |
772 | + | |
773 | + | |
774 | + | |
775 | +1.3 How are cgroups implemented ? | |
776 | +--------------------------------- | |
777 | + | |
778 | +Control Groups extends the kernel as follows: | |
779 | + | |
780 | + - Each task in the system has a reference-counted pointer to a | |
781 | + css_set. | |
782 | + | |
783 | + - A css_set contains a set of reference-counted pointers to | |
784 | + cgroup_subsys_state objects, one for each cgroup subsystem | |
785 | + registered in the system. There is no direct link from a task to | |
786 | + the cgroup of which it's a member in each hierarchy, but this | |
787 | + can be determined by following pointers through the | |
788 | + cgroup_subsys_state objects. This is because accessing the | |
789 | + subsystem state is something that's expected to happen frequently | |
790 | + and in performance-critical code, whereas operations that require a | |
791 | + task's actual cgroup assignments (in particular, moving between | |
792 | + cgroups) are less common. A linked list runs through the cg_list | |
793 | + field of each task_struct using the css_set, anchored at | |
794 | + css_set->tasks. | |
795 | + | |
796 | + - A cgroup hierarchy filesystem can be mounted for browsing and | |
797 | + manipulation from user space. | |
798 | + | |
799 | + - You can list all the tasks (by pid) attached to any cgroup. | |
800 | + | |
801 | +The implementation of cgroups requires a few, simple hooks | |
802 | +into the rest of the kernel, none in performance critical paths: | |
803 | + | |
804 | + - in init/main.c, to initialize the root cgroups and initial | |
805 | + css_set at system boot. | |
806 | + | |
807 | + - in fork and exit, to attach and detach a task from its css_set. | |
808 | + | |
809 | +In addition a new file system, of type "cgroup" may be mounted, to | |
810 | +enable browsing and modifying the cgroups presently known to the | |
811 | +kernel. When mounting a cgroup hierarchy, you may specify a | |
812 | +comma-separated list of subsystems to mount as the filesystem mount | |
813 | +options. By default, mounting the cgroup filesystem attempts to | |
814 | +mount a hierarchy containing all registered subsystems. | |
815 | + | |
816 | +If an active hierarchy with exactly the same set of subsystems already | |
817 | +exists, it will be reused for the new mount. If no existing hierarchy | |
818 | +matches, and any of the requested subsystems are in use in an existing | |
819 | +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy | |
820 | +is activated, associated with the requested subsystems. | |
821 | + | |
822 | +It's not currently possible to bind a new subsystem to an active | |
823 | +cgroup hierarchy, or to unbind a subsystem from an active cgroup | |
824 | +hierarchy. This may be possible in future, but is fraught with nasty | |
825 | +error-recovery issues. | |
826 | + | |
827 | +When a cgroup filesystem is unmounted, if there are any | |
828 | +child cgroups created below the top-level cgroup, that hierarchy | |
829 | +will remain active even though unmounted; if there are no | |
830 | +child cgroups then the hierarchy will be deactivated. | |
831 | + | |
832 | +No new system calls are added for cgroups - all support for | |
833 | +querying and modifying cgroups is via this cgroup file system. | |
834 | + | |
835 | +Each task under /proc has an added file named 'cgroup' displaying, | |
836 | +for each active hierarchy, the subsystem names and the cgroup name | |
837 | +as the path relative to the root of the cgroup file system. | |
838 | + | |
839 | +Each cgroup is represented by a directory in the cgroup file system | |
840 | +containing the following files describing that cgroup: | |
841 | + | |
842 | + - tasks: list of tasks (by pid) attached to that cgroup | |
843 | + - releasable flag: cgroup currently removeable? | |
844 | + - notify_on_release flag: run the release agent on exit? | |
845 | + - release_agent: the path to use for release notifications (this file | |
846 | + exists in the top cgroup only) | |
847 | + | |
848 | +Other subsystems such as cpusets may add additional files in each | |
849 | +cgroup dir. | |
850 | + | |
851 | +New cgroups are created using the mkdir system call or shell | |
852 | +command. The properties of a cgroup, such as its flags, are | |
853 | +modified by writing to the appropriate file in that cgroups | |
854 | +directory, as listed above. | |
855 | + | |
856 | +The named hierarchical structure of nested cgroups allows partitioning | |
857 | +a large system into nested, dynamically changeable, "soft-partitions". | |
858 | + | |
859 | +The attachment of each task, automatically inherited at fork by any | |
860 | +children of that task, to a cgroup allows organizing the work load | |
861 | +on a system into related sets of tasks. A task may be re-attached to | |
862 | +any other cgroup, if allowed by the permissions on the necessary | |
863 | +cgroup file system directories. | |
864 | + | |
865 | +When a task is moved from one cgroup to another, it gets a new | |
866 | +css_set pointer - if there's an already existing css_set with the | |
867 | +desired collection of cgroups then that group is reused, else a new | |
868 | +css_set is allocated. Note that the current implementation uses a | |
869 | +linear search to locate an appropriate existing css_set, so isn't | |
870 | +very efficient. A future version will use a hash table for better | |
871 | +performance. | |
872 | + | |
873 | +To allow access from a cgroup to the css_sets (and hence tasks) | |
874 | +that comprise it, a set of cg_cgroup_link objects form a lattice; | |
875 | +each cg_cgroup_link is linked into a list of cg_cgroup_links for | |
876 | +a single cgroup on its cgrp_link_list field, and a list of | |
877 | +cg_cgroup_links for a single css_set on its cg_link_list. | |
878 | + | |
879 | +Thus the set of tasks in a cgroup can be listed by iterating over | |
880 | +each css_set that references the cgroup, and sub-iterating over | |
881 | +each css_set's task set. | |
882 | + | |
883 | +The use of a Linux virtual file system (vfs) to represent the | |
884 | +cgroup hierarchy provides for a familiar permission and name space | |
885 | +for cgroups, with a minimum of additional kernel code. | |
886 | + | |
887 | +1.4 What does notify_on_release do ? | |
888 | +------------------------------------ | |
889 | + | |
890 | +If the notify_on_release flag is enabled (1) in a cgroup, then | |
891 | +whenever the last task in the cgroup leaves (exits or attaches to | |
892 | +some other cgroup) and the last child cgroup of that cgroup | |
893 | +is removed, then the kernel runs the command specified by the contents | |
894 | +of the "release_agent" file in that hierarchy's root directory, | |
895 | +supplying the pathname (relative to the mount point of the cgroup | |
896 | +file system) of the abandoned cgroup. This enables automatic | |
897 | +removal of abandoned cgroups. The default value of | |
898 | +notify_on_release in the root cgroup at system boot is disabled | |
899 | +(0). The default value of other cgroups at creation is the current | |
900 | +value of their parents notify_on_release setting. The default value of | |
901 | +a cgroup hierarchy's release_agent path is empty. | |
902 | + | |
903 | +1.5 How do I use cgroups ? | |
904 | +-------------------------- | |
905 | + | |
906 | +To start a new job that is to be contained within a cgroup, using | |
907 | +the "cpuset" cgroup subsystem, the steps are something like: | |
908 | + | |
909 | + 1) mkdir /dev/cgroup | |
910 | + 2) mount -t cgroup -ocpuset cpuset /dev/cgroup | |
911 | + 3) Create the new cgroup by doing mkdir's and write's (or echo's) in | |
912 | + the /dev/cgroup virtual file system. | |
913 | + 4) Start a task that will be the "founding father" of the new job. | |
914 | + 5) Attach that task to the new cgroup by writing its pid to the | |
915 | + /dev/cgroup tasks file for that cgroup. | |
916 | + 6) fork, exec or clone the job tasks from this founding father task. | |
917 | + | |
918 | +For example, the following sequence of commands will setup a cgroup | |
919 | +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, | |
920 | +and then start a subshell 'sh' in that cgroup: | |
921 | + | |
922 | + mount -t cgroup cpuset -ocpuset /dev/cgroup | |
923 | + cd /dev/cgroup | |
924 | + mkdir Charlie | |
925 | + cd Charlie | |
926 | + /bin/echo 2-3 > cpuset.cpus | |
927 | + /bin/echo 1 > cpuset.mems | |
928 | + /bin/echo $$ > tasks | |
929 | + sh | |
930 | + # The subshell 'sh' is now running in cgroup Charlie | |
931 | + # The next line should display '/Charlie' | |
932 | + cat /proc/self/cgroup | |
933 | + | |
934 | +2. Usage Examples and Syntax | |
935 | +============================ | |
936 | + | |
937 | +2.1 Basic Usage | |
938 | +--------------- | |
939 | + | |
940 | +Creating, modifying, using the cgroups can be done through the cgroup | |
941 | +virtual filesystem. | |
942 | + | |
943 | +To mount a cgroup hierarchy will all available subsystems, type: | |
944 | +# mount -t cgroup xxx /dev/cgroup | |
945 | + | |
946 | +The "xxx" is not interpreted by the cgroup code, but will appear in | |
947 | +/proc/mounts so may be any useful identifying string that you like. | |
948 | + | |
949 | +To mount a cgroup hierarchy with just the cpuset and numtasks | |
950 | +subsystems, type: | |
951 | +# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup | |
952 | + | |
953 | +To change the set of subsystems bound to a mounted hierarchy, just | |
954 | +remount with different options: | |
955 | + | |
956 | +# mount -o remount,cpuset,ns /dev/cgroup | |
957 | + | |
958 | +Note that changing the set of subsystems is currently only supported | |
959 | +when the hierarchy consists of a single (root) cgroup. Supporting | |
960 | +the ability to arbitrarily bind/unbind subsystems from an existing | |
961 | +cgroup hierarchy is intended to be implemented in the future. | |
962 | + | |
963 | +Then under /dev/cgroup you can find a tree that corresponds to the | |
964 | +tree of the cgroups in the system. For instance, /dev/cgroup | |
965 | +is the cgroup that holds the whole system. | |
966 | + | |
967 | +If you want to create a new cgroup under /dev/cgroup: | |
968 | +# cd /dev/cgroup | |
969 | +# mkdir my_cgroup | |
970 | + | |
971 | +Now you want to do something with this cgroup. | |
972 | +# cd my_cgroup | |
973 | + | |
974 | +In this directory you can find several files: | |
975 | +# ls | |
976 | +notify_on_release releasable tasks | |
977 | +(plus whatever files added by the attached subsystems) | |
978 | + | |
979 | +Now attach your shell to this cgroup: | |
980 | +# /bin/echo $$ > tasks | |
981 | + | |
982 | +You can also create cgroups inside your cgroup by using mkdir in this | |
983 | +directory. | |
984 | +# mkdir my_sub_cs | |
985 | + | |
986 | +To remove a cgroup, just use rmdir: | |
987 | +# rmdir my_sub_cs | |
988 | + | |
989 | +This will fail if the cgroup is in use (has cgroups inside, or | |
990 | +has processes attached, or is held alive by other subsystem-specific | |
991 | +reference). | |
992 | + | |
993 | +2.2 Attaching processes | |
994 | +----------------------- | |
995 | + | |
996 | +# /bin/echo PID > tasks | |
997 | + | |
998 | +Note that it is PID, not PIDs. You can only attach ONE task at a time. | |
999 | +If you have several tasks to attach, you have to do it one after another: | |
1000 | + | |
1001 | +# /bin/echo PID1 > tasks | |
1002 | +# /bin/echo PID2 > tasks | |
1003 | + ... | |
1004 | +# /bin/echo PIDn > tasks | |
1005 | + | |
1006 | +You can attach the current shell task by echoing 0: | |
1007 | + | |
1008 | +# echo 0 > tasks | |
1009 | + | |
1010 | +3. Kernel API | |
1011 | +============= | |
1012 | + | |
1013 | +3.1 Overview | |
1014 | +------------ | |
1015 | + | |
1016 | +Each kernel subsystem that wants to hook into the generic cgroup | |
1017 | +system needs to create a cgroup_subsys object. This contains | |
1018 | +various methods, which are callbacks from the cgroup system, along | |
1019 | +with a subsystem id which will be assigned by the cgroup system. | |
1020 | + | |
1021 | +Other fields in the cgroup_subsys object include: | |
1022 | + | |
1023 | +- subsys_id: a unique array index for the subsystem, indicating which | |
1024 | + entry in cgroup->subsys[] this subsystem should be managing. | |
1025 | + | |
1026 | +- name: should be initialized to a unique subsystem name. Should be | |
1027 | + no longer than MAX_CGROUP_TYPE_NAMELEN. | |
1028 | + | |
1029 | +- early_init: indicate if the subsystem needs early initialization | |
1030 | + at system boot. | |
1031 | + | |
1032 | +Each cgroup object created by the system has an array of pointers, | |
1033 | +indexed by subsystem id; this pointer is entirely managed by the | |
1034 | +subsystem; the generic cgroup code will never touch this pointer. | |
1035 | + | |
1036 | +3.2 Synchronization | |
1037 | +------------------- | |
1038 | + | |
1039 | +There is a global mutex, cgroup_mutex, used by the cgroup | |
1040 | +system. This should be taken by anything that wants to modify a | |
1041 | +cgroup. It may also be taken to prevent cgroups from being | |
1042 | +modified, but more specific locks may be more appropriate in that | |
1043 | +situation. | |
1044 | + | |
1045 | +See kernel/cgroup.c for more details. | |
1046 | + | |
1047 | +Subsystems can take/release the cgroup_mutex via the functions | |
1048 | +cgroup_lock()/cgroup_unlock(). | |
1049 | + | |
1050 | +Accessing a task's cgroup pointer may be done in the following ways: | |
1051 | +- while holding cgroup_mutex | |
1052 | +- while holding the task's alloc_lock (via task_lock()) | |
1053 | +- inside an rcu_read_lock() section via rcu_dereference() | |
1054 | + | |
1055 | +3.3 Subsystem API | |
1056 | +----------------- | |
1057 | + | |
1058 | +Each subsystem should: | |
1059 | + | |
1060 | +- add an entry in linux/cgroup_subsys.h | |
1061 | +- define a cgroup_subsys object called <name>_subsys | |
1062 | + | |
1063 | +Each subsystem may export the following methods. The only mandatory | |
1064 | +methods are create/destroy. Any others that are null are presumed to | |
1065 | +be successful no-ops. | |
1066 | + | |
1067 | +struct cgroup_subsys_state *create(struct cgroup_subsys *ss, | |
1068 | + struct cgroup *cgrp) | |
1069 | +(cgroup_mutex held by caller) | |
1070 | + | |
1071 | +Called to create a subsystem state object for a cgroup. The | |
1072 | +subsystem should allocate its subsystem state object for the passed | |
1073 | +cgroup, returning a pointer to the new object on success or a | |
1074 | +negative error code. On success, the subsystem pointer should point to | |
1075 | +a structure of type cgroup_subsys_state (typically embedded in a | |
1076 | +larger subsystem-specific object), which will be initialized by the | |
1077 | +cgroup system. Note that this will be called at initialization to | |
1078 | +create the root subsystem state for this subsystem; this case can be | |
1079 | +identified by the passed cgroup object having a NULL parent (since | |
1080 | +it's the root of the hierarchy) and may be an appropriate place for | |
1081 | +initialization code. | |
1082 | + | |
1083 | +void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | |
1084 | +(cgroup_mutex held by caller) | |
1085 | + | |
1086 | +The cgroup system is about to destroy the passed cgroup; the subsystem | |
1087 | +should do any necessary cleanup and free its subsystem state | |
1088 | +object. By the time this method is called, the cgroup has already been | |
1089 | +unlinked from the file system and from the child list of its parent; | |
1090 | +cgroup->parent is still valid. (Note - can also be called for a | |
1091 | +newly-created cgroup if an error occurs after this subsystem's | |
1092 | +create() method has been called for the new cgroup). | |
1093 | + | |
1094 | +void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); | |
1095 | +(cgroup_mutex held by caller) | |
1096 | + | |
1097 | +Called before checking the reference count on each subsystem. This may | |
1098 | +be useful for subsystems which have some extra references even if | |
1099 | +there are not tasks in the cgroup. | |
1100 | + | |
1101 | +int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |
1102 | + struct task_struct *task) | |
1103 | +(cgroup_mutex held by caller) | |
1104 | + | |
1105 | +Called prior to moving a task into a cgroup; if the subsystem | |
1106 | +returns an error, this will abort the attach operation. If a NULL | |
1107 | +task is passed, then a successful result indicates that *any* | |
1108 | +unspecified task can be moved into the cgroup. Note that this isn't | |
1109 | +called on a fork. If this method returns 0 (success) then this should | |
1110 | +remain valid while the caller holds cgroup_mutex. | |
1111 | + | |
1112 | +void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |
1113 | + struct cgroup *old_cgrp, struct task_struct *task) | |
1114 | + | |
1115 | +Called after the task has been attached to the cgroup, to allow any | |
1116 | +post-attachment activity that requires memory allocations or blocking. | |
1117 | + | |
1118 | +void fork(struct cgroup_subsy *ss, struct task_struct *task) | |
1119 | + | |
1120 | +Called when a task is forked into a cgroup. | |
1121 | + | |
1122 | +void exit(struct cgroup_subsys *ss, struct task_struct *task) | |
1123 | + | |
1124 | +Called during task exit. | |
1125 | + | |
1126 | +int populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | |
1127 | + | |
1128 | +Called after creation of a cgroup to allow a subsystem to populate | |
1129 | +the cgroup directory with file entries. The subsystem should make | |
1130 | +calls to cgroup_add_file() with objects of type cftype (see | |
1131 | +include/linux/cgroup.h for details). Note that although this | |
1132 | +method can return an error code, the error code is currently not | |
1133 | +always handled well. | |
1134 | + | |
1135 | +void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) | |
1136 | + | |
1137 | +Called at the end of cgroup_clone() to do any paramater | |
1138 | +initialization which might be required before a task could attach. For | |
1139 | +example in cpusets, no task may attach before 'cpus' and 'mems' are set | |
1140 | +up. | |
1141 | + | |
1142 | +void bind(struct cgroup_subsys *ss, struct cgroup *root) | |
1143 | +(cgroup_mutex held by caller) | |
1144 | + | |
1145 | +Called when a cgroup subsystem is rebound to a different hierarchy | |
1146 | +and root cgroup. Currently this will only involve movement between | |
1147 | +the default hierarchy (which never has sub-cgroups) and a hierarchy | |
1148 | +that is being created/destroyed (and hence has no sub-cgroups). | |
1149 | + | |
1150 | +4. Questions | |
1151 | +============ | |
1152 | + | |
1153 | +Q: what's up with this '/bin/echo' ? | |
1154 | +A: bash's builtin 'echo' command does not check calls to write() against | |
1155 | + errors. If you use it in the cgroup file system, you won't be | |
1156 | + able to tell whether a command succeeded or failed. | |
1157 | + | |
1158 | +Q: When I attach processes, only the first of the line gets really attached ! | |
1159 | +A: We can only return one error code per call to write(). So you should also | |
1160 | + put only ONE pid. | |
1161 | + | |
1162 | --- /dev/null | |
1163 | +++ b/Documentation/cgroups/freezer-subsystem.txt | |
1164 | @@ -0,0 +1,102 @@ | |
1165 | +The cgroup freezer is useful to batch job management system which start | |
1166 | +and stop sets of tasks in order to schedule the resources of a machine | |
1167 | +according to the desires of a system administrator. This sort of program | |
1168 | +is often used on HPC clusters to schedule access to the cluster as a | |
1169 | +whole. The cgroup freezer uses cgroups to describe the set of tasks to | |
1170 | +be started/stopped by the batch job management system. It also provides | |
1171 | +a means to start and stop the tasks composing the job. | |
1172 | + | |
1173 | +The cgroup freezer will also be useful for checkpointing running groups | |
1174 | +of tasks. The freezer allows the checkpoint code to obtain a consistent | |
1175 | +image of the tasks by attempting to force the tasks in a cgroup into a | |
1176 | +quiescent state. Once the tasks are quiescent another task can | |
1177 | +walk /proc or invoke a kernel interface to gather information about the | |
1178 | +quiesced tasks. Checkpointed tasks can be restarted later should a | |
1179 | +recoverable error occur. This also allows the checkpointed tasks to be | |
1180 | +migrated between nodes in a cluster by copying the gathered information | |
1181 | +to another node and restarting the tasks there. | |
1182 | + | |
1183 | +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping | |
1184 | +and resuming tasks in userspace. Both of these signals are observable | |
1185 | +from within the tasks we wish to freeze. While SIGSTOP cannot be caught, | |
1186 | +blocked, or ignored it can be seen by waiting or ptracing parent tasks. | |
1187 | +SIGCONT is especially unsuitable since it can be caught by the task. Any | |
1188 | +programs designed to watch for SIGSTOP and SIGCONT could be broken by | |
1189 | +attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can | |
1190 | +demonstrate this problem using nested bash shells: | |
1191 | + | |
1192 | + $ echo $$ | |
1193 | + 16644 | |
1194 | + $ bash | |
1195 | + $ echo $$ | |
1196 | + 16690 | |
1197 | + | |
1198 | + From a second, unrelated bash shell: | |
1199 | + $ kill -SIGSTOP 16690 | |
1200 | + $ kill -SIGCONT 16990 | |
1201 | + | |
1202 | + <at this point 16990 exits and causes 16644 to exit too> | |
1203 | + | |
1204 | +This happens because bash can observe both signals and choose how it | |
1205 | +responds to them. | |
1206 | + | |
1207 | +Another example of a program which catches and responds to these | |
1208 | +signals is gdb. In fact any program designed to use ptrace is likely to | |
1209 | +have a problem with this method of stopping and resuming tasks. | |
1210 | + | |
1211 | +In contrast, the cgroup freezer uses the kernel freezer code to | |
1212 | +prevent the freeze/unfreeze cycle from becoming visible to the tasks | |
1213 | +being frozen. This allows the bash example above and gdb to run as | |
1214 | +expected. | |
1215 | + | |
1216 | +The freezer subsystem in the container filesystem defines a file named | |
1217 | +freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the | |
1218 | +cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup. | |
1219 | +Reading will return the current state. | |
1220 | + | |
1221 | +Note freezer.state doesn't exist in root cgroup, which means root cgroup | |
1222 | +is non-freezable. | |
1223 | + | |
1224 | +* Examples of usage : | |
1225 | + | |
1226 | + # mkdir /containers | |
1227 | + # mount -t cgroup -ofreezer freezer /containers | |
1228 | + # mkdir /containers/0 | |
1229 | + # echo $some_pid > /containers/0/tasks | |
1230 | + | |
1231 | +to get status of the freezer subsystem : | |
1232 | + | |
1233 | + # cat /containers/0/freezer.state | |
1234 | + THAWED | |
1235 | + | |
1236 | +to freeze all tasks in the container : | |
1237 | + | |
1238 | + # echo FROZEN > /containers/0/freezer.state | |
1239 | + # cat /containers/0/freezer.state | |
1240 | + FREEZING | |
1241 | + # cat /containers/0/freezer.state | |
1242 | + FROZEN | |
1243 | + | |
1244 | +to unfreeze all tasks in the container : | |
1245 | + | |
1246 | + # echo THAWED > /containers/0/freezer.state | |
1247 | + # cat /containers/0/freezer.state | |
1248 | + THAWED | |
1249 | + | |
1250 | +This is the basic mechanism which should do the right thing for user space task | |
1251 | +in a simple scenario. | |
1252 | + | |
1253 | +It's important to note that freezing can be incomplete. In that case we return | |
1254 | +EBUSY. This means that some tasks in the cgroup are busy doing something that | |
1255 | +prevents us from completely freezing the cgroup at this time. After EBUSY, | |
1256 | +the cgroup will remain partially frozen -- reflected by freezer.state reporting | |
1257 | +"FREEZING" when read. The state will remain "FREEZING" until one of these | |
1258 | +things happens: | |
1259 | + | |
1260 | + 1) Userspace cancels the freezing operation by writing "THAWED" to | |
1261 | + the freezer.state file | |
1262 | + 2) Userspace retries the freezing operation by writing "FROZEN" to | |
1263 | + the freezer.state file (writing "FREEZING" is not legal | |
1264 | + and returns EINVAL) | |
1265 | + 3) The tasks that blocked the cgroup from entering the "FROZEN" | |
1266 | + state disappear from the cgroup's set of tasks. | |
1267 | --- a/Documentation/cpusets.txt | |
1268 | +++ b/Documentation/cpusets.txt | |
1269 | @@ -48,7 +48,7 @@ hooks, beyond what is already present, r | |
1270 | job placement on large systems. | |
1271 | ||
1272 | Cpusets use the generic cgroup subsystem described in | |
1273 | -Documentation/cgroup.txt. | |
1274 | +Documentation/cgroups/cgroups.txt. | |
1275 | ||
1276 | Requests by a task, using the sched_setaffinity(2) system call to | |
1277 | include CPUs in its CPU affinity mask, and using the mbind(2) and | |
1278 | --- a/arch/alpha/Kconfig | |
1279 | +++ b/arch/alpha/Kconfig | |
1280 | @@ -72,6 +72,7 @@ config ARCH_SUPPORTS_AOUT | |
1281 | def_bool y | |
1282 | ||
1283 | source "init/Kconfig" | |
1284 | +source "kernel/Kconfig.freezer" | |
1285 | ||
1286 | ||
1287 | menu "System setup" | |
1288 | --- a/arch/alpha/include/asm/thread_info.h | |
1289 | +++ b/arch/alpha/include/asm/thread_info.h | |
1290 | @@ -74,12 +74,14 @@ register struct thread_info *__current_t | |
1291 | #define TIF_UAC_SIGBUS 7 | |
1292 | #define TIF_MEMDIE 8 | |
1293 | #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ | |
1294 | +#define TIF_FREEZE 16 /* is freezing for suspend */ | |
1295 | ||
1296 | #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) | |
1297 | #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) | |
1298 | #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) | |
1299 | #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) | |
1300 | #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) | |
1301 | +#define _TIF_FREEZE (1<<TIF_FREEZE) | |
1302 | ||
1303 | /* Work to do on interrupt/exception return. */ | |
1304 | #define _TIF_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED) | |
1305 | --- a/arch/arm/Kconfig | |
1306 | +++ b/arch/arm/Kconfig | |
1307 | @@ -190,6 +190,8 @@ config VECTORS_BASE | |
1308 | ||
1309 | source "init/Kconfig" | |
1310 | ||
1311 | +source "kernel/Kconfig.freezer" | |
1312 | + | |
1313 | menu "System Type" | |
1314 | ||
1315 | choice | |
1316 | --- a/arch/avr32/Kconfig | |
1317 | +++ b/arch/avr32/Kconfig | |
1318 | @@ -72,6 +72,8 @@ config GENERIC_BUG | |
1319 | ||
1320 | source "init/Kconfig" | |
1321 | ||
1322 | +source "kernel/Kconfig.freezer" | |
1323 | + | |
1324 | menu "System Type and features" | |
1325 | ||
1326 | source "kernel/time/Kconfig" | |
1327 | --- a/arch/avr32/include/asm/thread_info.h | |
1328 | +++ b/arch/avr32/include/asm/thread_info.h | |
1329 | @@ -96,6 +96,7 @@ static inline struct thread_info *curren | |
1330 | #define _TIF_MEMDIE (1 << TIF_MEMDIE) | |
1331 | #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) | |
1332 | #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP) | |
1333 | +#define _TIF_FREEZE (1 << TIF_FREEZE) | |
1334 | ||
1335 | /* Note: The masks below must never span more than 16 bits! */ | |
1336 | ||
1337 | --- a/arch/blackfin/Kconfig | |
1338 | +++ b/arch/blackfin/Kconfig | |
1339 | @@ -64,8 +64,11 @@ config HARDWARE_PM | |
1340 | depends on OPROFILE | |
1341 | ||
1342 | source "init/Kconfig" | |
1343 | + | |
1344 | source "kernel/Kconfig.preempt" | |
1345 | ||
1346 | +source "kernel/Kconfig.freezer" | |
1347 | + | |
1348 | menu "Blackfin Processor Options" | |
1349 | ||
1350 | comment "Processor and Board Settings" | |
1351 | --- a/arch/cris/Kconfig | |
1352 | +++ b/arch/cris/Kconfig | |
1353 | @@ -62,6 +62,8 @@ config HZ | |
1354 | ||
1355 | source "init/Kconfig" | |
1356 | ||
1357 | +source "kernel/Kconfig.freezer" | |
1358 | + | |
1359 | menu "General setup" | |
1360 | ||
1361 | source "fs/Kconfig.binfmt" | |
1362 | --- a/arch/frv/Kconfig | |
1363 | +++ b/arch/frv/Kconfig | |
1364 | @@ -66,6 +66,8 @@ mainmenu "Fujitsu FR-V Kernel Configurat | |
1365 | ||
1366 | source "init/Kconfig" | |
1367 | ||
1368 | +source "kernel/Kconfig.freezer" | |
1369 | + | |
1370 | ||
1371 | menu "Fujitsu FR-V system setup" | |
1372 | ||
1373 | --- a/arch/h8300/Kconfig | |
1374 | +++ b/arch/h8300/Kconfig | |
1375 | @@ -89,6 +89,8 @@ config HZ | |
1376 | ||
1377 | source "init/Kconfig" | |
1378 | ||
1379 | +source "kernel/Kconfig.freezer" | |
1380 | + | |
1381 | source "arch/h8300/Kconfig.cpu" | |
1382 | ||
1383 | menu "Executable file formats" | |
1384 | --- a/arch/h8300/include/asm/thread_info.h | |
1385 | +++ b/arch/h8300/include/asm/thread_info.h | |
1386 | @@ -89,6 +89,7 @@ static inline struct thread_info *curren | |
1387 | TIF_NEED_RESCHED */ | |
1388 | #define TIF_MEMDIE 4 | |
1389 | #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ | |
1390 | +#define TIF_FREEZE 16 /* is freezing for suspend */ | |
1391 | ||
1392 | /* as above, but as bit values */ | |
1393 | #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) | |
1394 | @@ -96,6 +97,7 @@ static inline struct thread_info *curren | |
1395 | #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) | |
1396 | #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) | |
1397 | #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) | |
1398 | +#define _TIF_FREEZE (1<<TIF_FREEZE) | |
1399 | ||
1400 | #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ | |
1401 | ||
1402 | --- a/arch/ia64/Kconfig | |
1403 | +++ b/arch/ia64/Kconfig | |
1404 | @@ -7,6 +7,8 @@ mainmenu "IA-64 Linux Kernel Configurati | |
1405 | ||
1406 | source "init/Kconfig" | |
1407 | ||
1408 | +source "kernel/Kconfig.freezer" | |
1409 | + | |
1410 | menu "Processor type and features" | |
1411 | ||
1412 | config IA64 | |
1413 | --- a/arch/m32r/Kconfig | |
1414 | +++ b/arch/m32r/Kconfig | |
1415 | @@ -45,6 +45,8 @@ config HZ | |
1416 | ||
1417 | source "init/Kconfig" | |
1418 | ||
1419 | +source "kernel/Kconfig.freezer" | |
1420 | + | |
1421 | ||
1422 | menu "Processor type and features" | |
1423 | ||
1424 | --- a/arch/m68k/Kconfig | |
1425 | +++ b/arch/m68k/Kconfig | |
1426 | @@ -64,6 +64,8 @@ mainmenu "Linux/68k Kernel Configuration | |
1427 | ||
1428 | source "init/Kconfig" | |
1429 | ||
1430 | +source "kernel/Kconfig.freezer" | |
1431 | + | |
1432 | menu "Platform dependent setup" | |
1433 | ||
1434 | config EISA | |
1435 | --- a/arch/m68knommu/Kconfig | |
1436 | +++ b/arch/m68knommu/Kconfig | |
1437 | @@ -82,6 +82,8 @@ config ARCH_SUPPORTS_AOUT | |
1438 | ||
1439 | source "init/Kconfig" | |
1440 | ||
1441 | +source "kernel/Kconfig.freezer" | |
1442 | + | |
1443 | menu "Processor type and features" | |
1444 | ||
1445 | choice | |
1446 | --- a/arch/m68knommu/include/asm/thread_info.h | |
1447 | +++ b/arch/m68knommu/include/asm/thread_info.h | |
1448 | @@ -84,12 +84,14 @@ static inline struct thread_info *curren | |
1449 | #define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling | |
1450 | TIF_NEED_RESCHED */ | |
1451 | #define TIF_MEMDIE 4 | |
1452 | +#define TIF_FREEZE 16 /* is freezing for suspend */ | |
1453 | ||
1454 | /* as above, but as bit values */ | |
1455 | #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) | |
1456 | #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) | |
1457 | #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) | |
1458 | #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) | |
1459 | +#define _TIF_FREEZE (1<<TIF_FREEZE) | |
1460 | ||
1461 | #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ | |
1462 | ||
1463 | --- a/arch/mips/Kconfig | |
1464 | +++ b/arch/mips/Kconfig | |
1465 | @@ -1885,6 +1885,8 @@ config PROBE_INITRD_HEADER | |
1466 | add initrd or initramfs image to the kernel image. | |
1467 | Otherwise, say N. | |
1468 | ||
1469 | +source "kernel/Kconfig.freezer" | |
1470 | + | |
1471 | menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)" | |
1472 | ||
1473 | config HW_HAS_EISA | |
1474 | --- a/arch/mn10300/Kconfig | |
1475 | +++ b/arch/mn10300/Kconfig | |
1476 | @@ -71,6 +71,8 @@ mainmenu "Matsushita MN10300/AM33 Kernel | |
1477 | ||
1478 | source "init/Kconfig" | |
1479 | ||
1480 | +source "kernel/Kconfig.freezer" | |
1481 | + | |
1482 | ||
1483 | menu "Matsushita MN10300 system setup" | |
1484 | ||
1485 | --- a/arch/parisc/Kconfig | |
1486 | +++ b/arch/parisc/Kconfig | |
1487 | @@ -93,6 +93,8 @@ config ARCH_MAY_HAVE_PC_FDC | |
1488 | ||
1489 | source "init/Kconfig" | |
1490 | ||
1491 | +source "kernel/Kconfig.freezer" | |
1492 | + | |
1493 | ||
1494 | menu "Processor type and features" | |
1495 | ||
1496 | --- a/arch/powerpc/Kconfig | |
1497 | +++ b/arch/powerpc/Kconfig | |
1498 | @@ -228,6 +228,8 @@ config PPC_OF_PLATFORM_PCI | |
1499 | ||
1500 | source "init/Kconfig" | |
1501 | ||
1502 | +source "kernel/Kconfig.freezer" | |
1503 | + | |
1504 | source "arch/powerpc/sysdev/Kconfig" | |
1505 | source "arch/powerpc/platforms/Kconfig" | |
1506 | ||
1507 | --- a/arch/s390/Kconfig | |
1508 | +++ b/arch/s390/Kconfig | |
1509 | @@ -79,6 +79,8 @@ config S390 | |
1510 | ||
1511 | source "init/Kconfig" | |
1512 | ||
1513 | +source "kernel/Kconfig.freezer" | |
1514 | + | |
1515 | menu "Base setup" | |
1516 | ||
1517 | comment "Processor type and features" | |
1518 | --- a/arch/s390/include/asm/thread_info.h | |
1519 | +++ b/arch/s390/include/asm/thread_info.h | |
1520 | @@ -98,6 +98,7 @@ static inline struct thread_info *curren | |
1521 | #define TIF_31BIT 18 /* 32bit process */ | |
1522 | #define TIF_MEMDIE 19 | |
1523 | #define TIF_RESTORE_SIGMASK 20 /* restore signal mask in do_signal() */ | |
1524 | +#define TIF_FREEZE 21 | |
1525 | ||
1526 | #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) | |
1527 | #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) | |
1528 | @@ -110,6 +111,7 @@ static inline struct thread_info *curren | |
1529 | #define _TIF_USEDFPU (1<<TIF_USEDFPU) | |
1530 | #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) | |
1531 | #define _TIF_31BIT (1<<TIF_31BIT) | |
1532 | +#define _TIF_FREEZE (1<<TIF_FREEZE) | |
1533 | ||
1534 | #endif /* __KERNEL__ */ | |
1535 | ||
1536 | --- a/arch/sh/Kconfig | |
1537 | +++ b/arch/sh/Kconfig | |
1538 | @@ -106,6 +106,8 @@ config IO_TRAPPED | |
1539 | ||
1540 | source "init/Kconfig" | |
1541 | ||
1542 | +source "kernel/Kconfig.freezer" | |
1543 | + | |
1544 | menu "System type" | |
1545 | ||
1546 | # | |
1547 | --- a/arch/sparc/Kconfig | |
1548 | +++ b/arch/sparc/Kconfig | |
1549 | @@ -32,6 +32,8 @@ config HZ | |
1550 | ||
1551 | source "init/Kconfig" | |
1552 | ||
1553 | +source "kernel/Kconfig.freezer" | |
1554 | + | |
1555 | menu "General machine setup" | |
1556 | ||
1557 | config SMP | |
1558 | --- a/arch/sparc/include/asm/thread_info_32.h | |
1559 | +++ b/arch/sparc/include/asm/thread_info_32.h | |
1560 | @@ -139,6 +139,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, | |
1561 | #define TIF_POLLING_NRFLAG 9 /* true if poll_idle() is polling | |
1562 | * TIF_NEED_RESCHED */ | |
1563 | #define TIF_MEMDIE 10 | |
1564 | +#define TIF_FREEZE 11 /* is freezing for suspend */ | |
1565 | ||
1566 | /* as above, but as bit values */ | |
1567 | #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) | |
1568 | @@ -152,6 +153,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, | |
1569 | #define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | \ | |
1570 | _TIF_SIGPENDING | \ | |
1571 | _TIF_RESTORE_SIGMASK) | |
1572 | +#define _TIF_FREEZE (1<<TIF_FREEZE) | |
1573 | ||
1574 | #endif /* __KERNEL__ */ | |
1575 | ||
1576 | --- a/arch/sparc64/Kconfig | |
1577 | +++ b/arch/sparc64/Kconfig | |
1578 | @@ -85,6 +85,7 @@ config GENERIC_HARDIRQS_NO__DO_IRQ | |
1579 | def_bool y | |
1580 | ||
1581 | source "init/Kconfig" | |
1582 | +source "kernel/Kconfig.freezer" | |
1583 | ||
1584 | menu "Processor type and features" | |
1585 | ||
1586 | --- a/arch/um/Kconfig | |
1587 | +++ b/arch/um/Kconfig | |
1588 | @@ -229,6 +229,8 @@ endmenu | |
1589 | ||
1590 | source "init/Kconfig" | |
1591 | ||
1592 | +source "kernel/Kconfig.freezer" | |
1593 | + | |
1594 | source "drivers/block/Kconfig" | |
1595 | ||
1596 | source "arch/um/Kconfig.char" | |
1597 | --- a/arch/x86/Kconfig | |
1598 | +++ b/arch/x86/Kconfig | |
1599 | @@ -208,6 +208,7 @@ config X86_TRAMPOLINE | |
1600 | config KTIME_SCALAR | |
1601 | def_bool X86_32 | |
1602 | source "init/Kconfig" | |
1603 | +source "kernel/Kconfig.freezer" | |
1604 | ||
1605 | menu "Processor type and features" | |
1606 | ||
1607 | --- a/arch/xtensa/Kconfig | |
1608 | +++ b/arch/xtensa/Kconfig | |
1609 | @@ -55,6 +55,7 @@ config HZ | |
1610 | default 100 | |
1611 | ||
1612 | source "init/Kconfig" | |
1613 | +source "kernel/Kconfig.freezer" | |
1614 | ||
1615 | menu "Processor type and features" | |
1616 | ||
1617 | --- a/include/asm-cris/thread_info.h | |
1618 | +++ b/include/asm-cris/thread_info.h | |
1619 | @@ -88,6 +88,7 @@ struct thread_info { | |
1620 | #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ | |
1621 | #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ | |
1622 | #define TIF_MEMDIE 17 | |
1623 | +#define TIF_FREEZE 18 /* is freezing for suspend */ | |
1624 | ||
1625 | #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) | |
1626 | #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) | |
1627 | @@ -95,6 +96,7 @@ struct thread_info { | |
1628 | #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) | |
1629 | #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) | |
1630 | #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) | |
1631 | +#define _TIF_FREEZE (1<<TIF_FREEZE) | |
1632 | ||
1633 | #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ | |
1634 | #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */ | |
1635 | --- a/include/asm-m68k/thread_info.h | |
1636 | +++ b/include/asm-m68k/thread_info.h | |
1637 | @@ -52,5 +52,6 @@ struct thread_info { | |
1638 | #define TIF_DELAYED_TRACE 14 /* single step a syscall */ | |
1639 | #define TIF_SYSCALL_TRACE 15 /* syscall trace active */ | |
1640 | #define TIF_MEMDIE 16 | |
1641 | +#define TIF_FREEZE 17 /* thread is freezing for suspend */ | |
1642 | ||
1643 | #endif /* _ASM_M68K_THREAD_INFO_H */ | |
1644 | --- a/include/asm-parisc/thread_info.h | |
1645 | +++ b/include/asm-parisc/thread_info.h | |
1646 | @@ -58,6 +58,7 @@ struct thread_info { | |
1647 | #define TIF_32BIT 4 /* 32 bit binary */ | |
1648 | #define TIF_MEMDIE 5 | |
1649 | #define TIF_RESTORE_SIGMASK 6 /* restore saved signal mask */ | |
1650 | +#define TIF_FREEZE 7 /* is freezing for suspend */ | |
1651 | ||
1652 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) | |
1653 | #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) | |
1654 | @@ -65,6 +66,7 @@ struct thread_info { | |
1655 | #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) | |
1656 | #define _TIF_32BIT (1 << TIF_32BIT) | |
1657 | #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) | |
1658 | +#define _TIF_FREEZE (1 << TIF_FREEZE) | |
1659 | ||
1660 | #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | \ | |
1661 | _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK) | |
1662 | --- a/include/asm-um/thread_info.h | |
1663 | +++ b/include/asm-um/thread_info.h | |
1664 | @@ -69,6 +69,7 @@ static inline struct thread_info *curren | |
1665 | #define TIF_MEMDIE 5 | |
1666 | #define TIF_SYSCALL_AUDIT 6 | |
1667 | #define TIF_RESTORE_SIGMASK 7 | |
1668 | +#define TIF_FREEZE 16 /* is freezing for suspend */ | |
1669 | ||
1670 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) | |
1671 | #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) | |
1672 | @@ -77,5 +78,6 @@ static inline struct thread_info *curren | |
1673 | #define _TIF_MEMDIE (1 << TIF_MEMDIE) | |
1674 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) | |
1675 | #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) | |
1676 | +#define _TIF_FREEZE (1 << TIF_FREEZE) | |
1677 | ||
1678 | #endif | |
1679 | --- a/include/asm-xtensa/thread_info.h | |
1680 | +++ b/include/asm-xtensa/thread_info.h | |
1681 | @@ -134,6 +134,7 @@ static inline struct thread_info *curren | |
1682 | #define TIF_MEMDIE 5 | |
1683 | #define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */ | |
1684 | #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ | |
1685 | +#define TIF_FREEZE 17 /* is freezing for suspend */ | |
1686 | ||
1687 | #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) | |
1688 | #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) | |
1689 | @@ -142,6 +143,7 @@ static inline struct thread_info *curren | |
1690 | #define _TIF_IRET (1<<TIF_IRET) | |
1691 | #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) | |
1692 | #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) | |
1693 | +#define _TIF_FREEZE (1<<TIF_FREEZE) | |
1694 | ||
1695 | #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ | |
1696 | #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */ | |
1697 | --- a/include/linux/cgroup_subsys.h | |
1698 | +++ b/include/linux/cgroup_subsys.h | |
1699 | @@ -48,3 +48,9 @@ SUBSYS(devices) | |
1700 | #endif | |
1701 | ||
1702 | /* */ | |
1703 | + | |
1704 | +#ifdef CONFIG_CGROUP_FREEZER | |
1705 | +SUBSYS(freezer) | |
1706 | +#endif | |
1707 | + | |
1708 | +/* */ | |
1709 | --- a/include/linux/freezer.h | |
1710 | +++ b/include/linux/freezer.h | |
1711 | @@ -6,7 +6,7 @@ | |
1712 | #include <linux/sched.h> | |
1713 | #include <linux/wait.h> | |
1714 | ||
1715 | -#ifdef CONFIG_PM_SLEEP | |
1716 | +#ifdef CONFIG_FREEZER | |
1717 | /* | |
1718 | * Check if a process has been frozen | |
1719 | */ | |
1720 | @@ -39,29 +39,14 @@ static inline void clear_freeze_flag(str | |
1721 | clear_tsk_thread_flag(p, TIF_FREEZE); | |
1722 | } | |
1723 | ||
1724 | -/* | |
1725 | - * Wake up a frozen process | |
1726 | - * | |
1727 | - * task_lock() is taken to prevent the race with refrigerator() which may | |
1728 | - * occur if the freezing of tasks fails. Namely, without the lock, if the | |
1729 | - * freezing of tasks failed, thaw_tasks() might have run before a task in | |
1730 | - * refrigerator() could call frozen_process(), in which case the task would be | |
1731 | - * frozen and no one would thaw it. | |
1732 | - */ | |
1733 | -static inline int thaw_process(struct task_struct *p) | |
1734 | -{ | |
1735 | - task_lock(p); | |
1736 | - if (frozen(p)) { | |
1737 | - p->flags &= ~PF_FROZEN; | |
1738 | - task_unlock(p); | |
1739 | - wake_up_process(p); | |
1740 | - return 1; | |
1741 | - } | |
1742 | - clear_freeze_flag(p); | |
1743 | - task_unlock(p); | |
1744 | - return 0; | |
1745 | +static inline bool should_send_signal(struct task_struct *p) | |
1746 | +{ | |
1747 | + return !(p->flags & PF_FREEZER_NOSIG); | |
1748 | } | |
1749 | ||
1750 | +/* Takes and releases task alloc lock using task_lock() */ | |
1751 | +extern int thaw_process(struct task_struct *p); | |
1752 | + | |
1753 | extern void refrigerator(void); | |
1754 | extern int freeze_processes(void); | |
1755 | extern void thaw_processes(void); | |
1756 | @@ -75,6 +60,15 @@ static inline int try_to_freeze(void) | |
1757 | return 0; | |
1758 | } | |
1759 | ||
1760 | +extern bool freeze_task(struct task_struct *p, bool sig_only); | |
1761 | +extern void cancel_freezing(struct task_struct *p); | |
1762 | + | |
1763 | +#ifdef CONFIG_CGROUP_FREEZER | |
1764 | +extern int cgroup_frozen(struct task_struct *task); | |
1765 | +#else /* !CONFIG_CGROUP_FREEZER */ | |
1766 | +static inline int cgroup_frozen(struct task_struct *task) { return 0; } | |
1767 | +#endif /* !CONFIG_CGROUP_FREEZER */ | |
1768 | + | |
1769 | /* | |
1770 | * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it | |
1771 | * calls wait_for_completion(&vfork) and reset right after it returns from this | |
1772 | @@ -166,7 +160,7 @@ static inline void set_freezable_with_si | |
1773 | } while (try_to_freeze()); \ | |
1774 | __retval; \ | |
1775 | }) | |
1776 | -#else /* !CONFIG_PM_SLEEP */ | |
1777 | +#else /* !CONFIG_FREEZER */ | |
1778 | static inline int frozen(struct task_struct *p) { return 0; } | |
1779 | static inline int freezing(struct task_struct *p) { return 0; } | |
1780 | static inline void set_freeze_flag(struct task_struct *p) {} | |
1781 | @@ -191,6 +185,6 @@ static inline void set_freezable_with_si | |
1782 | #define wait_event_freezable_timeout(wq, condition, timeout) \ | |
1783 | wait_event_interruptible_timeout(wq, condition, timeout) | |
1784 | ||
1785 | -#endif /* !CONFIG_PM_SLEEP */ | |
1786 | +#endif /* !CONFIG_FREEZER */ | |
1787 | ||
1788 | #endif /* FREEZER_H_INCLUDED */ | |
1789 | --- a/init/Kconfig | |
1790 | +++ b/init/Kconfig | |
1791 | @@ -303,6 +303,13 @@ config CGROUP_NS | |
1792 | for instance virtual servers and checkpoint/restart | |
1793 | jobs. | |
1794 | ||
1795 | +config CGROUP_FREEZER | |
1796 | + bool "control group freezer subsystem" | |
1797 | + depends on CGROUPS | |
1798 | + help | |
1799 | + Provides a way to freeze and unfreeze all tasks in a | |
1800 | + cgroup. | |
1801 | + | |
1802 | config CGROUP_DEVICE | |
1803 | bool "Device controller for cgroups" | |
1804 | depends on CGROUPS && EXPERIMENTAL | |
1805 | --- /dev/null | |
1806 | +++ b/kernel/Kconfig.freezer | |
1807 | @@ -0,0 +1,2 @@ | |
1808 | +config FREEZER | |
1809 | + def_bool PM_SLEEP || CGROUP_FREEZER | |
1810 | --- a/kernel/Makefile | |
1811 | +++ b/kernel/Makefile | |
1812 | @@ -22,6 +22,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg | |
1813 | CFLAGS_REMOVE_sched.o = -pg | |
1814 | endif | |
1815 | ||
1816 | +obj-$(CONFIG_FREEZER) += freezer.o | |
1817 | obj-$(CONFIG_PROFILING) += profile.o | |
1818 | obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o | |
1819 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | |
1820 | @@ -54,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += bac | |
1821 | obj-$(CONFIG_COMPAT) += compat.o | |
1822 | obj-$(CONFIG_CGROUPS) += cgroup.o | |
1823 | obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o | |
1824 | +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o | |
1825 | obj-$(CONFIG_CPUSETS) += cpuset.o | |
1826 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o | |
1827 | obj-$(CONFIG_UTS_NS) += utsname.o | |
1828 | --- /dev/null | |
1829 | +++ b/kernel/cgroup_freezer.c | |
1830 | @@ -0,0 +1,379 @@ | |
1831 | +/* | |
1832 | + * cgroup_freezer.c - control group freezer subsystem | |
1833 | + * | |
1834 | + * Copyright IBM Corporation, 2007 | |
1835 | + * | |
1836 | + * Author : Cedric Le Goater <clg@fr.ibm.com> | |
1837 | + * | |
1838 | + * This program is free software; you can redistribute it and/or modify it | |
1839 | + * under the terms of version 2.1 of the GNU Lesser General Public License | |
1840 | + * as published by the Free Software Foundation. | |
1841 | + * | |
1842 | + * This program is distributed in the hope that it would be useful, but | |
1843 | + * WITHOUT ANY WARRANTY; without even the implied warranty of | |
1844 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
1845 | + */ | |
1846 | + | |
1847 | +#include <linux/module.h> | |
1848 | +#include <linux/cgroup.h> | |
1849 | +#include <linux/fs.h> | |
1850 | +#include <linux/uaccess.h> | |
1851 | +#include <linux/freezer.h> | |
1852 | +#include <linux/seq_file.h> | |
1853 | + | |
1854 | +enum freezer_state { | |
1855 | + CGROUP_THAWED = 0, | |
1856 | + CGROUP_FREEZING, | |
1857 | + CGROUP_FROZEN, | |
1858 | +}; | |
1859 | + | |
1860 | +struct freezer { | |
1861 | + struct cgroup_subsys_state css; | |
1862 | + enum freezer_state state; | |
1863 | + spinlock_t lock; /* protects _writes_ to state */ | |
1864 | +}; | |
1865 | + | |
1866 | +static inline struct freezer *cgroup_freezer( | |
1867 | + struct cgroup *cgroup) | |
1868 | +{ | |
1869 | + return container_of( | |
1870 | + cgroup_subsys_state(cgroup, freezer_subsys_id), | |
1871 | + struct freezer, css); | |
1872 | +} | |
1873 | + | |
1874 | +static inline struct freezer *task_freezer(struct task_struct *task) | |
1875 | +{ | |
1876 | + return container_of(task_subsys_state(task, freezer_subsys_id), | |
1877 | + struct freezer, css); | |
1878 | +} | |
1879 | + | |
1880 | +int cgroup_frozen(struct task_struct *task) | |
1881 | +{ | |
1882 | + struct freezer *freezer; | |
1883 | + enum freezer_state state; | |
1884 | + | |
1885 | + task_lock(task); | |
1886 | + freezer = task_freezer(task); | |
1887 | + state = freezer->state; | |
1888 | + task_unlock(task); | |
1889 | + | |
1890 | + return state == CGROUP_FROZEN; | |
1891 | +} | |
1892 | + | |
1893 | +/* | |
1894 | + * cgroups_write_string() limits the size of freezer state strings to | |
1895 | + * CGROUP_LOCAL_BUFFER_SIZE | |
1896 | + */ | |
1897 | +static const char *freezer_state_strs[] = { | |
1898 | + "THAWED", | |
1899 | + "FREEZING", | |
1900 | + "FROZEN", | |
1901 | +}; | |
1902 | + | |
1903 | +/* | |
1904 | + * State diagram | |
1905 | + * Transitions are caused by userspace writes to the freezer.state file. | |
1906 | + * The values in parenthesis are state labels. The rest are edge labels. | |
1907 | + * | |
1908 | + * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) | |
1909 | + * ^ ^ | | | |
1910 | + * | \_______THAWED_______/ | | |
1911 | + * \__________________________THAWED____________/ | |
1912 | + */ | |
1913 | + | |
1914 | +struct cgroup_subsys freezer_subsys; | |
1915 | + | |
1916 | +/* Locks taken and their ordering | |
1917 | + * ------------------------------ | |
1918 | + * css_set_lock | |
1919 | + * cgroup_mutex (AKA cgroup_lock) | |
1920 | + * task->alloc_lock (AKA task_lock) | |
1921 | + * freezer->lock | |
1922 | + * task->sighand->siglock | |
1923 | + * | |
1924 | + * cgroup code forces css_set_lock to be taken before task->alloc_lock | |
1925 | + * | |
1926 | + * freezer_create(), freezer_destroy(): | |
1927 | + * cgroup_mutex [ by cgroup core ] | |
1928 | + * | |
1929 | + * can_attach(): | |
1930 | + * cgroup_mutex | |
1931 | + * | |
1932 | + * cgroup_frozen(): | |
1933 | + * task->alloc_lock (to get task's cgroup) | |
1934 | + * | |
1935 | + * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): | |
1936 | + * task->alloc_lock (to get task's cgroup) | |
1937 | + * freezer->lock | |
1938 | + * sighand->siglock (if the cgroup is freezing) | |
1939 | + * | |
1940 | + * freezer_read(): | |
1941 | + * cgroup_mutex | |
1942 | + * freezer->lock | |
1943 | + * read_lock css_set_lock (cgroup iterator start) | |
1944 | + * | |
1945 | + * freezer_write() (freeze): | |
1946 | + * cgroup_mutex | |
1947 | + * freezer->lock | |
1948 | + * read_lock css_set_lock (cgroup iterator start) | |
1949 | + * sighand->siglock | |
1950 | + * | |
1951 | + * freezer_write() (unfreeze): | |
1952 | + * cgroup_mutex | |
1953 | + * freezer->lock | |
1954 | + * read_lock css_set_lock (cgroup iterator start) | |
1955 | + * task->alloc_lock (to prevent races with freeze_task()) | |
1956 | + * sighand->siglock | |
1957 | + */ | |
1958 | +static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, | |
1959 | + struct cgroup *cgroup) | |
1960 | +{ | |
1961 | + struct freezer *freezer; | |
1962 | + | |
1963 | + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); | |
1964 | + if (!freezer) | |
1965 | + return ERR_PTR(-ENOMEM); | |
1966 | + | |
1967 | + spin_lock_init(&freezer->lock); | |
1968 | + freezer->state = CGROUP_THAWED; | |
1969 | + return &freezer->css; | |
1970 | +} | |
1971 | + | |
1972 | +static void freezer_destroy(struct cgroup_subsys *ss, | |
1973 | + struct cgroup *cgroup) | |
1974 | +{ | |
1975 | + kfree(cgroup_freezer(cgroup)); | |
1976 | +} | |
1977 | + | |
1978 | +/* Task is frozen or will freeze immediately when next it gets woken */ | |
1979 | +static bool is_task_frozen_enough(struct task_struct *task) | |
1980 | +{ | |
1981 | + return frozen(task) || | |
1982 | + (task_is_stopped_or_traced(task) && freezing(task)); | |
1983 | +} | |
1984 | + | |
1985 | +/* | |
1986 | + * The call to cgroup_lock() in the freezer.state write method prevents | |
1987 | + * a write to that file racing against an attach, and hence the | |
1988 | + * can_attach() result will remain valid until the attach completes. | |
1989 | + */ | |
1990 | +static int freezer_can_attach(struct cgroup_subsys *ss, | |
1991 | + struct cgroup *new_cgroup, | |
1992 | + struct task_struct *task) | |
1993 | +{ | |
1994 | + struct freezer *freezer; | |
1995 | + | |
1996 | + /* | |
1997 | + * Anything frozen can't move or be moved to/from. | |
1998 | + * | |
1999 | + * Since orig_freezer->state == FROZEN means that @task has been | |
2000 | + * frozen, so it's sufficient to check the latter condition. | |
2001 | + */ | |
2002 | + | |
2003 | + if (is_task_frozen_enough(task)) | |
2004 | + return -EBUSY; | |
2005 | + | |
2006 | + freezer = cgroup_freezer(new_cgroup); | |
2007 | + if (freezer->state == CGROUP_FROZEN) | |
2008 | + return -EBUSY; | |
2009 | + | |
2010 | + return 0; | |
2011 | +} | |
2012 | + | |
2013 | +static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) | |
2014 | +{ | |
2015 | + struct freezer *freezer; | |
2016 | + | |
2017 | + /* | |
2018 | + * No lock is needed, since the task isn't on tasklist yet, | |
2019 | + * so it can't be moved to another cgroup, which means the | |
2020 | + * freezer won't be removed and will be valid during this | |
2021 | + * function call. | |
2022 | + */ | |
2023 | + freezer = task_freezer(task); | |
2024 | + | |
2025 | + /* | |
2026 | + * The root cgroup is non-freezable, so we can skip the | |
2027 | + * following check. | |
2028 | + */ | |
2029 | + if (!freezer->css.cgroup->parent) | |
2030 | + return; | |
2031 | + | |
2032 | + spin_lock_irq(&freezer->lock); | |
2033 | + BUG_ON(freezer->state == CGROUP_FROZEN); | |
2034 | + | |
2035 | + /* Locking avoids race with FREEZING -> THAWED transitions. */ | |
2036 | + if (freezer->state == CGROUP_FREEZING) | |
2037 | + freeze_task(task, true); | |
2038 | + spin_unlock_irq(&freezer->lock); | |
2039 | +} | |
2040 | + | |
2041 | +/* | |
2042 | + * caller must hold freezer->lock | |
2043 | + */ | |
2044 | +static void update_freezer_state(struct cgroup *cgroup, | |
2045 | + struct freezer *freezer) | |
2046 | +{ | |
2047 | + struct cgroup_iter it; | |
2048 | + struct task_struct *task; | |
2049 | + unsigned int nfrozen = 0, ntotal = 0; | |
2050 | + | |
2051 | + cgroup_iter_start(cgroup, &it); | |
2052 | + while ((task = cgroup_iter_next(cgroup, &it))) { | |
2053 | + ntotal++; | |
2054 | + if (is_task_frozen_enough(task)) | |
2055 | + nfrozen++; | |
2056 | + } | |
2057 | + | |
2058 | + /* | |
2059 | + * Transition to FROZEN when no new tasks can be added ensures | |
2060 | + * that we never exist in the FROZEN state while there are unfrozen | |
2061 | + * tasks. | |
2062 | + */ | |
2063 | + if (nfrozen == ntotal) | |
2064 | + freezer->state = CGROUP_FROZEN; | |
2065 | + else if (nfrozen > 0) | |
2066 | + freezer->state = CGROUP_FREEZING; | |
2067 | + else | |
2068 | + freezer->state = CGROUP_THAWED; | |
2069 | + cgroup_iter_end(cgroup, &it); | |
2070 | +} | |
2071 | + | |
2072 | +static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | |
2073 | + struct seq_file *m) | |
2074 | +{ | |
2075 | + struct freezer *freezer; | |
2076 | + enum freezer_state state; | |
2077 | + | |
2078 | + if (!cgroup_lock_live_group(cgroup)) | |
2079 | + return -ENODEV; | |
2080 | + | |
2081 | + freezer = cgroup_freezer(cgroup); | |
2082 | + spin_lock_irq(&freezer->lock); | |
2083 | + state = freezer->state; | |
2084 | + if (state == CGROUP_FREEZING) { | |
2085 | + /* We change from FREEZING to FROZEN lazily if the cgroup was | |
2086 | + * only partially frozen when we exitted write. */ | |
2087 | + update_freezer_state(cgroup, freezer); | |
2088 | + state = freezer->state; | |
2089 | + } | |
2090 | + spin_unlock_irq(&freezer->lock); | |
2091 | + cgroup_unlock(); | |
2092 | + | |
2093 | + seq_puts(m, freezer_state_strs[state]); | |
2094 | + seq_putc(m, '\n'); | |
2095 | + return 0; | |
2096 | +} | |
2097 | + | |
2098 | +static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |
2099 | +{ | |
2100 | + struct cgroup_iter it; | |
2101 | + struct task_struct *task; | |
2102 | + unsigned int num_cant_freeze_now = 0; | |
2103 | + | |
2104 | + freezer->state = CGROUP_FREEZING; | |
2105 | + cgroup_iter_start(cgroup, &it); | |
2106 | + while ((task = cgroup_iter_next(cgroup, &it))) { | |
2107 | + if (!freeze_task(task, true)) | |
2108 | + continue; | |
2109 | + if (is_task_frozen_enough(task)) | |
2110 | + continue; | |
2111 | + if (!freezing(task) && !freezer_should_skip(task)) | |
2112 | + num_cant_freeze_now++; | |
2113 | + } | |
2114 | + cgroup_iter_end(cgroup, &it); | |
2115 | + | |
2116 | + return num_cant_freeze_now ? -EBUSY : 0; | |
2117 | +} | |
2118 | + | |
2119 | +static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |
2120 | +{ | |
2121 | + struct cgroup_iter it; | |
2122 | + struct task_struct *task; | |
2123 | + | |
2124 | + cgroup_iter_start(cgroup, &it); | |
2125 | + while ((task = cgroup_iter_next(cgroup, &it))) { | |
2126 | + thaw_process(task); | |
2127 | + } | |
2128 | + cgroup_iter_end(cgroup, &it); | |
2129 | + | |
2130 | + freezer->state = CGROUP_THAWED; | |
2131 | +} | |
2132 | + | |
2133 | +static int freezer_change_state(struct cgroup *cgroup, | |
2134 | + enum freezer_state goal_state) | |
2135 | +{ | |
2136 | + struct freezer *freezer; | |
2137 | + int retval = 0; | |
2138 | + | |
2139 | + freezer = cgroup_freezer(cgroup); | |
2140 | + | |
2141 | + spin_lock_irq(&freezer->lock); | |
2142 | + | |
2143 | + update_freezer_state(cgroup, freezer); | |
2144 | + if (goal_state == freezer->state) | |
2145 | + goto out; | |
2146 | + | |
2147 | + switch (goal_state) { | |
2148 | + case CGROUP_THAWED: | |
2149 | + unfreeze_cgroup(cgroup, freezer); | |
2150 | + break; | |
2151 | + case CGROUP_FROZEN: | |
2152 | + retval = try_to_freeze_cgroup(cgroup, freezer); | |
2153 | + break; | |
2154 | + default: | |
2155 | + BUG(); | |
2156 | + } | |
2157 | +out: | |
2158 | + spin_unlock_irq(&freezer->lock); | |
2159 | + | |
2160 | + return retval; | |
2161 | +} | |
2162 | + | |
2163 | +static int freezer_write(struct cgroup *cgroup, | |
2164 | + struct cftype *cft, | |
2165 | + const char *buffer) | |
2166 | +{ | |
2167 | + int retval; | |
2168 | + enum freezer_state goal_state; | |
2169 | + | |
2170 | + if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) | |
2171 | + goal_state = CGROUP_THAWED; | |
2172 | + else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) | |
2173 | + goal_state = CGROUP_FROZEN; | |
2174 | + else | |
2175 | + return -EINVAL; | |
2176 | + | |
2177 | + if (!cgroup_lock_live_group(cgroup)) | |
2178 | + return -ENODEV; | |
2179 | + retval = freezer_change_state(cgroup, goal_state); | |
2180 | + cgroup_unlock(); | |
2181 | + return retval; | |
2182 | +} | |
2183 | + | |
2184 | +static struct cftype files[] = { | |
2185 | + { | |
2186 | + .name = "state", | |
2187 | + .read_seq_string = freezer_read, | |
2188 | + .write_string = freezer_write, | |
2189 | + }, | |
2190 | +}; | |
2191 | + | |
2192 | +static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) | |
2193 | +{ | |
2194 | + if (!cgroup->parent) | |
2195 | + return 0; | |
2196 | + return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); | |
2197 | +} | |
2198 | + | |
2199 | +struct cgroup_subsys freezer_subsys = { | |
2200 | + .name = "freezer", | |
2201 | + .create = freezer_create, | |
2202 | + .destroy = freezer_destroy, | |
2203 | + .populate = freezer_populate, | |
2204 | + .subsys_id = freezer_subsys_id, | |
2205 | + .can_attach = freezer_can_attach, | |
2206 | + .attach = NULL, | |
2207 | + .fork = freezer_fork, | |
2208 | + .exit = NULL, | |
2209 | +}; | |
2210 | --- /dev/null | |
2211 | +++ b/kernel/freezer.c | |
2212 | @@ -0,0 +1,154 @@ | |
2213 | +/* | |
2214 | + * kernel/freezer.c - Function to freeze a process | |
2215 | + * | |
2216 | + * Originally from kernel/power/process.c | |
2217 | + */ | |
2218 | + | |
2219 | +#include <linux/interrupt.h> | |
2220 | +#include <linux/suspend.h> | |
2221 | +#include <linux/module.h> | |
2222 | +#include <linux/syscalls.h> | |
2223 | +#include <linux/freezer.h> | |
2224 | + | |
2225 | +/* | |
2226 | + * freezing is complete, mark current process as frozen | |
2227 | + */ | |
2228 | +static inline void frozen_process(void) | |
2229 | +{ | |
2230 | + if (!unlikely(current->flags & PF_NOFREEZE)) { | |
2231 | + current->flags |= PF_FROZEN; | |
2232 | + wmb(); | |
2233 | + } | |
2234 | + clear_freeze_flag(current); | |
2235 | +} | |
2236 | + | |
2237 | +/* Refrigerator is place where frozen processes are stored :-). */ | |
2238 | +void refrigerator(void) | |
2239 | +{ | |
2240 | + /* Hmm, should we be allowed to suspend when there are realtime | |
2241 | + processes around? */ | |
2242 | + long save; | |
2243 | + | |
2244 | + task_lock(current); | |
2245 | + if (freezing(current)) { | |
2246 | + frozen_process(); | |
2247 | + task_unlock(current); | |
2248 | + } else { | |
2249 | + task_unlock(current); | |
2250 | + return; | |
2251 | + } | |
2252 | + save = current->state; | |
2253 | + pr_debug("%s entered refrigerator\n", current->comm); | |
2254 | + | |
2255 | + spin_lock_irq(¤t->sighand->siglock); | |
2256 | + recalc_sigpending(); /* We sent fake signal, clean it up */ | |
2257 | + spin_unlock_irq(¤t->sighand->siglock); | |
2258 | + | |
2259 | + for (;;) { | |
2260 | + set_current_state(TASK_UNINTERRUPTIBLE); | |
2261 | + if (!frozen(current)) | |
2262 | + break; | |
2263 | + schedule(); | |
2264 | + } | |
2265 | + pr_debug("%s left refrigerator\n", current->comm); | |
2266 | + __set_current_state(save); | |
2267 | +} | |
2268 | +EXPORT_SYMBOL(refrigerator); | |
2269 | + | |
2270 | +static void fake_signal_wake_up(struct task_struct *p) | |
2271 | +{ | |
2272 | + unsigned long flags; | |
2273 | + | |
2274 | + spin_lock_irqsave(&p->sighand->siglock, flags); | |
2275 | + signal_wake_up(p, 0); | |
2276 | + spin_unlock_irqrestore(&p->sighand->siglock, flags); | |
2277 | +} | |
2278 | + | |
2279 | +/** | |
2280 | + * freeze_task - send a freeze request to given task | |
2281 | + * @p: task to send the request to | |
2282 | + * @sig_only: if set, the request will only be sent if the task has the | |
2283 | + * PF_FREEZER_NOSIG flag unset | |
2284 | + * Return value: 'false', if @sig_only is set and the task has | |
2285 | + * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise | |
2286 | + * | |
2287 | + * The freeze request is sent by setting the tasks's TIF_FREEZE flag and | |
2288 | + * either sending a fake signal to it or waking it up, depending on whether | |
2289 | + * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task | |
2290 | + * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its | |
2291 | + * TIF_FREEZE flag will not be set. | |
2292 | + */ | |
2293 | +bool freeze_task(struct task_struct *p, bool sig_only) | |
2294 | +{ | |
2295 | + /* | |
2296 | + * We first check if the task is freezing and next if it has already | |
2297 | + * been frozen to avoid the race with frozen_process() which first marks | |
2298 | + * the task as frozen and next clears its TIF_FREEZE. | |
2299 | + */ | |
2300 | + if (!freezing(p)) { | |
2301 | + rmb(); | |
2302 | + if (frozen(p)) | |
2303 | + return false; | |
2304 | + | |
2305 | + if (!sig_only || should_send_signal(p)) | |
2306 | + set_freeze_flag(p); | |
2307 | + else | |
2308 | + return false; | |
2309 | + } | |
2310 | + | |
2311 | + if (should_send_signal(p)) { | |
2312 | + if (!signal_pending(p)) | |
2313 | + fake_signal_wake_up(p); | |
2314 | + } else if (sig_only) { | |
2315 | + return false; | |
2316 | + } else { | |
2317 | + wake_up_state(p, TASK_INTERRUPTIBLE); | |
2318 | + } | |
2319 | + | |
2320 | + return true; | |
2321 | +} | |
2322 | + | |
2323 | +void cancel_freezing(struct task_struct *p) | |
2324 | +{ | |
2325 | + unsigned long flags; | |
2326 | + | |
2327 | + if (freezing(p)) { | |
2328 | + pr_debug(" clean up: %s\n", p->comm); | |
2329 | + clear_freeze_flag(p); | |
2330 | + spin_lock_irqsave(&p->sighand->siglock, flags); | |
2331 | + recalc_sigpending_and_wake(p); | |
2332 | + spin_unlock_irqrestore(&p->sighand->siglock, flags); | |
2333 | + } | |
2334 | +} | |
2335 | + | |
2336 | +static int __thaw_process(struct task_struct *p) | |
2337 | +{ | |
2338 | + if (frozen(p)) { | |
2339 | + p->flags &= ~PF_FROZEN; | |
2340 | + return 1; | |
2341 | + } | |
2342 | + clear_freeze_flag(p); | |
2343 | + return 0; | |
2344 | +} | |
2345 | + | |
2346 | +/* | |
2347 | + * Wake up a frozen process | |
2348 | + * | |
2349 | + * task_lock() is needed to prevent the race with refrigerator() which may | |
2350 | + * occur if the freezing of tasks fails. Namely, without the lock, if the | |
2351 | + * freezing of tasks failed, thaw_tasks() might have run before a task in | |
2352 | + * refrigerator() could call frozen_process(), in which case the task would be | |
2353 | + * frozen and no one would thaw it. | |
2354 | + */ | |
2355 | +int thaw_process(struct task_struct *p) | |
2356 | +{ | |
2357 | + task_lock(p); | |
2358 | + if (__thaw_process(p) == 1) { | |
2359 | + task_unlock(p); | |
2360 | + wake_up_process(p); | |
2361 | + return 1; | |
2362 | + } | |
2363 | + task_unlock(p); | |
2364 | + return 0; | |
2365 | +} | |
2366 | +EXPORT_SYMBOL(thaw_process); | |
2367 | --- a/kernel/power/process.c | |
2368 | +++ b/kernel/power/process.c | |
2369 | @@ -28,121 +28,6 @@ static inline int freezeable(struct task | |
2370 | return 1; | |
2371 | } | |
2372 | ||
2373 | -/* | |
2374 | - * freezing is complete, mark current process as frozen | |
2375 | - */ | |
2376 | -static inline void frozen_process(void) | |
2377 | -{ | |
2378 | - if (!unlikely(current->flags & PF_NOFREEZE)) { | |
2379 | - current->flags |= PF_FROZEN; | |
2380 | - wmb(); | |
2381 | - } | |
2382 | - clear_freeze_flag(current); | |
2383 | -} | |
2384 | - | |
2385 | -/* Refrigerator is place where frozen processes are stored :-). */ | |
2386 | -void refrigerator(void) | |
2387 | -{ | |
2388 | - /* Hmm, should we be allowed to suspend when there are realtime | |
2389 | - processes around? */ | |
2390 | - long save; | |
2391 | - | |
2392 | - task_lock(current); | |
2393 | - if (freezing(current)) { | |
2394 | - frozen_process(); | |
2395 | - task_unlock(current); | |
2396 | - } else { | |
2397 | - task_unlock(current); | |
2398 | - return; | |
2399 | - } | |
2400 | - save = current->state; | |
2401 | - pr_debug("%s entered refrigerator\n", current->comm); | |
2402 | - | |
2403 | - spin_lock_irq(¤t->sighand->siglock); | |
2404 | - recalc_sigpending(); /* We sent fake signal, clean it up */ | |
2405 | - spin_unlock_irq(¤t->sighand->siglock); | |
2406 | - | |
2407 | - for (;;) { | |
2408 | - set_current_state(TASK_UNINTERRUPTIBLE); | |
2409 | - if (!frozen(current)) | |
2410 | - break; | |
2411 | - schedule(); | |
2412 | - } | |
2413 | - pr_debug("%s left refrigerator\n", current->comm); | |
2414 | - __set_current_state(save); | |
2415 | -} | |
2416 | - | |
2417 | -static void fake_signal_wake_up(struct task_struct *p) | |
2418 | -{ | |
2419 | - unsigned long flags; | |
2420 | - | |
2421 | - spin_lock_irqsave(&p->sighand->siglock, flags); | |
2422 | - signal_wake_up(p, 0); | |
2423 | - spin_unlock_irqrestore(&p->sighand->siglock, flags); | |
2424 | -} | |
2425 | - | |
2426 | -static inline bool should_send_signal(struct task_struct *p) | |
2427 | -{ | |
2428 | - return !(p->flags & PF_FREEZER_NOSIG); | |
2429 | -} | |
2430 | - | |
2431 | -/** | |
2432 | - * freeze_task - send a freeze request to given task | |
2433 | - * @p: task to send the request to | |
2434 | - * @sig_only: if set, the request will only be sent if the task has the | |
2435 | - * PF_FREEZER_NOSIG flag unset | |
2436 | - * Return value: 'false', if @sig_only is set and the task has | |
2437 | - * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise | |
2438 | - * | |
2439 | - * The freeze request is sent by setting the tasks's TIF_FREEZE flag and | |
2440 | - * either sending a fake signal to it or waking it up, depending on whether | |
2441 | - * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task | |
2442 | - * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its | |
2443 | - * TIF_FREEZE flag will not be set. | |
2444 | - */ | |
2445 | -static bool freeze_task(struct task_struct *p, bool sig_only) | |
2446 | -{ | |
2447 | - /* | |
2448 | - * We first check if the task is freezing and next if it has already | |
2449 | - * been frozen to avoid the race with frozen_process() which first marks | |
2450 | - * the task as frozen and next clears its TIF_FREEZE. | |
2451 | - */ | |
2452 | - if (!freezing(p)) { | |
2453 | - rmb(); | |
2454 | - if (frozen(p)) | |
2455 | - return false; | |
2456 | - | |
2457 | - if (!sig_only || should_send_signal(p)) | |
2458 | - set_freeze_flag(p); | |
2459 | - else | |
2460 | - return false; | |
2461 | - } | |
2462 | - | |
2463 | - if (should_send_signal(p)) { | |
2464 | - if (!signal_pending(p)) | |
2465 | - fake_signal_wake_up(p); | |
2466 | - } else if (sig_only) { | |
2467 | - return false; | |
2468 | - } else { | |
2469 | - wake_up_state(p, TASK_INTERRUPTIBLE); | |
2470 | - } | |
2471 | - | |
2472 | - return true; | |
2473 | -} | |
2474 | - | |
2475 | -static void cancel_freezing(struct task_struct *p) | |
2476 | -{ | |
2477 | - unsigned long flags; | |
2478 | - | |
2479 | - if (freezing(p)) { | |
2480 | - pr_debug(" clean up: %s\n", p->comm); | |
2481 | - clear_freeze_flag(p); | |
2482 | - spin_lock_irqsave(&p->sighand->siglock, flags); | |
2483 | - recalc_sigpending_and_wake(p); | |
2484 | - spin_unlock_irqrestore(&p->sighand->siglock, flags); | |
2485 | - } | |
2486 | -} | |
2487 | - | |
2488 | static int try_to_freeze_tasks(bool sig_only) | |
2489 | { | |
2490 | struct task_struct *g, *p; | |
2491 | @@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only) | |
2492 | if (nosig_only && should_send_signal(p)) | |
2493 | continue; | |
2494 | ||
2495 | + if (cgroup_frozen(p)) | |
2496 | + continue; | |
2497 | + | |
2498 | thaw_process(p); | |
2499 | } while_each_thread(g, p); | |
2500 | read_unlock(&tasklist_lock); | |
2501 | @@ -264,4 +152,3 @@ void thaw_processes(void) | |
2502 | printk("done.\n"); | |
2503 | } | |
2504 | ||
2505 | -EXPORT_SYMBOL(refrigerator); |