]> git.ipfire.org Git - thirdparty/kernel/linux.git/blame - block/blk-cgroup.c
swap,blkcg: issue swap io with the appropriate context
[thirdparty/kernel/linux.git] / block / blk-cgroup.c
CommitLineData
31e4c28d
VG
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
e48453c3
AA
12 *
13 * For policy-specific per-blkcg data:
14 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
15 * Arianna Avanzini <avanzini.arianna@gmail.com>
31e4c28d
VG
16 */
17#include <linux/ioprio.h>
22084190 18#include <linux/kdev_t.h>
9d6a986c 19#include <linux/module.h>
174cd4b1 20#include <linux/sched/signal.h>
accee785 21#include <linux/err.h>
9195291e 22#include <linux/blkdev.h>
52ebea74 23#include <linux/backing-dev.h>
5a0e3ad6 24#include <linux/slab.h>
34d0f179 25#include <linux/genhd.h>
72e06c25 26#include <linux/delay.h>
9a9e8a26 27#include <linux/atomic.h>
36aa9e5f 28#include <linux/ctype.h>
eea8f41c 29#include <linux/blk-cgroup.h>
5efd6113 30#include "blk.h"
3e252066 31
84c124da
DS
32#define MAX_KEY_LEN 100
33
838f13bf
TH
34/*
35 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
36 * blkcg_pol_register_mutex nests outside of it and synchronizes entire
37 * policy [un]register operations including cgroup file additions /
38 * removals. Putting cgroup file registration outside blkcg_pol_mutex
39 * allows grabbing it from cgroup callbacks.
40 */
41static DEFINE_MUTEX(blkcg_pol_register_mutex);
bc0d6501 42static DEFINE_MUTEX(blkcg_pol_mutex);
923adde1 43
e48453c3 44struct blkcg blkcg_root;
3c798398 45EXPORT_SYMBOL_GPL(blkcg_root);
9d6a986c 46
496d5e75
TH
47struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
48
3c798398 49static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
035d10b2 50
7876f930
TH
51static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
52
903d23f0
JB
53static bool blkcg_debug_stats = false;
54
a2b1693b 55static bool blkcg_policy_enabled(struct request_queue *q,
3c798398 56 const struct blkcg_policy *pol)
a2b1693b
TH
57{
58 return pol && test_bit(pol->plid, q->blkcg_pols);
59}
60
0381411e
TH
61/**
62 * blkg_free - free a blkg
63 * @blkg: blkg to free
64 *
65 * Free @blkg which may be partially allocated.
66 */
3c798398 67static void blkg_free(struct blkcg_gq *blkg)
0381411e 68{
e8989fae 69 int i;
549d3aa8
TH
70
71 if (!blkg)
72 return;
73
db613670 74 for (i = 0; i < BLKCG_MAX_POLS; i++)
001bea73
TH
75 if (blkg->pd[i])
76 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
e8989fae 77
994b7832 78 if (blkg->blkcg != &blkcg_root)
b425e504 79 blk_exit_rl(blkg->q, &blkg->rl);
77ea7338
TH
80
81 blkg_rwstat_exit(&blkg->stat_ios);
82 blkg_rwstat_exit(&blkg->stat_bytes);
549d3aa8 83 kfree(blkg);
0381411e
TH
84}
85
86/**
87 * blkg_alloc - allocate a blkg
88 * @blkcg: block cgroup the new blkg is associated with
89 * @q: request_queue the new blkg is associated with
15974993 90 * @gfp_mask: allocation mask to use
0381411e 91 *
e8989fae 92 * Allocate a new blkg assocating @blkcg and @q.
0381411e 93 */
15974993
TH
94static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
95 gfp_t gfp_mask)
0381411e 96{
3c798398 97 struct blkcg_gq *blkg;
e8989fae 98 int i;
0381411e
TH
99
100 /* alloc and init base part */
15974993 101 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
0381411e
TH
102 if (!blkg)
103 return NULL;
104
77ea7338
TH
105 if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
106 blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
107 goto err_free;
108
c875f4d0 109 blkg->q = q;
e8989fae 110 INIT_LIST_HEAD(&blkg->q_node);
0381411e 111 blkg->blkcg = blkcg;
a5049a8a 112 atomic_set(&blkg->refcnt, 1);
0381411e 113
a051661c
TH
114 /* root blkg uses @q->root_rl, init rl only for !root blkgs */
115 if (blkcg != &blkcg_root) {
116 if (blk_init_rl(&blkg->rl, q, gfp_mask))
117 goto err_free;
118 blkg->rl.blkg = blkg;
119 }
120
8bd435b3 121 for (i = 0; i < BLKCG_MAX_POLS; i++) {
3c798398 122 struct blkcg_policy *pol = blkcg_policy[i];
e8989fae 123 struct blkg_policy_data *pd;
0381411e 124
a2b1693b 125 if (!blkcg_policy_enabled(q, pol))
e8989fae
TH
126 continue;
127
128 /* alloc per-policy data and attach it to blkg */
001bea73 129 pd = pol->pd_alloc_fn(gfp_mask, q->node);
a051661c
TH
130 if (!pd)
131 goto err_free;
549d3aa8 132
e8989fae
TH
133 blkg->pd[i] = pd;
134 pd->blkg = blkg;
b276a876 135 pd->plid = i;
e8989fae
TH
136 }
137
0381411e 138 return blkg;
a051661c
TH
139
140err_free:
141 blkg_free(blkg);
142 return NULL;
0381411e
TH
143}
144
24f29046
TH
145struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
146 struct request_queue *q, bool update_hint)
80fd9979 147{
3c798398 148 struct blkcg_gq *blkg;
80fd9979 149
a637120e 150 /*
86cde6b6
TH
151 * Hint didn't match. Look up from the radix tree. Note that the
152 * hint can only be updated under queue_lock as otherwise @blkg
153 * could have already been removed from blkg_tree. The caller is
154 * responsible for grabbing queue_lock if @update_hint.
a637120e
TH
155 */
156 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
86cde6b6
TH
157 if (blkg && blkg->q == q) {
158 if (update_hint) {
159 lockdep_assert_held(q->queue_lock);
160 rcu_assign_pointer(blkcg->blkg_hint, blkg);
161 }
a637120e 162 return blkg;
86cde6b6 163 }
a637120e 164
80fd9979
TH
165 return NULL;
166}
ae118896 167EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
80fd9979 168
15974993 169/*
d708f0d5
JA
170 * If @new_blkg is %NULL, this function tries to allocate a new one as
171 * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return.
15974993 172 */
86cde6b6 173static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
d708f0d5
JA
174 struct request_queue *q,
175 struct blkcg_gq *new_blkg)
5624a4e4 176{
d708f0d5 177 struct blkcg_gq *blkg;
ce7acfea 178 struct bdi_writeback_congested *wb_congested;
f427d909 179 int i, ret;
5624a4e4 180
cd1604fa
TH
181 WARN_ON_ONCE(!rcu_read_lock_held());
182 lockdep_assert_held(q->queue_lock);
183
7ee9c562 184 /* blkg holds a reference to blkcg */
ec903c0c 185 if (!css_tryget_online(&blkcg->css)) {
20386ce0 186 ret = -ENODEV;
93e6d5d8 187 goto err_free_blkg;
15974993 188 }
cd1604fa 189
dc3b17cc 190 wb_congested = wb_congested_get_create(q->backing_dev_info,
d708f0d5
JA
191 blkcg->css.id,
192 GFP_NOWAIT | __GFP_NOWARN);
193 if (!wb_congested) {
ce7acfea 194 ret = -ENOMEM;
d708f0d5 195 goto err_put_css;
ce7acfea
TH
196 }
197
d708f0d5
JA
198 /* allocate */
199 if (!new_blkg) {
200 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
201 if (unlikely(!new_blkg)) {
202 ret = -ENOMEM;
203 goto err_put_congested;
15974993
TH
204 }
205 }
d708f0d5
JA
206 blkg = new_blkg;
207 blkg->wb_congested = wb_congested;
cd1604fa 208
db613670 209 /* link parent */
3c547865
TH
210 if (blkcg_parent(blkcg)) {
211 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
212 if (WARN_ON_ONCE(!blkg->parent)) {
20386ce0 213 ret = -ENODEV;
d708f0d5 214 goto err_put_congested;
3c547865
TH
215 }
216 blkg_get(blkg->parent);
217 }
218
db613670
TH
219 /* invoke per-policy init */
220 for (i = 0; i < BLKCG_MAX_POLS; i++) {
221 struct blkcg_policy *pol = blkcg_policy[i];
222
223 if (blkg->pd[i] && pol->pd_init_fn)
a9520cd6 224 pol->pd_init_fn(blkg->pd[i]);
db613670
TH
225 }
226
227 /* insert */
cd1604fa 228 spin_lock(&blkcg->lock);
a637120e
TH
229 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
230 if (likely(!ret)) {
231 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
232 list_add(&blkg->q_node, &q->blkg_list);
f427d909
TH
233
234 for (i = 0; i < BLKCG_MAX_POLS; i++) {
235 struct blkcg_policy *pol = blkcg_policy[i];
236
237 if (blkg->pd[i] && pol->pd_online_fn)
a9520cd6 238 pol->pd_online_fn(blkg->pd[i]);
f427d909 239 }
a637120e 240 }
f427d909 241 blkg->online = true;
cd1604fa 242 spin_unlock(&blkcg->lock);
496fb780 243
ec13b1d6 244 if (!ret)
a637120e 245 return blkg;
15974993 246
3c547865
TH
247 /* @blkg failed fully initialized, use the usual release path */
248 blkg_put(blkg);
249 return ERR_PTR(ret);
250
d708f0d5
JA
251err_put_congested:
252 wb_congested_put(wb_congested);
253err_put_css:
496fb780 254 css_put(&blkcg->css);
93e6d5d8 255err_free_blkg:
d708f0d5 256 blkg_free(new_blkg);
93e6d5d8 257 return ERR_PTR(ret);
31e4c28d 258}
3c96cb32 259
86cde6b6 260/**
d708f0d5 261 * blkg_lookup_create - lookup blkg, try to create one if not there
86cde6b6
TH
262 * @blkcg: blkcg of interest
263 * @q: request_queue of interest
264 *
265 * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to
3c547865
TH
266 * create one. blkg creation is performed recursively from blkcg_root such
267 * that all non-root blkg's have access to the parent blkg. This function
268 * should be called under RCU read lock and @q->queue_lock.
86cde6b6
TH
269 *
270 * Returns pointer to the looked up or created blkg on success, ERR_PTR()
271 * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not
272 * dead and bypassing, returns ERR_PTR(-EBUSY).
273 */
d708f0d5
JA
274struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
275 struct request_queue *q)
3c96cb32 276{
86cde6b6
TH
277 struct blkcg_gq *blkg;
278
279 WARN_ON_ONCE(!rcu_read_lock_held());
280 lockdep_assert_held(q->queue_lock);
281
d708f0d5
JA
282 /*
283 * This could be the first entry point of blkcg implementation and
284 * we shouldn't allow anything to go through for a bypassing queue.
285 */
286 if (unlikely(blk_queue_bypass(q)))
287 return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
288
86cde6b6
TH
289 blkg = __blkg_lookup(blkcg, q, true);
290 if (blkg)
291 return blkg;
292
3c547865
TH
293 /*
294 * Create blkgs walking down from blkcg_root to @blkcg, so that all
295 * non-root blkgs have access to their parents.
296 */
297 while (true) {
298 struct blkcg *pos = blkcg;
299 struct blkcg *parent = blkcg_parent(blkcg);
300
301 while (parent && !__blkg_lookup(parent, q, false)) {
302 pos = parent;
303 parent = blkcg_parent(parent);
304 }
305
d708f0d5 306 blkg = blkg_create(pos, q, NULL);
3c547865
TH
307 if (pos == blkcg || IS_ERR(blkg))
308 return blkg;
309 }
3c96cb32 310}
31e4c28d 311
4c699480
JQ
312static void blkg_pd_offline(struct blkcg_gq *blkg)
313{
314 int i;
315
316 lockdep_assert_held(blkg->q->queue_lock);
317 lockdep_assert_held(&blkg->blkcg->lock);
318
319 for (i = 0; i < BLKCG_MAX_POLS; i++) {
320 struct blkcg_policy *pol = blkcg_policy[i];
321
322 if (blkg->pd[i] && !blkg->pd[i]->offline &&
323 pol->pd_offline_fn) {
324 pol->pd_offline_fn(blkg->pd[i]);
325 blkg->pd[i]->offline = true;
326 }
327 }
328}
329
3c798398 330static void blkg_destroy(struct blkcg_gq *blkg)
03aa264a 331{
3c798398 332 struct blkcg *blkcg = blkg->blkcg;
77ea7338 333 struct blkcg_gq *parent = blkg->parent;
03aa264a 334
27e1f9d1 335 lockdep_assert_held(blkg->q->queue_lock);
9f13ef67 336 lockdep_assert_held(&blkcg->lock);
03aa264a
TH
337
338 /* Something wrong if we are trying to remove same group twice */
e8989fae 339 WARN_ON_ONCE(list_empty(&blkg->q_node));
9f13ef67 340 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
a637120e 341
77ea7338
TH
342 if (parent) {
343 blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
344 blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
345 }
346
f427d909
TH
347 blkg->online = false;
348
a637120e 349 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
e8989fae 350 list_del_init(&blkg->q_node);
9f13ef67 351 hlist_del_init_rcu(&blkg->blkcg_node);
03aa264a 352
a637120e
TH
353 /*
354 * Both setting lookup hint to and clearing it from @blkg are done
355 * under queue_lock. If it's not pointing to @blkg now, it never
356 * will. Hint assignment itself can race safely.
357 */
ec6c676a 358 if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
a637120e
TH
359 rcu_assign_pointer(blkcg->blkg_hint, NULL);
360
03aa264a
TH
361 /*
362 * Put the reference taken at the time of creation so that when all
363 * queues are gone, group can be destroyed.
364 */
365 blkg_put(blkg);
366}
367
9f13ef67
TH
368/**
369 * blkg_destroy_all - destroy all blkgs associated with a request_queue
370 * @q: request_queue of interest
9f13ef67 371 *
3c96cb32 372 * Destroy all blkgs associated with @q.
9f13ef67 373 */
3c96cb32 374static void blkg_destroy_all(struct request_queue *q)
72e06c25 375{
3c798398 376 struct blkcg_gq *blkg, *n;
72e06c25 377
6d18b008 378 lockdep_assert_held(q->queue_lock);
72e06c25 379
9f13ef67 380 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
3c798398 381 struct blkcg *blkcg = blkg->blkcg;
72e06c25 382
9f13ef67 383 spin_lock(&blkcg->lock);
4c699480 384 blkg_pd_offline(blkg);
9f13ef67
TH
385 blkg_destroy(blkg);
386 spin_unlock(&blkcg->lock);
72e06c25 387 }
6fe810bd
TH
388
389 q->root_blkg = NULL;
390 q->root_rl.blkg = NULL;
72e06c25
TH
391}
392
2a4fd070
TH
393/*
394 * A group is RCU protected, but having an rcu lock does not mean that one
395 * can access all the fields of blkg and assume these are valid. For
396 * example, don't try to follow throtl_data and request queue links.
397 *
398 * Having a reference to blkg under an rcu allows accesses to only values
399 * local to groups like group stats and group rate limits.
400 */
401void __blkg_release_rcu(struct rcu_head *rcu_head)
1adaf3dd 402{
2a4fd070 403 struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
db613670 404
3c547865 405 /* release the blkcg and parent blkg refs this blkg has been holding */
1adaf3dd 406 css_put(&blkg->blkcg->css);
a5049a8a 407 if (blkg->parent)
3c547865 408 blkg_put(blkg->parent);
1adaf3dd 409
ce7acfea
TH
410 wb_congested_put(blkg->wb_congested);
411
2a4fd070 412 blkg_free(blkg);
1adaf3dd 413}
2a4fd070 414EXPORT_SYMBOL_GPL(__blkg_release_rcu);
1adaf3dd 415
a051661c
TH
416/*
417 * The next function used by blk_queue_for_each_rl(). It's a bit tricky
418 * because the root blkg uses @q->root_rl instead of its own rl.
419 */
420struct request_list *__blk_queue_next_rl(struct request_list *rl,
421 struct request_queue *q)
422{
423 struct list_head *ent;
424 struct blkcg_gq *blkg;
425
426 /*
427 * Determine the current blkg list_head. The first entry is
428 * root_rl which is off @q->blkg_list and mapped to the head.
429 */
430 if (rl == &q->root_rl) {
431 ent = &q->blkg_list;
65c77fd9
JN
432 /* There are no more block groups, hence no request lists */
433 if (list_empty(ent))
434 return NULL;
a051661c
TH
435 } else {
436 blkg = container_of(rl, struct blkcg_gq, rl);
437 ent = &blkg->q_node;
438 }
439
440 /* walk to the next list_head, skip root blkcg */
441 ent = ent->next;
442 if (ent == &q->root_blkg->q_node)
443 ent = ent->next;
444 if (ent == &q->blkg_list)
445 return NULL;
446
447 blkg = container_of(ent, struct blkcg_gq, q_node);
448 return &blkg->rl;
449}
450
182446d0
TH
451static int blkcg_reset_stats(struct cgroup_subsys_state *css,
452 struct cftype *cftype, u64 val)
303a3acb 453{
182446d0 454 struct blkcg *blkcg = css_to_blkcg(css);
3c798398 455 struct blkcg_gq *blkg;
bc0d6501 456 int i;
303a3acb 457
838f13bf 458 mutex_lock(&blkcg_pol_mutex);
303a3acb 459 spin_lock_irq(&blkcg->lock);
997a026c
TH
460
461 /*
462 * Note that stat reset is racy - it doesn't synchronize against
463 * stat updates. This is a debug feature which shouldn't exist
464 * anyway. If you get hit by a race, retry.
465 */
b67bfe0d 466 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
77ea7338
TH
467 blkg_rwstat_reset(&blkg->stat_bytes);
468 blkg_rwstat_reset(&blkg->stat_ios);
469
8bd435b3 470 for (i = 0; i < BLKCG_MAX_POLS; i++) {
3c798398 471 struct blkcg_policy *pol = blkcg_policy[i];
549d3aa8 472
a9520cd6
TH
473 if (blkg->pd[i] && pol->pd_reset_stats_fn)
474 pol->pd_reset_stats_fn(blkg->pd[i]);
bc0d6501 475 }
303a3acb 476 }
f0bdc8cd 477
303a3acb 478 spin_unlock_irq(&blkcg->lock);
bc0d6501 479 mutex_unlock(&blkcg_pol_mutex);
303a3acb
DS
480 return 0;
481}
482
dd165eb3 483const char *blkg_dev_name(struct blkcg_gq *blkg)
303a3acb 484{
d3d32e69 485 /* some drivers (floppy) instantiate a queue w/o disk registered */
dc3b17cc
JK
486 if (blkg->q->backing_dev_info->dev)
487 return dev_name(blkg->q->backing_dev_info->dev);
d3d32e69 488 return NULL;
303a3acb 489}
dd165eb3 490EXPORT_SYMBOL_GPL(blkg_dev_name);
303a3acb 491
d3d32e69
TH
492/**
493 * blkcg_print_blkgs - helper for printing per-blkg data
494 * @sf: seq_file to print to
495 * @blkcg: blkcg of interest
496 * @prfill: fill function to print out a blkg
497 * @pol: policy in question
498 * @data: data to be passed to @prfill
499 * @show_total: to print out sum of prfill return values or not
500 *
501 * This function invokes @prfill on each blkg of @blkcg if pd for the
502 * policy specified by @pol exists. @prfill is invoked with @sf, the
810ecfa7
TH
503 * policy data and @data and the matching queue lock held. If @show_total
504 * is %true, the sum of the return values from @prfill is printed with
505 * "Total" label at the end.
d3d32e69
TH
506 *
507 * This is to be used to construct print functions for
508 * cftype->read_seq_string method.
509 */
3c798398 510void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
f95a04af
TH
511 u64 (*prfill)(struct seq_file *,
512 struct blkg_policy_data *, int),
3c798398 513 const struct blkcg_policy *pol, int data,
ec399347 514 bool show_total)
5624a4e4 515{
3c798398 516 struct blkcg_gq *blkg;
d3d32e69 517 u64 total = 0;
5624a4e4 518
810ecfa7 519 rcu_read_lock();
ee89f812 520 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
810ecfa7 521 spin_lock_irq(blkg->q->queue_lock);
a2b1693b 522 if (blkcg_policy_enabled(blkg->q, pol))
f95a04af 523 total += prfill(sf, blkg->pd[pol->plid], data);
810ecfa7
TH
524 spin_unlock_irq(blkg->q->queue_lock);
525 }
526 rcu_read_unlock();
d3d32e69
TH
527
528 if (show_total)
529 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
530}
829fdb50 531EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
d3d32e69
TH
532
533/**
534 * __blkg_prfill_u64 - prfill helper for a single u64 value
535 * @sf: seq_file to print to
f95a04af 536 * @pd: policy private data of interest
d3d32e69
TH
537 * @v: value to print
538 *
f95a04af 539 * Print @v to @sf for the device assocaited with @pd.
d3d32e69 540 */
f95a04af 541u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
d3d32e69 542{
f95a04af 543 const char *dname = blkg_dev_name(pd->blkg);
d3d32e69
TH
544
545 if (!dname)
546 return 0;
547
548 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
549 return v;
550}
829fdb50 551EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
d3d32e69
TH
552
553/**
554 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
555 * @sf: seq_file to print to
f95a04af 556 * @pd: policy private data of interest
d3d32e69
TH
557 * @rwstat: rwstat to print
558 *
f95a04af 559 * Print @rwstat to @sf for the device assocaited with @pd.
d3d32e69 560 */
f95a04af 561u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
829fdb50 562 const struct blkg_rwstat *rwstat)
d3d32e69
TH
563{
564 static const char *rwstr[] = {
565 [BLKG_RWSTAT_READ] = "Read",
566 [BLKG_RWSTAT_WRITE] = "Write",
567 [BLKG_RWSTAT_SYNC] = "Sync",
568 [BLKG_RWSTAT_ASYNC] = "Async",
569 };
f95a04af 570 const char *dname = blkg_dev_name(pd->blkg);
d3d32e69
TH
571 u64 v;
572 int i;
573
574 if (!dname)
575 return 0;
576
577 for (i = 0; i < BLKG_RWSTAT_NR; i++)
578 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
24bdb8ef 579 (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
d3d32e69 580
24bdb8ef
TH
581 v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
582 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
d3d32e69
TH
583 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
584 return v;
585}
b50da39f 586EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
d3d32e69 587
5bc4afb1
TH
588/**
589 * blkg_prfill_stat - prfill callback for blkg_stat
590 * @sf: seq_file to print to
f95a04af
TH
591 * @pd: policy private data of interest
592 * @off: offset to the blkg_stat in @pd
5bc4afb1
TH
593 *
594 * prfill callback for printing a blkg_stat.
595 */
f95a04af 596u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
d3d32e69 597{
f95a04af 598 return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
d3d32e69 599}
5bc4afb1 600EXPORT_SYMBOL_GPL(blkg_prfill_stat);
d3d32e69 601
5bc4afb1
TH
602/**
603 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
604 * @sf: seq_file to print to
f95a04af
TH
605 * @pd: policy private data of interest
606 * @off: offset to the blkg_rwstat in @pd
5bc4afb1
TH
607 *
608 * prfill callback for printing a blkg_rwstat.
609 */
f95a04af
TH
610u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
611 int off)
d3d32e69 612{
f95a04af 613 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
d3d32e69 614
f95a04af 615 return __blkg_prfill_rwstat(sf, pd, &rwstat);
d3d32e69 616}
5bc4afb1 617EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
d3d32e69 618
77ea7338
TH
619static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
620 struct blkg_policy_data *pd, int off)
621{
622 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
623
624 return __blkg_prfill_rwstat(sf, pd, &rwstat);
625}
626
627/**
628 * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
629 * @sf: seq_file to print to
630 * @v: unused
631 *
632 * To be used as cftype->seq_show to print blkg->stat_bytes.
633 * cftype->private must be set to the blkcg_policy.
634 */
635int blkg_print_stat_bytes(struct seq_file *sf, void *v)
636{
637 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
638 blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
639 offsetof(struct blkcg_gq, stat_bytes), true);
640 return 0;
641}
642EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
643
644/**
645 * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
646 * @sf: seq_file to print to
647 * @v: unused
648 *
649 * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private
650 * must be set to the blkcg_policy.
651 */
652int blkg_print_stat_ios(struct seq_file *sf, void *v)
653{
654 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
655 blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
656 offsetof(struct blkcg_gq, stat_ios), true);
657 return 0;
658}
659EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
660
661static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
662 struct blkg_policy_data *pd,
663 int off)
664{
665 struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
666 NULL, off);
667 return __blkg_prfill_rwstat(sf, pd, &rwstat);
668}
669
670/**
671 * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
672 * @sf: seq_file to print to
673 * @v: unused
674 */
675int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
676{
677 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
678 blkg_prfill_rwstat_field_recursive,
679 (void *)seq_cft(sf)->private,
680 offsetof(struct blkcg_gq, stat_bytes), true);
681 return 0;
682}
683EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
684
685/**
686 * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
687 * @sf: seq_file to print to
688 * @v: unused
689 */
690int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
691{
692 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
693 blkg_prfill_rwstat_field_recursive,
694 (void *)seq_cft(sf)->private,
695 offsetof(struct blkcg_gq, stat_ios), true);
696 return 0;
697}
698EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
699
16b3de66
TH
700/**
701 * blkg_stat_recursive_sum - collect hierarchical blkg_stat
f12c74ca
TH
702 * @blkg: blkg of interest
703 * @pol: blkcg_policy which contains the blkg_stat
704 * @off: offset to the blkg_stat in blkg_policy_data or @blkg
16b3de66 705 *
f12c74ca
TH
706 * Collect the blkg_stat specified by @blkg, @pol and @off and all its
707 * online descendants and their aux counts. The caller must be holding the
708 * queue lock for online tests.
709 *
710 * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
711 * at @off bytes into @blkg's blkg_policy_data of the policy.
16b3de66 712 */
f12c74ca
TH
713u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
714 struct blkcg_policy *pol, int off)
16b3de66 715{
16b3de66 716 struct blkcg_gq *pos_blkg;
492eb21b 717 struct cgroup_subsys_state *pos_css;
bd8815a6 718 u64 sum = 0;
16b3de66 719
f12c74ca 720 lockdep_assert_held(blkg->q->queue_lock);
16b3de66 721
16b3de66 722 rcu_read_lock();
f12c74ca
TH
723 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
724 struct blkg_stat *stat;
725
726 if (!pos_blkg->online)
727 continue;
16b3de66 728
f12c74ca
TH
729 if (pol)
730 stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
731 else
732 stat = (void *)blkg + off;
733
734 sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
16b3de66
TH
735 }
736 rcu_read_unlock();
737
738 return sum;
739}
740EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
741
742/**
743 * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
f12c74ca
TH
744 * @blkg: blkg of interest
745 * @pol: blkcg_policy which contains the blkg_rwstat
746 * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
16b3de66 747 *
f12c74ca
TH
748 * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
749 * online descendants and their aux counts. The caller must be holding the
750 * queue lock for online tests.
751 *
752 * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
753 * is at @off bytes into @blkg's blkg_policy_data of the policy.
16b3de66 754 */
f12c74ca
TH
755struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
756 struct blkcg_policy *pol, int off)
16b3de66 757{
16b3de66 758 struct blkcg_gq *pos_blkg;
492eb21b 759 struct cgroup_subsys_state *pos_css;
bd8815a6 760 struct blkg_rwstat sum = { };
16b3de66
TH
761 int i;
762
f12c74ca 763 lockdep_assert_held(blkg->q->queue_lock);
16b3de66 764
16b3de66 765 rcu_read_lock();
f12c74ca 766 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
3a7faead 767 struct blkg_rwstat *rwstat;
16b3de66
TH
768
769 if (!pos_blkg->online)
770 continue;
771
f12c74ca
TH
772 if (pol)
773 rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
774 else
775 rwstat = (void *)pos_blkg + off;
776
16b3de66 777 for (i = 0; i < BLKG_RWSTAT_NR; i++)
3a7faead
TH
778 atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
779 percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
780 &sum.aux_cnt[i]);
16b3de66
TH
781 }
782 rcu_read_unlock();
783
784 return sum;
785}
786EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
787
457e490f
TE
788/* Performs queue bypass and policy enabled checks then looks up blkg. */
789static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
790 const struct blkcg_policy *pol,
791 struct request_queue *q)
792{
793 WARN_ON_ONCE(!rcu_read_lock_held());
794 lockdep_assert_held(q->queue_lock);
795
796 if (!blkcg_policy_enabled(q, pol))
797 return ERR_PTR(-EOPNOTSUPP);
798
799 /*
800 * This could be the first entry point of blkcg implementation and
801 * we shouldn't allow anything to go through for a bypassing queue.
802 */
803 if (unlikely(blk_queue_bypass(q)))
804 return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
805
806 return __blkg_lookup(blkcg, q, true /* update_hint */);
807}
808
3a8b31d3
TH
809/**
810 * blkg_conf_prep - parse and prepare for per-blkg config update
811 * @blkcg: target block cgroup
da8b0662 812 * @pol: target policy
3a8b31d3
TH
813 * @input: input string
814 * @ctx: blkg_conf_ctx to be filled
815 *
816 * Parse per-blkg config update from @input and initialize @ctx with the
36aa9e5f
TH
817 * result. @ctx->blkg points to the blkg to be updated and @ctx->body the
818 * part of @input following MAJ:MIN. This function returns with RCU read
819 * lock and queue lock held and must be paired with blkg_conf_finish().
3a8b31d3 820 */
3c798398 821int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
36aa9e5f 822 char *input, struct blkg_conf_ctx *ctx)
da8b0662 823 __acquires(rcu) __acquires(disk->queue->queue_lock)
34d0f179 824{
3a8b31d3 825 struct gendisk *disk;
457e490f 826 struct request_queue *q;
3c798398 827 struct blkcg_gq *blkg;
726fa694 828 unsigned int major, minor;
36aa9e5f
TH
829 int key_len, part, ret;
830 char *body;
34d0f179 831
36aa9e5f 832 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
726fa694 833 return -EINVAL;
3a8b31d3 834
36aa9e5f
TH
835 body = input + key_len;
836 if (!isspace(*body))
837 return -EINVAL;
838 body = skip_spaces(body);
839
726fa694 840 disk = get_gendisk(MKDEV(major, minor), &part);
5f6c2d2b 841 if (!disk)
20386ce0 842 return -ENODEV;
5f6c2d2b 843 if (part) {
457e490f
TE
844 ret = -ENODEV;
845 goto fail;
5f6c2d2b 846 }
e56da7e2 847
457e490f 848 q = disk->queue;
da8b0662 849
457e490f
TE
850 rcu_read_lock();
851 spin_lock_irq(q->queue_lock);
e56da7e2 852
457e490f 853 blkg = blkg_lookup_check(blkcg, pol, q);
4bfd482e
TH
854 if (IS_ERR(blkg)) {
855 ret = PTR_ERR(blkg);
457e490f
TE
856 goto fail_unlock;
857 }
858
859 if (blkg)
860 goto success;
861
862 /*
863 * Create blkgs walking down from blkcg_root to @blkcg, so that all
864 * non-root blkgs have access to their parents.
865 */
866 while (true) {
867 struct blkcg *pos = blkcg;
868 struct blkcg *parent;
869 struct blkcg_gq *new_blkg;
870
871 parent = blkcg_parent(blkcg);
872 while (parent && !__blkg_lookup(parent, q, false)) {
873 pos = parent;
874 parent = blkcg_parent(parent);
875 }
876
877 /* Drop locks to do new blkg allocation with GFP_KERNEL. */
878 spin_unlock_irq(q->queue_lock);
3a8b31d3 879 rcu_read_unlock();
457e490f
TE
880
881 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
882 if (unlikely(!new_blkg)) {
883 ret = -ENOMEM;
884 goto fail;
7702e8f4 885 }
3a8b31d3 886
457e490f
TE
887 rcu_read_lock();
888 spin_lock_irq(q->queue_lock);
889
890 blkg = blkg_lookup_check(pos, pol, q);
891 if (IS_ERR(blkg)) {
892 ret = PTR_ERR(blkg);
893 goto fail_unlock;
894 }
895
896 if (blkg) {
897 blkg_free(new_blkg);
898 } else {
899 blkg = blkg_create(pos, q, new_blkg);
900 if (unlikely(IS_ERR(blkg))) {
901 ret = PTR_ERR(blkg);
902 goto fail_unlock;
903 }
904 }
905
906 if (pos == blkcg)
907 goto success;
908 }
909success:
3a8b31d3
TH
910 ctx->disk = disk;
911 ctx->blkg = blkg;
36aa9e5f 912 ctx->body = body;
726fa694 913 return 0;
457e490f
TE
914
915fail_unlock:
916 spin_unlock_irq(q->queue_lock);
917 rcu_read_unlock();
918fail:
9df6c299 919 put_disk_and_module(disk);
457e490f
TE
920 /*
921 * If queue was bypassing, we should retry. Do so after a
922 * short msleep(). It isn't strictly necessary but queue
923 * can be bypassing for some time and it's always nice to
924 * avoid busy looping.
925 */
926 if (ret == -EBUSY) {
927 msleep(10);
928 ret = restart_syscall();
929 }
930 return ret;
34d0f179 931}
829fdb50 932EXPORT_SYMBOL_GPL(blkg_conf_prep);
34d0f179 933
3a8b31d3
TH
934/**
935 * blkg_conf_finish - finish up per-blkg config update
936 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
937 *
938 * Finish up after per-blkg config update. This function must be paired
939 * with blkg_conf_prep().
940 */
829fdb50 941void blkg_conf_finish(struct blkg_conf_ctx *ctx)
da8b0662 942 __releases(ctx->disk->queue->queue_lock) __releases(rcu)
34d0f179 943{
da8b0662 944 spin_unlock_irq(ctx->disk->queue->queue_lock);
3a8b31d3 945 rcu_read_unlock();
9df6c299 946 put_disk_and_module(ctx->disk);
34d0f179 947}
829fdb50 948EXPORT_SYMBOL_GPL(blkg_conf_finish);
34d0f179 949
2ee867dc
TH
950static int blkcg_print_stat(struct seq_file *sf, void *v)
951{
952 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
953 struct blkcg_gq *blkg;
954
955 rcu_read_lock();
956
957 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
958 const char *dname;
903d23f0 959 char *buf;
2ee867dc
TH
960 struct blkg_rwstat rwstat;
961 u64 rbytes, wbytes, rios, wios;
903d23f0
JB
962 size_t size = seq_get_buf(sf, &buf), off = 0;
963 int i;
964 bool has_stats = false;
2ee867dc
TH
965
966 dname = blkg_dev_name(blkg);
967 if (!dname)
968 continue;
969
903d23f0
JB
970 /*
971 * Hooray string manipulation, count is the size written NOT
972 * INCLUDING THE \0, so size is now count+1 less than what we
973 * had before, but we want to start writing the next bit from
974 * the \0 so we only add count to buf.
975 */
976 off += scnprintf(buf+off, size-off, "%s ", dname);
977
2ee867dc
TH
978 spin_lock_irq(blkg->q->queue_lock);
979
980 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
981 offsetof(struct blkcg_gq, stat_bytes));
982 rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
983 wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
984
985 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
986 offsetof(struct blkcg_gq, stat_ios));
987 rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
988 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
989
990 spin_unlock_irq(blkg->q->queue_lock);
991
903d23f0
JB
992 if (rbytes || wbytes || rios || wios) {
993 has_stats = true;
994 off += scnprintf(buf+off, size-off,
995 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu",
996 rbytes, wbytes, rios, wios);
997 }
998
999 if (!blkcg_debug_stats)
1000 goto next;
1001
1002 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1003 struct blkcg_policy *pol = blkcg_policy[i];
1004 size_t written;
1005
1006 if (!blkg->pd[i] || !pol->pd_stat_fn)
1007 continue;
1008
1009 written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
1010 if (written)
1011 has_stats = true;
1012 off += written;
1013 }
1014next:
1015 if (has_stats) {
1016 off += scnprintf(buf+off, size-off, "\n");
1017 seq_commit(sf, off);
1018 }
2ee867dc
TH
1019 }
1020
1021 rcu_read_unlock();
1022 return 0;
1023}
1024
e1f3b941 1025static struct cftype blkcg_files[] = {
2ee867dc
TH
1026 {
1027 .name = "stat",
ca0752c5 1028 .flags = CFTYPE_NOT_ON_ROOT,
2ee867dc
TH
1029 .seq_show = blkcg_print_stat,
1030 },
1031 { } /* terminate */
1032};
1033
e1f3b941 1034static struct cftype blkcg_legacy_files[] = {
84c124da
DS
1035 {
1036 .name = "reset_stats",
3c798398 1037 .write_u64 = blkcg_reset_stats,
22084190 1038 },
4baf6e33 1039 { } /* terminate */
31e4c28d
VG
1040};
1041
9f13ef67 1042/**
92fb9748 1043 * blkcg_css_offline - cgroup css_offline callback
eb95419b 1044 * @css: css of interest
9f13ef67 1045 *
eb95419b 1046 * This function is called when @css is about to go away and responsible
4c699480
JQ
1047 * for offlining all blkgs pd and killing all wbs associated with @css.
1048 * blkgs pd offline should be done while holding both q and blkcg locks.
1049 * As blkcg lock is nested inside q lock, this function performs reverse
1050 * double lock dancing.
9f13ef67
TH
1051 *
1052 * This is the blkcg counterpart of ioc_release_fn().
1053 */
eb95419b 1054static void blkcg_css_offline(struct cgroup_subsys_state *css)
31e4c28d 1055{
eb95419b 1056 struct blkcg *blkcg = css_to_blkcg(css);
4c699480 1057 struct blkcg_gq *blkg;
b1c35769 1058
9f13ef67 1059 spin_lock_irq(&blkcg->lock);
7ee9c562 1060
4c699480 1061 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
c875f4d0 1062 struct request_queue *q = blkg->q;
b1c35769 1063
9f13ef67 1064 if (spin_trylock(q->queue_lock)) {
4c699480 1065 blkg_pd_offline(blkg);
9f13ef67
TH
1066 spin_unlock(q->queue_lock);
1067 } else {
1068 spin_unlock_irq(&blkcg->lock);
9f13ef67 1069 cpu_relax();
a5567932 1070 spin_lock_irq(&blkcg->lock);
0f3942a3 1071 }
9f13ef67 1072 }
b1c35769 1073
9f13ef67 1074 spin_unlock_irq(&blkcg->lock);
52ebea74
TH
1075
1076 wb_blkcg_offline(blkcg);
7ee9c562
TH
1077}
1078
4c699480
JQ
1079/**
1080 * blkcg_destroy_all_blkgs - destroy all blkgs associated with a blkcg
1081 * @blkcg: blkcg of interest
1082 *
1083 * This function is called when blkcg css is about to free and responsible for
1084 * destroying all blkgs associated with @blkcg.
1085 * blkgs should be removed while holding both q and blkcg locks. As blkcg lock
1086 * is nested inside q lock, this function performs reverse double lock dancing.
1087 */
1088static void blkcg_destroy_all_blkgs(struct blkcg *blkcg)
1089{
1090 spin_lock_irq(&blkcg->lock);
1091 while (!hlist_empty(&blkcg->blkg_list)) {
1092 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1093 struct blkcg_gq,
1094 blkcg_node);
1095 struct request_queue *q = blkg->q;
1096
1097 if (spin_trylock(q->queue_lock)) {
1098 blkg_destroy(blkg);
1099 spin_unlock(q->queue_lock);
1100 } else {
1101 spin_unlock_irq(&blkcg->lock);
1102 cpu_relax();
1103 spin_lock_irq(&blkcg->lock);
1104 }
1105 }
1106 spin_unlock_irq(&blkcg->lock);
1107}
1108
eb95419b 1109static void blkcg_css_free(struct cgroup_subsys_state *css)
7ee9c562 1110{
eb95419b 1111 struct blkcg *blkcg = css_to_blkcg(css);
bc915e61 1112 int i;
7ee9c562 1113
4c699480
JQ
1114 blkcg_destroy_all_blkgs(blkcg);
1115
7876f930 1116 mutex_lock(&blkcg_pol_mutex);
e4a9bde9 1117
7876f930 1118 list_del(&blkcg->all_blkcgs_node);
7876f930 1119
bc915e61 1120 for (i = 0; i < BLKCG_MAX_POLS; i++)
e4a9bde9
TH
1121 if (blkcg->cpd[i])
1122 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1123
1124 mutex_unlock(&blkcg_pol_mutex);
1125
bc915e61 1126 kfree(blkcg);
31e4c28d
VG
1127}
1128
eb95419b
TH
1129static struct cgroup_subsys_state *
1130blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
31e4c28d 1131{
3c798398 1132 struct blkcg *blkcg;
e48453c3
AA
1133 struct cgroup_subsys_state *ret;
1134 int i;
31e4c28d 1135
7876f930
TH
1136 mutex_lock(&blkcg_pol_mutex);
1137
eb95419b 1138 if (!parent_css) {
3c798398 1139 blkcg = &blkcg_root;
bc915e61
TH
1140 } else {
1141 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1142 if (!blkcg) {
1143 ret = ERR_PTR(-ENOMEM);
4c18c9e9 1144 goto unlock;
bc915e61 1145 }
e48453c3
AA
1146 }
1147
1148 for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1149 struct blkcg_policy *pol = blkcg_policy[i];
1150 struct blkcg_policy_data *cpd;
1151
1152 /*
1153 * If the policy hasn't been attached yet, wait for it
1154 * to be attached before doing anything else. Otherwise,
1155 * check if the policy requires any specific per-cgroup
1156 * data: if it does, allocate and initialize it.
1157 */
e4a9bde9 1158 if (!pol || !pol->cpd_alloc_fn)
e48453c3
AA
1159 continue;
1160
e4a9bde9 1161 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
e48453c3
AA
1162 if (!cpd) {
1163 ret = ERR_PTR(-ENOMEM);
1164 goto free_pd_blkcg;
1165 }
81437648
TH
1166 blkcg->cpd[i] = cpd;
1167 cpd->blkcg = blkcg;
e48453c3 1168 cpd->plid = i;
e4a9bde9
TH
1169 if (pol->cpd_init_fn)
1170 pol->cpd_init_fn(cpd);
e48453c3 1171 }
31e4c28d 1172
31e4c28d 1173 spin_lock_init(&blkcg->lock);
e00f4f4d 1174 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
31e4c28d 1175 INIT_HLIST_HEAD(&blkcg->blkg_list);
52ebea74
TH
1176#ifdef CONFIG_CGROUP_WRITEBACK
1177 INIT_LIST_HEAD(&blkcg->cgwb_list);
1178#endif
7876f930
TH
1179 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1180
1181 mutex_unlock(&blkcg_pol_mutex);
31e4c28d 1182 return &blkcg->css;
e48453c3
AA
1183
1184free_pd_blkcg:
1185 for (i--; i >= 0; i--)
e4a9bde9
TH
1186 if (blkcg->cpd[i])
1187 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
4c18c9e9 1188
1189 if (blkcg != &blkcg_root)
1190 kfree(blkcg);
1191unlock:
7876f930 1192 mutex_unlock(&blkcg_pol_mutex);
e48453c3 1193 return ret;
31e4c28d
VG
1194}
1195
5efd6113
TH
1196/**
1197 * blkcg_init_queue - initialize blkcg part of request queue
1198 * @q: request_queue to initialize
1199 *
1200 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1201 * part of new request_queue @q.
1202 *
1203 * RETURNS:
1204 * 0 on success, -errno on failure.
1205 */
1206int blkcg_init_queue(struct request_queue *q)
1207{
d708f0d5
JA
1208 struct blkcg_gq *new_blkg, *blkg;
1209 bool preloaded;
ec13b1d6
TH
1210 int ret;
1211
d708f0d5
JA
1212 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1213 if (!new_blkg)
1214 return -ENOMEM;
1215
1216 preloaded = !radix_tree_preload(GFP_KERNEL);
1217
bea54883 1218 /* Make sure the root blkg exists. */
ec13b1d6
TH
1219 rcu_read_lock();
1220 spin_lock_irq(q->queue_lock);
d708f0d5 1221 blkg = blkg_create(&blkcg_root, q, new_blkg);
901932a3
JB
1222 if (IS_ERR(blkg))
1223 goto err_unlock;
1224 q->root_blkg = blkg;
1225 q->root_rl.blkg = blkg;
ec13b1d6
TH
1226 spin_unlock_irq(q->queue_lock);
1227 rcu_read_unlock();
1228
d708f0d5
JA
1229 if (preloaded)
1230 radix_tree_preload_end();
1231
ec13b1d6
TH
1232 ret = blk_throtl_init(q);
1233 if (ret) {
1234 spin_lock_irq(q->queue_lock);
1235 blkg_destroy_all(q);
1236 spin_unlock_irq(q->queue_lock);
1237 }
1238 return ret;
901932a3
JB
1239
1240err_unlock:
1241 spin_unlock_irq(q->queue_lock);
1242 rcu_read_unlock();
1243 if (preloaded)
1244 radix_tree_preload_end();
1245 return PTR_ERR(blkg);
5efd6113
TH
1246}
1247
1248/**
1249 * blkcg_drain_queue - drain blkcg part of request_queue
1250 * @q: request_queue to drain
1251 *
1252 * Called from blk_drain_queue(). Responsible for draining blkcg part.
1253 */
1254void blkcg_drain_queue(struct request_queue *q)
1255{
1256 lockdep_assert_held(q->queue_lock);
1257
0b462c89
TH
1258 /*
1259 * @q could be exiting and already have destroyed all blkgs as
1260 * indicated by NULL root_blkg. If so, don't confuse policies.
1261 */
1262 if (!q->root_blkg)
1263 return;
1264
5efd6113
TH
1265 blk_throtl_drain(q);
1266}
1267
1268/**
1269 * blkcg_exit_queue - exit and release blkcg part of request_queue
1270 * @q: request_queue being released
1271 *
1272 * Called from blk_release_queue(). Responsible for exiting blkcg part.
1273 */
1274void blkcg_exit_queue(struct request_queue *q)
1275{
6d18b008 1276 spin_lock_irq(q->queue_lock);
3c96cb32 1277 blkg_destroy_all(q);
6d18b008
TH
1278 spin_unlock_irq(q->queue_lock);
1279
5efd6113
TH
1280 blk_throtl_exit(q);
1281}
1282
31e4c28d
VG
1283/*
1284 * We cannot support shared io contexts, as we have no mean to support
1285 * two tasks with the same ioc in two different groups without major rework
1286 * of the main cic data structures. For now we allow a task to change
1287 * its cgroup only if it's the only owner of its ioc.
1288 */
1f7dd3e5 1289static int blkcg_can_attach(struct cgroup_taskset *tset)
31e4c28d 1290{
bb9d97b6 1291 struct task_struct *task;
1f7dd3e5 1292 struct cgroup_subsys_state *dst_css;
31e4c28d
VG
1293 struct io_context *ioc;
1294 int ret = 0;
1295
1296 /* task_lock() is needed to avoid races with exit_io_context() */
1f7dd3e5 1297 cgroup_taskset_for_each(task, dst_css, tset) {
bb9d97b6
TH
1298 task_lock(task);
1299 ioc = task->io_context;
1300 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1301 ret = -EINVAL;
1302 task_unlock(task);
1303 if (ret)
1304 break;
1305 }
31e4c28d
VG
1306 return ret;
1307}
1308
69d7fde5
TH
1309static void blkcg_bind(struct cgroup_subsys_state *root_css)
1310{
1311 int i;
1312
1313 mutex_lock(&blkcg_pol_mutex);
1314
1315 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1316 struct blkcg_policy *pol = blkcg_policy[i];
1317 struct blkcg *blkcg;
1318
1319 if (!pol || !pol->cpd_bind_fn)
1320 continue;
1321
1322 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1323 if (blkcg->cpd[pol->plid])
1324 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1325 }
1326 mutex_unlock(&blkcg_pol_mutex);
1327}
1328
c165b3e3 1329struct cgroup_subsys io_cgrp_subsys = {
92fb9748
TH
1330 .css_alloc = blkcg_css_alloc,
1331 .css_offline = blkcg_css_offline,
1332 .css_free = blkcg_css_free,
3c798398 1333 .can_attach = blkcg_can_attach,
69d7fde5 1334 .bind = blkcg_bind,
2ee867dc 1335 .dfl_cftypes = blkcg_files,
880f50e2 1336 .legacy_cftypes = blkcg_legacy_files,
c165b3e3 1337 .legacy_name = "blkio",
1ced953b
TH
1338#ifdef CONFIG_MEMCG
1339 /*
1340 * This ensures that, if available, memcg is automatically enabled
1341 * together on the default hierarchy so that the owner cgroup can
1342 * be retrieved from writeback pages.
1343 */
1344 .depends_on = 1 << memory_cgrp_id,
1345#endif
676f7c8f 1346};
c165b3e3 1347EXPORT_SYMBOL_GPL(io_cgrp_subsys);
676f7c8f 1348
a2b1693b
TH
1349/**
1350 * blkcg_activate_policy - activate a blkcg policy on a request_queue
1351 * @q: request_queue of interest
1352 * @pol: blkcg policy to activate
1353 *
1354 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through
1355 * bypass mode to populate its blkgs with policy_data for @pol.
1356 *
1357 * Activation happens with @q bypassed, so nobody would be accessing blkgs
1358 * from IO path. Update of each blkg is protected by both queue and blkcg
1359 * locks so that holding either lock and testing blkcg_policy_enabled() is
1360 * always enough for dereferencing policy data.
1361 *
1362 * The caller is responsible for synchronizing [de]activations and policy
1363 * [un]registerations. Returns 0 on success, -errno on failure.
1364 */
1365int blkcg_activate_policy(struct request_queue *q,
3c798398 1366 const struct blkcg_policy *pol)
a2b1693b 1367{
4c55f4f9 1368 struct blkg_policy_data *pd_prealloc = NULL;
ec13b1d6 1369 struct blkcg_gq *blkg;
4c55f4f9 1370 int ret;
a2b1693b
TH
1371
1372 if (blkcg_policy_enabled(q, pol))
1373 return 0;
1374
38dbb7dd 1375 if (q->mq_ops)
bd166ef1 1376 blk_mq_freeze_queue(q);
38dbb7dd 1377 else
bd166ef1 1378 blk_queue_bypass_start(q);
4c55f4f9
TH
1379pd_prealloc:
1380 if (!pd_prealloc) {
001bea73 1381 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
4c55f4f9 1382 if (!pd_prealloc) {
a2b1693b 1383 ret = -ENOMEM;
4c55f4f9 1384 goto out_bypass_end;
a2b1693b 1385 }
a2b1693b
TH
1386 }
1387
a2b1693b
TH
1388 spin_lock_irq(q->queue_lock);
1389
1390 list_for_each_entry(blkg, &q->blkg_list, q_node) {
4c55f4f9
TH
1391 struct blkg_policy_data *pd;
1392
1393 if (blkg->pd[pol->plid])
1394 continue;
a2b1693b 1395
e00f4f4d 1396 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
4c55f4f9
TH
1397 if (!pd)
1398 swap(pd, pd_prealloc);
1399 if (!pd) {
1400 spin_unlock_irq(q->queue_lock);
1401 goto pd_prealloc;
1402 }
a2b1693b
TH
1403
1404 blkg->pd[pol->plid] = pd;
1405 pd->blkg = blkg;
b276a876 1406 pd->plid = pol->plid;
3e418710 1407 if (pol->pd_init_fn)
a9520cd6 1408 pol->pd_init_fn(pd);
a2b1693b
TH
1409 }
1410
1411 __set_bit(pol->plid, q->blkcg_pols);
1412 ret = 0;
4c55f4f9 1413
a2b1693b 1414 spin_unlock_irq(q->queue_lock);
4c55f4f9 1415out_bypass_end:
38dbb7dd 1416 if (q->mq_ops)
bd166ef1 1417 blk_mq_unfreeze_queue(q);
38dbb7dd 1418 else
bd166ef1 1419 blk_queue_bypass_end(q);
001bea73
TH
1420 if (pd_prealloc)
1421 pol->pd_free_fn(pd_prealloc);
a2b1693b
TH
1422 return ret;
1423}
1424EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1425
1426/**
1427 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
1428 * @q: request_queue of interest
1429 * @pol: blkcg policy to deactivate
1430 *
1431 * Deactivate @pol on @q. Follows the same synchronization rules as
1432 * blkcg_activate_policy().
1433 */
1434void blkcg_deactivate_policy(struct request_queue *q,
3c798398 1435 const struct blkcg_policy *pol)
a2b1693b 1436{
3c798398 1437 struct blkcg_gq *blkg;
a2b1693b
TH
1438
1439 if (!blkcg_policy_enabled(q, pol))
1440 return;
1441
38dbb7dd 1442 if (q->mq_ops)
bd166ef1 1443 blk_mq_freeze_queue(q);
38dbb7dd 1444 else
bd166ef1
JA
1445 blk_queue_bypass_start(q);
1446
a2b1693b
TH
1447 spin_lock_irq(q->queue_lock);
1448
1449 __clear_bit(pol->plid, q->blkcg_pols);
1450
1451 list_for_each_entry(blkg, &q->blkg_list, q_node) {
001bea73 1452 if (blkg->pd[pol->plid]) {
4c699480
JQ
1453 if (!blkg->pd[pol->plid]->offline &&
1454 pol->pd_offline_fn) {
a9520cd6 1455 pol->pd_offline_fn(blkg->pd[pol->plid]);
4c699480
JQ
1456 blkg->pd[pol->plid]->offline = true;
1457 }
001bea73
TH
1458 pol->pd_free_fn(blkg->pd[pol->plid]);
1459 blkg->pd[pol->plid] = NULL;
1460 }
a2b1693b
TH
1461 }
1462
1463 spin_unlock_irq(q->queue_lock);
bd166ef1 1464
38dbb7dd 1465 if (q->mq_ops)
bd166ef1 1466 blk_mq_unfreeze_queue(q);
38dbb7dd 1467 else
bd166ef1 1468 blk_queue_bypass_end(q);
a2b1693b
TH
1469}
1470EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1471
8bd435b3 1472/**
3c798398
TH
1473 * blkcg_policy_register - register a blkcg policy
1474 * @pol: blkcg policy to register
8bd435b3 1475 *
3c798398
TH
1476 * Register @pol with blkcg core. Might sleep and @pol may be modified on
1477 * successful registration. Returns 0 on success and -errno on failure.
8bd435b3 1478 */
d5bf0291 1479int blkcg_policy_register(struct blkcg_policy *pol)
3e252066 1480{
06b285bd 1481 struct blkcg *blkcg;
8bd435b3 1482 int i, ret;
e8989fae 1483
838f13bf 1484 mutex_lock(&blkcg_pol_register_mutex);
bc0d6501
TH
1485 mutex_lock(&blkcg_pol_mutex);
1486
8bd435b3
TH
1487 /* find an empty slot */
1488 ret = -ENOSPC;
1489 for (i = 0; i < BLKCG_MAX_POLS; i++)
3c798398 1490 if (!blkcg_policy[i])
8bd435b3
TH
1491 break;
1492 if (i >= BLKCG_MAX_POLS)
838f13bf 1493 goto err_unlock;
035d10b2 1494
e8401073 1495 /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
1496 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1497 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1498 goto err_unlock;
1499
06b285bd 1500 /* register @pol */
3c798398 1501 pol->plid = i;
06b285bd
TH
1502 blkcg_policy[pol->plid] = pol;
1503
1504 /* allocate and install cpd's */
e4a9bde9 1505 if (pol->cpd_alloc_fn) {
06b285bd
TH
1506 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1507 struct blkcg_policy_data *cpd;
1508
e4a9bde9 1509 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
bbb427e3 1510 if (!cpd)
06b285bd 1511 goto err_free_cpds;
06b285bd 1512
81437648
TH
1513 blkcg->cpd[pol->plid] = cpd;
1514 cpd->blkcg = blkcg;
06b285bd 1515 cpd->plid = pol->plid;
81437648 1516 pol->cpd_init_fn(cpd);
06b285bd
TH
1517 }
1518 }
1519
838f13bf 1520 mutex_unlock(&blkcg_pol_mutex);
8bd435b3 1521
8bd435b3 1522 /* everything is in place, add intf files for the new policy */
2ee867dc
TH
1523 if (pol->dfl_cftypes)
1524 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1525 pol->dfl_cftypes));
880f50e2 1526 if (pol->legacy_cftypes)
c165b3e3 1527 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
880f50e2 1528 pol->legacy_cftypes));
838f13bf
TH
1529 mutex_unlock(&blkcg_pol_register_mutex);
1530 return 0;
1531
06b285bd 1532err_free_cpds:
58a9edce 1533 if (pol->cpd_free_fn) {
06b285bd 1534 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
e4a9bde9
TH
1535 if (blkcg->cpd[pol->plid]) {
1536 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1537 blkcg->cpd[pol->plid] = NULL;
1538 }
06b285bd
TH
1539 }
1540 }
1541 blkcg_policy[pol->plid] = NULL;
838f13bf 1542err_unlock:
bc0d6501 1543 mutex_unlock(&blkcg_pol_mutex);
838f13bf 1544 mutex_unlock(&blkcg_pol_register_mutex);
8bd435b3 1545 return ret;
3e252066 1546}
3c798398 1547EXPORT_SYMBOL_GPL(blkcg_policy_register);
3e252066 1548
8bd435b3 1549/**
3c798398
TH
1550 * blkcg_policy_unregister - unregister a blkcg policy
1551 * @pol: blkcg policy to unregister
8bd435b3 1552 *
3c798398 1553 * Undo blkcg_policy_register(@pol). Might sleep.
8bd435b3 1554 */
3c798398 1555void blkcg_policy_unregister(struct blkcg_policy *pol)
3e252066 1556{
06b285bd
TH
1557 struct blkcg *blkcg;
1558
838f13bf 1559 mutex_lock(&blkcg_pol_register_mutex);
bc0d6501 1560
3c798398 1561 if (WARN_ON(blkcg_policy[pol->plid] != pol))
8bd435b3
TH
1562 goto out_unlock;
1563
1564 /* kill the intf files first */
2ee867dc
TH
1565 if (pol->dfl_cftypes)
1566 cgroup_rm_cftypes(pol->dfl_cftypes);
880f50e2
TH
1567 if (pol->legacy_cftypes)
1568 cgroup_rm_cftypes(pol->legacy_cftypes);
44ea53de 1569
06b285bd 1570 /* remove cpds and unregister */
838f13bf 1571 mutex_lock(&blkcg_pol_mutex);
06b285bd 1572
58a9edce 1573 if (pol->cpd_free_fn) {
06b285bd 1574 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
e4a9bde9
TH
1575 if (blkcg->cpd[pol->plid]) {
1576 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1577 blkcg->cpd[pol->plid] = NULL;
1578 }
06b285bd
TH
1579 }
1580 }
3c798398 1581 blkcg_policy[pol->plid] = NULL;
06b285bd 1582
bc0d6501 1583 mutex_unlock(&blkcg_pol_mutex);
838f13bf
TH
1584out_unlock:
1585 mutex_unlock(&blkcg_pol_register_mutex);
3e252066 1586}
3c798398 1587EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
903d23f0
JB
1588
1589module_param(blkcg_debug_stats, bool, 0644);
1590MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");