]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - block/cfq-iosched.c
mem/memcg: cache rightmost node
[thirdparty/kernel/stable.git] / block / cfq-iosched.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * CFQ, or complete fairness queueing, disk scheduler.
3 *
4 * Based on ideas from a previously unfinished io
5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
0fe23479 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
1da177e4 8 */
1da177e4 9#include <linux/module.h>
5a0e3ad6 10#include <linux/slab.h>
e6017571 11#include <linux/sched/clock.h>
1cc9be68
AV
12#include <linux/blkdev.h>
13#include <linux/elevator.h>
9a7f38c4 14#include <linux/ktime.h>
1da177e4 15#include <linux/rbtree.h>
22e2c507 16#include <linux/ioprio.h>
7b679138 17#include <linux/blktrace_api.h>
eea8f41c 18#include <linux/blk-cgroup.h>
6e736be7 19#include "blk.h"
87760e5e 20#include "blk-wbt.h"
1da177e4
LT
21
22/*
23 * tunables
24 */
fe094d98 25/* max queue in one round of service */
abc3c744 26static const int cfq_quantum = 8;
9a7f38c4 27static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
fe094d98
JA
28/* maximum backwards seek, in KiB */
29static const int cfq_back_max = 16 * 1024;
30/* penalty of a backwards seek */
31static const int cfq_back_penalty = 2;
9a7f38c4
JM
32static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
33static u64 cfq_slice_async = NSEC_PER_SEC / 25;
64100099 34static const int cfq_slice_async_rq = 2;
9a7f38c4
JM
35static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
36static u64 cfq_group_idle = NSEC_PER_SEC / 125;
37static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
5db5d642 38static const int cfq_hist_divisor = 4;
22e2c507 39
d9e7620e 40/*
5be6b756 41 * offset from end of queue service tree for idle class
d9e7620e 42 */
9a7f38c4 43#define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5)
5be6b756
HT
44/* offset from end of group service tree under time slice mode */
45#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
46/* offset from end of group service under IOPS mode */
47#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
d9e7620e
JA
48
49/*
50 * below this threshold, we consider thinktime immediate
51 */
9a7f38c4 52#define CFQ_MIN_TT (2 * NSEC_PER_SEC / HZ)
d9e7620e 53
22e2c507 54#define CFQ_SLICE_SCALE (5)
45333d5a 55#define CFQ_HW_QUEUE_MIN (5)
25bc6b07 56#define CFQ_SERVICE_SHIFT 12
22e2c507 57
3dde36dd 58#define CFQQ_SEEK_THR (sector_t)(8 * 100)
e9ce335d 59#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
41647e7a 60#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
3dde36dd 61#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
ae54abed 62
a612fddf
TH
63#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
64#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
65#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
1da177e4 66
e18b890b 67static struct kmem_cache *cfq_pool;
1da177e4 68
22e2c507
JA
69#define CFQ_PRIO_LISTS IOPRIO_BE_NR
70#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
22e2c507
JA
71#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
72
206dc69b 73#define sample_valid(samples) ((samples) > 80)
1fa8f6d6 74#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
206dc69b 75
e48453c3 76/* blkio-related constants */
3ecca629
TH
77#define CFQ_WEIGHT_LEGACY_MIN 10
78#define CFQ_WEIGHT_LEGACY_DFL 500
79#define CFQ_WEIGHT_LEGACY_MAX 1000
e48453c3 80
c5869807 81struct cfq_ttime {
9a7f38c4 82 u64 last_end_request;
c5869807 83
9a7f38c4
JM
84 u64 ttime_total;
85 u64 ttime_mean;
c5869807 86 unsigned long ttime_samples;
c5869807
TH
87};
88
cc09e299
JA
89/*
90 * Most of our rbtree usage is for sorting with min extraction, so
91 * if we cache the leftmost node we don't have to walk down the tree
92 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
93 * move this into the elevator for the rq sorting as well.
94 */
95struct cfq_rb_root {
09663c86 96 struct rb_root_cached rb;
aa6f6a3d 97 unsigned count;
1fa8f6d6 98 u64 min_vdisktime;
f5f2b6ce 99 struct cfq_ttime ttime;
cc09e299 100};
09663c86 101#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
9a7f38c4 102 .ttime = {.last_end_request = ktime_get_ns(),},}
cc09e299 103
6118b70b
JA
104/*
105 * Per process-grouping structure
106 */
107struct cfq_queue {
108 /* reference count */
30d7b944 109 int ref;
6118b70b
JA
110 /* various state flags, see below */
111 unsigned int flags;
112 /* parent cfq_data */
113 struct cfq_data *cfqd;
114 /* service_tree member */
115 struct rb_node rb_node;
116 /* service_tree key */
9a7f38c4 117 u64 rb_key;
6118b70b
JA
118 /* prio tree member */
119 struct rb_node p_node;
120 /* prio tree root we belong to, if any */
121 struct rb_root *p_root;
122 /* sorted list of pending requests */
123 struct rb_root sort_list;
124 /* if fifo isn't expired, next request to serve */
125 struct request *next_rq;
126 /* requests queued in sort_list */
127 int queued[2];
128 /* currently allocated requests */
129 int allocated[2];
130 /* fifo list of requests in sort_list */
131 struct list_head fifo;
132
dae739eb 133 /* time when queue got scheduled in to dispatch first request. */
9a7f38c4
JM
134 u64 dispatch_start;
135 u64 allocated_slice;
136 u64 slice_dispatch;
dae739eb 137 /* time when first request from queue completed and slice started. */
9a7f38c4
JM
138 u64 slice_start;
139 u64 slice_end;
93fdf147 140 s64 slice_resid;
6118b70b 141
65299a3b
CH
142 /* pending priority requests */
143 int prio_pending;
6118b70b
JA
144 /* number of requests that are on the dispatch list or inside driver */
145 int dispatched;
146
147 /* io prio of this group */
148 unsigned short ioprio, org_ioprio;
b8269db4 149 unsigned short ioprio_class, org_ioprio_class;
6118b70b 150
c4081ba5
RK
151 pid_t pid;
152
3dde36dd 153 u32 seek_history;
b2c18e1e
JM
154 sector_t last_request_pos;
155
aa6f6a3d 156 struct cfq_rb_root *service_tree;
df5fe3e8 157 struct cfq_queue *new_cfqq;
cdb16e8f 158 struct cfq_group *cfqg;
c4e7893e
VG
159 /* Number of sectors dispatched from queue in single dispatch round */
160 unsigned long nr_sectors;
6118b70b
JA
161};
162
c0324a02 163/*
718eee05 164 * First index in the service_trees.
c0324a02
CZ
165 * IDLE is handled separately, so it has negative index
166 */
3bf10fea 167enum wl_class_t {
c0324a02 168 BE_WORKLOAD = 0,
615f0259
VG
169 RT_WORKLOAD = 1,
170 IDLE_WORKLOAD = 2,
b4627321 171 CFQ_PRIO_NR,
c0324a02
CZ
172};
173
718eee05
CZ
174/*
175 * Second index in the service_trees.
176 */
177enum wl_type_t {
178 ASYNC_WORKLOAD = 0,
179 SYNC_NOIDLE_WORKLOAD = 1,
180 SYNC_WORKLOAD = 2
181};
182
155fead9
TH
183struct cfqg_stats {
184#ifdef CONFIG_CFQ_GROUP_IOSCHED
155fead9
TH
185 /* number of ios merged */
186 struct blkg_rwstat merged;
187 /* total time spent on device in ns, may not be accurate w/ queueing */
188 struct blkg_rwstat service_time;
189 /* total time spent waiting in scheduler queue in ns */
190 struct blkg_rwstat wait_time;
191 /* number of IOs queued up */
192 struct blkg_rwstat queued;
155fead9
TH
193 /* total disk time and nr sectors dispatched by this group */
194 struct blkg_stat time;
195#ifdef CONFIG_DEBUG_BLK_CGROUP
196 /* time not charged to this cgroup */
197 struct blkg_stat unaccounted_time;
198 /* sum of number of ios queued across all samples */
199 struct blkg_stat avg_queue_size_sum;
200 /* count of samples taken for average */
201 struct blkg_stat avg_queue_size_samples;
202 /* how many times this group has been removed from service tree */
203 struct blkg_stat dequeue;
204 /* total time spent waiting for it to be assigned a timeslice. */
205 struct blkg_stat group_wait_time;
3c798398 206 /* time spent idling for this blkcg_gq */
155fead9
TH
207 struct blkg_stat idle_time;
208 /* total time with empty current active q with other requests queued */
209 struct blkg_stat empty_time;
210 /* fields after this shouldn't be cleared on stat reset */
211 uint64_t start_group_wait_time;
212 uint64_t start_idle_time;
213 uint64_t start_empty_time;
214 uint16_t flags;
215#endif /* CONFIG_DEBUG_BLK_CGROUP */
216#endif /* CONFIG_CFQ_GROUP_IOSCHED */
217};
218
e48453c3
AA
219/* Per-cgroup data */
220struct cfq_group_data {
221 /* must be the first member */
81437648 222 struct blkcg_policy_data cpd;
e48453c3
AA
223
224 unsigned int weight;
225 unsigned int leaf_weight;
226};
227
cdb16e8f
VG
228/* This is per cgroup per device grouping structure */
229struct cfq_group {
f95a04af
TH
230 /* must be the first member */
231 struct blkg_policy_data pd;
232
1fa8f6d6
VG
233 /* group service_tree member */
234 struct rb_node rb_node;
235
236 /* group service_tree key */
237 u64 vdisktime;
e71357e1 238
7918ffb5
TH
239 /*
240 * The number of active cfqgs and sum of their weights under this
241 * cfqg. This covers this cfqg's leaf_weight and all children's
242 * weights, but does not cover weights of further descendants.
243 *
244 * If a cfqg is on the service tree, it's active. An active cfqg
245 * also activates its parent and contributes to the children_weight
246 * of the parent.
247 */
248 int nr_active;
249 unsigned int children_weight;
250
1d3650f7
TH
251 /*
252 * vfraction is the fraction of vdisktime that the tasks in this
253 * cfqg are entitled to. This is determined by compounding the
254 * ratios walking up from this cfqg to the root.
255 *
256 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
257 * vfractions on a service tree is approximately 1. The sum may
258 * deviate a bit due to rounding errors and fluctuations caused by
259 * cfqgs entering and leaving the service tree.
260 */
261 unsigned int vfraction;
262
e71357e1
TH
263 /*
264 * There are two weights - (internal) weight is the weight of this
265 * cfqg against the sibling cfqgs. leaf_weight is the wight of
266 * this cfqg against the child cfqgs. For the root cfqg, both
267 * weights are kept in sync for backward compatibility.
268 */
25bc6b07 269 unsigned int weight;
8184f93e 270 unsigned int new_weight;
3381cb8d 271 unsigned int dev_weight;
1fa8f6d6 272
e71357e1
TH
273 unsigned int leaf_weight;
274 unsigned int new_leaf_weight;
275 unsigned int dev_leaf_weight;
276
1fa8f6d6
VG
277 /* number of cfqq currently on this group */
278 int nr_cfqq;
279
cdb16e8f 280 /*
4495a7d4 281 * Per group busy queues average. Useful for workload slice calc. We
b4627321
VG
282 * create the array for each prio class but at run time it is used
283 * only for RT and BE class and slot for IDLE class remains unused.
284 * This is primarily done to avoid confusion and a gcc warning.
285 */
286 unsigned int busy_queues_avg[CFQ_PRIO_NR];
287 /*
288 * rr lists of queues with requests. We maintain service trees for
289 * RT and BE classes. These trees are subdivided in subclasses
290 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
291 * class there is no subclassification and all the cfq queues go on
292 * a single tree service_tree_idle.
cdb16e8f
VG
293 * Counts are embedded in the cfq_rb_root
294 */
295 struct cfq_rb_root service_trees[2][3];
296 struct cfq_rb_root service_tree_idle;
dae739eb 297
9a7f38c4 298 u64 saved_wl_slice;
4d2ceea4
VG
299 enum wl_type_t saved_wl_type;
300 enum wl_class_t saved_wl_class;
4eef3049 301
80bdf0c7
VG
302 /* number of requests that are on the dispatch list or inside driver */
303 int dispatched;
7700fc4f 304 struct cfq_ttime ttime;
0b39920b 305 struct cfqg_stats stats; /* stats for this cfqg */
60a83707
TH
306
307 /* async queue for each priority case */
308 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
309 struct cfq_queue *async_idle_cfqq;
310
cdb16e8f 311};
718eee05 312
c5869807
TH
313struct cfq_io_cq {
314 struct io_cq icq; /* must be the first member */
315 struct cfq_queue *cfqq[2];
316 struct cfq_ttime ttime;
598971bf
TH
317 int ioprio; /* the current ioprio */
318#ifdef CONFIG_CFQ_GROUP_IOSCHED
f4da8072 319 uint64_t blkcg_serial_nr; /* the current blkcg serial */
598971bf 320#endif
c5869807
TH
321};
322
22e2c507
JA
323/*
324 * Per block device queue structure
325 */
1da177e4 326struct cfq_data {
165125e1 327 struct request_queue *queue;
1fa8f6d6
VG
328 /* Root service tree for cfq_groups */
329 struct cfq_rb_root grp_service_tree;
f51b802c 330 struct cfq_group *root_group;
22e2c507 331
c0324a02
CZ
332 /*
333 * The priority currently being served
22e2c507 334 */
4d2ceea4
VG
335 enum wl_class_t serving_wl_class;
336 enum wl_type_t serving_wl_type;
9a7f38c4 337 u64 workload_expires;
cdb16e8f 338 struct cfq_group *serving_group;
a36e71f9
JA
339
340 /*
341 * Each priority tree is sorted by next_request position. These
342 * trees are used when determining if two or more queues are
343 * interleaving requests (see cfq_close_cooperator).
344 */
345 struct rb_root prio_trees[CFQ_PRIO_LISTS];
346
22e2c507 347 unsigned int busy_queues;
ef8a41df 348 unsigned int busy_sync_queues;
22e2c507 349
53c583d2
CZ
350 int rq_in_driver;
351 int rq_in_flight[2];
45333d5a
AC
352
353 /*
354 * queue-depth detection
355 */
356 int rq_queued;
25776e35 357 int hw_tag;
e459dd08
CZ
358 /*
359 * hw_tag can be
360 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
361 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
362 * 0 => no NCQ
363 */
364 int hw_tag_est_depth;
365 unsigned int hw_tag_samples;
1da177e4 366
22e2c507
JA
367 /*
368 * idle window management
369 */
91148325 370 struct hrtimer idle_slice_timer;
23e018a1 371 struct work_struct unplug_work;
1da177e4 372
22e2c507 373 struct cfq_queue *active_queue;
c5869807 374 struct cfq_io_cq *active_cic;
22e2c507 375
6d048f53 376 sector_t last_position;
1da177e4 377
1da177e4
LT
378 /*
379 * tunables, see top of file
380 */
381 unsigned int cfq_quantum;
1da177e4
LT
382 unsigned int cfq_back_penalty;
383 unsigned int cfq_back_max;
22e2c507 384 unsigned int cfq_slice_async_rq;
963b72fc 385 unsigned int cfq_latency;
9a7f38c4
JM
386 u64 cfq_fifo_expire[2];
387 u64 cfq_slice[2];
388 u64 cfq_slice_idle;
389 u64 cfq_group_idle;
390 u64 cfq_target_latency;
d9ff4187 391
6118b70b
JA
392 /*
393 * Fallback dummy cfqq for extreme OOM conditions
394 */
395 struct cfq_queue oom_cfqq;
365722bb 396
9a7f38c4 397 u64 last_delayed_sync;
1da177e4
LT
398};
399
25fb5169 400static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
60a83707 401static void cfq_put_queue(struct cfq_queue *cfqq);
25fb5169 402
34b98d03 403static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
3bf10fea 404 enum wl_class_t class,
65b32a57 405 enum wl_type_t type)
c0324a02 406{
1fa8f6d6
VG
407 if (!cfqg)
408 return NULL;
409
3bf10fea 410 if (class == IDLE_WORKLOAD)
cdb16e8f 411 return &cfqg->service_tree_idle;
c0324a02 412
3bf10fea 413 return &cfqg->service_trees[class][type];
c0324a02
CZ
414}
415
3b18152c 416enum cfqq_state_flags {
b0b8d749
JA
417 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
418 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
b029195d 419 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
b0b8d749 420 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
b0b8d749
JA
421 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
422 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
423 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
44f7c160 424 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
91fac317 425 CFQ_CFQQ_FLAG_sync, /* synchronous queue */
b3b6d040 426 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
ae54abed 427 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
76280aff 428 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
f75edf2d 429 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
3b18152c
JA
430};
431
432#define CFQ_CFQQ_FNS(name) \
433static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
434{ \
fe094d98 435 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
3b18152c
JA
436} \
437static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
438{ \
fe094d98 439 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
3b18152c
JA
440} \
441static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
442{ \
fe094d98 443 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
3b18152c
JA
444}
445
446CFQ_CFQQ_FNS(on_rr);
447CFQ_CFQQ_FNS(wait_request);
b029195d 448CFQ_CFQQ_FNS(must_dispatch);
3b18152c 449CFQ_CFQQ_FNS(must_alloc_slice);
3b18152c
JA
450CFQ_CFQQ_FNS(fifo_expire);
451CFQ_CFQQ_FNS(idle_window);
452CFQ_CFQQ_FNS(prio_changed);
44f7c160 453CFQ_CFQQ_FNS(slice_new);
91fac317 454CFQ_CFQQ_FNS(sync);
a36e71f9 455CFQ_CFQQ_FNS(coop);
ae54abed 456CFQ_CFQQ_FNS(split_coop);
76280aff 457CFQ_CFQQ_FNS(deep);
f75edf2d 458CFQ_CFQQ_FNS(wait_busy);
3b18152c
JA
459#undef CFQ_CFQQ_FNS
460
629ed0b1 461#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
2ce4d50f 462
155fead9
TH
463/* cfqg stats flags */
464enum cfqg_stats_flags {
465 CFQG_stats_waiting = 0,
466 CFQG_stats_idling,
467 CFQG_stats_empty,
629ed0b1
TH
468};
469
155fead9
TH
470#define CFQG_FLAG_FNS(name) \
471static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \
629ed0b1 472{ \
155fead9 473 stats->flags |= (1 << CFQG_stats_##name); \
629ed0b1 474} \
155fead9 475static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \
629ed0b1 476{ \
155fead9 477 stats->flags &= ~(1 << CFQG_stats_##name); \
629ed0b1 478} \
155fead9 479static inline int cfqg_stats_##name(struct cfqg_stats *stats) \
629ed0b1 480{ \
155fead9 481 return (stats->flags & (1 << CFQG_stats_##name)) != 0; \
629ed0b1
TH
482} \
483
155fead9
TH
484CFQG_FLAG_FNS(waiting)
485CFQG_FLAG_FNS(idling)
486CFQG_FLAG_FNS(empty)
487#undef CFQG_FLAG_FNS
629ed0b1
TH
488
489/* This should be called with the queue_lock held. */
155fead9 490static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
629ed0b1
TH
491{
492 unsigned long long now;
493
155fead9 494 if (!cfqg_stats_waiting(stats))
629ed0b1
TH
495 return;
496
497 now = sched_clock();
498 if (time_after64(now, stats->start_group_wait_time))
499 blkg_stat_add(&stats->group_wait_time,
500 now - stats->start_group_wait_time);
155fead9 501 cfqg_stats_clear_waiting(stats);
629ed0b1
TH
502}
503
504/* This should be called with the queue_lock held. */
155fead9
TH
505static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
506 struct cfq_group *curr_cfqg)
629ed0b1 507{
155fead9 508 struct cfqg_stats *stats = &cfqg->stats;
629ed0b1 509
155fead9 510 if (cfqg_stats_waiting(stats))
629ed0b1 511 return;
155fead9 512 if (cfqg == curr_cfqg)
629ed0b1 513 return;
155fead9
TH
514 stats->start_group_wait_time = sched_clock();
515 cfqg_stats_mark_waiting(stats);
629ed0b1
TH
516}
517
518/* This should be called with the queue_lock held. */
155fead9 519static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
629ed0b1
TH
520{
521 unsigned long long now;
522
155fead9 523 if (!cfqg_stats_empty(stats))
629ed0b1
TH
524 return;
525
526 now = sched_clock();
527 if (time_after64(now, stats->start_empty_time))
528 blkg_stat_add(&stats->empty_time,
529 now - stats->start_empty_time);
155fead9 530 cfqg_stats_clear_empty(stats);
629ed0b1
TH
531}
532
155fead9 533static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
629ed0b1 534{
155fead9 535 blkg_stat_add(&cfqg->stats.dequeue, 1);
629ed0b1
TH
536}
537
155fead9 538static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
629ed0b1 539{
155fead9 540 struct cfqg_stats *stats = &cfqg->stats;
629ed0b1 541
4d5e80a7 542 if (blkg_rwstat_total(&stats->queued))
629ed0b1
TH
543 return;
544
545 /*
546 * group is already marked empty. This can happen if cfqq got new
547 * request in parent group and moved to this group while being added
548 * to service tree. Just ignore the event and move on.
549 */
155fead9 550 if (cfqg_stats_empty(stats))
629ed0b1
TH
551 return;
552
553 stats->start_empty_time = sched_clock();
155fead9 554 cfqg_stats_mark_empty(stats);
629ed0b1
TH
555}
556
155fead9 557static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
629ed0b1 558{
155fead9 559 struct cfqg_stats *stats = &cfqg->stats;
629ed0b1 560
155fead9 561 if (cfqg_stats_idling(stats)) {
629ed0b1
TH
562 unsigned long long now = sched_clock();
563
564 if (time_after64(now, stats->start_idle_time))
565 blkg_stat_add(&stats->idle_time,
566 now - stats->start_idle_time);
155fead9 567 cfqg_stats_clear_idling(stats);
629ed0b1
TH
568 }
569}
570
155fead9 571static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
629ed0b1 572{
155fead9 573 struct cfqg_stats *stats = &cfqg->stats;
629ed0b1 574
155fead9 575 BUG_ON(cfqg_stats_idling(stats));
629ed0b1
TH
576
577 stats->start_idle_time = sched_clock();
155fead9 578 cfqg_stats_mark_idling(stats);
629ed0b1
TH
579}
580
155fead9 581static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
629ed0b1 582{
155fead9 583 struct cfqg_stats *stats = &cfqg->stats;
629ed0b1
TH
584
585 blkg_stat_add(&stats->avg_queue_size_sum,
4d5e80a7 586 blkg_rwstat_total(&stats->queued));
629ed0b1 587 blkg_stat_add(&stats->avg_queue_size_samples, 1);
155fead9 588 cfqg_stats_update_group_wait_time(stats);
629ed0b1
TH
589}
590
591#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
592
f48ec1d7
TH
593static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
594static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
595static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
596static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
597static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
598static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
599static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
629ed0b1
TH
600
601#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
602
603#ifdef CONFIG_CFQ_GROUP_IOSCHED
2ce4d50f 604
4ceab71b
JA
605static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
606{
607 return pd ? container_of(pd, struct cfq_group, pd) : NULL;
608}
609
610static struct cfq_group_data
611*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
612{
81437648 613 return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
4ceab71b
JA
614}
615
616static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
617{
618 return pd_to_blkg(&cfqg->pd);
619}
620
ffea73fc
TH
621static struct blkcg_policy blkcg_policy_cfq;
622
623static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
624{
625 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
626}
627
e48453c3
AA
628static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
629{
630 return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
631}
632
d02f7aa8 633static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
7918ffb5 634{
d02f7aa8 635 struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
7918ffb5 636
d02f7aa8 637 return pblkg ? blkg_to_cfqg(pblkg) : NULL;
7918ffb5
TH
638}
639
3984aa55
JK
640static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
641 struct cfq_group *ancestor)
642{
643 return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
644 cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
645}
646
eb7d8c07
TH
647static inline void cfqg_get(struct cfq_group *cfqg)
648{
649 return blkg_get(cfqg_to_blkg(cfqg));
650}
651
652static inline void cfqg_put(struct cfq_group *cfqg)
653{
654 return blkg_put(cfqg_to_blkg(cfqg));
655}
656
54e7ed12 657#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \
35fe6d76
SL
658 blk_add_cgroup_trace_msg((cfqd)->queue, \
659 cfqg_to_blkg((cfqq)->cfqg)->blkcg, \
660 "cfq%d%c%c " fmt, (cfqq)->pid, \
b226e5c4
VG
661 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
662 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
35fe6d76 663 ##args); \
54e7ed12
TH
664} while (0)
665
666#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \
35fe6d76
SL
667 blk_add_cgroup_trace_msg((cfqd)->queue, \
668 cfqg_to_blkg(cfqg)->blkcg, fmt, ##args); \
54e7ed12 669} while (0)
2868ef7b 670
155fead9 671static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
ef295ecf
CH
672 struct cfq_group *curr_cfqg,
673 unsigned int op)
2ce4d50f 674{
ef295ecf 675 blkg_rwstat_add(&cfqg->stats.queued, op, 1);
155fead9
TH
676 cfqg_stats_end_empty_time(&cfqg->stats);
677 cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
2ce4d50f
TH
678}
679
155fead9 680static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
9a7f38c4 681 uint64_t time, unsigned long unaccounted_time)
2ce4d50f 682{
155fead9 683 blkg_stat_add(&cfqg->stats.time, time);
629ed0b1 684#ifdef CONFIG_DEBUG_BLK_CGROUP
155fead9 685 blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
629ed0b1 686#endif
2ce4d50f
TH
687}
688
ef295ecf
CH
689static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
690 unsigned int op)
2ce4d50f 691{
ef295ecf 692 blkg_rwstat_add(&cfqg->stats.queued, op, -1);
2ce4d50f
TH
693}
694
ef295ecf
CH
695static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
696 unsigned int op)
2ce4d50f 697{
ef295ecf 698 blkg_rwstat_add(&cfqg->stats.merged, op, 1);
2ce4d50f
TH
699}
700
155fead9 701static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
ef295ecf
CH
702 uint64_t start_time, uint64_t io_start_time,
703 unsigned int op)
2ce4d50f 704{
155fead9 705 struct cfqg_stats *stats = &cfqg->stats;
629ed0b1 706 unsigned long long now = sched_clock();
629ed0b1
TH
707
708 if (time_after64(now, io_start_time))
ef295ecf 709 blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
629ed0b1 710 if (time_after64(io_start_time, start_time))
ef295ecf 711 blkg_rwstat_add(&stats->wait_time, op,
629ed0b1 712 io_start_time - start_time);
2ce4d50f
TH
713}
714
689665af
TH
715/* @stats = 0 */
716static void cfqg_stats_reset(struct cfqg_stats *stats)
155fead9 717{
155fead9 718 /* queued stats shouldn't be cleared */
155fead9
TH
719 blkg_rwstat_reset(&stats->merged);
720 blkg_rwstat_reset(&stats->service_time);
721 blkg_rwstat_reset(&stats->wait_time);
722 blkg_stat_reset(&stats->time);
723#ifdef CONFIG_DEBUG_BLK_CGROUP
724 blkg_stat_reset(&stats->unaccounted_time);
725 blkg_stat_reset(&stats->avg_queue_size_sum);
726 blkg_stat_reset(&stats->avg_queue_size_samples);
727 blkg_stat_reset(&stats->dequeue);
728 blkg_stat_reset(&stats->group_wait_time);
729 blkg_stat_reset(&stats->idle_time);
730 blkg_stat_reset(&stats->empty_time);
731#endif
732}
733
0b39920b 734/* @to += @from */
e6269c44 735static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
0b39920b
TH
736{
737 /* queued stats shouldn't be cleared */
e6269c44
TH
738 blkg_rwstat_add_aux(&to->merged, &from->merged);
739 blkg_rwstat_add_aux(&to->service_time, &from->service_time);
740 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
741 blkg_stat_add_aux(&from->time, &from->time);
0b39920b 742#ifdef CONFIG_DEBUG_BLK_CGROUP
e6269c44
TH
743 blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
744 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
745 blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
746 blkg_stat_add_aux(&to->dequeue, &from->dequeue);
747 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
748 blkg_stat_add_aux(&to->idle_time, &from->idle_time);
749 blkg_stat_add_aux(&to->empty_time, &from->empty_time);
0b39920b
TH
750#endif
751}
752
753/*
e6269c44 754 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
0b39920b
TH
755 * recursive stats can still account for the amount used by this cfqg after
756 * it's gone.
757 */
758static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
759{
760 struct cfq_group *parent = cfqg_parent(cfqg);
761
762 lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
763
764 if (unlikely(!parent))
765 return;
766
e6269c44 767 cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
0b39920b 768 cfqg_stats_reset(&cfqg->stats);
0b39920b
TH
769}
770
eb7d8c07
TH
771#else /* CONFIG_CFQ_GROUP_IOSCHED */
772
d02f7aa8 773static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
3984aa55
JK
774static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
775 struct cfq_group *ancestor)
776{
777 return true;
778}
eb7d8c07
TH
779static inline void cfqg_get(struct cfq_group *cfqg) { }
780static inline void cfqg_put(struct cfq_group *cfqg) { }
781
7b679138 782#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
b226e5c4
VG
783 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
784 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
785 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
786 ##args)
4495a7d4 787#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
eb7d8c07 788
155fead9 789static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
ef295ecf 790 struct cfq_group *curr_cfqg, unsigned int op) { }
155fead9 791static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
9a7f38c4 792 uint64_t time, unsigned long unaccounted_time) { }
ef295ecf
CH
793static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
794 unsigned int op) { }
795static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
796 unsigned int op) { }
155fead9 797static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
ef295ecf
CH
798 uint64_t start_time, uint64_t io_start_time,
799 unsigned int op) { }
2ce4d50f 800
eb7d8c07
TH
801#endif /* CONFIG_CFQ_GROUP_IOSCHED */
802
7b679138
JA
803#define cfq_log(cfqd, fmt, args...) \
804 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
805
615f0259
VG
806/* Traverses through cfq group service trees */
807#define for_each_cfqg_st(cfqg, i, j, st) \
808 for (i = 0; i <= IDLE_WORKLOAD; i++) \
809 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
810 : &cfqg->service_tree_idle; \
811 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
812 (i == IDLE_WORKLOAD && j == 0); \
813 j++, st = i < IDLE_WORKLOAD ? \
814 &cfqg->service_trees[i][j]: NULL) \
815
f5f2b6ce
SL
816static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
817 struct cfq_ttime *ttime, bool group_idle)
818{
9a7f38c4 819 u64 slice;
f5f2b6ce
SL
820 if (!sample_valid(ttime->ttime_samples))
821 return false;
822 if (group_idle)
823 slice = cfqd->cfq_group_idle;
824 else
825 slice = cfqd->cfq_slice_idle;
826 return ttime->ttime_mean > slice;
827}
615f0259 828
02b35081
VG
829static inline bool iops_mode(struct cfq_data *cfqd)
830{
831 /*
832 * If we are not idling on queues and it is a NCQ drive, parallel
833 * execution of requests is on and measuring time is not possible
834 * in most of the cases until and unless we drive shallower queue
835 * depths and that becomes a performance bottleneck. In such cases
836 * switch to start providing fairness in terms of number of IOs.
837 */
838 if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
839 return true;
840 else
841 return false;
842}
843
3bf10fea 844static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
c0324a02
CZ
845{
846 if (cfq_class_idle(cfqq))
847 return IDLE_WORKLOAD;
848 if (cfq_class_rt(cfqq))
849 return RT_WORKLOAD;
850 return BE_WORKLOAD;
851}
852
718eee05
CZ
853
854static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
855{
856 if (!cfq_cfqq_sync(cfqq))
857 return ASYNC_WORKLOAD;
858 if (!cfq_cfqq_idle_window(cfqq))
859 return SYNC_NOIDLE_WORKLOAD;
860 return SYNC_WORKLOAD;
861}
862
3bf10fea 863static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
58ff82f3
VG
864 struct cfq_data *cfqd,
865 struct cfq_group *cfqg)
c0324a02 866{
3bf10fea 867 if (wl_class == IDLE_WORKLOAD)
cdb16e8f 868 return cfqg->service_tree_idle.count;
c0324a02 869
34b98d03
VG
870 return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
871 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
872 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
c0324a02
CZ
873}
874
f26bd1f0
VG
875static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
876 struct cfq_group *cfqg)
877{
34b98d03
VG
878 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
879 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
f26bd1f0
VG
880}
881
165125e1 882static void cfq_dispatch_insert(struct request_queue *, struct request *);
4f85cb96 883static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
2da8de0b 884 struct cfq_io_cq *cic, struct bio *bio);
91fac317 885
c5869807
TH
886static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
887{
888 /* cic->icq is the first member, %NULL will convert to %NULL */
889 return container_of(icq, struct cfq_io_cq, icq);
890}
891
47fdd4ca
TH
892static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
893 struct io_context *ioc)
894{
895 if (ioc)
896 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
897 return NULL;
898}
899
c5869807 900static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
91fac317 901{
a6151c3a 902 return cic->cfqq[is_sync];
91fac317
VT
903}
904
c5869807
TH
905static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
906 bool is_sync)
91fac317 907{
a6151c3a 908 cic->cfqq[is_sync] = cfqq;
91fac317
VT
909}
910
c5869807 911static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
bca4b914 912{
c5869807 913 return cic->icq.q->elevator->elevator_data;
bca4b914
KK
914}
915
99f95e52
AM
916/*
917 * scheduler run of queue, if there are requests pending and no one in the
918 * driver that will restart queueing
919 */
23e018a1 920static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
99f95e52 921{
7b679138
JA
922 if (cfqd->busy_queues) {
923 cfq_log(cfqd, "schedule dispatch");
59c3d45e 924 kblockd_schedule_work(&cfqd->unplug_work);
7b679138 925 }
99f95e52
AM
926}
927
44f7c160
JA
928/*
929 * Scale schedule slice based on io priority. Use the sync time slice only
930 * if a queue is marked sync and has sync io queued. A sync queue with async
931 * io only, should not get full sync slice length.
932 */
9a7f38c4 933static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
d9e7620e 934 unsigned short prio)
44f7c160 935{
9a7f38c4
JM
936 u64 base_slice = cfqd->cfq_slice[sync];
937 u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
44f7c160 938
d9e7620e
JA
939 WARN_ON(prio >= IOPRIO_BE_NR);
940
9a7f38c4 941 return base_slice + (slice * (4 - prio));
d9e7620e 942}
44f7c160 943
9a7f38c4 944static inline u64
d9e7620e
JA
945cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
946{
947 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
44f7c160
JA
948}
949
1d3650f7
TH
950/**
951 * cfqg_scale_charge - scale disk time charge according to cfqg weight
952 * @charge: disk time being charged
953 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
954 *
955 * Scale @charge according to @vfraction, which is in range (0, 1]. The
956 * scaling is inversely proportional.
957 *
958 * scaled = charge / vfraction
959 *
960 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
961 */
9a7f38c4 962static inline u64 cfqg_scale_charge(u64 charge,
1d3650f7 963 unsigned int vfraction)
25bc6b07 964{
1d3650f7 965 u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
25bc6b07 966
1d3650f7
TH
967 /* charge / vfraction */
968 c <<= CFQ_SERVICE_SHIFT;
9a7f38c4 969 return div_u64(c, vfraction);
25bc6b07
VG
970}
971
972static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
973{
974 s64 delta = (s64)(vdisktime - min_vdisktime);
975 if (delta > 0)
976 min_vdisktime = vdisktime;
977
978 return min_vdisktime;
979}
980
25bc6b07
VG
981static void update_min_vdisktime(struct cfq_rb_root *st)
982{
09663c86
DB
983 if (!RB_EMPTY_ROOT(&st->rb.rb_root)) {
984 struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost);
25bc6b07 985
a6032710
GJ
986 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
987 cfqg->vdisktime);
25bc6b07 988 }
25bc6b07
VG
989}
990
5db5d642
CZ
991/*
992 * get averaged number of queues of RT/BE priority.
993 * average is updated, with a formula that gives more weight to higher numbers,
994 * to quickly follows sudden increases and decrease slowly
995 */
996
58ff82f3
VG
997static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
998 struct cfq_group *cfqg, bool rt)
5869619c 999{
5db5d642
CZ
1000 unsigned min_q, max_q;
1001 unsigned mult = cfq_hist_divisor - 1;
1002 unsigned round = cfq_hist_divisor / 2;
58ff82f3 1003 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
5db5d642 1004
58ff82f3
VG
1005 min_q = min(cfqg->busy_queues_avg[rt], busy);
1006 max_q = max(cfqg->busy_queues_avg[rt], busy);
1007 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
5db5d642 1008 cfq_hist_divisor;
58ff82f3
VG
1009 return cfqg->busy_queues_avg[rt];
1010}
1011
9a7f38c4 1012static inline u64
58ff82f3
VG
1013cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
1014{
41cad6ab 1015 return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
5db5d642
CZ
1016}
1017
9a7f38c4 1018static inline u64
ba5bd520 1019cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
44f7c160 1020{
9a7f38c4 1021 u64 slice = cfq_prio_to_slice(cfqd, cfqq);
5db5d642 1022 if (cfqd->cfq_latency) {
58ff82f3
VG
1023 /*
1024 * interested queues (we consider only the ones with the same
1025 * priority class in the cfq group)
1026 */
1027 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
1028 cfq_class_rt(cfqq));
9a7f38c4
JM
1029 u64 sync_slice = cfqd->cfq_slice[1];
1030 u64 expect_latency = sync_slice * iq;
1031 u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
58ff82f3
VG
1032
1033 if (expect_latency > group_slice) {
9a7f38c4
JM
1034 u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
1035 u64 low_slice;
1036
5db5d642
CZ
1037 /* scale low_slice according to IO priority
1038 * and sync vs async */
9a7f38c4
JM
1039 low_slice = div64_u64(base_low_slice*slice, sync_slice);
1040 low_slice = min(slice, low_slice);
5db5d642
CZ
1041 /* the adapted slice value is scaled to fit all iqs
1042 * into the target latency */
9a7f38c4
JM
1043 slice = div64_u64(slice*group_slice, expect_latency);
1044 slice = max(slice, low_slice);
5db5d642
CZ
1045 }
1046 }
c553f8e3
SL
1047 return slice;
1048}
1049
1050static inline void
1051cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1052{
9a7f38c4
JM
1053 u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1054 u64 now = ktime_get_ns();
c553f8e3 1055
9a7f38c4
JM
1056 cfqq->slice_start = now;
1057 cfqq->slice_end = now + slice;
f75edf2d 1058 cfqq->allocated_slice = slice;
9a7f38c4 1059 cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
44f7c160
JA
1060}
1061
1062/*
1063 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1064 * isn't valid until the first request from the dispatch is activated
1065 * and the slice time set.
1066 */
a6151c3a 1067static inline bool cfq_slice_used(struct cfq_queue *cfqq)
44f7c160
JA
1068{
1069 if (cfq_cfqq_slice_new(cfqq))
c1e44756 1070 return false;
9a7f38c4 1071 if (ktime_get_ns() < cfqq->slice_end)
c1e44756 1072 return false;
44f7c160 1073
c1e44756 1074 return true;
44f7c160
JA
1075}
1076
1da177e4 1077/*
5e705374 1078 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1da177e4 1079 * We choose the request that is closest to the head right now. Distance
e8a99053 1080 * behind the head is penalized and only allowed to a certain extent.
1da177e4 1081 */
5e705374 1082static struct request *
cf7c25cf 1083cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1da177e4 1084{
cf7c25cf 1085 sector_t s1, s2, d1 = 0, d2 = 0;
1da177e4 1086 unsigned long back_max;
e8a99053
AM
1087#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1088#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1089 unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1da177e4 1090
5e705374
JA
1091 if (rq1 == NULL || rq1 == rq2)
1092 return rq2;
1093 if (rq2 == NULL)
1094 return rq1;
9c2c38a1 1095
229836bd
NK
1096 if (rq_is_sync(rq1) != rq_is_sync(rq2))
1097 return rq_is_sync(rq1) ? rq1 : rq2;
1098
65299a3b
CH
1099 if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1100 return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
b53d1ed7 1101
83096ebf
TH
1102 s1 = blk_rq_pos(rq1);
1103 s2 = blk_rq_pos(rq2);
1da177e4 1104
1da177e4
LT
1105 /*
1106 * by definition, 1KiB is 2 sectors
1107 */
1108 back_max = cfqd->cfq_back_max * 2;
1109
1110 /*
1111 * Strict one way elevator _except_ in the case where we allow
1112 * short backward seeks which are biased as twice the cost of a
1113 * similar forward seek.
1114 */
1115 if (s1 >= last)
1116 d1 = s1 - last;
1117 else if (s1 + back_max >= last)
1118 d1 = (last - s1) * cfqd->cfq_back_penalty;
1119 else
e8a99053 1120 wrap |= CFQ_RQ1_WRAP;
1da177e4
LT
1121
1122 if (s2 >= last)
1123 d2 = s2 - last;
1124 else if (s2 + back_max >= last)
1125 d2 = (last - s2) * cfqd->cfq_back_penalty;
1126 else
e8a99053 1127 wrap |= CFQ_RQ2_WRAP;
1da177e4
LT
1128
1129 /* Found required data */
e8a99053
AM
1130
1131 /*
1132 * By doing switch() on the bit mask "wrap" we avoid having to
1133 * check two variables for all permutations: --> faster!
1134 */
1135 switch (wrap) {
5e705374 1136 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
e8a99053 1137 if (d1 < d2)
5e705374 1138 return rq1;
e8a99053 1139 else if (d2 < d1)
5e705374 1140 return rq2;
e8a99053
AM
1141 else {
1142 if (s1 >= s2)
5e705374 1143 return rq1;
e8a99053 1144 else
5e705374 1145 return rq2;
e8a99053 1146 }
1da177e4 1147
e8a99053 1148 case CFQ_RQ2_WRAP:
5e705374 1149 return rq1;
e8a99053 1150 case CFQ_RQ1_WRAP:
5e705374
JA
1151 return rq2;
1152 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
e8a99053
AM
1153 default:
1154 /*
1155 * Since both rqs are wrapped,
1156 * start with the one that's further behind head
1157 * (--> only *one* back seek required),
1158 * since back seek takes more time than forward.
1159 */
1160 if (s1 <= s2)
5e705374 1161 return rq1;
1da177e4 1162 else
5e705374 1163 return rq2;
1da177e4
LT
1164 }
1165}
1166
0871714e 1167static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
cc09e299 1168{
615f0259
VG
1169 /* Service tree is empty */
1170 if (!root->count)
1171 return NULL;
1172
09663c86 1173 return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
cc09e299
JA
1174}
1175
1fa8f6d6
VG
1176static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1177{
09663c86 1178 return rb_entry_cfqg(rb_first_cached(&root->rb));
1fa8f6d6
VG
1179}
1180
09663c86 1181static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
a36e71f9 1182{
09663c86 1183 rb_erase_cached(n, &root->rb);
a36e71f9 1184 RB_CLEAR_NODE(n);
a36e71f9 1185
aa6f6a3d 1186 --root->count;
cc09e299
JA
1187}
1188
1da177e4
LT
1189/*
1190 * would be nice to take fifo expire time into account as well
1191 */
5e705374
JA
1192static struct request *
1193cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1194 struct request *last)
1da177e4 1195{
21183b07
JA
1196 struct rb_node *rbnext = rb_next(&last->rb_node);
1197 struct rb_node *rbprev = rb_prev(&last->rb_node);
5e705374 1198 struct request *next = NULL, *prev = NULL;
1da177e4 1199
21183b07 1200 BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1da177e4
LT
1201
1202 if (rbprev)
5e705374 1203 prev = rb_entry_rq(rbprev);
1da177e4 1204
21183b07 1205 if (rbnext)
5e705374 1206 next = rb_entry_rq(rbnext);
21183b07
JA
1207 else {
1208 rbnext = rb_first(&cfqq->sort_list);
1209 if (rbnext && rbnext != &last->rb_node)
5e705374 1210 next = rb_entry_rq(rbnext);
21183b07 1211 }
1da177e4 1212
cf7c25cf 1213 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1da177e4
LT
1214}
1215
9a7f38c4
JM
1216static u64 cfq_slice_offset(struct cfq_data *cfqd,
1217 struct cfq_queue *cfqq)
1da177e4 1218{
d9e7620e
JA
1219 /*
1220 * just an approximation, should be ok.
1221 */
cdb16e8f 1222 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
464191c6 1223 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
d9e7620e
JA
1224}
1225
1fa8f6d6
VG
1226static inline s64
1227cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1228{
1229 return cfqg->vdisktime - st->min_vdisktime;
1230}
1231
1232static void
1233__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1234{
09663c86 1235 struct rb_node **node = &st->rb.rb_root.rb_node;
1fa8f6d6
VG
1236 struct rb_node *parent = NULL;
1237 struct cfq_group *__cfqg;
1238 s64 key = cfqg_key(st, cfqg);
09663c86 1239 bool leftmost = true;
1fa8f6d6
VG
1240
1241 while (*node != NULL) {
1242 parent = *node;
1243 __cfqg = rb_entry_cfqg(parent);
1244
1245 if (key < cfqg_key(st, __cfqg))
1246 node = &parent->rb_left;
1247 else {
1248 node = &parent->rb_right;
09663c86 1249 leftmost = false;
1fa8f6d6
VG
1250 }
1251 }
1252
1fa8f6d6 1253 rb_link_node(&cfqg->rb_node, parent, node);
09663c86 1254 rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
1fa8f6d6
VG
1255}
1256
7b5af5cf
TM
1257/*
1258 * This has to be called only on activation of cfqg
1259 */
1fa8f6d6 1260static void
8184f93e
JT
1261cfq_update_group_weight(struct cfq_group *cfqg)
1262{
3381cb8d 1263 if (cfqg->new_weight) {
8184f93e 1264 cfqg->weight = cfqg->new_weight;
3381cb8d 1265 cfqg->new_weight = 0;
8184f93e 1266 }
e15693ef
TM
1267}
1268
1269static void
1270cfq_update_group_leaf_weight(struct cfq_group *cfqg)
1271{
1272 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
e71357e1
TH
1273
1274 if (cfqg->new_leaf_weight) {
1275 cfqg->leaf_weight = cfqg->new_leaf_weight;
1276 cfqg->new_leaf_weight = 0;
1277 }
8184f93e
JT
1278}
1279
1280static void
1281cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1282{
1d3650f7 1283 unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
7918ffb5 1284 struct cfq_group *pos = cfqg;
1d3650f7 1285 struct cfq_group *parent;
7918ffb5
TH
1286 bool propagate;
1287
1288 /* add to the service tree */
8184f93e
JT
1289 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1290
7b5af5cf
TM
1291 /*
1292 * Update leaf_weight. We cannot update weight at this point
1293 * because cfqg might already have been activated and is
1294 * contributing its current weight to the parent's child_weight.
1295 */
e15693ef 1296 cfq_update_group_leaf_weight(cfqg);
8184f93e 1297 __cfq_group_service_tree_add(st, cfqg);
7918ffb5
TH
1298
1299 /*
1d3650f7
TH
1300 * Activate @cfqg and calculate the portion of vfraction @cfqg is
1301 * entitled to. vfraction is calculated by walking the tree
1302 * towards the root calculating the fraction it has at each level.
1303 * The compounded ratio is how much vfraction @cfqg owns.
1304 *
1305 * Start with the proportion tasks in this cfqg has against active
1306 * children cfqgs - its leaf_weight against children_weight.
7918ffb5
TH
1307 */
1308 propagate = !pos->nr_active++;
1309 pos->children_weight += pos->leaf_weight;
1d3650f7 1310 vfr = vfr * pos->leaf_weight / pos->children_weight;
7918ffb5 1311
1d3650f7
TH
1312 /*
1313 * Compound ->weight walking up the tree. Both activation and
1314 * vfraction calculation are done in the same loop. Propagation
1315 * stops once an already activated node is met. vfraction
1316 * calculation should always continue to the root.
1317 */
d02f7aa8 1318 while ((parent = cfqg_parent(pos))) {
1d3650f7 1319 if (propagate) {
e15693ef 1320 cfq_update_group_weight(pos);
1d3650f7
TH
1321 propagate = !parent->nr_active++;
1322 parent->children_weight += pos->weight;
1323 }
1324 vfr = vfr * pos->weight / parent->children_weight;
7918ffb5
TH
1325 pos = parent;
1326 }
1d3650f7
TH
1327
1328 cfqg->vfraction = max_t(unsigned, vfr, 1);
8184f93e
JT
1329}
1330
5be6b756
HT
1331static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
1332{
1333 if (!iops_mode(cfqd))
1334 return CFQ_SLICE_MODE_GROUP_DELAY;
1335 else
1336 return CFQ_IOPS_MODE_GROUP_DELAY;
1337}
1338
8184f93e
JT
1339static void
1340cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1fa8f6d6
VG
1341{
1342 struct cfq_rb_root *st = &cfqd->grp_service_tree;
1343 struct cfq_group *__cfqg;
1344 struct rb_node *n;
1345
1346 cfqg->nr_cfqq++;
760701bf 1347 if (!RB_EMPTY_NODE(&cfqg->rb_node))
1fa8f6d6
VG
1348 return;
1349
1350 /*
1351 * Currently put the group at the end. Later implement something
1352 * so that groups get lesser vtime based on their weights, so that
25985edc 1353 * if group does not loose all if it was not continuously backlogged.
1fa8f6d6 1354 */
09663c86 1355 n = rb_last(&st->rb.rb_root);
1fa8f6d6
VG
1356 if (n) {
1357 __cfqg = rb_entry_cfqg(n);
5be6b756
HT
1358 cfqg->vdisktime = __cfqg->vdisktime +
1359 cfq_get_cfqg_vdisktime_delay(cfqd);
1fa8f6d6
VG
1360 } else
1361 cfqg->vdisktime = st->min_vdisktime;
8184f93e
JT
1362 cfq_group_service_tree_add(st, cfqg);
1363}
1fa8f6d6 1364
8184f93e
JT
1365static void
1366cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1367{
7918ffb5
TH
1368 struct cfq_group *pos = cfqg;
1369 bool propagate;
1370
1371 /*
1372 * Undo activation from cfq_group_service_tree_add(). Deactivate
1373 * @cfqg and propagate deactivation upwards.
1374 */
1375 propagate = !--pos->nr_active;
1376 pos->children_weight -= pos->leaf_weight;
1377
1378 while (propagate) {
d02f7aa8 1379 struct cfq_group *parent = cfqg_parent(pos);
7918ffb5
TH
1380
1381 /* @pos has 0 nr_active at this point */
1382 WARN_ON_ONCE(pos->children_weight);
1d3650f7 1383 pos->vfraction = 0;
7918ffb5
TH
1384
1385 if (!parent)
1386 break;
1387
1388 propagate = !--parent->nr_active;
1389 parent->children_weight -= pos->weight;
1390 pos = parent;
1391 }
1392
1393 /* remove from the service tree */
8184f93e
JT
1394 if (!RB_EMPTY_NODE(&cfqg->rb_node))
1395 cfq_rb_erase(&cfqg->rb_node, st);
1fa8f6d6
VG
1396}
1397
1398static void
8184f93e 1399cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1fa8f6d6
VG
1400{
1401 struct cfq_rb_root *st = &cfqd->grp_service_tree;
1402
1403 BUG_ON(cfqg->nr_cfqq < 1);
1404 cfqg->nr_cfqq--;
25bc6b07 1405
1fa8f6d6
VG
1406 /* If there are other cfq queues under this group, don't delete it */
1407 if (cfqg->nr_cfqq)
1408 return;
1409
2868ef7b 1410 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
8184f93e 1411 cfq_group_service_tree_del(st, cfqg);
4d2ceea4 1412 cfqg->saved_wl_slice = 0;
155fead9 1413 cfqg_stats_update_dequeue(cfqg);
dae739eb
VG
1414}
1415
9a7f38c4
JM
1416static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1417 u64 *unaccounted_time)
dae739eb 1418{
9a7f38c4
JM
1419 u64 slice_used;
1420 u64 now = ktime_get_ns();
dae739eb
VG
1421
1422 /*
1423 * Queue got expired before even a single request completed or
1424 * got expired immediately after first request completion.
1425 */
9a7f38c4 1426 if (!cfqq->slice_start || cfqq->slice_start == now) {
dae739eb
VG
1427 /*
1428 * Also charge the seek time incurred to the group, otherwise
1429 * if there are mutiple queues in the group, each can dispatch
1430 * a single request on seeky media and cause lots of seek time
1431 * and group will never know it.
1432 */
0b31c10c
JK
1433 slice_used = max_t(u64, (now - cfqq->dispatch_start),
1434 jiffies_to_nsecs(1));
dae739eb 1435 } else {
9a7f38c4 1436 slice_used = now - cfqq->slice_start;
167400d3
JT
1437 if (slice_used > cfqq->allocated_slice) {
1438 *unaccounted_time = slice_used - cfqq->allocated_slice;
f75edf2d 1439 slice_used = cfqq->allocated_slice;
167400d3 1440 }
9a7f38c4 1441 if (cfqq->slice_start > cfqq->dispatch_start)
167400d3
JT
1442 *unaccounted_time += cfqq->slice_start -
1443 cfqq->dispatch_start;
dae739eb
VG
1444 }
1445
dae739eb
VG
1446 return slice_used;
1447}
1448
1449static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
e5ff082e 1450 struct cfq_queue *cfqq)
dae739eb
VG
1451{
1452 struct cfq_rb_root *st = &cfqd->grp_service_tree;
9a7f38c4 1453 u64 used_sl, charge, unaccounted_sl = 0;
f26bd1f0
VG
1454 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1455 - cfqg->service_tree_idle.count;
1d3650f7 1456 unsigned int vfr;
9a7f38c4 1457 u64 now = ktime_get_ns();
f26bd1f0
VG
1458
1459 BUG_ON(nr_sync < 0);
167400d3 1460 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
dae739eb 1461
02b35081
VG
1462 if (iops_mode(cfqd))
1463 charge = cfqq->slice_dispatch;
1464 else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1465 charge = cfqq->allocated_slice;
dae739eb 1466
1d3650f7
TH
1467 /*
1468 * Can't update vdisktime while on service tree and cfqg->vfraction
1469 * is valid only while on it. Cache vfr, leave the service tree,
1470 * update vdisktime and go back on. The re-addition to the tree
1471 * will also update the weights as necessary.
1472 */
1473 vfr = cfqg->vfraction;
8184f93e 1474 cfq_group_service_tree_del(st, cfqg);
1d3650f7 1475 cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
8184f93e 1476 cfq_group_service_tree_add(st, cfqg);
dae739eb
VG
1477
1478 /* This group is being expired. Save the context */
9a7f38c4
JM
1479 if (cfqd->workload_expires > now) {
1480 cfqg->saved_wl_slice = cfqd->workload_expires - now;
4d2ceea4
VG
1481 cfqg->saved_wl_type = cfqd->serving_wl_type;
1482 cfqg->saved_wl_class = cfqd->serving_wl_class;
dae739eb 1483 } else
4d2ceea4 1484 cfqg->saved_wl_slice = 0;
2868ef7b
VG
1485
1486 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1487 st->min_vdisktime);
fd16d263 1488 cfq_log_cfqq(cfqq->cfqd, cfqq,
9a7f38c4 1489 "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
fd16d263
JP
1490 used_sl, cfqq->slice_dispatch, charge,
1491 iops_mode(cfqd), cfqq->nr_sectors);
155fead9
TH
1492 cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1493 cfqg_stats_set_start_empty_time(cfqg);
1fa8f6d6
VG
1494}
1495
f51b802c
TH
1496/**
1497 * cfq_init_cfqg_base - initialize base part of a cfq_group
1498 * @cfqg: cfq_group to initialize
1499 *
1500 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1501 * is enabled or not.
1502 */
1503static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1504{
1505 struct cfq_rb_root *st;
1506 int i, j;
1507
1508 for_each_cfqg_st(cfqg, i, j, st)
1509 *st = CFQ_RB_ROOT;
1510 RB_CLEAR_NODE(&cfqg->rb_node);
1511
9a7f38c4 1512 cfqg->ttime.last_end_request = ktime_get_ns();
f51b802c
TH
1513}
1514
25fb5169 1515#ifdef CONFIG_CFQ_GROUP_IOSCHED
69d7fde5
TH
1516static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1517 bool on_dfl, bool reset_dev, bool is_leaf_weight);
1518
24bdb8ef 1519static void cfqg_stats_exit(struct cfqg_stats *stats)
90d3839b 1520{
24bdb8ef
TH
1521 blkg_rwstat_exit(&stats->merged);
1522 blkg_rwstat_exit(&stats->service_time);
1523 blkg_rwstat_exit(&stats->wait_time);
1524 blkg_rwstat_exit(&stats->queued);
24bdb8ef
TH
1525 blkg_stat_exit(&stats->time);
1526#ifdef CONFIG_DEBUG_BLK_CGROUP
1527 blkg_stat_exit(&stats->unaccounted_time);
1528 blkg_stat_exit(&stats->avg_queue_size_sum);
1529 blkg_stat_exit(&stats->avg_queue_size_samples);
1530 blkg_stat_exit(&stats->dequeue);
1531 blkg_stat_exit(&stats->group_wait_time);
1532 blkg_stat_exit(&stats->idle_time);
1533 blkg_stat_exit(&stats->empty_time);
1534#endif
1535}
1536
1537static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
1538{
77ea7338 1539 if (blkg_rwstat_init(&stats->merged, gfp) ||
24bdb8ef
TH
1540 blkg_rwstat_init(&stats->service_time, gfp) ||
1541 blkg_rwstat_init(&stats->wait_time, gfp) ||
1542 blkg_rwstat_init(&stats->queued, gfp) ||
24bdb8ef
TH
1543 blkg_stat_init(&stats->time, gfp))
1544 goto err;
90d3839b
PZ
1545
1546#ifdef CONFIG_DEBUG_BLK_CGROUP
24bdb8ef
TH
1547 if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
1548 blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
1549 blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
1550 blkg_stat_init(&stats->dequeue, gfp) ||
1551 blkg_stat_init(&stats->group_wait_time, gfp) ||
1552 blkg_stat_init(&stats->idle_time, gfp) ||
1553 blkg_stat_init(&stats->empty_time, gfp))
1554 goto err;
90d3839b 1555#endif
24bdb8ef
TH
1556 return 0;
1557err:
1558 cfqg_stats_exit(stats);
1559 return -ENOMEM;
90d3839b
PZ
1560}
1561
e4a9bde9
TH
1562static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
1563{
1564 struct cfq_group_data *cgd;
1565
ebc4ff66 1566 cgd = kzalloc(sizeof(*cgd), gfp);
e4a9bde9
TH
1567 if (!cgd)
1568 return NULL;
1569 return &cgd->cpd;
1570}
1571
81437648 1572static void cfq_cpd_init(struct blkcg_policy_data *cpd)
e48453c3 1573{
81437648 1574 struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
9e10a130 1575 unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
69d7fde5 1576 CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
e48453c3 1577
69d7fde5
TH
1578 if (cpd_to_blkcg(cpd) == &blkcg_root)
1579 weight *= 2;
1580
1581 cgd->weight = weight;
1582 cgd->leaf_weight = weight;
e48453c3
AA
1583}
1584
e4a9bde9
TH
1585static void cfq_cpd_free(struct blkcg_policy_data *cpd)
1586{
1587 kfree(cpd_to_cfqgd(cpd));
1588}
1589
69d7fde5
TH
1590static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
1591{
1592 struct blkcg *blkcg = cpd_to_blkcg(cpd);
9e10a130 1593 bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
69d7fde5
TH
1594 unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1595
1596 if (blkcg == &blkcg_root)
1597 weight *= 2;
1598
1599 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
1600 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
1601}
1602
001bea73
TH
1603static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
1604{
b2ce2643
TH
1605 struct cfq_group *cfqg;
1606
1607 cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
1608 if (!cfqg)
1609 return NULL;
1610
1611 cfq_init_cfqg_base(cfqg);
24bdb8ef
TH
1612 if (cfqg_stats_init(&cfqg->stats, gfp)) {
1613 kfree(cfqg);
1614 return NULL;
1615 }
b2ce2643
TH
1616
1617 return &cfqg->pd;
001bea73
TH
1618}
1619
a9520cd6 1620static void cfq_pd_init(struct blkg_policy_data *pd)
f469a7b4 1621{
a9520cd6
TH
1622 struct cfq_group *cfqg = pd_to_cfqg(pd);
1623 struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
25fb5169 1624
e48453c3
AA
1625 cfqg->weight = cgd->weight;
1626 cfqg->leaf_weight = cgd->leaf_weight;
25fb5169
VG
1627}
1628
a9520cd6 1629static void cfq_pd_offline(struct blkg_policy_data *pd)
0b39920b 1630{
a9520cd6 1631 struct cfq_group *cfqg = pd_to_cfqg(pd);
60a83707
TH
1632 int i;
1633
1634 for (i = 0; i < IOPRIO_BE_NR; i++) {
1635 if (cfqg->async_cfqq[0][i])
1636 cfq_put_queue(cfqg->async_cfqq[0][i]);
1637 if (cfqg->async_cfqq[1][i])
1638 cfq_put_queue(cfqg->async_cfqq[1][i]);
1639 }
1640
1641 if (cfqg->async_idle_cfqq)
1642 cfq_put_queue(cfqg->async_idle_cfqq);
1643
0b39920b
TH
1644 /*
1645 * @blkg is going offline and will be ignored by
1646 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
1647 * that they don't get lost. If IOs complete after this point, the
1648 * stats for them will be lost. Oh well...
1649 */
60a83707 1650 cfqg_stats_xfer_dead(cfqg);
0b39920b
TH
1651}
1652
001bea73
TH
1653static void cfq_pd_free(struct blkg_policy_data *pd)
1654{
24bdb8ef
TH
1655 struct cfq_group *cfqg = pd_to_cfqg(pd);
1656
1657 cfqg_stats_exit(&cfqg->stats);
1658 return kfree(cfqg);
001bea73
TH
1659}
1660
a9520cd6 1661static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
689665af 1662{
a9520cd6 1663 struct cfq_group *cfqg = pd_to_cfqg(pd);
689665af
TH
1664
1665 cfqg_stats_reset(&cfqg->stats);
25fb5169
VG
1666}
1667
ae118896
TH
1668static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
1669 struct blkcg *blkcg)
25fb5169 1670{
ae118896 1671 struct blkcg_gq *blkg;
f469a7b4 1672
ae118896
TH
1673 blkg = blkg_lookup(blkcg, cfqd->queue);
1674 if (likely(blkg))
1675 return blkg_to_cfqg(blkg);
1676 return NULL;
25fb5169
VG
1677}
1678
1679static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1680{
25fb5169 1681 cfqq->cfqg = cfqg;
b1c35769 1682 /* cfqq reference on cfqg */
eb7d8c07 1683 cfqg_get(cfqg);
b1c35769
VG
1684}
1685
f95a04af
TH
1686static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1687 struct blkg_policy_data *pd, int off)
60c2bc2d 1688{
f95a04af 1689 struct cfq_group *cfqg = pd_to_cfqg(pd);
3381cb8d
TH
1690
1691 if (!cfqg->dev_weight)
60c2bc2d 1692 return 0;
f95a04af 1693 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
60c2bc2d
TH
1694}
1695
2da8ca82 1696static int cfqg_print_weight_device(struct seq_file *sf, void *v)
60c2bc2d 1697{
2da8ca82
TH
1698 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1699 cfqg_prfill_weight_device, &blkcg_policy_cfq,
1700 0, false);
60c2bc2d
TH
1701 return 0;
1702}
1703
e71357e1
TH
1704static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1705 struct blkg_policy_data *pd, int off)
1706{
1707 struct cfq_group *cfqg = pd_to_cfqg(pd);
1708
1709 if (!cfqg->dev_leaf_weight)
1710 return 0;
1711 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1712}
1713
2da8ca82 1714static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
e71357e1 1715{
2da8ca82
TH
1716 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1717 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
1718 0, false);
e71357e1
TH
1719 return 0;
1720}
1721
2da8ca82 1722static int cfq_print_weight(struct seq_file *sf, void *v)
60c2bc2d 1723{
e48453c3 1724 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
9470e4a6
JA
1725 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1726 unsigned int val = 0;
e48453c3 1727
9470e4a6
JA
1728 if (cgd)
1729 val = cgd->weight;
1730
1731 seq_printf(sf, "%u\n", val);
60c2bc2d
TH
1732 return 0;
1733}
1734
2da8ca82 1735static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
e71357e1 1736{
e48453c3 1737 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
9470e4a6
JA
1738 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1739 unsigned int val = 0;
1740
1741 if (cgd)
1742 val = cgd->leaf_weight;
e48453c3 1743
9470e4a6 1744 seq_printf(sf, "%u\n", val);
e71357e1
TH
1745 return 0;
1746}
1747
451af504
TH
1748static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1749 char *buf, size_t nbytes, loff_t off,
2ee867dc 1750 bool on_dfl, bool is_leaf_weight)
60c2bc2d 1751{
69d7fde5
TH
1752 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1753 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
451af504 1754 struct blkcg *blkcg = css_to_blkcg(of_css(of));
60c2bc2d 1755 struct blkg_conf_ctx ctx;
3381cb8d 1756 struct cfq_group *cfqg;
e48453c3 1757 struct cfq_group_data *cfqgd;
60c2bc2d 1758 int ret;
36aa9e5f 1759 u64 v;
60c2bc2d 1760
3c798398 1761 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
60c2bc2d
TH
1762 if (ret)
1763 return ret;
1764
2ee867dc
TH
1765 if (sscanf(ctx.body, "%llu", &v) == 1) {
1766 /* require "default" on dfl */
1767 ret = -ERANGE;
1768 if (!v && on_dfl)
1769 goto out_finish;
1770 } else if (!strcmp(strim(ctx.body), "default")) {
1771 v = 0;
1772 } else {
1773 ret = -EINVAL;
36aa9e5f 1774 goto out_finish;
2ee867dc 1775 }
36aa9e5f 1776
3381cb8d 1777 cfqg = blkg_to_cfqg(ctx.blkg);
e48453c3 1778 cfqgd = blkcg_to_cfqgd(blkcg);
ae994ea9 1779
20386ce0 1780 ret = -ERANGE;
69d7fde5 1781 if (!v || (v >= min && v <= max)) {
e71357e1 1782 if (!is_leaf_weight) {
36aa9e5f
TH
1783 cfqg->dev_weight = v;
1784 cfqg->new_weight = v ?: cfqgd->weight;
e71357e1 1785 } else {
36aa9e5f
TH
1786 cfqg->dev_leaf_weight = v;
1787 cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
e71357e1 1788 }
60c2bc2d
TH
1789 ret = 0;
1790 }
36aa9e5f 1791out_finish:
60c2bc2d 1792 blkg_conf_finish(&ctx);
451af504 1793 return ret ?: nbytes;
60c2bc2d
TH
1794}
1795
451af504
TH
1796static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
1797 char *buf, size_t nbytes, loff_t off)
e71357e1 1798{
2ee867dc 1799 return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
e71357e1
TH
1800}
1801
451af504
TH
1802static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
1803 char *buf, size_t nbytes, loff_t off)
e71357e1 1804{
2ee867dc 1805 return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
e71357e1
TH
1806}
1807
dd165eb3 1808static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
69d7fde5 1809 bool on_dfl, bool reset_dev, bool is_leaf_weight)
60c2bc2d 1810{
69d7fde5
TH
1811 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1812 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
182446d0 1813 struct blkcg *blkcg = css_to_blkcg(css);
3c798398 1814 struct blkcg_gq *blkg;
e48453c3 1815 struct cfq_group_data *cfqgd;
ae994ea9 1816 int ret = 0;
60c2bc2d 1817
69d7fde5
TH
1818 if (val < min || val > max)
1819 return -ERANGE;
60c2bc2d
TH
1820
1821 spin_lock_irq(&blkcg->lock);
e48453c3 1822 cfqgd = blkcg_to_cfqgd(blkcg);
ae994ea9
JA
1823 if (!cfqgd) {
1824 ret = -EINVAL;
1825 goto out;
1826 }
e71357e1
TH
1827
1828 if (!is_leaf_weight)
e48453c3 1829 cfqgd->weight = val;
e71357e1 1830 else
e48453c3 1831 cfqgd->leaf_weight = val;
60c2bc2d 1832
b67bfe0d 1833 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3381cb8d 1834 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
60c2bc2d 1835
e71357e1
TH
1836 if (!cfqg)
1837 continue;
1838
1839 if (!is_leaf_weight) {
69d7fde5
TH
1840 if (reset_dev)
1841 cfqg->dev_weight = 0;
e71357e1 1842 if (!cfqg->dev_weight)
e48453c3 1843 cfqg->new_weight = cfqgd->weight;
e71357e1 1844 } else {
69d7fde5
TH
1845 if (reset_dev)
1846 cfqg->dev_leaf_weight = 0;
e71357e1 1847 if (!cfqg->dev_leaf_weight)
e48453c3 1848 cfqg->new_leaf_weight = cfqgd->leaf_weight;
e71357e1 1849 }
60c2bc2d
TH
1850 }
1851
ae994ea9 1852out:
60c2bc2d 1853 spin_unlock_irq(&blkcg->lock);
ae994ea9 1854 return ret;
60c2bc2d
TH
1855}
1856
182446d0
TH
1857static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1858 u64 val)
e71357e1 1859{
69d7fde5 1860 return __cfq_set_weight(css, val, false, false, false);
e71357e1
TH
1861}
1862
182446d0
TH
1863static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1864 struct cftype *cft, u64 val)
e71357e1 1865{
69d7fde5 1866 return __cfq_set_weight(css, val, false, false, true);
e71357e1
TH
1867}
1868
2da8ca82 1869static int cfqg_print_stat(struct seq_file *sf, void *v)
5bc4afb1 1870{
2da8ca82
TH
1871 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1872 &blkcg_policy_cfq, seq_cft(sf)->private, false);
5bc4afb1
TH
1873 return 0;
1874}
1875
2da8ca82 1876static int cfqg_print_rwstat(struct seq_file *sf, void *v)
5bc4afb1 1877{
2da8ca82
TH
1878 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1879 &blkcg_policy_cfq, seq_cft(sf)->private, true);
5bc4afb1
TH
1880 return 0;
1881}
1882
43114018
TH
1883static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1884 struct blkg_policy_data *pd, int off)
1885{
f12c74ca
TH
1886 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
1887 &blkcg_policy_cfq, off);
43114018
TH
1888 return __blkg_prfill_u64(sf, pd, sum);
1889}
1890
1891static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1892 struct blkg_policy_data *pd, int off)
1893{
f12c74ca
TH
1894 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
1895 &blkcg_policy_cfq, off);
43114018
TH
1896 return __blkg_prfill_rwstat(sf, pd, &sum);
1897}
1898
2da8ca82 1899static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
43114018 1900{
2da8ca82
TH
1901 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1902 cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
1903 seq_cft(sf)->private, false);
43114018
TH
1904 return 0;
1905}
1906
2da8ca82 1907static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
43114018 1908{
2da8ca82
TH
1909 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1910 cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
1911 seq_cft(sf)->private, true);
43114018
TH
1912 return 0;
1913}
1914
702747ca
TH
1915static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
1916 int off)
1917{
1918 u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
1919
1920 return __blkg_prfill_u64(sf, pd, sum >> 9);
1921}
1922
1923static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
1924{
1925 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1926 cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
1927 return 0;
1928}
1929
1930static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
1931 struct blkg_policy_data *pd, int off)
1932{
1933 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
1934 offsetof(struct blkcg_gq, stat_bytes));
1935 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
1936 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
1937
1938 return __blkg_prfill_u64(sf, pd, sum >> 9);
1939}
1940
1941static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
1942{
1943 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1944 cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
1945 false);
1946 return 0;
1947}
1948
60c2bc2d 1949#ifdef CONFIG_DEBUG_BLK_CGROUP
f95a04af
TH
1950static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1951 struct blkg_policy_data *pd, int off)
60c2bc2d 1952{
f95a04af 1953 struct cfq_group *cfqg = pd_to_cfqg(pd);
155fead9 1954 u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
60c2bc2d
TH
1955 u64 v = 0;
1956
1957 if (samples) {
155fead9 1958 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
f3cff25f 1959 v = div64_u64(v, samples);
60c2bc2d 1960 }
f95a04af 1961 __blkg_prfill_u64(sf, pd, v);
60c2bc2d
TH
1962 return 0;
1963}
1964
1965/* print avg_queue_size */
2da8ca82 1966static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
60c2bc2d 1967{
2da8ca82
TH
1968 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1969 cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
1970 0, false);
60c2bc2d
TH
1971 return 0;
1972}
1973#endif /* CONFIG_DEBUG_BLK_CGROUP */
1974
880f50e2 1975static struct cftype cfq_blkcg_legacy_files[] = {
1d3650f7 1976 /* on root, weight is mapped to leaf_weight */
60c2bc2d
TH
1977 {
1978 .name = "weight_device",
1d3650f7 1979 .flags = CFTYPE_ONLY_ON_ROOT,
2da8ca82 1980 .seq_show = cfqg_print_leaf_weight_device,
451af504 1981 .write = cfqg_set_leaf_weight_device,
60c2bc2d
TH
1982 },
1983 {
1984 .name = "weight",
1d3650f7 1985 .flags = CFTYPE_ONLY_ON_ROOT,
2da8ca82 1986 .seq_show = cfq_print_leaf_weight,
1d3650f7 1987 .write_u64 = cfq_set_leaf_weight,
60c2bc2d 1988 },
e71357e1 1989
1d3650f7 1990 /* no such mapping necessary for !roots */
60c2bc2d
TH
1991 {
1992 .name = "weight_device",
1d3650f7 1993 .flags = CFTYPE_NOT_ON_ROOT,
2da8ca82 1994 .seq_show = cfqg_print_weight_device,
451af504 1995 .write = cfqg_set_weight_device,
60c2bc2d
TH
1996 },
1997 {
1998 .name = "weight",
1d3650f7 1999 .flags = CFTYPE_NOT_ON_ROOT,
2da8ca82 2000 .seq_show = cfq_print_weight,
3381cb8d 2001 .write_u64 = cfq_set_weight,
60c2bc2d 2002 },
e71357e1 2003
e71357e1
TH
2004 {
2005 .name = "leaf_weight_device",
2da8ca82 2006 .seq_show = cfqg_print_leaf_weight_device,
451af504 2007 .write = cfqg_set_leaf_weight_device,
e71357e1
TH
2008 },
2009 {
2010 .name = "leaf_weight",
2da8ca82 2011 .seq_show = cfq_print_leaf_weight,
e71357e1
TH
2012 .write_u64 = cfq_set_leaf_weight,
2013 },
2014
43114018 2015 /* statistics, covers only the tasks in the cfqg */
60c2bc2d
TH
2016 {
2017 .name = "time",
5bc4afb1 2018 .private = offsetof(struct cfq_group, stats.time),
2da8ca82 2019 .seq_show = cfqg_print_stat,
60c2bc2d
TH
2020 },
2021 {
2022 .name = "sectors",
702747ca 2023 .seq_show = cfqg_print_stat_sectors,
60c2bc2d
TH
2024 },
2025 {
2026 .name = "io_service_bytes",
77ea7338
TH
2027 .private = (unsigned long)&blkcg_policy_cfq,
2028 .seq_show = blkg_print_stat_bytes,
60c2bc2d
TH
2029 },
2030 {
2031 .name = "io_serviced",
77ea7338
TH
2032 .private = (unsigned long)&blkcg_policy_cfq,
2033 .seq_show = blkg_print_stat_ios,
60c2bc2d
TH
2034 },
2035 {
2036 .name = "io_service_time",
5bc4afb1 2037 .private = offsetof(struct cfq_group, stats.service_time),
2da8ca82 2038 .seq_show = cfqg_print_rwstat,
60c2bc2d
TH
2039 },
2040 {
2041 .name = "io_wait_time",
5bc4afb1 2042 .private = offsetof(struct cfq_group, stats.wait_time),
2da8ca82 2043 .seq_show = cfqg_print_rwstat,
60c2bc2d
TH
2044 },
2045 {
2046 .name = "io_merged",
5bc4afb1 2047 .private = offsetof(struct cfq_group, stats.merged),
2da8ca82 2048 .seq_show = cfqg_print_rwstat,
60c2bc2d
TH
2049 },
2050 {
2051 .name = "io_queued",
5bc4afb1 2052 .private = offsetof(struct cfq_group, stats.queued),
2da8ca82 2053 .seq_show = cfqg_print_rwstat,
60c2bc2d 2054 },
43114018
TH
2055
2056 /* the same statictics which cover the cfqg and its descendants */
2057 {
2058 .name = "time_recursive",
2059 .private = offsetof(struct cfq_group, stats.time),
2da8ca82 2060 .seq_show = cfqg_print_stat_recursive,
43114018
TH
2061 },
2062 {
2063 .name = "sectors_recursive",
702747ca 2064 .seq_show = cfqg_print_stat_sectors_recursive,
43114018
TH
2065 },
2066 {
2067 .name = "io_service_bytes_recursive",
77ea7338
TH
2068 .private = (unsigned long)&blkcg_policy_cfq,
2069 .seq_show = blkg_print_stat_bytes_recursive,
43114018
TH
2070 },
2071 {
2072 .name = "io_serviced_recursive",
77ea7338
TH
2073 .private = (unsigned long)&blkcg_policy_cfq,
2074 .seq_show = blkg_print_stat_ios_recursive,
43114018
TH
2075 },
2076 {
2077 .name = "io_service_time_recursive",
2078 .private = offsetof(struct cfq_group, stats.service_time),
2da8ca82 2079 .seq_show = cfqg_print_rwstat_recursive,
43114018
TH
2080 },
2081 {
2082 .name = "io_wait_time_recursive",
2083 .private = offsetof(struct cfq_group, stats.wait_time),
2da8ca82 2084 .seq_show = cfqg_print_rwstat_recursive,
43114018
TH
2085 },
2086 {
2087 .name = "io_merged_recursive",
2088 .private = offsetof(struct cfq_group, stats.merged),
2da8ca82 2089 .seq_show = cfqg_print_rwstat_recursive,
43114018
TH
2090 },
2091 {
2092 .name = "io_queued_recursive",
2093 .private = offsetof(struct cfq_group, stats.queued),
2da8ca82 2094 .seq_show = cfqg_print_rwstat_recursive,
43114018 2095 },
60c2bc2d
TH
2096#ifdef CONFIG_DEBUG_BLK_CGROUP
2097 {
2098 .name = "avg_queue_size",
2da8ca82 2099 .seq_show = cfqg_print_avg_queue_size,
60c2bc2d
TH
2100 },
2101 {
2102 .name = "group_wait_time",
5bc4afb1 2103 .private = offsetof(struct cfq_group, stats.group_wait_time),
2da8ca82 2104 .seq_show = cfqg_print_stat,
60c2bc2d
TH
2105 },
2106 {
2107 .name = "idle_time",
5bc4afb1 2108 .private = offsetof(struct cfq_group, stats.idle_time),
2da8ca82 2109 .seq_show = cfqg_print_stat,
60c2bc2d
TH
2110 },
2111 {
2112 .name = "empty_time",
5bc4afb1 2113 .private = offsetof(struct cfq_group, stats.empty_time),
2da8ca82 2114 .seq_show = cfqg_print_stat,
60c2bc2d
TH
2115 },
2116 {
2117 .name = "dequeue",
5bc4afb1 2118 .private = offsetof(struct cfq_group, stats.dequeue),
2da8ca82 2119 .seq_show = cfqg_print_stat,
60c2bc2d
TH
2120 },
2121 {
2122 .name = "unaccounted_time",
5bc4afb1 2123 .private = offsetof(struct cfq_group, stats.unaccounted_time),
2da8ca82 2124 .seq_show = cfqg_print_stat,
60c2bc2d
TH
2125 },
2126#endif /* CONFIG_DEBUG_BLK_CGROUP */
2127 { } /* terminate */
2128};
2ee867dc
TH
2129
2130static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
2131{
2132 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2133 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
2134
2135 seq_printf(sf, "default %u\n", cgd->weight);
2136 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
2137 &blkcg_policy_cfq, 0, false);
2138 return 0;
2139}
2140
2141static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
2142 char *buf, size_t nbytes, loff_t off)
2143{
2144 char *endp;
2145 int ret;
2146 u64 v;
2147
2148 buf = strim(buf);
2149
2150 /* "WEIGHT" or "default WEIGHT" sets the default weight */
2151 v = simple_strtoull(buf, &endp, 0);
2152 if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
69d7fde5 2153 ret = __cfq_set_weight(of_css(of), v, true, false, false);
2ee867dc
TH
2154 return ret ?: nbytes;
2155 }
2156
2157 /* "MAJ:MIN WEIGHT" */
2158 return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
2159}
2160
2161static struct cftype cfq_blkcg_files[] = {
2162 {
2163 .name = "weight",
2164 .flags = CFTYPE_NOT_ON_ROOT,
2165 .seq_show = cfq_print_weight_on_dfl,
2166 .write = cfq_set_weight_on_dfl,
2167 },
2168 { } /* terminate */
2169};
2170
25fb5169 2171#else /* GROUP_IOSCHED */
ae118896
TH
2172static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
2173 struct blkcg *blkcg)
25fb5169 2174{
f51b802c 2175 return cfqd->root_group;
25fb5169 2176}
7f1dc8a2 2177
25fb5169
VG
2178static inline void
2179cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
2180 cfqq->cfqg = cfqg;
2181}
2182
2183#endif /* GROUP_IOSCHED */
2184
498d3aa2 2185/*
c0324a02 2186 * The cfqd->service_trees holds all pending cfq_queue's that have
498d3aa2
JA
2187 * requests waiting to be processed. It is sorted in the order that
2188 * we will service the queues.
2189 */
a36e71f9 2190static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
a6151c3a 2191 bool add_front)
d9e7620e 2192{
0871714e
JA
2193 struct rb_node **p, *parent;
2194 struct cfq_queue *__cfqq;
9a7f38c4 2195 u64 rb_key;
34b98d03 2196 struct cfq_rb_root *st;
09663c86 2197 bool leftmost = true;
dae739eb 2198 int new_cfqq = 1;
9a7f38c4 2199 u64 now = ktime_get_ns();
ae30c286 2200
34b98d03 2201 st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
0871714e
JA
2202 if (cfq_class_idle(cfqq)) {
2203 rb_key = CFQ_IDLE_DELAY;
09663c86 2204 parent = rb_last(&st->rb.rb_root);
0871714e
JA
2205 if (parent && parent != &cfqq->rb_node) {
2206 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2207 rb_key += __cfqq->rb_key;
2208 } else
9a7f38c4 2209 rb_key += now;
0871714e 2210 } else if (!add_front) {
b9c8946b
JA
2211 /*
2212 * Get our rb key offset. Subtract any residual slice
2213 * value carried from last service. A negative resid
2214 * count indicates slice overrun, and this should position
2215 * the next service time further away in the tree.
2216 */
9a7f38c4 2217 rb_key = cfq_slice_offset(cfqd, cfqq) + now;
b9c8946b 2218 rb_key -= cfqq->slice_resid;
edd75ffd 2219 cfqq->slice_resid = 0;
48e025e6 2220 } else {
9a7f38c4 2221 rb_key = -NSEC_PER_SEC;
34b98d03 2222 __cfqq = cfq_rb_first(st);
9a7f38c4 2223 rb_key += __cfqq ? __cfqq->rb_key : now;
48e025e6 2224 }
1da177e4 2225
d9e7620e 2226 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
dae739eb 2227 new_cfqq = 0;
99f9628a 2228 /*
d9e7620e 2229 * same position, nothing more to do
99f9628a 2230 */
34b98d03 2231 if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
d9e7620e 2232 return;
1da177e4 2233
aa6f6a3d
CZ
2234 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2235 cfqq->service_tree = NULL;
1da177e4 2236 }
d9e7620e 2237
0871714e 2238 parent = NULL;
34b98d03 2239 cfqq->service_tree = st;
09663c86 2240 p = &st->rb.rb_root.rb_node;
d9e7620e
JA
2241 while (*p) {
2242 parent = *p;
2243 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2244
0c534e0a 2245 /*
c0324a02 2246 * sort by key, that represents service time.
0c534e0a 2247 */
9a7f38c4 2248 if (rb_key < __cfqq->rb_key)
1f23f121 2249 p = &parent->rb_left;
c0324a02 2250 else {
1f23f121 2251 p = &parent->rb_right;
09663c86 2252 leftmost = false;
c0324a02 2253 }
d9e7620e
JA
2254 }
2255
2256 cfqq->rb_key = rb_key;
2257 rb_link_node(&cfqq->rb_node, parent, p);
09663c86 2258 rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost);
34b98d03 2259 st->count++;
20359f27 2260 if (add_front || !new_cfqq)
dae739eb 2261 return;
8184f93e 2262 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1da177e4
LT
2263}
2264
a36e71f9 2265static struct cfq_queue *
f2d1f0ae
JA
2266cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
2267 sector_t sector, struct rb_node **ret_parent,
2268 struct rb_node ***rb_link)
a36e71f9 2269{
a36e71f9
JA
2270 struct rb_node **p, *parent;
2271 struct cfq_queue *cfqq = NULL;
2272
2273 parent = NULL;
2274 p = &root->rb_node;
2275 while (*p) {
2276 struct rb_node **n;
2277
2278 parent = *p;
2279 cfqq = rb_entry(parent, struct cfq_queue, p_node);
2280
2281 /*
2282 * Sort strictly based on sector. Smallest to the left,
2283 * largest to the right.
2284 */
2e46e8b2 2285 if (sector > blk_rq_pos(cfqq->next_rq))
a36e71f9 2286 n = &(*p)->rb_right;
2e46e8b2 2287 else if (sector < blk_rq_pos(cfqq->next_rq))
a36e71f9
JA
2288 n = &(*p)->rb_left;
2289 else
2290 break;
2291 p = n;
3ac6c9f8 2292 cfqq = NULL;
a36e71f9
JA
2293 }
2294
2295 *ret_parent = parent;
2296 if (rb_link)
2297 *rb_link = p;
3ac6c9f8 2298 return cfqq;
a36e71f9
JA
2299}
2300
2301static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2302{
a36e71f9
JA
2303 struct rb_node **p, *parent;
2304 struct cfq_queue *__cfqq;
2305
f2d1f0ae
JA
2306 if (cfqq->p_root) {
2307 rb_erase(&cfqq->p_node, cfqq->p_root);
2308 cfqq->p_root = NULL;
2309 }
a36e71f9
JA
2310
2311 if (cfq_class_idle(cfqq))
2312 return;
2313 if (!cfqq->next_rq)
2314 return;
2315
f2d1f0ae 2316 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
2e46e8b2
TH
2317 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
2318 blk_rq_pos(cfqq->next_rq), &parent, &p);
3ac6c9f8
JA
2319 if (!__cfqq) {
2320 rb_link_node(&cfqq->p_node, parent, p);
f2d1f0ae
JA
2321 rb_insert_color(&cfqq->p_node, cfqq->p_root);
2322 } else
2323 cfqq->p_root = NULL;
a36e71f9
JA
2324}
2325
498d3aa2
JA
2326/*
2327 * Update cfqq's position in the service tree.
2328 */
edd75ffd 2329static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
6d048f53 2330{
6d048f53
JA
2331 /*
2332 * Resorting requires the cfqq to be on the RR list already.
2333 */
a36e71f9 2334 if (cfq_cfqq_on_rr(cfqq)) {
edd75ffd 2335 cfq_service_tree_add(cfqd, cfqq, 0);
a36e71f9
JA
2336 cfq_prio_tree_add(cfqd, cfqq);
2337 }
6d048f53
JA
2338}
2339
1da177e4
LT
2340/*
2341 * add to busy list of queues for service, trying to be fair in ordering
22e2c507 2342 * the pending list according to last request service
1da177e4 2343 */
febffd61 2344static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1da177e4 2345{
7b679138 2346 cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
3b18152c
JA
2347 BUG_ON(cfq_cfqq_on_rr(cfqq));
2348 cfq_mark_cfqq_on_rr(cfqq);
1da177e4 2349 cfqd->busy_queues++;
ef8a41df
SL
2350 if (cfq_cfqq_sync(cfqq))
2351 cfqd->busy_sync_queues++;
1da177e4 2352
edd75ffd 2353 cfq_resort_rr_list(cfqd, cfqq);
1da177e4
LT
2354}
2355
498d3aa2
JA
2356/*
2357 * Called when the cfqq no longer has requests pending, remove it from
2358 * the service tree.
2359 */
febffd61 2360static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1da177e4 2361{
7b679138 2362 cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
3b18152c
JA
2363 BUG_ON(!cfq_cfqq_on_rr(cfqq));
2364 cfq_clear_cfqq_on_rr(cfqq);
1da177e4 2365
aa6f6a3d
CZ
2366 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2367 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2368 cfqq->service_tree = NULL;
2369 }
f2d1f0ae
JA
2370 if (cfqq->p_root) {
2371 rb_erase(&cfqq->p_node, cfqq->p_root);
2372 cfqq->p_root = NULL;
2373 }
d9e7620e 2374
8184f93e 2375 cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
1da177e4
LT
2376 BUG_ON(!cfqd->busy_queues);
2377 cfqd->busy_queues--;
ef8a41df
SL
2378 if (cfq_cfqq_sync(cfqq))
2379 cfqd->busy_sync_queues--;
1da177e4
LT
2380}
2381
2382/*
2383 * rb tree support functions
2384 */
febffd61 2385static void cfq_del_rq_rb(struct request *rq)
1da177e4 2386{
5e705374 2387 struct cfq_queue *cfqq = RQ_CFQQ(rq);
5e705374 2388 const int sync = rq_is_sync(rq);
1da177e4 2389
b4878f24
JA
2390 BUG_ON(!cfqq->queued[sync]);
2391 cfqq->queued[sync]--;
1da177e4 2392
5e705374 2393 elv_rb_del(&cfqq->sort_list, rq);
1da177e4 2394
f04a6424
VG
2395 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2396 /*
2397 * Queue will be deleted from service tree when we actually
2398 * expire it later. Right now just remove it from prio tree
2399 * as it is empty.
2400 */
2401 if (cfqq->p_root) {
2402 rb_erase(&cfqq->p_node, cfqq->p_root);
2403 cfqq->p_root = NULL;
2404 }
2405 }
1da177e4
LT
2406}
2407
5e705374 2408static void cfq_add_rq_rb(struct request *rq)
1da177e4 2409{
5e705374 2410 struct cfq_queue *cfqq = RQ_CFQQ(rq);
1da177e4 2411 struct cfq_data *cfqd = cfqq->cfqd;
796d5116 2412 struct request *prev;
1da177e4 2413
5380a101 2414 cfqq->queued[rq_is_sync(rq)]++;
1da177e4 2415
796d5116 2416 elv_rb_add(&cfqq->sort_list, rq);
5fccbf61
JA
2417
2418 if (!cfq_cfqq_on_rr(cfqq))
2419 cfq_add_cfqq_rr(cfqd, cfqq);
5044eed4
JA
2420
2421 /*
2422 * check if this request is a better next-serve candidate
2423 */
a36e71f9 2424 prev = cfqq->next_rq;
cf7c25cf 2425 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
a36e71f9
JA
2426
2427 /*
2428 * adjust priority tree position, if ->next_rq changes
2429 */
2430 if (prev != cfqq->next_rq)
2431 cfq_prio_tree_add(cfqd, cfqq);
2432
5044eed4 2433 BUG_ON(!cfqq->next_rq);
1da177e4
LT
2434}
2435
febffd61 2436static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1da177e4 2437{
5380a101
JA
2438 elv_rb_del(&cfqq->sort_list, rq);
2439 cfqq->queued[rq_is_sync(rq)]--;
ef295ecf 2440 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
5e705374 2441 cfq_add_rq_rb(rq);
155fead9 2442 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
ef295ecf 2443 rq->cmd_flags);
1da177e4
LT
2444}
2445
206dc69b
JA
2446static struct request *
2447cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1da177e4 2448{
206dc69b 2449 struct task_struct *tsk = current;
c5869807 2450 struct cfq_io_cq *cic;
206dc69b 2451 struct cfq_queue *cfqq;
1da177e4 2452
4ac845a2 2453 cic = cfq_cic_lookup(cfqd, tsk->io_context);
91fac317
VT
2454 if (!cic)
2455 return NULL;
2456
aa39ebd4 2457 cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
f73a1c7d
KO
2458 if (cfqq)
2459 return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
1da177e4 2460
1da177e4
LT
2461 return NULL;
2462}
2463
165125e1 2464static void cfq_activate_request(struct request_queue *q, struct request *rq)
1da177e4 2465{
22e2c507 2466 struct cfq_data *cfqd = q->elevator->elevator_data;
3b18152c 2467
53c583d2 2468 cfqd->rq_in_driver++;
7b679138 2469 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
53c583d2 2470 cfqd->rq_in_driver);
25776e35 2471
5b93629b 2472 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1da177e4
LT
2473}
2474
165125e1 2475static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
1da177e4 2476{
b4878f24
JA
2477 struct cfq_data *cfqd = q->elevator->elevator_data;
2478
53c583d2
CZ
2479 WARN_ON(!cfqd->rq_in_driver);
2480 cfqd->rq_in_driver--;
7b679138 2481 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
53c583d2 2482 cfqd->rq_in_driver);
1da177e4
LT
2483}
2484
b4878f24 2485static void cfq_remove_request(struct request *rq)
1da177e4 2486{
5e705374 2487 struct cfq_queue *cfqq = RQ_CFQQ(rq);
21183b07 2488
5e705374
JA
2489 if (cfqq->next_rq == rq)
2490 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
1da177e4 2491
b4878f24 2492 list_del_init(&rq->queuelist);
5e705374 2493 cfq_del_rq_rb(rq);
374f84ac 2494
45333d5a 2495 cfqq->cfqd->rq_queued--;
ef295ecf 2496 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
65299a3b
CH
2497 if (rq->cmd_flags & REQ_PRIO) {
2498 WARN_ON(!cfqq->prio_pending);
2499 cfqq->prio_pending--;
b53d1ed7 2500 }
1da177e4
LT
2501}
2502
34fe7c05 2503static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
165125e1 2504 struct bio *bio)
1da177e4
LT
2505{
2506 struct cfq_data *cfqd = q->elevator->elevator_data;
2507 struct request *__rq;
1da177e4 2508
206dc69b 2509 __rq = cfq_find_rq_fmerge(cfqd, bio);
72ef799b 2510 if (__rq && elv_bio_merge_ok(__rq, bio)) {
9817064b
JA
2511 *req = __rq;
2512 return ELEVATOR_FRONT_MERGE;
1da177e4
LT
2513 }
2514
2515 return ELEVATOR_NO_MERGE;
1da177e4
LT
2516}
2517
165125e1 2518static void cfq_merged_request(struct request_queue *q, struct request *req,
34fe7c05 2519 enum elv_merge type)
1da177e4 2520{
21183b07 2521 if (type == ELEVATOR_FRONT_MERGE) {
5e705374 2522 struct cfq_queue *cfqq = RQ_CFQQ(req);
1da177e4 2523
5e705374 2524 cfq_reposition_rq_rb(cfqq, req);
1da177e4 2525 }
1da177e4
LT
2526}
2527
812d4026
DS
2528static void cfq_bio_merged(struct request_queue *q, struct request *req,
2529 struct bio *bio)
2530{
ef295ecf 2531 cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
812d4026
DS
2532}
2533
1da177e4 2534static void
165125e1 2535cfq_merged_requests(struct request_queue *q, struct request *rq,
1da177e4
LT
2536 struct request *next)
2537{
cf7c25cf 2538 struct cfq_queue *cfqq = RQ_CFQQ(rq);
4a0b75c7
SL
2539 struct cfq_data *cfqd = q->elevator->elevator_data;
2540
22e2c507
JA
2541 /*
2542 * reposition in fifo if next is older than rq
2543 */
2544 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
9a7f38c4 2545 next->fifo_time < rq->fifo_time &&
3d106fba 2546 cfqq == RQ_CFQQ(next)) {
22e2c507 2547 list_move(&rq->queuelist, &next->queuelist);
8b4922d3 2548 rq->fifo_time = next->fifo_time;
30996f40 2549 }
22e2c507 2550
cf7c25cf
CZ
2551 if (cfqq->next_rq == next)
2552 cfqq->next_rq = rq;
b4878f24 2553 cfq_remove_request(next);
ef295ecf 2554 cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
4a0b75c7
SL
2555
2556 cfqq = RQ_CFQQ(next);
2557 /*
2558 * all requests of this queue are merged to other queues, delete it
2559 * from the service tree. If it's the active_queue,
2560 * cfq_dispatch_requests() will choose to expire it or do idle
2561 */
2562 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2563 cfqq != cfqd->active_queue)
2564 cfq_del_cfqq_rr(cfqd, cfqq);
22e2c507
JA
2565}
2566
72ef799b
TE
2567static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
2568 struct bio *bio)
da775265
JA
2569{
2570 struct cfq_data *cfqd = q->elevator->elevator_data;
aa39ebd4 2571 bool is_sync = op_is_sync(bio->bi_opf);
c5869807 2572 struct cfq_io_cq *cic;
da775265 2573 struct cfq_queue *cfqq;
da775265
JA
2574
2575 /*
ec8acb69 2576 * Disallow merge of a sync bio into an async request.
da775265 2577 */
aa39ebd4 2578 if (is_sync && !rq_is_sync(rq))
a6151c3a 2579 return false;
da775265
JA
2580
2581 /*
f1a4f4d3 2582 * Lookup the cfqq that this bio will be queued with and allow
07c2bd37 2583 * merge only if rq is queued there.
f1a4f4d3 2584 */
07c2bd37
TH
2585 cic = cfq_cic_lookup(cfqd, current->io_context);
2586 if (!cic)
2587 return false;
719d3402 2588
aa39ebd4 2589 cfqq = cic_to_cfqq(cic, is_sync);
a6151c3a 2590 return cfqq == RQ_CFQQ(rq);
da775265
JA
2591}
2592
72ef799b
TE
2593static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq,
2594 struct request *next)
2595{
2596 return RQ_CFQQ(rq) == RQ_CFQQ(next);
2597}
2598
812df48d
DS
2599static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2600{
91148325 2601 hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
155fead9 2602 cfqg_stats_update_idle_time(cfqq->cfqg);
812df48d
DS
2603}
2604
febffd61
JA
2605static void __cfq_set_active_queue(struct cfq_data *cfqd,
2606 struct cfq_queue *cfqq)
22e2c507
JA
2607{
2608 if (cfqq) {
3bf10fea 2609 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
4d2ceea4 2610 cfqd->serving_wl_class, cfqd->serving_wl_type);
155fead9 2611 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
62a37f6b 2612 cfqq->slice_start = 0;
9a7f38c4 2613 cfqq->dispatch_start = ktime_get_ns();
62a37f6b
JT
2614 cfqq->allocated_slice = 0;
2615 cfqq->slice_end = 0;
2616 cfqq->slice_dispatch = 0;
2617 cfqq->nr_sectors = 0;
2618
2619 cfq_clear_cfqq_wait_request(cfqq);
2620 cfq_clear_cfqq_must_dispatch(cfqq);
2621 cfq_clear_cfqq_must_alloc_slice(cfqq);
2622 cfq_clear_cfqq_fifo_expire(cfqq);
2623 cfq_mark_cfqq_slice_new(cfqq);
2624
2625 cfq_del_timer(cfqd, cfqq);
22e2c507
JA
2626 }
2627
2628 cfqd->active_queue = cfqq;
2629}
2630
7b14e3b5
JA
2631/*
2632 * current cfqq expired its slice (or was too idle), select new one
2633 */
2634static void
2635__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
e5ff082e 2636 bool timed_out)
7b14e3b5 2637{
7b679138
JA
2638 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2639
7b14e3b5 2640 if (cfq_cfqq_wait_request(cfqq))
812df48d 2641 cfq_del_timer(cfqd, cfqq);
7b14e3b5 2642
7b14e3b5 2643 cfq_clear_cfqq_wait_request(cfqq);
f75edf2d 2644 cfq_clear_cfqq_wait_busy(cfqq);
7b14e3b5 2645
ae54abed
SL
2646 /*
2647 * If this cfqq is shared between multiple processes, check to
2648 * make sure that those processes are still issuing I/Os within
2649 * the mean seek distance. If not, it may be time to break the
2650 * queues apart again.
2651 */
2652 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2653 cfq_mark_cfqq_split_coop(cfqq);
2654
7b14e3b5 2655 /*
6084cdda 2656 * store what was left of this slice, if the queue idled/timed out
7b14e3b5 2657 */
c553f8e3
SL
2658 if (timed_out) {
2659 if (cfq_cfqq_slice_new(cfqq))
ba5bd520 2660 cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
c553f8e3 2661 else
9a7f38c4 2662 cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
93fdf147 2663 cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
7b679138 2664 }
7b14e3b5 2665
e5ff082e 2666 cfq_group_served(cfqd, cfqq->cfqg, cfqq);
dae739eb 2667
f04a6424
VG
2668 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2669 cfq_del_cfqq_rr(cfqd, cfqq);
2670
edd75ffd 2671 cfq_resort_rr_list(cfqd, cfqq);
7b14e3b5
JA
2672
2673 if (cfqq == cfqd->active_queue)
2674 cfqd->active_queue = NULL;
2675
2676 if (cfqd->active_cic) {
11a3122f 2677 put_io_context(cfqd->active_cic->icq.ioc);
7b14e3b5
JA
2678 cfqd->active_cic = NULL;
2679 }
7b14e3b5
JA
2680}
2681
e5ff082e 2682static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
7b14e3b5
JA
2683{
2684 struct cfq_queue *cfqq = cfqd->active_queue;
2685
2686 if (cfqq)
e5ff082e 2687 __cfq_slice_expired(cfqd, cfqq, timed_out);
7b14e3b5
JA
2688}
2689
498d3aa2
JA
2690/*
2691 * Get next queue for service. Unless we have a queue preemption,
2692 * we'll simply select the first cfqq in the service tree.
2693 */
6d048f53 2694static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
22e2c507 2695{
34b98d03
VG
2696 struct cfq_rb_root *st = st_for(cfqd->serving_group,
2697 cfqd->serving_wl_class, cfqd->serving_wl_type);
d9e7620e 2698
f04a6424
VG
2699 if (!cfqd->rq_queued)
2700 return NULL;
2701
1fa8f6d6 2702 /* There is nothing to dispatch */
34b98d03 2703 if (!st)
1fa8f6d6 2704 return NULL;
09663c86 2705 if (RB_EMPTY_ROOT(&st->rb.rb_root))
c0324a02 2706 return NULL;
34b98d03 2707 return cfq_rb_first(st);
6d048f53
JA
2708}
2709
f04a6424
VG
2710static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2711{
25fb5169 2712 struct cfq_group *cfqg;
f04a6424
VG
2713 struct cfq_queue *cfqq;
2714 int i, j;
2715 struct cfq_rb_root *st;
2716
2717 if (!cfqd->rq_queued)
2718 return NULL;
2719
25fb5169
VG
2720 cfqg = cfq_get_next_cfqg(cfqd);
2721 if (!cfqg)
2722 return NULL;
2723
1cf41753
ME
2724 for_each_cfqg_st(cfqg, i, j, st) {
2725 cfqq = cfq_rb_first(st);
2726 if (cfqq)
f04a6424 2727 return cfqq;
1cf41753 2728 }
f04a6424
VG
2729 return NULL;
2730}
2731
498d3aa2
JA
2732/*
2733 * Get and set a new active queue for service.
2734 */
a36e71f9
JA
2735static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2736 struct cfq_queue *cfqq)
6d048f53 2737{
e00ef799 2738 if (!cfqq)
a36e71f9 2739 cfqq = cfq_get_next_queue(cfqd);
6d048f53 2740
22e2c507 2741 __cfq_set_active_queue(cfqd, cfqq);
3b18152c 2742 return cfqq;
22e2c507
JA
2743}
2744
d9e7620e
JA
2745static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2746 struct request *rq)
2747{
83096ebf
TH
2748 if (blk_rq_pos(rq) >= cfqd->last_position)
2749 return blk_rq_pos(rq) - cfqd->last_position;
d9e7620e 2750 else
83096ebf 2751 return cfqd->last_position - blk_rq_pos(rq);
d9e7620e
JA
2752}
2753
b2c18e1e 2754static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
e9ce335d 2755 struct request *rq)
6d048f53 2756{
e9ce335d 2757 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
6d048f53
JA
2758}
2759
a36e71f9
JA
2760static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2761 struct cfq_queue *cur_cfqq)
2762{
f2d1f0ae 2763 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
a36e71f9
JA
2764 struct rb_node *parent, *node;
2765 struct cfq_queue *__cfqq;
2766 sector_t sector = cfqd->last_position;
2767
2768 if (RB_EMPTY_ROOT(root))
2769 return NULL;
2770
2771 /*
2772 * First, if we find a request starting at the end of the last
2773 * request, choose it.
2774 */
f2d1f0ae 2775 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
a36e71f9
JA
2776 if (__cfqq)
2777 return __cfqq;
2778
2779 /*
2780 * If the exact sector wasn't found, the parent of the NULL leaf
2781 * will contain the closest sector.
2782 */
2783 __cfqq = rb_entry(parent, struct cfq_queue, p_node);
e9ce335d 2784 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
a36e71f9
JA
2785 return __cfqq;
2786
2e46e8b2 2787 if (blk_rq_pos(__cfqq->next_rq) < sector)
a36e71f9
JA
2788 node = rb_next(&__cfqq->p_node);
2789 else
2790 node = rb_prev(&__cfqq->p_node);
2791 if (!node)
2792 return NULL;
2793
2794 __cfqq = rb_entry(node, struct cfq_queue, p_node);
e9ce335d 2795 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
a36e71f9
JA
2796 return __cfqq;
2797
2798 return NULL;
2799}
2800
2801/*
2802 * cfqd - obvious
2803 * cur_cfqq - passed in so that we don't decide that the current queue is
2804 * closely cooperating with itself.
2805 *
2806 * So, basically we're assuming that that cur_cfqq has dispatched at least
2807 * one request, and that cfqd->last_position reflects a position on the disk
2808 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid
2809 * assumption.
2810 */
2811static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
b3b6d040 2812 struct cfq_queue *cur_cfqq)
6d048f53 2813{
a36e71f9
JA
2814 struct cfq_queue *cfqq;
2815
39c01b21
DS
2816 if (cfq_class_idle(cur_cfqq))
2817 return NULL;
e6c5bc73
JM
2818 if (!cfq_cfqq_sync(cur_cfqq))
2819 return NULL;
2820 if (CFQQ_SEEKY(cur_cfqq))
2821 return NULL;
2822
b9d8f4c7
GJ
2823 /*
2824 * Don't search priority tree if it's the only queue in the group.
2825 */
2826 if (cur_cfqq->cfqg->nr_cfqq == 1)
2827 return NULL;
2828
6d048f53 2829 /*
d9e7620e
JA
2830 * We should notice if some of the queues are cooperating, eg
2831 * working closely on the same area of the disk. In that case,
2832 * we can group them together and don't waste time idling.
6d048f53 2833 */
a36e71f9
JA
2834 cfqq = cfqq_close(cfqd, cur_cfqq);
2835 if (!cfqq)
2836 return NULL;
2837
8682e1f1
VG
2838 /* If new queue belongs to different cfq_group, don't choose it */
2839 if (cur_cfqq->cfqg != cfqq->cfqg)
2840 return NULL;
2841
df5fe3e8
JM
2842 /*
2843 * It only makes sense to merge sync queues.
2844 */
2845 if (!cfq_cfqq_sync(cfqq))
2846 return NULL;
e6c5bc73
JM
2847 if (CFQQ_SEEKY(cfqq))
2848 return NULL;
df5fe3e8 2849
c0324a02
CZ
2850 /*
2851 * Do not merge queues of different priority classes
2852 */
2853 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2854 return NULL;
2855
a36e71f9 2856 return cfqq;
6d048f53
JA
2857}
2858
a6d44e98
CZ
2859/*
2860 * Determine whether we should enforce idle window for this queue.
2861 */
2862
2863static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2864{
3bf10fea 2865 enum wl_class_t wl_class = cfqq_class(cfqq);
34b98d03 2866 struct cfq_rb_root *st = cfqq->service_tree;
a6d44e98 2867
34b98d03
VG
2868 BUG_ON(!st);
2869 BUG_ON(!st->count);
f04a6424 2870
b6508c16
VG
2871 if (!cfqd->cfq_slice_idle)
2872 return false;
2873
a6d44e98 2874 /* We never do for idle class queues. */
3bf10fea 2875 if (wl_class == IDLE_WORKLOAD)
a6d44e98
CZ
2876 return false;
2877
2878 /* We do for queues that were marked with idle window flag. */
3c764b7a
SL
2879 if (cfq_cfqq_idle_window(cfqq) &&
2880 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
a6d44e98
CZ
2881 return true;
2882
2883 /*
2884 * Otherwise, we do only if they are the last ones
2885 * in their service tree.
2886 */
34b98d03
VG
2887 if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2888 !cfq_io_thinktime_big(cfqd, &st->ttime, false))
c1e44756 2889 return true;
34b98d03 2890 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
c1e44756 2891 return false;
a6d44e98
CZ
2892}
2893
6d048f53 2894static void cfq_arm_slice_timer(struct cfq_data *cfqd)
22e2c507 2895{
1792669c 2896 struct cfq_queue *cfqq = cfqd->active_queue;
e795421e 2897 struct cfq_rb_root *st = cfqq->service_tree;
c5869807 2898 struct cfq_io_cq *cic;
9a7f38c4
JM
2899 u64 sl, group_idle = 0;
2900 u64 now = ktime_get_ns();
7b14e3b5 2901
a68bbddb 2902 /*
f7d7b7a7
JA
2903 * SSD device without seek penalty, disable idling. But only do so
2904 * for devices that support queuing, otherwise we still have a problem
2905 * with sync vs async workloads.
a68bbddb 2906 */
b3193bc0
RH
2907 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag &&
2908 !cfqd->cfq_group_idle)
a68bbddb
JA
2909 return;
2910
dd67d051 2911 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
6d048f53 2912 WARN_ON(cfq_cfqq_slice_new(cfqq));
22e2c507
JA
2913
2914 /*
2915 * idle is disabled, either manually or by past process history
2916 */
80bdf0c7
VG
2917 if (!cfq_should_idle(cfqd, cfqq)) {
2918 /* no queue idling. Check for group idling */
2919 if (cfqd->cfq_group_idle)
2920 group_idle = cfqd->cfq_group_idle;
2921 else
2922 return;
2923 }
6d048f53 2924
7b679138 2925 /*
8e550632 2926 * still active requests from this queue, don't idle
7b679138 2927 */
8e550632 2928 if (cfqq->dispatched)
7b679138
JA
2929 return;
2930
22e2c507
JA
2931 /*
2932 * task has exited, don't wait
2933 */
206dc69b 2934 cic = cfqd->active_cic;
f6e8d01b 2935 if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
6d048f53
JA
2936 return;
2937
355b659c
CZ
2938 /*
2939 * If our average think time is larger than the remaining time
2940 * slice, then don't idle. This avoids overrunning the allotted
2941 * time slice.
2942 */
383cd721 2943 if (sample_valid(cic->ttime.ttime_samples) &&
9a7f38c4
JM
2944 (cfqq->slice_end - now < cic->ttime.ttime_mean)) {
2945 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu",
383cd721 2946 cic->ttime.ttime_mean);
355b659c 2947 return;
b1ffe737 2948 }
355b659c 2949
e795421e
JK
2950 /*
2951 * There are other queues in the group or this is the only group and
2952 * it has too big thinktime, don't do group idle.
2953 */
2954 if (group_idle &&
2955 (cfqq->cfqg->nr_cfqq > 1 ||
2956 cfq_io_thinktime_big(cfqd, &st->ttime, true)))
80bdf0c7
VG
2957 return;
2958
3b18152c 2959 cfq_mark_cfqq_wait_request(cfqq);
22e2c507 2960
80bdf0c7
VG
2961 if (group_idle)
2962 sl = cfqd->cfq_group_idle;
2963 else
2964 sl = cfqd->cfq_slice_idle;
206dc69b 2965
91148325
JK
2966 hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl),
2967 HRTIMER_MODE_REL);
155fead9 2968 cfqg_stats_set_start_idle_time(cfqq->cfqg);
9a7f38c4 2969 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl,
80bdf0c7 2970 group_idle ? 1 : 0);
1da177e4
LT
2971}
2972
498d3aa2
JA
2973/*
2974 * Move request from internal lists to the request queue dispatch list.
2975 */
165125e1 2976static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1da177e4 2977{
3ed9a296 2978 struct cfq_data *cfqd = q->elevator->elevator_data;
5e705374 2979 struct cfq_queue *cfqq = RQ_CFQQ(rq);
22e2c507 2980
7b679138
JA
2981 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2982
06d21886 2983 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
5380a101 2984 cfq_remove_request(rq);
6d048f53 2985 cfqq->dispatched++;
80bdf0c7 2986 (RQ_CFQG(rq))->dispatched++;
5380a101 2987 elv_dispatch_sort(q, rq);
3ed9a296 2988
53c583d2 2989 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
c4e7893e 2990 cfqq->nr_sectors += blk_rq_sectors(rq);
1da177e4
LT
2991}
2992
2993/*
2994 * return expired entry, or NULL to just start from scratch in rbtree
2995 */
febffd61 2996static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
1da177e4 2997{
30996f40 2998 struct request *rq = NULL;
1da177e4 2999
3b18152c 3000 if (cfq_cfqq_fifo_expire(cfqq))
1da177e4 3001 return NULL;
cb887411
JA
3002
3003 cfq_mark_cfqq_fifo_expire(cfqq);
3004
89850f7e
JA
3005 if (list_empty(&cfqq->fifo))
3006 return NULL;
1da177e4 3007
89850f7e 3008 rq = rq_entry_fifo(cfqq->fifo.next);
9a7f38c4 3009 if (ktime_get_ns() < rq->fifo_time)
7b679138 3010 rq = NULL;
1da177e4 3011
6d048f53 3012 return rq;
1da177e4
LT
3013}
3014
22e2c507
JA
3015static inline int
3016cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3017{
3018 const int base_rq = cfqd->cfq_slice_async_rq;
1da177e4 3019
22e2c507 3020 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1da177e4 3021
b9f8ce05 3022 return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
1da177e4
LT
3023}
3024
df5fe3e8
JM
3025/*
3026 * Must be called with the queue_lock held.
3027 */
3028static int cfqq_process_refs(struct cfq_queue *cfqq)
3029{
3030 int process_refs, io_refs;
3031
3032 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
30d7b944 3033 process_refs = cfqq->ref - io_refs;
df5fe3e8
JM
3034 BUG_ON(process_refs < 0);
3035 return process_refs;
3036}
3037
3038static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
3039{
e6c5bc73 3040 int process_refs, new_process_refs;
df5fe3e8
JM
3041 struct cfq_queue *__cfqq;
3042
c10b61f0
JM
3043 /*
3044 * If there are no process references on the new_cfqq, then it is
3045 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
3046 * chain may have dropped their last reference (not just their
3047 * last process reference).
3048 */
3049 if (!cfqq_process_refs(new_cfqq))
3050 return;
3051
df5fe3e8
JM
3052 /* Avoid a circular list and skip interim queue merges */
3053 while ((__cfqq = new_cfqq->new_cfqq)) {
3054 if (__cfqq == cfqq)
3055 return;
3056 new_cfqq = __cfqq;
3057 }
3058
3059 process_refs = cfqq_process_refs(cfqq);
c10b61f0 3060 new_process_refs = cfqq_process_refs(new_cfqq);
df5fe3e8
JM
3061 /*
3062 * If the process for the cfqq has gone away, there is no
3063 * sense in merging the queues.
3064 */
c10b61f0 3065 if (process_refs == 0 || new_process_refs == 0)
df5fe3e8
JM
3066 return;
3067
e6c5bc73
JM
3068 /*
3069 * Merge in the direction of the lesser amount of work.
3070 */
e6c5bc73
JM
3071 if (new_process_refs >= process_refs) {
3072 cfqq->new_cfqq = new_cfqq;
30d7b944 3073 new_cfqq->ref += process_refs;
e6c5bc73
JM
3074 } else {
3075 new_cfqq->new_cfqq = cfqq;
30d7b944 3076 cfqq->ref += new_process_refs;
e6c5bc73 3077 }
df5fe3e8
JM
3078}
3079
6d816ec7 3080static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
3bf10fea 3081 struct cfq_group *cfqg, enum wl_class_t wl_class)
718eee05
CZ
3082{
3083 struct cfq_queue *queue;
3084 int i;
3085 bool key_valid = false;
9a7f38c4 3086 u64 lowest_key = 0;
718eee05
CZ
3087 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
3088
65b32a57
VG
3089 for (i = 0; i <= SYNC_WORKLOAD; ++i) {
3090 /* select the one with lowest rb_key */
34b98d03 3091 queue = cfq_rb_first(st_for(cfqg, wl_class, i));
718eee05 3092 if (queue &&
9a7f38c4 3093 (!key_valid || queue->rb_key < lowest_key)) {
718eee05
CZ
3094 lowest_key = queue->rb_key;
3095 cur_best = i;
3096 key_valid = true;
3097 }
3098 }
3099
3100 return cur_best;
3101}
3102
6d816ec7
VG
3103static void
3104choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
718eee05 3105{
9a7f38c4 3106 u64 slice;
718eee05 3107 unsigned count;
cdb16e8f 3108 struct cfq_rb_root *st;
9a7f38c4 3109 u64 group_slice;
4d2ceea4 3110 enum wl_class_t original_class = cfqd->serving_wl_class;
9a7f38c4 3111 u64 now = ktime_get_ns();
1fa8f6d6 3112
718eee05 3113 /* Choose next priority. RT > BE > IDLE */
58ff82f3 3114 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
4d2ceea4 3115 cfqd->serving_wl_class = RT_WORKLOAD;
58ff82f3 3116 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
4d2ceea4 3117 cfqd->serving_wl_class = BE_WORKLOAD;
718eee05 3118 else {
4d2ceea4 3119 cfqd->serving_wl_class = IDLE_WORKLOAD;
9a7f38c4 3120 cfqd->workload_expires = now + jiffies_to_nsecs(1);
718eee05
CZ
3121 return;
3122 }
3123
4d2ceea4 3124 if (original_class != cfqd->serving_wl_class)
e4ea0c16
SL
3125 goto new_workload;
3126
718eee05
CZ
3127 /*
3128 * For RT and BE, we have to choose also the type
3129 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
3130 * expiration time
3131 */
34b98d03 3132 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
cdb16e8f 3133 count = st->count;
718eee05
CZ
3134
3135 /*
65b32a57 3136 * check workload expiration, and that we still have other queues ready
718eee05 3137 */
9a7f38c4 3138 if (count && !(now > cfqd->workload_expires))
718eee05
CZ
3139 return;
3140
e4ea0c16 3141new_workload:
718eee05 3142 /* otherwise select new workload type */
6d816ec7 3143 cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
4d2ceea4 3144 cfqd->serving_wl_class);
34b98d03 3145 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
cdb16e8f 3146 count = st->count;
718eee05
CZ
3147
3148 /*
3149 * the workload slice is computed as a fraction of target latency
3150 * proportional to the number of queues in that workload, over
3151 * all the queues in the same priority class
3152 */
58ff82f3
VG
3153 group_slice = cfq_group_slice(cfqd, cfqg);
3154
9a7f38c4 3155 slice = div_u64(group_slice * count,
4d2ceea4
VG
3156 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
3157 cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
9a7f38c4 3158 cfqg)));
718eee05 3159
4d2ceea4 3160 if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
9a7f38c4 3161 u64 tmp;
f26bd1f0
VG
3162
3163 /*
3164 * Async queues are currently system wide. Just taking
3165 * proportion of queues with-in same group will lead to higher
3166 * async ratio system wide as generally root group is going
3167 * to have higher weight. A more accurate thing would be to
3168 * calculate system wide asnc/sync ratio.
3169 */
5bf14c07
TM
3170 tmp = cfqd->cfq_target_latency *
3171 cfqg_busy_async_queues(cfqd, cfqg);
9a7f38c4
JM
3172 tmp = div_u64(tmp, cfqd->busy_queues);
3173 slice = min_t(u64, slice, tmp);
f26bd1f0 3174
718eee05
CZ
3175 /* async workload slice is scaled down according to
3176 * the sync/async slice ratio. */
9a7f38c4 3177 slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]);
f26bd1f0 3178 } else
718eee05
CZ
3179 /* sync workload slice is at least 2 * cfq_slice_idle */
3180 slice = max(slice, 2 * cfqd->cfq_slice_idle);
3181
9a7f38c4
JM
3182 slice = max_t(u64, slice, CFQ_MIN_TT);
3183 cfq_log(cfqd, "workload slice:%llu", slice);
3184 cfqd->workload_expires = now + slice;
718eee05
CZ
3185}
3186
1fa8f6d6
VG
3187static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
3188{
3189 struct cfq_rb_root *st = &cfqd->grp_service_tree;
25bc6b07 3190 struct cfq_group *cfqg;
1fa8f6d6 3191
09663c86 3192 if (RB_EMPTY_ROOT(&st->rb.rb_root))
1fa8f6d6 3193 return NULL;
25bc6b07 3194 cfqg = cfq_rb_first_group(st);
25bc6b07
VG
3195 update_min_vdisktime(st);
3196 return cfqg;
1fa8f6d6
VG
3197}
3198
cdb16e8f
VG
3199static void cfq_choose_cfqg(struct cfq_data *cfqd)
3200{
1fa8f6d6 3201 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
9a7f38c4 3202 u64 now = ktime_get_ns();
1fa8f6d6
VG
3203
3204 cfqd->serving_group = cfqg;
dae739eb
VG
3205
3206 /* Restore the workload type data */
4d2ceea4 3207 if (cfqg->saved_wl_slice) {
9a7f38c4 3208 cfqd->workload_expires = now + cfqg->saved_wl_slice;
4d2ceea4
VG
3209 cfqd->serving_wl_type = cfqg->saved_wl_type;
3210 cfqd->serving_wl_class = cfqg->saved_wl_class;
66ae2919 3211 } else
9a7f38c4 3212 cfqd->workload_expires = now - 1;
66ae2919 3213
6d816ec7 3214 choose_wl_class_and_type(cfqd, cfqg);
cdb16e8f
VG
3215}
3216
22e2c507 3217/*
498d3aa2
JA
3218 * Select a queue for service. If we have a current active queue,
3219 * check whether to continue servicing it, or retrieve and set a new one.
22e2c507 3220 */
1b5ed5e1 3221static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1da177e4 3222{
a36e71f9 3223 struct cfq_queue *cfqq, *new_cfqq = NULL;
9a7f38c4 3224 u64 now = ktime_get_ns();
1da177e4 3225
22e2c507
JA
3226 cfqq = cfqd->active_queue;
3227 if (!cfqq)
3228 goto new_queue;
1da177e4 3229
f04a6424
VG
3230 if (!cfqd->rq_queued)
3231 return NULL;
c244bb50
VG
3232
3233 /*
3234 * We were waiting for group to get backlogged. Expire the queue
3235 */
3236 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
3237 goto expire;
3238
22e2c507 3239 /*
6d048f53 3240 * The active queue has run out of time, expire it and select new.
22e2c507 3241 */
7667aa06
VG
3242 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
3243 /*
3244 * If slice had not expired at the completion of last request
3245 * we might not have turned on wait_busy flag. Don't expire
3246 * the queue yet. Allow the group to get backlogged.
3247 *
3248 * The very fact that we have used the slice, that means we
3249 * have been idling all along on this queue and it should be
3250 * ok to wait for this request to complete.
3251 */
82bbbf28
VG
3252 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
3253 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3254 cfqq = NULL;
7667aa06 3255 goto keep_queue;
82bbbf28 3256 } else
80bdf0c7 3257 goto check_group_idle;
7667aa06 3258 }
1da177e4 3259
22e2c507 3260 /*
6d048f53
JA
3261 * The active queue has requests and isn't expired, allow it to
3262 * dispatch.
22e2c507 3263 */
dd67d051 3264 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
22e2c507 3265 goto keep_queue;
6d048f53 3266
a36e71f9
JA
3267 /*
3268 * If another queue has a request waiting within our mean seek
3269 * distance, let it run. The expire code will check for close
3270 * cooperators and put the close queue at the front of the service
df5fe3e8 3271 * tree. If possible, merge the expiring queue with the new cfqq.
a36e71f9 3272 */
b3b6d040 3273 new_cfqq = cfq_close_cooperator(cfqd, cfqq);
df5fe3e8
JM
3274 if (new_cfqq) {
3275 if (!cfqq->new_cfqq)
3276 cfq_setup_merge(cfqq, new_cfqq);
a36e71f9 3277 goto expire;
df5fe3e8 3278 }
a36e71f9 3279
6d048f53
JA
3280 /*
3281 * No requests pending. If the active queue still has requests in
3282 * flight or is idling for a new request, allow either of these
3283 * conditions to happen (or time out) before selecting a new queue.
3284 */
91148325 3285 if (hrtimer_active(&cfqd->idle_slice_timer)) {
80bdf0c7
VG
3286 cfqq = NULL;
3287 goto keep_queue;
3288 }
3289
8e1ac665
SL
3290 /*
3291 * This is a deep seek queue, but the device is much faster than
3292 * the queue can deliver, don't idle
3293 **/
3294 if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
3295 (cfq_cfqq_slice_new(cfqq) ||
9a7f38c4 3296 (cfqq->slice_end - now > now - cfqq->slice_start))) {
8e1ac665
SL
3297 cfq_clear_cfqq_deep(cfqq);
3298 cfq_clear_cfqq_idle_window(cfqq);
3299 }
3300
80bdf0c7
VG
3301 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3302 cfqq = NULL;
3303 goto keep_queue;
3304 }
3305
3306 /*
3307 * If group idle is enabled and there are requests dispatched from
3308 * this group, wait for requests to complete.
3309 */
3310check_group_idle:
7700fc4f
SL
3311 if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
3312 cfqq->cfqg->dispatched &&
3313 !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
caaa5f9f
JA
3314 cfqq = NULL;
3315 goto keep_queue;
22e2c507
JA
3316 }
3317
3b18152c 3318expire:
e5ff082e 3319 cfq_slice_expired(cfqd, 0);
3b18152c 3320new_queue:
718eee05
CZ
3321 /*
3322 * Current queue expired. Check if we have to switch to a new
3323 * service tree
3324 */
3325 if (!new_cfqq)
cdb16e8f 3326 cfq_choose_cfqg(cfqd);
718eee05 3327
a36e71f9 3328 cfqq = cfq_set_active_queue(cfqd, new_cfqq);
22e2c507 3329keep_queue:
3b18152c 3330 return cfqq;
22e2c507
JA
3331}
3332
febffd61 3333static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
d9e7620e
JA
3334{
3335 int dispatched = 0;
3336
3337 while (cfqq->next_rq) {
3338 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
3339 dispatched++;
3340 }
3341
3342 BUG_ON(!list_empty(&cfqq->fifo));
f04a6424
VG
3343
3344 /* By default cfqq is not expired if it is empty. Do it explicitly */
e5ff082e 3345 __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
d9e7620e
JA
3346 return dispatched;
3347}
3348
498d3aa2
JA
3349/*
3350 * Drain our current requests. Used for barriers and when switching
3351 * io schedulers on-the-fly.
3352 */
d9e7620e 3353static int cfq_forced_dispatch(struct cfq_data *cfqd)
1b5ed5e1 3354{
0871714e 3355 struct cfq_queue *cfqq;
d9e7620e 3356 int dispatched = 0;
cdb16e8f 3357
3440c49f 3358 /* Expire the timeslice of the current active queue first */
e5ff082e 3359 cfq_slice_expired(cfqd, 0);
3440c49f
DS
3360 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3361 __cfq_set_active_queue(cfqd, cfqq);
f04a6424 3362 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3440c49f 3363 }
1b5ed5e1 3364
1b5ed5e1
TH
3365 BUG_ON(cfqd->busy_queues);
3366
6923715a 3367 cfq_log(cfqd, "forced_dispatch=%d", dispatched);
1b5ed5e1
TH
3368 return dispatched;
3369}
3370
abc3c744
SL
3371static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3372 struct cfq_queue *cfqq)
3373{
9a7f38c4
JM
3374 u64 now = ktime_get_ns();
3375
abc3c744
SL
3376 /* the queue hasn't finished any request, can't estimate */
3377 if (cfq_cfqq_slice_new(cfqq))
c1e44756 3378 return true;
9a7f38c4 3379 if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end)
c1e44756 3380 return true;
abc3c744 3381
c1e44756 3382 return false;
abc3c744
SL
3383}
3384
0b182d61 3385static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2f5cb738 3386{
2f5cb738 3387 unsigned int max_dispatch;
22e2c507 3388
3932a86b
GC
3389 if (cfq_cfqq_must_dispatch(cfqq))
3390 return true;
3391
5ad531db
JA
3392 /*
3393 * Drain async requests before we start sync IO
3394 */
53c583d2 3395 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
0b182d61 3396 return false;
5ad531db 3397
2f5cb738
JA
3398 /*
3399 * If this is an async queue and we have sync IO in flight, let it wait
3400 */
53c583d2 3401 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
0b182d61 3402 return false;
2f5cb738 3403
abc3c744 3404 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
2f5cb738
JA
3405 if (cfq_class_idle(cfqq))
3406 max_dispatch = 1;
b4878f24 3407
2f5cb738
JA
3408 /*
3409 * Does this cfqq already have too much IO in flight?
3410 */
3411 if (cfqq->dispatched >= max_dispatch) {
ef8a41df 3412 bool promote_sync = false;
2f5cb738
JA
3413 /*
3414 * idle queue must always only have a single IO in flight
3415 */
3ed9a296 3416 if (cfq_class_idle(cfqq))
0b182d61 3417 return false;
3ed9a296 3418
ef8a41df 3419 /*
c4ade94f
LS
3420 * If there is only one sync queue
3421 * we can ignore async queue here and give the sync
ef8a41df
SL
3422 * queue no dispatch limit. The reason is a sync queue can
3423 * preempt async queue, limiting the sync queue doesn't make
3424 * sense. This is useful for aiostress test.
3425 */
c4ade94f
LS
3426 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3427 promote_sync = true;
ef8a41df 3428
2f5cb738
JA
3429 /*
3430 * We have other queues, don't allow more IO from this one
3431 */
ef8a41df
SL
3432 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3433 !promote_sync)
0b182d61 3434 return false;
9ede209e 3435
365722bb 3436 /*
474b18cc 3437 * Sole queue user, no limit
365722bb 3438 */
ef8a41df 3439 if (cfqd->busy_queues == 1 || promote_sync)
abc3c744
SL
3440 max_dispatch = -1;
3441 else
3442 /*
3443 * Normally we start throttling cfqq when cfq_quantum/2
3444 * requests have been dispatched. But we can drive
3445 * deeper queue depths at the beginning of slice
3446 * subjected to upper limit of cfq_quantum.
3447 * */
3448 max_dispatch = cfqd->cfq_quantum;
8e296755
JA
3449 }
3450
3451 /*
3452 * Async queues must wait a bit before being allowed dispatch.
3453 * We also ramp up the dispatch depth gradually for async IO,
3454 * based on the last sync IO we serviced
3455 */
963b72fc 3456 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
9a7f38c4 3457 u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync;
8e296755 3458 unsigned int depth;
365722bb 3459
9a7f38c4 3460 depth = div64_u64(last_sync, cfqd->cfq_slice[1]);
e00c54c3
JA
3461 if (!depth && !cfqq->dispatched)
3462 depth = 1;
8e296755
JA
3463 if (depth < max_dispatch)
3464 max_dispatch = depth;
2f5cb738 3465 }
3ed9a296 3466
0b182d61
JA
3467 /*
3468 * If we're below the current max, allow a dispatch
3469 */
3470 return cfqq->dispatched < max_dispatch;
3471}
3472
3473/*
3474 * Dispatch a request from cfqq, moving them to the request queue
3475 * dispatch list.
3476 */
3477static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3478{
3479 struct request *rq;
3480
3481 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3482
3932a86b
GC
3483 rq = cfq_check_fifo(cfqq);
3484 if (rq)
3485 cfq_mark_cfqq_must_dispatch(cfqq);
3486
0b182d61
JA
3487 if (!cfq_may_dispatch(cfqd, cfqq))
3488 return false;
3489
3490 /*
3491 * follow expired path, else get first next available
3492 */
0b182d61
JA
3493 if (!rq)
3494 rq = cfqq->next_rq;
3932a86b
GC
3495 else
3496 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
0b182d61
JA
3497
3498 /*
3499 * insert request into driver dispatch list
3500 */
3501 cfq_dispatch_insert(cfqd->queue, rq);
3502
3503 if (!cfqd->active_cic) {
c5869807 3504 struct cfq_io_cq *cic = RQ_CIC(rq);
0b182d61 3505
c5869807 3506 atomic_long_inc(&cic->icq.ioc->refcount);
0b182d61
JA
3507 cfqd->active_cic = cic;
3508 }
3509
3510 return true;
3511}
3512
3513/*
3514 * Find the cfqq that we need to service and move a request from that to the
3515 * dispatch list
3516 */
3517static int cfq_dispatch_requests(struct request_queue *q, int force)
3518{
3519 struct cfq_data *cfqd = q->elevator->elevator_data;
3520 struct cfq_queue *cfqq;
3521
3522 if (!cfqd->busy_queues)
3523 return 0;
3524
3525 if (unlikely(force))
3526 return cfq_forced_dispatch(cfqd);
3527
3528 cfqq = cfq_select_queue(cfqd);
3529 if (!cfqq)
8e296755
JA
3530 return 0;
3531
2f5cb738 3532 /*
0b182d61 3533 * Dispatch a request from this cfqq, if it is allowed
2f5cb738 3534 */
0b182d61
JA
3535 if (!cfq_dispatch_request(cfqd, cfqq))
3536 return 0;
3537
2f5cb738 3538 cfqq->slice_dispatch++;
b029195d 3539 cfq_clear_cfqq_must_dispatch(cfqq);
22e2c507 3540
2f5cb738
JA
3541 /*
3542 * expire an async queue immediately if it has used up its slice. idle
3543 * queue always expire after 1 dispatch round.
3544 */
3545 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3546 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3547 cfq_class_idle(cfqq))) {
9a7f38c4 3548 cfqq->slice_end = ktime_get_ns() + 1;
e5ff082e 3549 cfq_slice_expired(cfqd, 0);
1da177e4
LT
3550 }
3551
b217a903 3552 cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2f5cb738 3553 return 1;
1da177e4
LT
3554}
3555
1da177e4 3556/*
5e705374
JA
3557 * task holds one reference to the queue, dropped when task exits. each rq
3558 * in-flight on this queue also holds a reference, dropped when rq is freed.
1da177e4 3559 *
b1c35769 3560 * Each cfq queue took a reference on the parent group. Drop it now.
1da177e4
LT
3561 * queue lock must be held here.
3562 */
3563static void cfq_put_queue(struct cfq_queue *cfqq)
3564{
22e2c507 3565 struct cfq_data *cfqd = cfqq->cfqd;
0bbfeb83 3566 struct cfq_group *cfqg;
22e2c507 3567
30d7b944 3568 BUG_ON(cfqq->ref <= 0);
1da177e4 3569
30d7b944
SL
3570 cfqq->ref--;
3571 if (cfqq->ref)
1da177e4
LT
3572 return;
3573
7b679138 3574 cfq_log_cfqq(cfqd, cfqq, "put_queue");
1da177e4 3575 BUG_ON(rb_first(&cfqq->sort_list));
22e2c507 3576 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
b1c35769 3577 cfqg = cfqq->cfqg;
1da177e4 3578
28f95cbc 3579 if (unlikely(cfqd->active_queue == cfqq)) {
e5ff082e 3580 __cfq_slice_expired(cfqd, cfqq, 0);
23e018a1 3581 cfq_schedule_dispatch(cfqd);
28f95cbc 3582 }
22e2c507 3583
f04a6424 3584 BUG_ON(cfq_cfqq_on_rr(cfqq));
1da177e4 3585 kmem_cache_free(cfq_pool, cfqq);
eb7d8c07 3586 cfqg_put(cfqg);
1da177e4
LT
3587}
3588
d02a2c07 3589static void cfq_put_cooperator(struct cfq_queue *cfqq)
1da177e4 3590{
df5fe3e8
JM
3591 struct cfq_queue *__cfqq, *next;
3592
df5fe3e8
JM
3593 /*
3594 * If this queue was scheduled to merge with another queue, be
3595 * sure to drop the reference taken on that queue (and others in
3596 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
3597 */
3598 __cfqq = cfqq->new_cfqq;
3599 while (__cfqq) {
3600 if (__cfqq == cfqq) {
3601 WARN(1, "cfqq->new_cfqq loop detected\n");
3602 break;
3603 }
3604 next = __cfqq->new_cfqq;
3605 cfq_put_queue(__cfqq);
3606 __cfqq = next;
3607 }
d02a2c07
SL
3608}
3609
3610static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3611{
3612 if (unlikely(cfqq == cfqd->active_queue)) {
3613 __cfq_slice_expired(cfqd, cfqq, 0);
3614 cfq_schedule_dispatch(cfqd);
3615 }
3616
3617 cfq_put_cooperator(cfqq);
df5fe3e8 3618
89850f7e
JA
3619 cfq_put_queue(cfqq);
3620}
22e2c507 3621
9b84cacd
TH
3622static void cfq_init_icq(struct io_cq *icq)
3623{
3624 struct cfq_io_cq *cic = icq_to_cic(icq);
3625
9a7f38c4 3626 cic->ttime.last_end_request = ktime_get_ns();
9b84cacd
TH
3627}
3628
c5869807 3629static void cfq_exit_icq(struct io_cq *icq)
89850f7e 3630{
c5869807 3631 struct cfq_io_cq *cic = icq_to_cic(icq);
283287a5 3632 struct cfq_data *cfqd = cic_to_cfqd(cic);
4faa3c81 3633
563180a4
TH
3634 if (cic_to_cfqq(cic, false)) {
3635 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
3636 cic_set_cfqq(cic, NULL, false);
12a05732
AV
3637 }
3638
563180a4
TH
3639 if (cic_to_cfqq(cic, true)) {
3640 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
3641 cic_set_cfqq(cic, NULL, true);
12a05732 3642 }
89850f7e
JA
3643}
3644
abede6da 3645static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
22e2c507
JA
3646{
3647 struct task_struct *tsk = current;
3648 int ioprio_class;
3649
3b18152c 3650 if (!cfq_cfqq_prio_changed(cfqq))
22e2c507
JA
3651 return;
3652
598971bf 3653 ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
22e2c507 3654 switch (ioprio_class) {
fe094d98
JA
3655 default:
3656 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3657 case IOPRIO_CLASS_NONE:
3658 /*
6d63c275 3659 * no prio set, inherit CPU scheduling settings
fe094d98
JA
3660 */
3661 cfqq->ioprio = task_nice_ioprio(tsk);
6d63c275 3662 cfqq->ioprio_class = task_nice_ioclass(tsk);
fe094d98
JA
3663 break;
3664 case IOPRIO_CLASS_RT:
598971bf 3665 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
fe094d98
JA
3666 cfqq->ioprio_class = IOPRIO_CLASS_RT;
3667 break;
3668 case IOPRIO_CLASS_BE:
598971bf 3669 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
fe094d98
JA
3670 cfqq->ioprio_class = IOPRIO_CLASS_BE;
3671 break;
3672 case IOPRIO_CLASS_IDLE:
3673 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3674 cfqq->ioprio = 7;
3675 cfq_clear_cfqq_idle_window(cfqq);
3676 break;
22e2c507
JA
3677 }
3678
3679 /*
3680 * keep track of original prio settings in case we have to temporarily
3681 * elevate the priority of this queue
3682 */
3683 cfqq->org_ioprio = cfqq->ioprio;
b8269db4 3684 cfqq->org_ioprio_class = cfqq->ioprio_class;
3b18152c 3685 cfq_clear_cfqq_prio_changed(cfqq);
22e2c507
JA
3686}
3687
598971bf 3688static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
22e2c507 3689{
598971bf 3690 int ioprio = cic->icq.ioc->ioprio;
bca4b914 3691 struct cfq_data *cfqd = cic_to_cfqd(cic);
478a82b0 3692 struct cfq_queue *cfqq;
35e6077c 3693
598971bf
TH
3694 /*
3695 * Check whether ioprio has changed. The condition may trigger
3696 * spuriously on a newly created cic but there's no harm.
3697 */
3698 if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
caaa5f9f
JA
3699 return;
3700
563180a4 3701 cfqq = cic_to_cfqq(cic, false);
caaa5f9f 3702 if (cfqq) {
563180a4 3703 cfq_put_queue(cfqq);
2da8de0b 3704 cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
563180a4 3705 cic_set_cfqq(cic, cfqq, false);
22e2c507 3706 }
caaa5f9f 3707
563180a4 3708 cfqq = cic_to_cfqq(cic, true);
caaa5f9f
JA
3709 if (cfqq)
3710 cfq_mark_cfqq_prio_changed(cfqq);
598971bf
TH
3711
3712 cic->ioprio = ioprio;
22e2c507
JA
3713}
3714
d5036d77 3715static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
a6151c3a 3716 pid_t pid, bool is_sync)
d5036d77
JA
3717{
3718 RB_CLEAR_NODE(&cfqq->rb_node);
3719 RB_CLEAR_NODE(&cfqq->p_node);
3720 INIT_LIST_HEAD(&cfqq->fifo);
3721
30d7b944 3722 cfqq->ref = 0;
d5036d77
JA
3723 cfqq->cfqd = cfqd;
3724
3725 cfq_mark_cfqq_prio_changed(cfqq);
3726
3727 if (is_sync) {
3728 if (!cfq_class_idle(cfqq))
3729 cfq_mark_cfqq_idle_window(cfqq);
3730 cfq_mark_cfqq_sync(cfqq);
3731 }
3732 cfqq->pid = pid;
3733}
3734
24610333 3735#ifdef CONFIG_CFQ_GROUP_IOSCHED
142bbdfc 3736static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
24610333 3737{
bca4b914 3738 struct cfq_data *cfqd = cic_to_cfqd(cic);
60a83707 3739 struct cfq_queue *cfqq;
f4da8072 3740 uint64_t serial_nr;
24610333 3741
598971bf 3742 rcu_read_lock();
f4da8072 3743 serial_nr = bio_blkcg(bio)->css.serial_nr;
598971bf 3744 rcu_read_unlock();
24610333 3745
598971bf
TH
3746 /*
3747 * Check whether blkcg has changed. The condition may trigger
3748 * spuriously on a newly created cic but there's no harm.
3749 */
f4da8072 3750 if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
142bbdfc 3751 return;
87760e5e 3752
60a83707
TH
3753 /*
3754 * Drop reference to queues. New queues will be assigned in new
3755 * group upon arrival of fresh requests.
3756 */
3757 cfqq = cic_to_cfqq(cic, false);
3758 if (cfqq) {
3759 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3760 cic_set_cfqq(cic, NULL, false);
3761 cfq_put_queue(cfqq);
3762 }
3763
3764 cfqq = cic_to_cfqq(cic, true);
3765 if (cfqq) {
3766 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3767 cic_set_cfqq(cic, NULL, true);
3768 cfq_put_queue(cfqq);
24610333 3769 }
598971bf 3770
f4da8072 3771 cic->blkcg_serial_nr = serial_nr;
24610333 3772}
598971bf 3773#else
142bbdfc 3774static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
5d7f5ce1 3775{
5d7f5ce1 3776}
24610333
VG
3777#endif /* CONFIG_CFQ_GROUP_IOSCHED */
3778
c2dea2d1 3779static struct cfq_queue **
60a83707 3780cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
c2dea2d1 3781{
fe094d98 3782 switch (ioprio_class) {
c2dea2d1 3783 case IOPRIO_CLASS_RT:
60a83707 3784 return &cfqg->async_cfqq[0][ioprio];
598971bf
TH
3785 case IOPRIO_CLASS_NONE:
3786 ioprio = IOPRIO_NORM;
3787 /* fall through */
c2dea2d1 3788 case IOPRIO_CLASS_BE:
60a83707 3789 return &cfqg->async_cfqq[1][ioprio];
c2dea2d1 3790 case IOPRIO_CLASS_IDLE:
60a83707 3791 return &cfqg->async_idle_cfqq;
c2dea2d1
VT
3792 default:
3793 BUG();
3794 }
3795}
3796
15c31be4 3797static struct cfq_queue *
abede6da 3798cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
2da8de0b 3799 struct bio *bio)
15c31be4 3800{
c6ce1943
JM
3801 int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3802 int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
d4aad7ff 3803 struct cfq_queue **async_cfqq = NULL;
4ebc1c61 3804 struct cfq_queue *cfqq;
322731ed
TH
3805 struct cfq_group *cfqg;
3806
3807 rcu_read_lock();
ae118896 3808 cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
322731ed
TH
3809 if (!cfqg) {
3810 cfqq = &cfqd->oom_cfqq;
3811 goto out;
3812 }
15c31be4 3813
c2dea2d1 3814 if (!is_sync) {
c6ce1943
JM
3815 if (!ioprio_valid(cic->ioprio)) {
3816 struct task_struct *tsk = current;
3817 ioprio = task_nice_ioprio(tsk);
3818 ioprio_class = task_nice_ioclass(tsk);
3819 }
60a83707 3820 async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
c2dea2d1 3821 cfqq = *async_cfqq;
4ebc1c61
TH
3822 if (cfqq)
3823 goto out;
c2dea2d1
VT
3824 }
3825
e00f4f4d
TH
3826 cfqq = kmem_cache_alloc_node(cfq_pool,
3827 GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
d4aad7ff
TH
3828 cfqd->queue->node);
3829 if (!cfqq) {
3830 cfqq = &cfqd->oom_cfqq;
3831 goto out;
3832 }
3833
4d608baa
AP
3834 /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
3835 cfqq->ioprio_class = IOPRIO_CLASS_NONE;
d4aad7ff
TH
3836 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3837 cfq_init_prio_data(cfqq, cic);
3838 cfq_link_cfqq_cfqg(cfqq, cfqg);
3839 cfq_log_cfqq(cfqd, cfqq, "alloced");
15c31be4 3840
d4aad7ff
TH
3841 if (async_cfqq) {
3842 /* a new async queue is created, pin and remember */
30d7b944 3843 cfqq->ref++;
c2dea2d1 3844 *async_cfqq = cfqq;
15c31be4 3845 }
4ebc1c61 3846out:
30d7b944 3847 cfqq->ref++;
322731ed 3848 rcu_read_unlock();
15c31be4
JA
3849 return cfqq;
3850}
3851
22e2c507 3852static void
9a7f38c4 3853__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle)
1da177e4 3854{
9a7f38c4 3855 u64 elapsed = ktime_get_ns() - ttime->last_end_request;
383cd721 3856 elapsed = min(elapsed, 2UL * slice_idle);
db3b5848 3857
383cd721 3858 ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
9a7f38c4
JM
3859 ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
3860 ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
3861 ttime->ttime_samples);
383cd721
SL
3862}
3863
3864static void
3865cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
c5869807 3866 struct cfq_io_cq *cic)
383cd721 3867{
f5f2b6ce 3868 if (cfq_cfqq_sync(cfqq)) {
383cd721 3869 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
f5f2b6ce
SL
3870 __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3871 cfqd->cfq_slice_idle);
3872 }
7700fc4f
SL
3873#ifdef CONFIG_CFQ_GROUP_IOSCHED
3874 __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3875#endif
22e2c507 3876}
1da177e4 3877
206dc69b 3878static void
b2c18e1e 3879cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
6d048f53 3880 struct request *rq)
206dc69b 3881{
3dde36dd 3882 sector_t sdist = 0;
41647e7a 3883 sector_t n_sec = blk_rq_sectors(rq);
3dde36dd
CZ
3884 if (cfqq->last_request_pos) {
3885 if (cfqq->last_request_pos < blk_rq_pos(rq))
3886 sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3887 else
3888 sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3889 }
206dc69b 3890
3dde36dd 3891 cfqq->seek_history <<= 1;
41647e7a
CZ
3892 if (blk_queue_nonrot(cfqd->queue))
3893 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3894 else
3895 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
206dc69b 3896}
1da177e4 3897
a2b80967
CH
3898static inline bool req_noidle(struct request *req)
3899{
3900 return req_op(req) == REQ_OP_WRITE &&
3901 (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
3902}
3903
22e2c507
JA
3904/*
3905 * Disable idle window if the process thinks too long or seeks so much that
3906 * it doesn't matter
3907 */
3908static void
3909cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
c5869807 3910 struct cfq_io_cq *cic)
22e2c507 3911{
7b679138 3912 int old_idle, enable_idle;
1be92f2f 3913
0871714e
JA
3914 /*
3915 * Don't idle for async or idle io prio class
3916 */
3917 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
1be92f2f
JA
3918 return;
3919
c265a7f4 3920 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
1da177e4 3921
76280aff
CZ
3922 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3923 cfq_mark_cfqq_deep(cfqq);
3924
a2b80967 3925 if (cfqq->next_rq && req_noidle(cfqq->next_rq))
749ef9f8 3926 enable_idle = 0;
f6e8d01b 3927 else if (!atomic_read(&cic->icq.ioc->active_ref) ||
c5869807
TH
3928 !cfqd->cfq_slice_idle ||
3929 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
22e2c507 3930 enable_idle = 0;
383cd721
SL
3931 else if (sample_valid(cic->ttime.ttime_samples)) {
3932 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
22e2c507
JA
3933 enable_idle = 0;
3934 else
3935 enable_idle = 1;
1da177e4
LT
3936 }
3937
7b679138
JA
3938 if (old_idle != enable_idle) {
3939 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3940 if (enable_idle)
3941 cfq_mark_cfqq_idle_window(cfqq);
3942 else
3943 cfq_clear_cfqq_idle_window(cfqq);
3944 }
22e2c507 3945}
1da177e4 3946
22e2c507
JA
3947/*
3948 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3949 * no or if we aren't sure, a 1 will cause a preempt.
3950 */
a6151c3a 3951static bool
22e2c507 3952cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
5e705374 3953 struct request *rq)
22e2c507 3954{
6d048f53 3955 struct cfq_queue *cfqq;
22e2c507 3956
6d048f53
JA
3957 cfqq = cfqd->active_queue;
3958 if (!cfqq)
a6151c3a 3959 return false;
22e2c507 3960
6d048f53 3961 if (cfq_class_idle(new_cfqq))
a6151c3a 3962 return false;
22e2c507
JA
3963
3964 if (cfq_class_idle(cfqq))
a6151c3a 3965 return true;
1e3335de 3966
875feb63
DS
3967 /*
3968 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3969 */
3970 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3971 return false;
3972
374f84ac
JA
3973 /*
3974 * if the new request is sync, but the currently running queue is
3975 * not, let the sync request have priority.
3976 */
3932a86b 3977 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
a6151c3a 3978 return true;
1e3335de 3979
3984aa55
JK
3980 /*
3981 * Treat ancestors of current cgroup the same way as current cgroup.
3982 * For anybody else we disallow preemption to guarantee service
3983 * fairness among cgroups.
3984 */
3985 if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
8682e1f1
VG
3986 return false;
3987
3988 if (cfq_slice_used(cfqq))
3989 return true;
3990
6c80731c
JK
3991 /*
3992 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3993 */
3994 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3995 return true;
3996
3997 WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
8682e1f1 3998 /* Allow preemption only if we are idling on sync-noidle tree */
4d2ceea4 3999 if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
8682e1f1 4000 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
8682e1f1
VG
4001 RB_EMPTY_ROOT(&cfqq->sort_list))
4002 return true;
4003
b53d1ed7
JA
4004 /*
4005 * So both queues are sync. Let the new request get disk time if
4006 * it's a metadata request and the current queue is doing regular IO.
4007 */
65299a3b 4008 if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
b53d1ed7
JA
4009 return true;
4010
d2d59e18
SL
4011 /* An idle queue should not be idle now for some reason */
4012 if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
4013 return true;
4014
1e3335de 4015 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
a6151c3a 4016 return false;
1e3335de
JA
4017
4018 /*
4019 * if this request is as-good as one we would expect from the
4020 * current cfqq, let it preempt
4021 */
e9ce335d 4022 if (cfq_rq_close(cfqd, cfqq, rq))
a6151c3a 4023 return true;
1e3335de 4024
a6151c3a 4025 return false;
22e2c507
JA
4026}
4027
4028/*
4029 * cfqq preempts the active queue. if we allowed preempt with no slice left,
4030 * let it have half of its nominal slice.
4031 */
4032static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4033{
df0793ab
SL
4034 enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
4035
7b679138 4036 cfq_log_cfqq(cfqd, cfqq, "preempt");
df0793ab 4037 cfq_slice_expired(cfqd, 1);
22e2c507 4038
f8ae6e3e
SL
4039 /*
4040 * workload type is changed, don't save slice, otherwise preempt
4041 * doesn't happen
4042 */
df0793ab 4043 if (old_type != cfqq_type(cfqq))
4d2ceea4 4044 cfqq->cfqg->saved_wl_slice = 0;
f8ae6e3e 4045
bf572256
JA
4046 /*
4047 * Put the new queue at the front of the of the current list,
4048 * so we know that it will be selected next.
4049 */
4050 BUG_ON(!cfq_cfqq_on_rr(cfqq));
edd75ffd
JA
4051
4052 cfq_service_tree_add(cfqd, cfqq, 1);
eda5e0c9 4053
62a37f6b
JT
4054 cfqq->slice_end = 0;
4055 cfq_mark_cfqq_slice_new(cfqq);
22e2c507
JA
4056}
4057
22e2c507 4058/*
5e705374 4059 * Called when a new fs request (rq) is added (to cfqq). Check if there's
22e2c507
JA
4060 * something we should do about it
4061 */
4062static void
5e705374
JA
4063cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
4064 struct request *rq)
22e2c507 4065{
c5869807 4066 struct cfq_io_cq *cic = RQ_CIC(rq);
12e9fddd 4067
45333d5a 4068 cfqd->rq_queued++;
65299a3b
CH
4069 if (rq->cmd_flags & REQ_PRIO)
4070 cfqq->prio_pending++;
374f84ac 4071
383cd721 4072 cfq_update_io_thinktime(cfqd, cfqq, cic);
b2c18e1e 4073 cfq_update_io_seektime(cfqd, cfqq, rq);
9c2c38a1
JA
4074 cfq_update_idle_window(cfqd, cfqq, cic);
4075
b2c18e1e 4076 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
22e2c507
JA
4077
4078 if (cfqq == cfqd->active_queue) {
4079 /*
b029195d
JA
4080 * Remember that we saw a request from this process, but
4081 * don't start queuing just yet. Otherwise we risk seeing lots
4082 * of tiny requests, because we disrupt the normal plugging
d6ceb25e
JA
4083 * and merging. If the request is already larger than a single
4084 * page, let it rip immediately. For that case we assume that
2d870722
JA
4085 * merging is already done. Ditto for a busy system that
4086 * has other work pending, don't risk delaying until the
4087 * idle timer unplug to continue working.
22e2c507 4088 */
d6ceb25e 4089 if (cfq_cfqq_wait_request(cfqq)) {
09cbfeaf 4090 if (blk_rq_bytes(rq) > PAGE_SIZE ||
2d870722 4091 cfqd->busy_queues > 1) {
812df48d 4092 cfq_del_timer(cfqd, cfqq);
554554f6 4093 cfq_clear_cfqq_wait_request(cfqq);
24ecfbe2 4094 __blk_run_queue(cfqd->queue);
a11cdaa7 4095 } else {
155fead9 4096 cfqg_stats_update_idle_time(cfqq->cfqg);
bf791937 4097 cfq_mark_cfqq_must_dispatch(cfqq);
a11cdaa7 4098 }
d6ceb25e 4099 }
5e705374 4100 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
22e2c507
JA
4101 /*
4102 * not the active queue - expire current slice if it is
4103 * idle and has expired it's mean thinktime or this new queue
3a9a3f6c
DS
4104 * has some old slice time left and is of higher priority or
4105 * this new queue is RT and the current one is BE
22e2c507
JA
4106 */
4107 cfq_preempt_queue(cfqd, cfqq);
24ecfbe2 4108 __blk_run_queue(cfqd->queue);
22e2c507 4109 }
1da177e4
LT
4110}
4111
165125e1 4112static void cfq_insert_request(struct request_queue *q, struct request *rq)
1da177e4 4113{
b4878f24 4114 struct cfq_data *cfqd = q->elevator->elevator_data;
5e705374 4115 struct cfq_queue *cfqq = RQ_CFQQ(rq);
22e2c507 4116
7b679138 4117 cfq_log_cfqq(cfqd, cfqq, "insert_request");
abede6da 4118 cfq_init_prio_data(cfqq, RQ_CIC(rq));
1da177e4 4119
9a7f38c4 4120 rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
22e2c507 4121 list_add_tail(&rq->queuelist, &cfqq->fifo);
aa6f6a3d 4122 cfq_add_rq_rb(rq);
ef295ecf 4123 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
155fead9 4124 rq->cmd_flags);
5e705374 4125 cfq_rq_enqueued(cfqd, cfqq, rq);
1da177e4
LT
4126}
4127
45333d5a
AC
4128/*
4129 * Update hw_tag based on peak queue depth over 50 samples under
4130 * sufficient load.
4131 */
4132static void cfq_update_hw_tag(struct cfq_data *cfqd)
4133{
1a1238a7
SL
4134 struct cfq_queue *cfqq = cfqd->active_queue;
4135
53c583d2
CZ
4136 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
4137 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
e459dd08
CZ
4138
4139 if (cfqd->hw_tag == 1)
4140 return;
45333d5a
AC
4141
4142 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
53c583d2 4143 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
45333d5a
AC
4144 return;
4145
1a1238a7
SL
4146 /*
4147 * If active queue hasn't enough requests and can idle, cfq might not
4148 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
4149 * case
4150 */
4151 if (cfqq && cfq_cfqq_idle_window(cfqq) &&
4152 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
53c583d2 4153 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
1a1238a7
SL
4154 return;
4155
45333d5a
AC
4156 if (cfqd->hw_tag_samples++ < 50)
4157 return;
4158
e459dd08 4159 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
45333d5a
AC
4160 cfqd->hw_tag = 1;
4161 else
4162 cfqd->hw_tag = 0;
45333d5a
AC
4163}
4164
7667aa06
VG
4165static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4166{
c5869807 4167 struct cfq_io_cq *cic = cfqd->active_cic;
9a7f38c4 4168 u64 now = ktime_get_ns();
7667aa06 4169
02a8f01b
JT
4170 /* If the queue already has requests, don't wait */
4171 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4172 return false;
4173
7667aa06
VG
4174 /* If there are other queues in the group, don't wait */
4175 if (cfqq->cfqg->nr_cfqq > 1)
4176 return false;
4177
7700fc4f
SL
4178 /* the only queue in the group, but think time is big */
4179 if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
4180 return false;
4181
7667aa06
VG
4182 if (cfq_slice_used(cfqq))
4183 return true;
4184
4185 /* if slice left is less than think time, wait busy */
383cd721 4186 if (cic && sample_valid(cic->ttime.ttime_samples)
9a7f38c4 4187 && (cfqq->slice_end - now < cic->ttime.ttime_mean))
7667aa06
VG
4188 return true;
4189
4190 /*
4191 * If think times is less than a jiffy than ttime_mean=0 and above
4192 * will not be true. It might happen that slice has not expired yet
4193 * but will expire soon (4-5 ns) during select_queue(). To cover the
4194 * case where think time is less than a jiffy, mark the queue wait
4195 * busy if only 1 jiffy is left in the slice.
4196 */
9a7f38c4 4197 if (cfqq->slice_end - now <= jiffies_to_nsecs(1))
7667aa06
VG
4198 return true;
4199
4200 return false;
4201}
4202
165125e1 4203static void cfq_completed_request(struct request_queue *q, struct request *rq)
1da177e4 4204{
5e705374 4205 struct cfq_queue *cfqq = RQ_CFQQ(rq);
b4878f24 4206 struct cfq_data *cfqd = cfqq->cfqd;
5380a101 4207 const int sync = rq_is_sync(rq);
9a7f38c4 4208 u64 now = ktime_get_ns();
1da177e4 4209
a2b80967 4210 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
1da177e4 4211
45333d5a
AC
4212 cfq_update_hw_tag(cfqd);
4213
53c583d2 4214 WARN_ON(!cfqd->rq_in_driver);
6d048f53 4215 WARN_ON(!cfqq->dispatched);
53c583d2 4216 cfqd->rq_in_driver--;
6d048f53 4217 cfqq->dispatched--;
80bdf0c7 4218 (RQ_CFQG(rq))->dispatched--;
155fead9 4219 cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
ef295ecf 4220 rq_io_start_time_ns(rq), rq->cmd_flags);
1da177e4 4221
53c583d2 4222 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3ed9a296 4223
365722bb 4224 if (sync) {
34b98d03 4225 struct cfq_rb_root *st;
f5f2b6ce 4226
383cd721 4227 RQ_CIC(rq)->ttime.last_end_request = now;
f5f2b6ce
SL
4228
4229 if (cfq_cfqq_on_rr(cfqq))
34b98d03 4230 st = cfqq->service_tree;
f5f2b6ce 4231 else
34b98d03
VG
4232 st = st_for(cfqq->cfqg, cfqq_class(cfqq),
4233 cfqq_type(cfqq));
4234
4235 st->ttime.last_end_request = now;
149321a6
JK
4236 /*
4237 * We have to do this check in jiffies since start_time is in
4238 * jiffies and it is not trivial to convert to ns. If
4239 * cfq_fifo_expire[1] ever comes close to 1 jiffie, this test
4240 * will become problematic but so far we are fine (the default
4241 * is 128 ms).
4242 */
4243 if (!time_after(rq->start_time +
4244 nsecs_to_jiffies(cfqd->cfq_fifo_expire[1]),
4245 jiffies))
573412b2 4246 cfqd->last_delayed_sync = now;
365722bb 4247 }
caaa5f9f 4248
7700fc4f
SL
4249#ifdef CONFIG_CFQ_GROUP_IOSCHED
4250 cfqq->cfqg->ttime.last_end_request = now;
4251#endif
4252
caaa5f9f
JA
4253 /*
4254 * If this is the active queue, check if it needs to be expired,
4255 * or if we want to idle in case it has no pending requests.
4256 */
4257 if (cfqd->active_queue == cfqq) {
a36e71f9
JA
4258 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
4259
44f7c160
JA
4260 if (cfq_cfqq_slice_new(cfqq)) {
4261 cfq_set_prio_slice(cfqd, cfqq);
4262 cfq_clear_cfqq_slice_new(cfqq);
4263 }
f75edf2d
VG
4264
4265 /*
7667aa06
VG
4266 * Should we wait for next request to come in before we expire
4267 * the queue.
f75edf2d 4268 */
7667aa06 4269 if (cfq_should_wait_busy(cfqd, cfqq)) {
9a7f38c4 4270 u64 extend_sl = cfqd->cfq_slice_idle;
80bdf0c7
VG
4271 if (!cfqd->cfq_slice_idle)
4272 extend_sl = cfqd->cfq_group_idle;
9a7f38c4 4273 cfqq->slice_end = now + extend_sl;
f75edf2d 4274 cfq_mark_cfqq_wait_busy(cfqq);
b1ffe737 4275 cfq_log_cfqq(cfqd, cfqq, "will busy wait");
f75edf2d
VG
4276 }
4277
a36e71f9 4278 /*
8e550632
CZ
4279 * Idling is not enabled on:
4280 * - expired queues
4281 * - idle-priority queues
4282 * - async queues
4283 * - queues with still some requests queued
4284 * - when there is a close cooperator
a36e71f9 4285 */
0871714e 4286 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
e5ff082e 4287 cfq_slice_expired(cfqd, 1);
8e550632
CZ
4288 else if (sync && cfqq_empty &&
4289 !cfq_close_cooperator(cfqd, cfqq)) {
749ef9f8 4290 cfq_arm_slice_timer(cfqd);
8e550632 4291 }
caaa5f9f 4292 }
6d048f53 4293
53c583d2 4294 if (!cfqd->rq_in_driver)
23e018a1 4295 cfq_schedule_dispatch(cfqd);
1da177e4
LT
4296}
4297
ef295ecf 4298static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
b8269db4
JA
4299{
4300 /*
4301 * If REQ_PRIO is set, boost class and prio level, if it's below
4302 * BE/NORM. If prio is not set, restore the potentially boosted
4303 * class/prio level.
4304 */
ef295ecf 4305 if (!(op & REQ_PRIO)) {
b8269db4
JA
4306 cfqq->ioprio_class = cfqq->org_ioprio_class;
4307 cfqq->ioprio = cfqq->org_ioprio;
4308 } else {
4309 if (cfq_class_idle(cfqq))
4310 cfqq->ioprio_class = IOPRIO_CLASS_BE;
4311 if (cfqq->ioprio > IOPRIO_NORM)
4312 cfqq->ioprio = IOPRIO_NORM;
4313 }
4314}
4315
89850f7e 4316static inline int __cfq_may_queue(struct cfq_queue *cfqq)
22e2c507 4317{
1b379d8d 4318 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3b18152c 4319 cfq_mark_cfqq_must_alloc_slice(cfqq);
22e2c507 4320 return ELV_MQUEUE_MUST;
3b18152c 4321 }
1da177e4 4322
22e2c507 4323 return ELV_MQUEUE_MAY;
22e2c507
JA
4324}
4325
ef295ecf 4326static int cfq_may_queue(struct request_queue *q, unsigned int op)
22e2c507
JA
4327{
4328 struct cfq_data *cfqd = q->elevator->elevator_data;
4329 struct task_struct *tsk = current;
c5869807 4330 struct cfq_io_cq *cic;
22e2c507
JA
4331 struct cfq_queue *cfqq;
4332
4333 /*
4334 * don't force setup of a queue from here, as a call to may_queue
4335 * does not necessarily imply that a request actually will be queued.
4336 * so just lookup a possibly existing queue, or return 'may queue'
4337 * if that fails
4338 */
4ac845a2 4339 cic = cfq_cic_lookup(cfqd, tsk->io_context);
91fac317
VT
4340 if (!cic)
4341 return ELV_MQUEUE_MAY;
4342
ef295ecf 4343 cfqq = cic_to_cfqq(cic, op_is_sync(op));
22e2c507 4344 if (cfqq) {
abede6da 4345 cfq_init_prio_data(cfqq, cic);
ef295ecf 4346 cfqq_boost_on_prio(cfqq, op);
22e2c507 4347
89850f7e 4348 return __cfq_may_queue(cfqq);
22e2c507
JA
4349 }
4350
4351 return ELV_MQUEUE_MAY;
1da177e4
LT
4352}
4353
1da177e4
LT
4354/*
4355 * queue lock held here
4356 */
bb37b94c 4357static void cfq_put_request(struct request *rq)
1da177e4 4358{
5e705374 4359 struct cfq_queue *cfqq = RQ_CFQQ(rq);
1da177e4 4360
5e705374 4361 if (cfqq) {
22e2c507 4362 const int rw = rq_data_dir(rq);
1da177e4 4363
22e2c507
JA
4364 BUG_ON(!cfqq->allocated[rw]);
4365 cfqq->allocated[rw]--;
1da177e4 4366
7f1dc8a2 4367 /* Put down rq reference on cfqg */
eb7d8c07 4368 cfqg_put(RQ_CFQG(rq));
a612fddf
TH
4369 rq->elv.priv[0] = NULL;
4370 rq->elv.priv[1] = NULL;
7f1dc8a2 4371
1da177e4
LT
4372 cfq_put_queue(cfqq);
4373 }
4374}
4375
df5fe3e8 4376static struct cfq_queue *
c5869807 4377cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
df5fe3e8
JM
4378 struct cfq_queue *cfqq)
4379{
4380 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
4381 cic_set_cfqq(cic, cfqq->new_cfqq, 1);
b3b6d040 4382 cfq_mark_cfqq_coop(cfqq->new_cfqq);
df5fe3e8
JM
4383 cfq_put_queue(cfqq);
4384 return cic_to_cfqq(cic, 1);
4385}
4386
e6c5bc73
JM
4387/*
4388 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
4389 * was the last process referring to said cfqq.
4390 */
4391static struct cfq_queue *
c5869807 4392split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
e6c5bc73
JM
4393{
4394 if (cfqq_process_refs(cfqq) == 1) {
e6c5bc73
JM
4395 cfqq->pid = current->pid;
4396 cfq_clear_cfqq_coop(cfqq);
ae54abed 4397 cfq_clear_cfqq_split_coop(cfqq);
e6c5bc73
JM
4398 return cfqq;
4399 }
4400
4401 cic_set_cfqq(cic, NULL, 1);
d02a2c07
SL
4402
4403 cfq_put_cooperator(cfqq);
4404
e6c5bc73
JM
4405 cfq_put_queue(cfqq);
4406 return NULL;
4407}
1da177e4 4408/*
22e2c507 4409 * Allocate cfq data structures associated with this request.
1da177e4 4410 */
22e2c507 4411static int
852c788f
TH
4412cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4413 gfp_t gfp_mask)
1da177e4
LT
4414{
4415 struct cfq_data *cfqd = q->elevator->elevator_data;
f1f8cc94 4416 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
1da177e4 4417 const int rw = rq_data_dir(rq);
a6151c3a 4418 const bool is_sync = rq_is_sync(rq);
22e2c507 4419 struct cfq_queue *cfqq;
1da177e4 4420
216284c3 4421 spin_lock_irq(q->queue_lock);
f1f8cc94 4422
598971bf 4423 check_ioprio_changed(cic, bio);
142bbdfc 4424 check_blkcg_changed(cic, bio);
e6c5bc73 4425new_queue:
91fac317 4426 cfqq = cic_to_cfqq(cic, is_sync);
32f2e807 4427 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
bce6133b
TH
4428 if (cfqq)
4429 cfq_put_queue(cfqq);
2da8de0b 4430 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
91fac317 4431 cic_set_cfqq(cic, cfqq, is_sync);
df5fe3e8 4432 } else {
e6c5bc73
JM
4433 /*
4434 * If the queue was seeky for too long, break it apart.
4435 */
ae54abed 4436 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
e6c5bc73
JM
4437 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
4438 cfqq = split_cfqq(cic, cfqq);
4439 if (!cfqq)
4440 goto new_queue;
4441 }
4442
df5fe3e8
JM
4443 /*
4444 * Check to see if this queue is scheduled to merge with
4445 * another, closely cooperating queue. The merging of
4446 * queues happens here as it must be done in process context.
4447 * The reference on new_cfqq was taken in merge_cfqqs.
4448 */
4449 if (cfqq->new_cfqq)
4450 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
91fac317 4451 }
1da177e4
LT
4452
4453 cfqq->allocated[rw]++;
1da177e4 4454
6fae9c25 4455 cfqq->ref++;
eb7d8c07 4456 cfqg_get(cfqq->cfqg);
a612fddf 4457 rq->elv.priv[0] = cfqq;
1adaf3dd 4458 rq->elv.priv[1] = cfqq->cfqg;
216284c3 4459 spin_unlock_irq(q->queue_lock);
5d7f5ce1 4460
5e705374 4461 return 0;
1da177e4
LT
4462}
4463
65f27f38 4464static void cfq_kick_queue(struct work_struct *work)
22e2c507 4465{
65f27f38 4466 struct cfq_data *cfqd =
23e018a1 4467 container_of(work, struct cfq_data, unplug_work);
165125e1 4468 struct request_queue *q = cfqd->queue;
22e2c507 4469
40bb54d1 4470 spin_lock_irq(q->queue_lock);
24ecfbe2 4471 __blk_run_queue(cfqd->queue);
40bb54d1 4472 spin_unlock_irq(q->queue_lock);
22e2c507
JA
4473}
4474
4475/*
4476 * Timer running if the active_queue is currently idling inside its time slice
4477 */
91148325 4478static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer)
22e2c507 4479{
91148325
JK
4480 struct cfq_data *cfqd = container_of(timer, struct cfq_data,
4481 idle_slice_timer);
22e2c507
JA
4482 struct cfq_queue *cfqq;
4483 unsigned long flags;
3c6bd2f8 4484 int timed_out = 1;
22e2c507 4485
7b679138
JA
4486 cfq_log(cfqd, "idle timer fired");
4487
22e2c507
JA
4488 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
4489
fe094d98
JA
4490 cfqq = cfqd->active_queue;
4491 if (cfqq) {
3c6bd2f8
JA
4492 timed_out = 0;
4493
b029195d
JA
4494 /*
4495 * We saw a request before the queue expired, let it through
4496 */
4497 if (cfq_cfqq_must_dispatch(cfqq))
4498 goto out_kick;
4499
22e2c507
JA
4500 /*
4501 * expired
4502 */
44f7c160 4503 if (cfq_slice_used(cfqq))
22e2c507
JA
4504 goto expire;
4505
4506 /*
4507 * only expire and reinvoke request handler, if there are
4508 * other queues with pending requests
4509 */
caaa5f9f 4510 if (!cfqd->busy_queues)
22e2c507 4511 goto out_cont;
22e2c507
JA
4512
4513 /*
4514 * not expired and it has a request pending, let it dispatch
4515 */
75e50984 4516 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
22e2c507 4517 goto out_kick;
76280aff
CZ
4518
4519 /*
4520 * Queue depth flag is reset only when the idle didn't succeed
4521 */
4522 cfq_clear_cfqq_deep(cfqq);
22e2c507
JA
4523 }
4524expire:
e5ff082e 4525 cfq_slice_expired(cfqd, timed_out);
22e2c507 4526out_kick:
23e018a1 4527 cfq_schedule_dispatch(cfqd);
22e2c507
JA
4528out_cont:
4529 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
91148325 4530 return HRTIMER_NORESTART;
22e2c507
JA
4531}
4532
3b18152c
JA
4533static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4534{
91148325 4535 hrtimer_cancel(&cfqd->idle_slice_timer);
23e018a1 4536 cancel_work_sync(&cfqd->unplug_work);
3b18152c 4537}
22e2c507 4538
b374d18a 4539static void cfq_exit_queue(struct elevator_queue *e)
1da177e4 4540{
22e2c507 4541 struct cfq_data *cfqd = e->elevator_data;
165125e1 4542 struct request_queue *q = cfqd->queue;
22e2c507 4543
3b18152c 4544 cfq_shutdown_timer_wq(cfqd);
e2d74ac0 4545
d9ff4187 4546 spin_lock_irq(q->queue_lock);
e2d74ac0 4547
d9ff4187 4548 if (cfqd->active_queue)
e5ff082e 4549 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
e2d74ac0 4550
03aa264a
TH
4551 spin_unlock_irq(q->queue_lock);
4552
a90d742e
AV
4553 cfq_shutdown_timer_wq(cfqd);
4554
ffea73fc
TH
4555#ifdef CONFIG_CFQ_GROUP_IOSCHED
4556 blkcg_deactivate_policy(q, &blkcg_policy_cfq);
4557#else
f51b802c 4558 kfree(cfqd->root_group);
2abae55f 4559#endif
56edf7d7 4560 kfree(cfqd);
1da177e4
LT
4561}
4562
d50235b7 4563static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
1da177e4
LT
4564{
4565 struct cfq_data *cfqd;
3c798398 4566 struct blkcg_gq *blkg __maybe_unused;
a2b1693b 4567 int i, ret;
d50235b7
JM
4568 struct elevator_queue *eq;
4569
4570 eq = elevator_alloc(q, e);
4571 if (!eq)
4572 return -ENOMEM;
1da177e4 4573
c1b511eb 4574 cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
d50235b7
JM
4575 if (!cfqd) {
4576 kobject_put(&eq->kobj);
b2fab5ac 4577 return -ENOMEM;
d50235b7
JM
4578 }
4579 eq->elevator_data = cfqd;
80b15c73 4580
f51b802c 4581 cfqd->queue = q;
d50235b7
JM
4582 spin_lock_irq(q->queue_lock);
4583 q->elevator = eq;
4584 spin_unlock_irq(q->queue_lock);
f51b802c 4585
1fa8f6d6
VG
4586 /* Init root service tree */
4587 cfqd->grp_service_tree = CFQ_RB_ROOT;
4588
f51b802c 4589 /* Init root group and prefer root group over other groups by default */
25fb5169 4590#ifdef CONFIG_CFQ_GROUP_IOSCHED
3c798398 4591 ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
a2b1693b
TH
4592 if (ret)
4593 goto out_free;
f51b802c 4594
a2b1693b 4595 cfqd->root_group = blkg_to_cfqg(q->root_blkg);
f51b802c 4596#else
a2b1693b 4597 ret = -ENOMEM;
f51b802c
TH
4598 cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
4599 GFP_KERNEL, cfqd->queue->node);
a2b1693b
TH
4600 if (!cfqd->root_group)
4601 goto out_free;
5624a4e4 4602
a2b1693b 4603 cfq_init_cfqg_base(cfqd->root_group);
3ecca629
TH
4604 cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4605 cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
69d7fde5 4606#endif
5624a4e4 4607
26a2ac00
JA
4608 /*
4609 * Not strictly needed (since RB_ROOT just clears the node and we
4610 * zeroed cfqd on alloc), but better be safe in case someone decides
4611 * to add magic to the rb code
4612 */
4613 for (i = 0; i < CFQ_PRIO_LISTS; i++)
4614 cfqd->prio_trees[i] = RB_ROOT;
4615
6118b70b 4616 /*
d4aad7ff 4617 * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
6118b70b 4618 * Grab a permanent reference to it, so that the normal code flow
f51b802c
TH
4619 * will not attempt to free it. oom_cfqq is linked to root_group
4620 * but shouldn't hold a reference as it'll never be unlinked. Lose
4621 * the reference from linking right away.
6118b70b
JA
4622 */
4623 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
30d7b944 4624 cfqd->oom_cfqq.ref++;
1adaf3dd
TH
4625
4626 spin_lock_irq(q->queue_lock);
f51b802c 4627 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
eb7d8c07 4628 cfqg_put(cfqd->root_group);
1adaf3dd 4629 spin_unlock_irq(q->queue_lock);
1da177e4 4630
91148325
JK
4631 hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC,
4632 HRTIMER_MODE_REL);
22e2c507 4633 cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
22e2c507 4634
23e018a1 4635 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
22e2c507 4636
1da177e4 4637 cfqd->cfq_quantum = cfq_quantum;
22e2c507
JA
4638 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4639 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
1da177e4
LT
4640 cfqd->cfq_back_max = cfq_back_max;
4641 cfqd->cfq_back_penalty = cfq_back_penalty;
22e2c507
JA
4642 cfqd->cfq_slice[0] = cfq_slice_async;
4643 cfqd->cfq_slice[1] = cfq_slice_sync;
5bf14c07 4644 cfqd->cfq_target_latency = cfq_target_latency;
22e2c507 4645 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
0bb97947 4646 cfqd->cfq_slice_idle = cfq_slice_idle;
80bdf0c7 4647 cfqd->cfq_group_idle = cfq_group_idle;
963b72fc 4648 cfqd->cfq_latency = 1;
e459dd08 4649 cfqd->hw_tag = -1;
edc71131
CZ
4650 /*
4651 * we optimistically start assuming sync ops weren't delayed in last
4652 * second, in order to have larger depth for async operations.
4653 */
9a7f38c4 4654 cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC;
b2fab5ac 4655 return 0;
a2b1693b
TH
4656
4657out_free:
4658 kfree(cfqd);
d50235b7 4659 kobject_put(&eq->kobj);
a2b1693b 4660 return ret;
1da177e4
LT
4661}
4662
0bb97947
JA
4663static void cfq_registered_queue(struct request_queue *q)
4664{
4665 struct elevator_queue *e = q->elevator;
4666 struct cfq_data *cfqd = e->elevator_data;
4667
4668 /*
4669 * Default to IOPS mode with no idling for SSDs
4670 */
4671 if (blk_queue_nonrot(q))
4672 cfqd->cfq_slice_idle = 0;
142bbdfc 4673 wbt_disable_default(q);
0bb97947
JA
4674}
4675
1da177e4
LT
4676/*
4677 * sysfs parts below -->
4678 */
1da177e4
LT
4679static ssize_t
4680cfq_var_show(unsigned int var, char *page)
4681{
176167ad 4682 return sprintf(page, "%u\n", var);
1da177e4
LT
4683}
4684
235f8da1 4685static void
4686cfq_var_store(unsigned int *var, const char *page)
1da177e4
LT
4687{
4688 char *p = (char *) page;
4689
4690 *var = simple_strtoul(p, &p, 10);
1da177e4
LT
4691}
4692
1da177e4 4693#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
b374d18a 4694static ssize_t __FUNC(struct elevator_queue *e, char *page) \
1da177e4 4695{ \
3d1ab40f 4696 struct cfq_data *cfqd = e->elevator_data; \
9a7f38c4 4697 u64 __data = __VAR; \
1da177e4 4698 if (__CONV) \
9a7f38c4 4699 __data = div_u64(__data, NSEC_PER_MSEC); \
1da177e4
LT
4700 return cfq_var_show(__data, (page)); \
4701}
4702SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
22e2c507
JA
4703SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4704SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
e572ec7e
AV
4705SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4706SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
22e2c507 4707SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
80bdf0c7 4708SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
22e2c507
JA
4709SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4710SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4711SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
963b72fc 4712SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
5bf14c07 4713SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
1da177e4
LT
4714#undef SHOW_FUNCTION
4715
d2d481d0
JM
4716#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
4717static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4718{ \
4719 struct cfq_data *cfqd = e->elevator_data; \
4720 u64 __data = __VAR; \
4721 __data = div_u64(__data, NSEC_PER_USEC); \
4722 return cfq_var_show(__data, (page)); \
4723}
4724USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle);
4725USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle);
4726USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]);
4727USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]);
4728USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
4729#undef USEC_SHOW_FUNCTION
4730
1da177e4 4731#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
b374d18a 4732static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
1da177e4 4733{ \
3d1ab40f 4734 struct cfq_data *cfqd = e->elevator_data; \
1da177e4 4735 unsigned int __data; \
235f8da1 4736 cfq_var_store(&__data, (page)); \
1da177e4
LT
4737 if (__data < (MIN)) \
4738 __data = (MIN); \
4739 else if (__data > (MAX)) \
4740 __data = (MAX); \
4741 if (__CONV) \
9a7f38c4 4742 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
1da177e4
LT
4743 else \
4744 *(__PTR) = __data; \
235f8da1 4745 return count; \
1da177e4
LT
4746}
4747STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
fe094d98
JA
4748STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4749 UINT_MAX, 1);
4750STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4751 UINT_MAX, 1);
e572ec7e 4752STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
fe094d98
JA
4753STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4754 UINT_MAX, 0);
22e2c507 4755STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
80bdf0c7 4756STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
22e2c507
JA
4757STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4758STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
fe094d98
JA
4759STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4760 UINT_MAX, 0);
963b72fc 4761STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
5bf14c07 4762STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
1da177e4
LT
4763#undef STORE_FUNCTION
4764
d2d481d0
JM
4765#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
4766static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4767{ \
4768 struct cfq_data *cfqd = e->elevator_data; \
4769 unsigned int __data; \
235f8da1 4770 cfq_var_store(&__data, (page)); \
d2d481d0
JM
4771 if (__data < (MIN)) \
4772 __data = (MIN); \
4773 else if (__data > (MAX)) \
4774 __data = (MAX); \
4775 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
235f8da1 4776 return count; \
d2d481d0
JM
4777}
4778USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX);
4779USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX);
4780USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX);
4781USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX);
4782USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX);
4783#undef USEC_STORE_FUNCTION
4784
e572ec7e
AV
4785#define CFQ_ATTR(name) \
4786 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
4787
4788static struct elv_fs_entry cfq_attrs[] = {
4789 CFQ_ATTR(quantum),
e572ec7e
AV
4790 CFQ_ATTR(fifo_expire_sync),
4791 CFQ_ATTR(fifo_expire_async),
4792 CFQ_ATTR(back_seek_max),
4793 CFQ_ATTR(back_seek_penalty),
4794 CFQ_ATTR(slice_sync),
d2d481d0 4795 CFQ_ATTR(slice_sync_us),
e572ec7e 4796 CFQ_ATTR(slice_async),
d2d481d0 4797 CFQ_ATTR(slice_async_us),
e572ec7e
AV
4798 CFQ_ATTR(slice_async_rq),
4799 CFQ_ATTR(slice_idle),
d2d481d0 4800 CFQ_ATTR(slice_idle_us),
80bdf0c7 4801 CFQ_ATTR(group_idle),
d2d481d0 4802 CFQ_ATTR(group_idle_us),
963b72fc 4803 CFQ_ATTR(low_latency),
5bf14c07 4804 CFQ_ATTR(target_latency),
d2d481d0 4805 CFQ_ATTR(target_latency_us),
e572ec7e 4806 __ATTR_NULL
1da177e4
LT
4807};
4808
1da177e4 4809static struct elevator_type iosched_cfq = {
c51ca6cf 4810 .ops.sq = {
1da177e4
LT
4811 .elevator_merge_fn = cfq_merge,
4812 .elevator_merged_fn = cfq_merged_request,
4813 .elevator_merge_req_fn = cfq_merged_requests,
72ef799b
TE
4814 .elevator_allow_bio_merge_fn = cfq_allow_bio_merge,
4815 .elevator_allow_rq_merge_fn = cfq_allow_rq_merge,
812d4026 4816 .elevator_bio_merged_fn = cfq_bio_merged,
b4878f24 4817 .elevator_dispatch_fn = cfq_dispatch_requests,
1da177e4 4818 .elevator_add_req_fn = cfq_insert_request,
b4878f24 4819 .elevator_activate_req_fn = cfq_activate_request,
1da177e4 4820 .elevator_deactivate_req_fn = cfq_deactivate_request,
1da177e4 4821 .elevator_completed_req_fn = cfq_completed_request,
21183b07
JA
4822 .elevator_former_req_fn = elv_rb_former_request,
4823 .elevator_latter_req_fn = elv_rb_latter_request,
9b84cacd 4824 .elevator_init_icq_fn = cfq_init_icq,
7e5a8794 4825 .elevator_exit_icq_fn = cfq_exit_icq,
1da177e4
LT
4826 .elevator_set_req_fn = cfq_set_request,
4827 .elevator_put_req_fn = cfq_put_request,
4828 .elevator_may_queue_fn = cfq_may_queue,
4829 .elevator_init_fn = cfq_init_queue,
4830 .elevator_exit_fn = cfq_exit_queue,
0bb97947 4831 .elevator_registered_fn = cfq_registered_queue,
1da177e4 4832 },
3d3c2379
TH
4833 .icq_size = sizeof(struct cfq_io_cq),
4834 .icq_align = __alignof__(struct cfq_io_cq),
3d1ab40f 4835 .elevator_attrs = cfq_attrs,
3d3c2379 4836 .elevator_name = "cfq",
1da177e4
LT
4837 .elevator_owner = THIS_MODULE,
4838};
4839
3e252066 4840#ifdef CONFIG_CFQ_GROUP_IOSCHED
3c798398 4841static struct blkcg_policy blkcg_policy_cfq = {
2ee867dc 4842 .dfl_cftypes = cfq_blkcg_files,
880f50e2 4843 .legacy_cftypes = cfq_blkcg_legacy_files,
f9fcc2d3 4844
e4a9bde9 4845 .cpd_alloc_fn = cfq_cpd_alloc,
e48453c3 4846 .cpd_init_fn = cfq_cpd_init,
e4a9bde9 4847 .cpd_free_fn = cfq_cpd_free,
69d7fde5 4848 .cpd_bind_fn = cfq_cpd_bind,
e4a9bde9 4849
001bea73 4850 .pd_alloc_fn = cfq_pd_alloc,
f9fcc2d3 4851 .pd_init_fn = cfq_pd_init,
0b39920b 4852 .pd_offline_fn = cfq_pd_offline,
001bea73 4853 .pd_free_fn = cfq_pd_free,
f9fcc2d3 4854 .pd_reset_stats_fn = cfq_pd_reset_stats,
3e252066 4855};
3e252066
VG
4856#endif
4857
1da177e4
LT
4858static int __init cfq_init(void)
4859{
3d3c2379
TH
4860 int ret;
4861
80bdf0c7 4862#ifdef CONFIG_CFQ_GROUP_IOSCHED
3c798398 4863 ret = blkcg_policy_register(&blkcg_policy_cfq);
8bd435b3
TH
4864 if (ret)
4865 return ret;
ffea73fc
TH
4866#else
4867 cfq_group_idle = 0;
4868#endif
8bd435b3 4869
fd794956 4870 ret = -ENOMEM;
3d3c2379
TH
4871 cfq_pool = KMEM_CACHE(cfq_queue, 0);
4872 if (!cfq_pool)
8bd435b3 4873 goto err_pol_unreg;
1da177e4 4874
3d3c2379 4875 ret = elv_register(&iosched_cfq);
8bd435b3
TH
4876 if (ret)
4877 goto err_free_pool;
3d3c2379 4878
2fdd82bd 4879 return 0;
8bd435b3
TH
4880
4881err_free_pool:
4882 kmem_cache_destroy(cfq_pool);
4883err_pol_unreg:
ffea73fc 4884#ifdef CONFIG_CFQ_GROUP_IOSCHED
3c798398 4885 blkcg_policy_unregister(&blkcg_policy_cfq);
ffea73fc 4886#endif
8bd435b3 4887 return ret;
1da177e4
LT
4888}
4889
4890static void __exit cfq_exit(void)
4891{
ffea73fc 4892#ifdef CONFIG_CFQ_GROUP_IOSCHED
3c798398 4893 blkcg_policy_unregister(&blkcg_policy_cfq);
ffea73fc 4894#endif
1da177e4 4895 elv_unregister(&iosched_cfq);
3d3c2379 4896 kmem_cache_destroy(cfq_pool);
1da177e4
LT
4897}
4898
4899module_init(cfq_init);
4900module_exit(cfq_exit);
4901
4902MODULE_AUTHOR("Jens Axboe");
4903MODULE_LICENSE("GPL");
4904MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");