]>
Commit | Line | Data |
---|---|---|
97bb1c57 AF |
1 | From: Eric Dumazet <edumazet@google.com> |
2 | Date: Thu, 10 May 2012 07:51:25 +0000 | |
3 | Subject: [PATCH 1/7] codel: Controlled Delay AQM | |
4 | ||
5 | commit 76e3cc126bb223013a6b9a0e2a51238d1ef2e409 upstream. | |
6 | ||
7 | An implementation of CoDel AQM, from Kathleen Nichols and Van Jacobson. | |
8 | ||
9 | http://queue.acm.org/detail.cfm?id=2209336 | |
10 | ||
11 | This AQM main input is no longer queue size in bytes or packets, but the | |
12 | delay packets stay in (FIFO) queue. | |
13 | ||
14 | As we don't have infinite memory, we still can drop packets in enqueue() | |
15 | in case of massive load, but mean of CoDel is to drop packets in | |
16 | dequeue(), using a control law based on two simple parameters : | |
17 | ||
18 | target : target sojourn time (default 5ms) | |
19 | interval : width of moving time window (default 100ms) | |
20 | ||
21 | Based on initial work from Dave Taht. | |
22 | ||
23 | Refactored to help future codel inclusion as a plugin for other linux | |
24 | qdisc (FQ_CODEL, ...), like RED. | |
25 | ||
26 | include/net/codel.h contains codel algorithm as close as possible than | |
27 | Kathleen reference. | |
28 | ||
29 | net/sched/sch_codel.c contains the linux qdisc specific glue. | |
30 | ||
31 | Separate structures permit a memory efficient implementation of fq_codel | |
32 | (to be sent as a separate work) : Each flow has its own struct | |
33 | codel_vars. | |
34 | ||
35 | timestamps are taken at enqueue() time with 1024 ns precision, allowing | |
36 | a range of 2199 seconds in queue, and 100Gb links support. iproute2 uses | |
37 | usec as base unit. | |
38 | ||
39 | Selected packets are dropped, unless ECN is enabled and packets can get | |
40 | ECN mark instead. | |
41 | ||
42 | Tested from 2Mb to 10Gb speeds with no particular problems, on ixgbe and | |
43 | tg3 drivers (BQL enabled). | |
44 | ||
45 | Usage: tc qdisc ... codel [ limit PACKETS ] [ target TIME ] | |
46 | [ interval TIME ] [ ecn ] | |
47 | ||
48 | qdisc codel 10: parent 1:1 limit 2000p target 3.0ms interval 60.0ms ecn | |
49 | Sent 13347099587 bytes 8815805 pkt (dropped 0, overlimits 0 requeues 0) | |
50 | rate 202365Kbit 16708pps backlog 113550b 75p requeues 0 | |
51 | count 116 lastcount 98 ldelay 4.3ms dropping drop_next 816us | |
52 | maxpacket 1514 ecn_mark 84399 drop_overlimit 0 | |
53 | ||
54 | CoDel must be seen as a base module, and should be used keeping in mind | |
55 | there is still a FIFO queue. So a typical setup will probably need a | |
56 | hierarchy of several qdiscs and packet classifiers to be able to meet | |
57 | whatever constraints a user might have. | |
58 | ||
59 | One possible example would be to use fq_codel, which combines Fair | |
60 | Queueing and CoDel, in replacement of sfq / sfq_red. | |
61 | ||
62 | Signed-off-by: Eric Dumazet <edumazet@google.com> | |
63 | Signed-off-by: Dave Taht <dave.taht@bufferbloat.net> | |
64 | Cc: Kathleen Nichols <nichols@pollere.com> | |
65 | Cc: Van Jacobson <van@pollere.net> | |
66 | Cc: Tom Herbert <therbert@google.com> | |
67 | Cc: Matt Mathis <mattmathis@google.com> | |
68 | Cc: Yuchung Cheng <ycheng@google.com> | |
69 | Cc: Stephen Hemminger <shemminger@vyatta.com> | |
70 | Signed-off-by: David S. Miller <davem@davemloft.net> | |
71 | --- | |
72 | include/linux/pkt_sched.h | 26 ++++ | |
73 | include/net/codel.h | 332 +++++++++++++++++++++++++++++++++++++++++++++ | |
74 | net/sched/Kconfig | 11 ++ | |
75 | net/sched/Makefile | 1 + | |
76 | net/sched/sch_codel.c | 275 +++++++++++++++++++++++++++++++++++++ | |
77 | 5 files changed, 645 insertions(+) | |
78 | create mode 100644 include/net/codel.h | |
79 | create mode 100644 net/sched/sch_codel.c | |
80 | ||
81 | diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h | |
82 | index ffe975c..cde56c2 100644 | |
83 | --- a/include/linux/pkt_sched.h | |
84 | +++ b/include/linux/pkt_sched.h | |
85 | @@ -655,4 +655,30 @@ struct tc_qfq_stats { | |
86 | __u32 lmax; | |
87 | }; | |
88 | ||
89 | +/* CODEL */ | |
90 | + | |
91 | +enum { | |
92 | + TCA_CODEL_UNSPEC, | |
93 | + TCA_CODEL_TARGET, | |
94 | + TCA_CODEL_LIMIT, | |
95 | + TCA_CODEL_INTERVAL, | |
96 | + TCA_CODEL_ECN, | |
97 | + __TCA_CODEL_MAX | |
98 | +}; | |
99 | + | |
100 | +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) | |
101 | + | |
102 | +struct tc_codel_xstats { | |
103 | + __u32 maxpacket; /* largest packet we've seen so far */ | |
104 | + __u32 count; /* how many drops we've done since the last time we | |
105 | + * entered dropping state | |
106 | + */ | |
107 | + __u32 lastcount; /* count at entry to dropping state */ | |
108 | + __u32 ldelay; /* in-queue delay seen by most recently dequeued packet */ | |
109 | + __s32 drop_next; /* time to drop next packet */ | |
110 | + __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ | |
111 | + __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ | |
112 | + __u32 dropping; /* are we in dropping state ? */ | |
113 | +}; | |
114 | + | |
115 | #endif | |
116 | diff --git a/include/net/codel.h b/include/net/codel.h | |
117 | new file mode 100644 | |
118 | index 0000000..bce2cef | |
119 | --- /dev/null | |
120 | +++ b/include/net/codel.h | |
121 | @@ -0,0 +1,332 @@ | |
122 | +#ifndef __NET_SCHED_CODEL_H | |
123 | +#define __NET_SCHED_CODEL_H | |
124 | + | |
125 | +/* | |
126 | + * Codel - The Controlled-Delay Active Queue Management algorithm | |
127 | + * | |
128 | + * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com> | |
129 | + * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net> | |
130 | + * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> | |
131 | + * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> | |
132 | + * | |
133 | + * Redistribution and use in source and binary forms, with or without | |
134 | + * modification, are permitted provided that the following conditions | |
135 | + * are met: | |
136 | + * 1. Redistributions of source code must retain the above copyright | |
137 | + * notice, this list of conditions, and the following disclaimer, | |
138 | + * without modification. | |
139 | + * 2. Redistributions in binary form must reproduce the above copyright | |
140 | + * notice, this list of conditions and the following disclaimer in the | |
141 | + * documentation and/or other materials provided with the distribution. | |
142 | + * 3. The names of the authors may not be used to endorse or promote products | |
143 | + * derived from this software without specific prior written permission. | |
144 | + * | |
145 | + * Alternatively, provided that this notice is retained in full, this | |
146 | + * software may be distributed under the terms of the GNU General | |
147 | + * Public License ("GPL") version 2, in which case the provisions of the | |
148 | + * GPL apply INSTEAD OF those given above. | |
149 | + * | |
150 | + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
151 | + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
152 | + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
153 | + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
154 | + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
155 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
156 | + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
157 | + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
158 | + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
159 | + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
160 | + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH | |
161 | + * DAMAGE. | |
162 | + * | |
163 | + */ | |
164 | + | |
165 | +#include <linux/types.h> | |
166 | +#include <linux/ktime.h> | |
167 | +#include <linux/skbuff.h> | |
168 | +#include <net/pkt_sched.h> | |
169 | +#include <net/inet_ecn.h> | |
170 | + | |
171 | +/* Controlling Queue Delay (CoDel) algorithm | |
172 | + * ========================================= | |
173 | + * Source : Kathleen Nichols and Van Jacobson | |
174 | + * http://queue.acm.org/detail.cfm?id=2209336 | |
175 | + * | |
176 | + * Implemented on linux by Dave Taht and Eric Dumazet | |
177 | + */ | |
178 | + | |
179 | + | |
180 | +/* CoDel uses a 1024 nsec clock, encoded in u32 | |
181 | + * This gives a range of 2199 seconds, because of signed compares | |
182 | + */ | |
183 | +typedef u32 codel_time_t; | |
184 | +typedef s32 codel_tdiff_t; | |
185 | +#define CODEL_SHIFT 10 | |
186 | +#define MS2TIME(a) ((a * NSEC_PER_MSEC) >> CODEL_SHIFT) | |
187 | + | |
188 | +static inline codel_time_t codel_get_time(void) | |
189 | +{ | |
190 | + u64 ns = ktime_to_ns(ktime_get()); | |
191 | + | |
192 | + return ns >> CODEL_SHIFT; | |
193 | +} | |
194 | + | |
195 | +#define codel_time_after(a, b) ((s32)(a) - (s32)(b) > 0) | |
196 | +#define codel_time_after_eq(a, b) ((s32)(a) - (s32)(b) >= 0) | |
197 | +#define codel_time_before(a, b) ((s32)(a) - (s32)(b) < 0) | |
198 | +#define codel_time_before_eq(a, b) ((s32)(a) - (s32)(b) <= 0) | |
199 | + | |
200 | +/* Qdiscs using codel plugin must use codel_skb_cb in their own cb[] */ | |
201 | +struct codel_skb_cb { | |
202 | + codel_time_t enqueue_time; | |
203 | +}; | |
204 | + | |
205 | +static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) | |
206 | +{ | |
207 | + qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb)); | |
208 | + return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data; | |
209 | +} | |
210 | + | |
211 | +static codel_time_t codel_get_enqueue_time(const struct sk_buff *skb) | |
212 | +{ | |
213 | + return get_codel_cb(skb)->enqueue_time; | |
214 | +} | |
215 | + | |
216 | +static void codel_set_enqueue_time(struct sk_buff *skb) | |
217 | +{ | |
218 | + get_codel_cb(skb)->enqueue_time = codel_get_time(); | |
219 | +} | |
220 | + | |
221 | +static inline u32 codel_time_to_us(codel_time_t val) | |
222 | +{ | |
223 | + u64 valns = ((u64)val << CODEL_SHIFT); | |
224 | + | |
225 | + do_div(valns, NSEC_PER_USEC); | |
226 | + return (u32)valns; | |
227 | +} | |
228 | + | |
229 | +/** | |
230 | + * struct codel_params - contains codel parameters | |
231 | + * @target: target queue size (in time units) | |
232 | + * @interval: width of moving time window | |
233 | + * @ecn: is Explicit Congestion Notification enabled | |
234 | + */ | |
235 | +struct codel_params { | |
236 | + codel_time_t target; | |
237 | + codel_time_t interval; | |
238 | + bool ecn; | |
239 | +}; | |
240 | + | |
241 | +/** | |
242 | + * struct codel_vars - contains codel variables | |
243 | + * @count: how many drops we've done since the last time we | |
244 | + * entered dropping state | |
245 | + * @lastcount: count at entry to dropping state | |
246 | + * @dropping: set to true if in dropping state | |
247 | + * @first_above_time: when we went (or will go) continuously above target | |
248 | + * for interval | |
249 | + * @drop_next: time to drop next packet, or when we dropped last | |
250 | + * @ldelay: sojourn time of last dequeued packet | |
251 | + */ | |
252 | +struct codel_vars { | |
253 | + u32 count; | |
254 | + u32 lastcount; | |
255 | + bool dropping; | |
256 | + codel_time_t first_above_time; | |
257 | + codel_time_t drop_next; | |
258 | + codel_time_t ldelay; | |
259 | +}; | |
260 | + | |
261 | +/** | |
262 | + * struct codel_stats - contains codel shared variables and stats | |
263 | + * @maxpacket: largest packet we've seen so far | |
264 | + * @drop_count: temp count of dropped packets in dequeue() | |
265 | + * ecn_mark: number of packets we ECN marked instead of dropping | |
266 | + */ | |
267 | +struct codel_stats { | |
268 | + u32 maxpacket; | |
269 | + u32 drop_count; | |
270 | + u32 ecn_mark; | |
271 | +}; | |
272 | + | |
273 | +static void codel_params_init(struct codel_params *params) | |
274 | +{ | |
275 | + params->interval = MS2TIME(100); | |
276 | + params->target = MS2TIME(5); | |
277 | + params->ecn = false; | |
278 | +} | |
279 | + | |
280 | +static void codel_vars_init(struct codel_vars *vars) | |
281 | +{ | |
282 | + vars->drop_next = 0; | |
283 | + vars->first_above_time = 0; | |
284 | + vars->dropping = false; /* exit dropping state */ | |
285 | + vars->count = 0; | |
286 | + vars->lastcount = 0; | |
287 | +} | |
288 | + | |
289 | +static void codel_stats_init(struct codel_stats *stats) | |
290 | +{ | |
291 | + stats->maxpacket = 256; | |
292 | +} | |
293 | + | |
294 | +/* return interval/sqrt(x) with good precision | |
295 | + * relies on int_sqrt(unsigned long x) kernel implementation | |
296 | + */ | |
297 | +static u32 codel_inv_sqrt(u32 _interval, u32 _x) | |
298 | +{ | |
299 | + u64 interval = _interval; | |
300 | + unsigned long x = _x; | |
301 | + | |
302 | + /* Scale operands for max precision */ | |
303 | + | |
304 | +#if BITS_PER_LONG == 64 | |
305 | + x <<= 32; /* On 64bit arches, we can prescale x by 32bits */ | |
306 | + interval <<= 16; | |
307 | +#endif | |
308 | + | |
309 | + while (x < (1UL << (BITS_PER_LONG - 2))) { | |
310 | + x <<= 2; | |
311 | + interval <<= 1; | |
312 | + } | |
313 | + do_div(interval, int_sqrt(x)); | |
314 | + return (u32)interval; | |
315 | +} | |
316 | + | |
317 | +static codel_time_t codel_control_law(codel_time_t t, | |
318 | + codel_time_t interval, | |
319 | + u32 count) | |
320 | +{ | |
321 | + return t + codel_inv_sqrt(interval, count); | |
322 | +} | |
323 | + | |
324 | + | |
325 | +static bool codel_should_drop(struct sk_buff *skb, | |
326 | + unsigned int *backlog, | |
327 | + struct codel_vars *vars, | |
328 | + struct codel_params *params, | |
329 | + struct codel_stats *stats, | |
330 | + codel_time_t now) | |
331 | +{ | |
332 | + bool ok_to_drop; | |
333 | + | |
334 | + if (!skb) { | |
335 | + vars->first_above_time = 0; | |
336 | + return false; | |
337 | + } | |
338 | + | |
339 | + vars->ldelay = now - codel_get_enqueue_time(skb); | |
340 | + *backlog -= qdisc_pkt_len(skb); | |
341 | + | |
342 | + if (unlikely(qdisc_pkt_len(skb) > stats->maxpacket)) | |
343 | + stats->maxpacket = qdisc_pkt_len(skb); | |
344 | + | |
345 | + if (codel_time_before(vars->ldelay, params->target) || | |
346 | + *backlog <= stats->maxpacket) { | |
347 | + /* went below - stay below for at least interval */ | |
348 | + vars->first_above_time = 0; | |
349 | + return false; | |
350 | + } | |
351 | + ok_to_drop = false; | |
352 | + if (vars->first_above_time == 0) { | |
353 | + /* just went above from below. If we stay above | |
354 | + * for at least interval we'll say it's ok to drop | |
355 | + */ | |
356 | + vars->first_above_time = now + params->interval; | |
357 | + } else if (codel_time_after(now, vars->first_above_time)) { | |
358 | + ok_to_drop = true; | |
359 | + } | |
360 | + return ok_to_drop; | |
361 | +} | |
362 | + | |
363 | +typedef struct sk_buff * (*codel_skb_dequeue_t)(struct codel_vars *vars, | |
364 | + struct Qdisc *sch); | |
365 | + | |
366 | +static struct sk_buff *codel_dequeue(struct Qdisc *sch, | |
367 | + struct codel_params *params, | |
368 | + struct codel_vars *vars, | |
369 | + struct codel_stats *stats, | |
370 | + codel_skb_dequeue_t dequeue_func, | |
371 | + u32 *backlog) | |
372 | +{ | |
373 | + struct sk_buff *skb = dequeue_func(vars, sch); | |
374 | + codel_time_t now; | |
375 | + bool drop; | |
376 | + | |
377 | + if (!skb) { | |
378 | + vars->dropping = false; | |
379 | + return skb; | |
380 | + } | |
381 | + now = codel_get_time(); | |
382 | + drop = codel_should_drop(skb, backlog, vars, params, stats, now); | |
383 | + if (vars->dropping) { | |
384 | + if (!drop) { | |
385 | + /* sojourn time below target - leave dropping state */ | |
386 | + vars->dropping = false; | |
387 | + } else if (codel_time_after_eq(now, vars->drop_next)) { | |
388 | + /* It's time for the next drop. Drop the current | |
389 | + * packet and dequeue the next. The dequeue might | |
390 | + * take us out of dropping state. | |
391 | + * If not, schedule the next drop. | |
392 | + * A large backlog might result in drop rates so high | |
393 | + * that the next drop should happen now, | |
394 | + * hence the while loop. | |
395 | + */ | |
396 | + while (vars->dropping && | |
397 | + codel_time_after_eq(now, vars->drop_next)) { | |
398 | + if (++vars->count == 0) /* avoid zero divides */ | |
399 | + vars->count = ~0U; | |
400 | + if (params->ecn && INET_ECN_set_ce(skb)) { | |
401 | + stats->ecn_mark++; | |
402 | + vars->drop_next = | |
403 | + codel_control_law(vars->drop_next, | |
404 | + params->interval, | |
405 | + vars->count); | |
406 | + goto end; | |
407 | + } | |
408 | + qdisc_drop(skb, sch); | |
409 | + stats->drop_count++; | |
410 | + skb = dequeue_func(vars, sch); | |
411 | + if (!codel_should_drop(skb, backlog, | |
412 | + vars, params, stats, now)) { | |
413 | + /* leave dropping state */ | |
414 | + vars->dropping = false; | |
415 | + } else { | |
416 | + /* and schedule the next drop */ | |
417 | + vars->drop_next = | |
418 | + codel_control_law(vars->drop_next, | |
419 | + params->interval, | |
420 | + vars->count); | |
421 | + } | |
422 | + } | |
423 | + } | |
424 | + } else if (drop) { | |
425 | + if (params->ecn && INET_ECN_set_ce(skb)) { | |
426 | + stats->ecn_mark++; | |
427 | + } else { | |
428 | + qdisc_drop(skb, sch); | |
429 | + stats->drop_count++; | |
430 | + | |
431 | + skb = dequeue_func(vars, sch); | |
432 | + drop = codel_should_drop(skb, backlog, vars, params, | |
433 | + stats, now); | |
434 | + } | |
435 | + vars->dropping = true; | |
436 | + /* if min went above target close to when we last went below it | |
437 | + * assume that the drop rate that controlled the queue on the | |
438 | + * last cycle is a good starting point to control it now. | |
439 | + */ | |
440 | + if (codel_time_before(now - vars->drop_next, | |
441 | + 16 * params->interval)) { | |
442 | + vars->count = (vars->count - vars->lastcount) | 1; | |
443 | + } else { | |
444 | + vars->count = 1; | |
445 | + } | |
446 | + vars->lastcount = vars->count; | |
447 | + vars->drop_next = codel_control_law(now, params->interval, | |
448 | + vars->count); | |
449 | + } | |
450 | +end: | |
451 | + return skb; | |
452 | +} | |
453 | +#endif | |
454 | diff --git a/net/sched/Kconfig b/net/sched/Kconfig | |
455 | index 75b58f8..fadd252 100644 | |
456 | --- a/net/sched/Kconfig | |
457 | +++ b/net/sched/Kconfig | |
458 | @@ -250,6 +250,17 @@ config NET_SCH_QFQ | |
459 | ||
460 | If unsure, say N. | |
461 | ||
462 | +config NET_SCH_CODEL | |
463 | + tristate "Controlled Delay AQM (CODEL)" | |
464 | + help | |
465 | + Say Y here if you want to use the Controlled Delay (CODEL) | |
466 | + packet scheduling algorithm. | |
467 | + | |
468 | + To compile this driver as a module, choose M here: the module | |
469 | + will be called sch_codel. | |
470 | + | |
471 | + If unsure, say N. | |
472 | + | |
473 | config NET_SCH_INGRESS | |
474 | tristate "Ingress Qdisc" | |
475 | depends on NET_CLS_ACT | |
476 | diff --git a/net/sched/Makefile b/net/sched/Makefile | |
477 | index 8cdf4e2..30fab03 100644 | |
478 | --- a/net/sched/Makefile | |
479 | +++ b/net/sched/Makefile | |
480 | @@ -37,6 +37,7 @@ obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o | |
481 | obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o | |
482 | obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o | |
483 | obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o | |
484 | +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o | |
485 | ||
486 | obj-$(CONFIG_NET_CLS_U32) += cls_u32.o | |
487 | obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o | |
488 | diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c | |
489 | new file mode 100644 | |
490 | index 0000000..b4a1a81 | |
491 | --- /dev/null | |
492 | +++ b/net/sched/sch_codel.c | |
493 | @@ -0,0 +1,275 @@ | |
494 | +/* | |
495 | + * Codel - The Controlled-Delay Active Queue Management algorithm | |
496 | + * | |
497 | + * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com> | |
498 | + * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net> | |
499 | + * | |
500 | + * Implemented on linux by : | |
501 | + * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> | |
502 | + * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> | |
503 | + * | |
504 | + * Redistribution and use in source and binary forms, with or without | |
505 | + * modification, are permitted provided that the following conditions | |
506 | + * are met: | |
507 | + * 1. Redistributions of source code must retain the above copyright | |
508 | + * notice, this list of conditions, and the following disclaimer, | |
509 | + * without modification. | |
510 | + * 2. Redistributions in binary form must reproduce the above copyright | |
511 | + * notice, this list of conditions and the following disclaimer in the | |
512 | + * documentation and/or other materials provided with the distribution. | |
513 | + * 3. The names of the authors may not be used to endorse or promote products | |
514 | + * derived from this software without specific prior written permission. | |
515 | + * | |
516 | + * Alternatively, provided that this notice is retained in full, this | |
517 | + * software may be distributed under the terms of the GNU General | |
518 | + * Public License ("GPL") version 2, in which case the provisions of the | |
519 | + * GPL apply INSTEAD OF those given above. | |
520 | + * | |
521 | + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
522 | + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
523 | + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
524 | + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
525 | + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
526 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
527 | + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
528 | + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
529 | + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
530 | + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
531 | + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH | |
532 | + * DAMAGE. | |
533 | + * | |
534 | + */ | |
535 | + | |
536 | +#include <linux/module.h> | |
537 | +#include <linux/slab.h> | |
538 | +#include <linux/types.h> | |
539 | +#include <linux/kernel.h> | |
540 | +#include <linux/errno.h> | |
541 | +#include <linux/skbuff.h> | |
542 | +#include <net/pkt_sched.h> | |
543 | +#include <net/codel.h> | |
544 | + | |
545 | + | |
546 | +#define DEFAULT_CODEL_LIMIT 1000 | |
547 | + | |
548 | +struct codel_sched_data { | |
549 | + struct codel_params params; | |
550 | + struct codel_vars vars; | |
551 | + struct codel_stats stats; | |
552 | + u32 drop_overlimit; | |
553 | +}; | |
554 | + | |
555 | +/* This is the specific function called from codel_dequeue() | |
556 | + * to dequeue a packet from queue. Note: backlog is handled in | |
557 | + * codel, we dont need to reduce it here. | |
558 | + */ | |
559 | +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) | |
560 | +{ | |
561 | + struct sk_buff *skb = __skb_dequeue(&sch->q); | |
562 | + | |
563 | + prefetch(&skb->end); /* we'll need skb_shinfo() */ | |
564 | + return skb; | |
565 | +} | |
566 | + | |
567 | +static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) | |
568 | +{ | |
569 | + struct codel_sched_data *q = qdisc_priv(sch); | |
570 | + struct sk_buff *skb; | |
571 | + | |
572 | + skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, | |
573 | + dequeue, &sch->qstats.backlog); | |
574 | + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, | |
575 | + * or HTB crashes. Defer it for next round. | |
576 | + */ | |
577 | + if (q->stats.drop_count && sch->q.qlen) { | |
578 | + qdisc_tree_decrease_qlen(sch, q->stats.drop_count); | |
579 | + q->stats.drop_count = 0; | |
580 | + } | |
581 | + if (skb) | |
582 | + qdisc_bstats_update(sch, skb); | |
583 | + return skb; | |
584 | +} | |
585 | + | |
586 | +static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |
587 | +{ | |
588 | + struct codel_sched_data *q; | |
589 | + | |
590 | + if (likely(qdisc_qlen(sch) < sch->limit)) { | |
591 | + codel_set_enqueue_time(skb); | |
592 | + return qdisc_enqueue_tail(skb, sch); | |
593 | + } | |
594 | + q = qdisc_priv(sch); | |
595 | + q->drop_overlimit++; | |
596 | + return qdisc_drop(skb, sch); | |
597 | +} | |
598 | + | |
599 | +static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { | |
600 | + [TCA_CODEL_TARGET] = { .type = NLA_U32 }, | |
601 | + [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, | |
602 | + [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, | |
603 | + [TCA_CODEL_ECN] = { .type = NLA_U32 }, | |
604 | +}; | |
605 | + | |
606 | +static int codel_change(struct Qdisc *sch, struct nlattr *opt) | |
607 | +{ | |
608 | + struct codel_sched_data *q = qdisc_priv(sch); | |
609 | + struct nlattr *tb[TCA_CODEL_MAX + 1]; | |
610 | + unsigned int qlen; | |
611 | + int err; | |
612 | + | |
613 | + if (!opt) | |
614 | + return -EINVAL; | |
615 | + | |
616 | + err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy); | |
617 | + if (err < 0) | |
618 | + return err; | |
619 | + | |
620 | + sch_tree_lock(sch); | |
621 | + | |
622 | + if (tb[TCA_CODEL_TARGET]) { | |
623 | + u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); | |
624 | + | |
625 | + q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; | |
626 | + } | |
627 | + | |
628 | + if (tb[TCA_CODEL_INTERVAL]) { | |
629 | + u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); | |
630 | + | |
631 | + q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT; | |
632 | + } | |
633 | + | |
634 | + if (tb[TCA_CODEL_LIMIT]) | |
635 | + sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); | |
636 | + | |
637 | + if (tb[TCA_CODEL_ECN]) | |
638 | + q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]); | |
639 | + | |
640 | + qlen = sch->q.qlen; | |
641 | + while (sch->q.qlen > sch->limit) { | |
642 | + struct sk_buff *skb = __skb_dequeue(&sch->q); | |
643 | + | |
644 | + sch->qstats.backlog -= qdisc_pkt_len(skb); | |
645 | + qdisc_drop(skb, sch); | |
646 | + } | |
647 | + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); | |
648 | + | |
649 | + sch_tree_unlock(sch); | |
650 | + return 0; | |
651 | +} | |
652 | + | |
653 | +static int codel_init(struct Qdisc *sch, struct nlattr *opt) | |
654 | +{ | |
655 | + struct codel_sched_data *q = qdisc_priv(sch); | |
656 | + | |
657 | + sch->limit = DEFAULT_CODEL_LIMIT; | |
658 | + | |
659 | + codel_params_init(&q->params); | |
660 | + codel_vars_init(&q->vars); | |
661 | + codel_stats_init(&q->stats); | |
662 | + | |
663 | + if (opt) { | |
664 | + int err = codel_change(sch, opt); | |
665 | + | |
666 | + if (err) | |
667 | + return err; | |
668 | + } | |
669 | + | |
670 | + if (sch->limit >= 1) | |
671 | + sch->flags |= TCQ_F_CAN_BYPASS; | |
672 | + else | |
673 | + sch->flags &= ~TCQ_F_CAN_BYPASS; | |
674 | + | |
675 | + return 0; | |
676 | +} | |
677 | + | |
678 | +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) | |
679 | +{ | |
680 | + struct codel_sched_data *q = qdisc_priv(sch); | |
681 | + struct nlattr *opts; | |
682 | + | |
683 | + opts = nla_nest_start(skb, TCA_OPTIONS); | |
684 | + if (opts == NULL) | |
685 | + goto nla_put_failure; | |
686 | + | |
687 | + if (nla_put_u32(skb, TCA_CODEL_TARGET, | |
688 | + codel_time_to_us(q->params.target)) || | |
689 | + nla_put_u32(skb, TCA_CODEL_LIMIT, | |
690 | + sch->limit) || | |
691 | + nla_put_u32(skb, TCA_CODEL_INTERVAL, | |
692 | + codel_time_to_us(q->params.interval)) || | |
693 | + nla_put_u32(skb, TCA_CODEL_ECN, | |
694 | + q->params.ecn)) | |
695 | + goto nla_put_failure; | |
696 | + | |
697 | + return nla_nest_end(skb, opts); | |
698 | + | |
699 | +nla_put_failure: | |
700 | + nla_nest_cancel(skb, opts); | |
701 | + return -1; | |
702 | +} | |
703 | + | |
704 | +static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) | |
705 | +{ | |
706 | + const struct codel_sched_data *q = qdisc_priv(sch); | |
707 | + struct tc_codel_xstats st = { | |
708 | + .maxpacket = q->stats.maxpacket, | |
709 | + .count = q->vars.count, | |
710 | + .lastcount = q->vars.lastcount, | |
711 | + .drop_overlimit = q->drop_overlimit, | |
712 | + .ldelay = codel_time_to_us(q->vars.ldelay), | |
713 | + .dropping = q->vars.dropping, | |
714 | + .ecn_mark = q->stats.ecn_mark, | |
715 | + }; | |
716 | + | |
717 | + if (q->vars.dropping) { | |
718 | + codel_tdiff_t delta = q->vars.drop_next - codel_get_time(); | |
719 | + | |
720 | + if (delta >= 0) | |
721 | + st.drop_next = codel_time_to_us(delta); | |
722 | + else | |
723 | + st.drop_next = -codel_time_to_us(-delta); | |
724 | + } | |
725 | + | |
726 | + return gnet_stats_copy_app(d, &st, sizeof(st)); | |
727 | +} | |
728 | + | |
729 | +static void codel_reset(struct Qdisc *sch) | |
730 | +{ | |
731 | + struct codel_sched_data *q = qdisc_priv(sch); | |
732 | + | |
733 | + qdisc_reset_queue(sch); | |
734 | + codel_vars_init(&q->vars); | |
735 | +} | |
736 | + | |
737 | +static struct Qdisc_ops codel_qdisc_ops __read_mostly = { | |
738 | + .id = "codel", | |
739 | + .priv_size = sizeof(struct codel_sched_data), | |
740 | + | |
741 | + .enqueue = codel_qdisc_enqueue, | |
742 | + .dequeue = codel_qdisc_dequeue, | |
743 | + .peek = qdisc_peek_dequeued, | |
744 | + .init = codel_init, | |
745 | + .reset = codel_reset, | |
746 | + .change = codel_change, | |
747 | + .dump = codel_dump, | |
748 | + .dump_stats = codel_dump_stats, | |
749 | + .owner = THIS_MODULE, | |
750 | +}; | |
751 | + | |
752 | +static int __init codel_module_init(void) | |
753 | +{ | |
754 | + return register_qdisc(&codel_qdisc_ops); | |
755 | +} | |
756 | + | |
757 | +static void __exit codel_module_exit(void) | |
758 | +{ | |
759 | + unregister_qdisc(&codel_qdisc_ops); | |
760 | +} | |
761 | + | |
762 | +module_init(codel_module_init) | |
763 | +module_exit(codel_module_exit) | |
764 | + | |
765 | +MODULE_DESCRIPTION("Controlled Delay queue discipline"); | |
766 | +MODULE_AUTHOR("Dave Taht"); | |
767 | +MODULE_AUTHOR("Eric Dumazet"); | |
768 | +MODULE_LICENSE("Dual BSD/GPL"); | |
769 | -- | |
770 | 1.7.10 | |
771 |