]>
Commit | Line | Data |
---|---|---|
e898660a AF |
1 | From d4b36210c2e6ecef0ce52fb6c18c51144f5c2d88 Mon Sep 17 00:00:00 2001 |
2 | From: Vijay Subramanian <vijaynsu@cisco.com> | |
3 | Date: Sat, 4 Jan 2014 17:33:55 -0800 | |
4 | Subject: net: pkt_sched: PIE AQM scheme | |
5 | ||
6 | Proportional Integral controller Enhanced (PIE) is a scheduler to address the | |
7 | bufferbloat problem. | |
8 | ||
9 | >From the IETF draft below: | |
10 | " Bufferbloat is a phenomenon where excess buffers in the network cause high | |
11 | latency and jitter. As more and more interactive applications (e.g. voice over | |
12 | IP, real time video streaming and financial transactions) run in the Internet, | |
13 | high latency and jitter degrade application performance. There is a pressing | |
14 | need to design intelligent queue management schemes that can control latency and | |
15 | jitter; and hence provide desirable quality of service to users. | |
16 | ||
17 | We present here a lightweight design, PIE(Proportional Integral controller | |
18 | Enhanced) that can effectively control the average queueing latency to a target | |
19 | value. Simulation results, theoretical analysis and Linux testbed results have | |
20 | shown that PIE can ensure low latency and achieve high link utilization under | |
21 | various congestion situations. The design does not require per-packet | |
22 | timestamp, so it incurs very small overhead and is simple enough to implement | |
23 | in both hardware and software. " | |
24 | ||
25 | Many thanks to Dave Taht for extensive feedback, reviews, testing and | |
26 | suggestions. Thanks also to Stephen Hemminger and Eric Dumazet for reviews and | |
27 | suggestions. Naeem Khademi and Dave Taht independently contributed to ECN | |
28 | support. | |
29 | ||
30 | For more information, please see technical paper about PIE in the IEEE | |
31 | Conference on High Performance Switching and Routing 2013. A copy of the paper | |
32 | can be found at ftp://ftpeng.cisco.com/pie/. | |
33 | ||
34 | Please also refer to the IETF draft submission at | |
35 | http://tools.ietf.org/html/draft-pan-tsvwg-pie-00 | |
36 | ||
37 | All relevant code, documents and test scripts and results can be found at | |
38 | ftp://ftpeng.cisco.com/pie/. | |
39 | ||
40 | For problems with the iproute2/tc or Linux kernel code, please contact Vijay | |
41 | Subramanian (vijaynsu@cisco.com or subramanian.vijay@gmail.com) Mythili Prabhu | |
42 | (mysuryan@cisco.com) | |
43 | ||
44 | Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com> | |
45 | Signed-off-by: Mythili Prabhu <mysuryan@cisco.com> | |
46 | CC: Dave Taht <dave.taht@bufferbloat.net> | |
47 | Signed-off-by: David S. Miller <davem@davemloft.net> | |
48 | ||
49 | diff -Naur linux-3.10.39.org/include/uapi/linux/pkt_sched.h linux-3.10.39/include/uapi/linux/pkt_sched.h | |
50 | --- linux-3.10.39.org/include/uapi/linux/pkt_sched.h 2014-05-06 16:56:24.000000000 +0200 | |
51 | +++ linux-3.10.39/include/uapi/linux/pkt_sched.h 2014-05-15 10:33:08.296828477 +0200 | |
52 | @@ -744,4 +744,29 @@ | |
53 | }; | |
54 | }; | |
55 | ||
56 | +/* PIE */ | |
57 | +enum { | |
58 | + TCA_PIE_UNSPEC, | |
59 | + TCA_PIE_TARGET, | |
60 | + TCA_PIE_LIMIT, | |
61 | + TCA_PIE_TUPDATE, | |
62 | + TCA_PIE_ALPHA, | |
63 | + TCA_PIE_BETA, | |
64 | + TCA_PIE_ECN, | |
65 | + TCA_PIE_BYTEMODE, | |
66 | + __TCA_PIE_MAX | |
67 | +}; | |
68 | +#define TCA_PIE_MAX (__TCA_PIE_MAX - 1) | |
69 | + | |
70 | +struct tc_pie_xstats { | |
71 | + __u32 prob; /* current probability */ | |
72 | + __u32 delay; /* current delay in ms */ | |
73 | + __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ | |
74 | + __u32 packets_in; /* total number of packets enqueued */ | |
75 | + __u32 dropped; /* packets dropped due to pie_action */ | |
76 | + __u32 overlimit; /* dropped due to lack of space in queue */ | |
77 | + __u32 maxq; /* maximum queue size */ | |
78 | + __u32 ecn_mark; /* packets marked with ecn*/ | |
79 | +}; | |
80 | + | |
81 | #endif | |
82 | diff -Naur linux-3.10.39.org/net/sched/Kconfig linux-3.10.39/net/sched/Kconfig | |
83 | --- linux-3.10.39.org/net/sched/Kconfig 2014-05-06 16:56:24.000000000 +0200 | |
84 | +++ linux-3.10.39/net/sched/Kconfig 2014-05-15 09:30:29.866632326 +0200 | |
85 | @@ -272,6 +272,19 @@ | |
86 | ||
87 | If unsure, say N. | |
88 | ||
89 | +config NET_SCH_PIE | |
90 | + tristate "Proportional Integral controller Enhanced (PIE) scheduler" | |
91 | + help | |
92 | + Say Y here if you want to use the Proportional Integral controller | |
93 | + Enhanced scheduler packet scheduling algorithm. | |
94 | + For more information, please see | |
95 | + http://tools.ietf.org/html/draft-pan-tsvwg-pie-00 | |
96 | + | |
97 | + To compile this driver as a module, choose M here: the module | |
98 | + will be called sch_pie. | |
99 | + | |
100 | + If unsure, say N. | |
101 | + | |
102 | config NET_SCH_INGRESS | |
103 | tristate "Ingress Qdisc" | |
104 | depends on NET_CLS_ACT | |
105 | diff -Naur linux-3.10.39.org/net/sched/Makefile linux-3.10.39/net/sched/Makefile | |
106 | --- linux-3.10.39.org/net/sched/Makefile 2014-05-06 16:56:24.000000000 +0200 | |
107 | +++ linux-3.10.39/net/sched/Makefile 2014-05-15 10:34:55.533502406 +0200 | |
108 | @@ -39,6 +39,7 @@ | |
109 | obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o | |
110 | obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o | |
111 | obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o | |
112 | +obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o | |
113 | ||
114 | obj-$(CONFIG_NET_CLS_U32) += cls_u32.o | |
115 | obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o | |
116 | diff -Naur linux-3.10.39.org/net/sched/sch_pie.c linux-3.10.39/net/sched/sch_pie.c | |
117 | --- linux-3.10.39.org/net/sched/sch_pie.c 1970-01-01 01:00:00.000000000 +0100 | |
118 | +++ linux-3.10.39/net/sched/sch_pie.c 2014-05-15 09:30:29.869966724 +0200 | |
119 | @@ -0,0 +1,555 @@ | |
120 | +/* Copyright (C) 2013 Cisco Systems, Inc, 2013. | |
121 | + * | |
122 | + * This program is free software; you can redistribute it and/or | |
123 | + * modify it under the terms of the GNU General Public License | |
124 | + * as published by the Free Software Foundation; either version 2 | |
125 | + * of the License. | |
126 | + * | |
127 | + * This program is distributed in the hope that it will be useful, | |
128 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
129 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
130 | + * GNU General Public License for more details. | |
131 | + * | |
132 | + * Author: Vijay Subramanian <vijaynsu@cisco.com> | |
133 | + * Author: Mythili Prabhu <mysuryan@cisco.com> | |
134 | + * | |
135 | + * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no> | |
136 | + * University of Oslo, Norway. | |
137 | + */ | |
138 | + | |
139 | +#include <linux/module.h> | |
140 | +#include <linux/slab.h> | |
141 | +#include <linux/types.h> | |
142 | +#include <linux/kernel.h> | |
143 | +#include <linux/errno.h> | |
144 | +#include <linux/skbuff.h> | |
145 | +#include <net/pkt_sched.h> | |
146 | +#include <net/inet_ecn.h> | |
147 | + | |
148 | +#define QUEUE_THRESHOLD 10000 | |
149 | +#define DQCOUNT_INVALID -1 | |
150 | +#define MAX_PROB 0xffffffff | |
151 | +#define PIE_SCALE 8 | |
152 | + | |
153 | +/* parameters used */ | |
154 | +struct pie_params { | |
155 | + psched_time_t target; /* user specified target delay in pschedtime */ | |
156 | + u32 tupdate; /* timer frequency (in jiffies) */ | |
157 | + u32 limit; /* number of packets that can be enqueued */ | |
158 | + u32 alpha; /* alpha and beta are between -4 and 4 */ | |
159 | + u32 beta; /* and are used for shift relative to 1 */ | |
160 | + bool ecn; /* true if ecn is enabled */ | |
161 | + bool bytemode; /* to scale drop early prob based on pkt size */ | |
162 | +}; | |
163 | + | |
164 | +/* variables used */ | |
165 | +struct pie_vars { | |
166 | + u32 prob; /* probability but scaled by u32 limit. */ | |
167 | + psched_time_t burst_time; | |
168 | + psched_time_t qdelay; | |
169 | + psched_time_t qdelay_old; | |
170 | + u64 dq_count; /* measured in bytes */ | |
171 | + psched_time_t dq_tstamp; /* drain rate */ | |
172 | + u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ | |
173 | + u32 qlen_old; /* in bytes */ | |
174 | +}; | |
175 | + | |
176 | +/* statistics gathering */ | |
177 | +struct pie_stats { | |
178 | + u32 packets_in; /* total number of packets enqueued */ | |
179 | + u32 dropped; /* packets dropped due to pie_action */ | |
180 | + u32 overlimit; /* dropped due to lack of space in queue */ | |
181 | + u32 maxq; /* maximum queue size */ | |
182 | + u32 ecn_mark; /* packets marked with ECN */ | |
183 | +}; | |
184 | + | |
185 | +/* private data for the Qdisc */ | |
186 | +struct pie_sched_data { | |
187 | + struct pie_params params; | |
188 | + struct pie_vars vars; | |
189 | + struct pie_stats stats; | |
190 | + struct timer_list adapt_timer; | |
191 | +}; | |
192 | + | |
193 | +static void pie_params_init(struct pie_params *params) | |
194 | +{ | |
195 | + params->alpha = 2; | |
196 | + params->beta = 20; | |
197 | + params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */ | |
198 | + params->limit = 1000; /* default of 1000 packets */ | |
199 | + params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */ | |
200 | + params->ecn = false; | |
201 | + params->bytemode = false; | |
202 | +} | |
203 | + | |
204 | +static void pie_vars_init(struct pie_vars *vars) | |
205 | +{ | |
206 | + vars->dq_count = DQCOUNT_INVALID; | |
207 | + vars->avg_dq_rate = 0; | |
208 | + /* default of 100 ms in pschedtime */ | |
209 | + vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC); | |
210 | +} | |
211 | + | |
212 | +static bool drop_early(struct Qdisc *sch, u32 packet_size) | |
213 | +{ | |
214 | + struct pie_sched_data *q = qdisc_priv(sch); | |
215 | + u32 rnd; | |
216 | + u32 local_prob = q->vars.prob; | |
217 | + u32 mtu = psched_mtu(qdisc_dev(sch)); | |
218 | + | |
219 | + /* If there is still burst allowance left skip random early drop */ | |
220 | + if (q->vars.burst_time > 0) | |
221 | + return false; | |
222 | + | |
223 | + /* If current delay is less than half of target, and | |
224 | + * if drop prob is low already, disable early_drop | |
225 | + */ | |
226 | + if ((q->vars.qdelay < q->params.target / 2) | |
227 | + && (q->vars.prob < MAX_PROB / 5)) | |
228 | + return false; | |
229 | + | |
230 | + /* If we have fewer than 2 mtu-sized packets, disable drop_early, | |
231 | + * similar to min_th in RED | |
232 | + */ | |
233 | + if (sch->qstats.backlog < 2 * mtu) | |
234 | + return false; | |
235 | + | |
236 | + /* If bytemode is turned on, use packet size to compute new | |
237 | + * probablity. Smaller packets will have lower drop prob in this case | |
238 | + */ | |
239 | + if (q->params.bytemode && packet_size <= mtu) | |
240 | + local_prob = (local_prob / mtu) * packet_size; | |
241 | + else | |
242 | + local_prob = q->vars.prob; | |
243 | + | |
244 | + rnd = net_random(); | |
245 | + if (rnd < local_prob) | |
246 | + return true; | |
247 | + | |
248 | + return false; | |
249 | +} | |
250 | + | |
251 | +static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |
252 | +{ | |
253 | + struct pie_sched_data *q = qdisc_priv(sch); | |
254 | + bool enqueue = false; | |
255 | + | |
256 | + if (unlikely(qdisc_qlen(sch) >= sch->limit)) { | |
257 | + q->stats.overlimit++; | |
258 | + goto out; | |
259 | + } | |
260 | + | |
261 | + if (!drop_early(sch, skb->len)) { | |
262 | + enqueue = true; | |
263 | + } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && | |
264 | + INET_ECN_set_ce(skb)) { | |
265 | + /* If packet is ecn capable, mark it if drop probability | |
266 | + * is lower than 10%, else drop it. | |
267 | + */ | |
268 | + q->stats.ecn_mark++; | |
269 | + enqueue = true; | |
270 | + } | |
271 | + | |
272 | + /* we can enqueue the packet */ | |
273 | + if (enqueue) { | |
274 | + q->stats.packets_in++; | |
275 | + if (qdisc_qlen(sch) > q->stats.maxq) | |
276 | + q->stats.maxq = qdisc_qlen(sch); | |
277 | + | |
278 | + return qdisc_enqueue_tail(skb, sch); | |
279 | + } | |
280 | + | |
281 | +out: | |
282 | + q->stats.dropped++; | |
283 | + return qdisc_drop(skb, sch); | |
284 | +} | |
285 | + | |
286 | +static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { | |
287 | + [TCA_PIE_TARGET] = {.type = NLA_U32}, | |
288 | + [TCA_PIE_LIMIT] = {.type = NLA_U32}, | |
289 | + [TCA_PIE_TUPDATE] = {.type = NLA_U32}, | |
290 | + [TCA_PIE_ALPHA] = {.type = NLA_U32}, | |
291 | + [TCA_PIE_BETA] = {.type = NLA_U32}, | |
292 | + [TCA_PIE_ECN] = {.type = NLA_U32}, | |
293 | + [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, | |
294 | +}; | |
295 | + | |
296 | +static int pie_change(struct Qdisc *sch, struct nlattr *opt) | |
297 | +{ | |
298 | + struct pie_sched_data *q = qdisc_priv(sch); | |
299 | + struct nlattr *tb[TCA_PIE_MAX + 1]; | |
300 | + unsigned int qlen; | |
301 | + int err; | |
302 | + | |
303 | + if (!opt) | |
304 | + return -EINVAL; | |
305 | + | |
306 | + err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy); | |
307 | + if (err < 0) | |
308 | + return err; | |
309 | + | |
310 | + sch_tree_lock(sch); | |
311 | + | |
312 | + /* convert from microseconds to pschedtime */ | |
313 | + if (tb[TCA_PIE_TARGET]) { | |
314 | + /* target is in us */ | |
315 | + u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); | |
316 | + | |
317 | + /* convert to pschedtime */ | |
318 | + q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); | |
319 | + } | |
320 | + | |
321 | + /* tupdate is in jiffies */ | |
322 | + if (tb[TCA_PIE_TUPDATE]) | |
323 | + q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); | |
324 | + | |
325 | + if (tb[TCA_PIE_LIMIT]) { | |
326 | + u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); | |
327 | + | |
328 | + q->params.limit = limit; | |
329 | + sch->limit = limit; | |
330 | + } | |
331 | + | |
332 | + if (tb[TCA_PIE_ALPHA]) | |
333 | + q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); | |
334 | + | |
335 | + if (tb[TCA_PIE_BETA]) | |
336 | + q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); | |
337 | + | |
338 | + if (tb[TCA_PIE_ECN]) | |
339 | + q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); | |
340 | + | |
341 | + if (tb[TCA_PIE_BYTEMODE]) | |
342 | + q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); | |
343 | + | |
344 | + /* Drop excess packets if new limit is lower */ | |
345 | + qlen = sch->q.qlen; | |
346 | + while (sch->q.qlen > sch->limit) { | |
347 | + struct sk_buff *skb = __skb_dequeue(&sch->q); | |
348 | + | |
349 | + sch->qstats.backlog -= qdisc_pkt_len(skb); | |
350 | + qdisc_drop(skb, sch); | |
351 | + } | |
352 | + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); | |
353 | + | |
354 | + sch_tree_unlock(sch); | |
355 | + return 0; | |
356 | +} | |
357 | + | |
358 | +static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) | |
359 | +{ | |
360 | + | |
361 | + struct pie_sched_data *q = qdisc_priv(sch); | |
362 | + int qlen = sch->qstats.backlog; /* current queue size in bytes */ | |
363 | + | |
364 | + /* If current queue is about 10 packets or more and dq_count is unset | |
365 | + * we have enough packets to calculate the drain rate. Save | |
366 | + * current time as dq_tstamp and start measurement cycle. | |
367 | + */ | |
368 | + if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { | |
369 | + q->vars.dq_tstamp = psched_get_time(); | |
370 | + q->vars.dq_count = 0; | |
371 | + } | |
372 | + | |
373 | + /* Calculate the average drain rate from this value. If queue length | |
374 | + * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset | |
375 | + * the dq_count to -1 as we don't have enough packets to calculate the | |
376 | + * drain rate anymore The following if block is entered only when we | |
377 | + * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) | |
378 | + * and we calculate the drain rate for the threshold here. dq_count is | |
379 | + * in bytes, time difference in psched_time, hence rate is in | |
380 | + * bytes/psched_time. | |
381 | + */ | |
382 | + if (q->vars.dq_count != DQCOUNT_INVALID) { | |
383 | + q->vars.dq_count += skb->len; | |
384 | + | |
385 | + if (q->vars.dq_count >= QUEUE_THRESHOLD) { | |
386 | + psched_time_t now = psched_get_time(); | |
387 | + u32 dtime = now - q->vars.dq_tstamp; | |
388 | + u32 count = q->vars.dq_count << PIE_SCALE; | |
389 | + | |
390 | + if (dtime == 0) | |
391 | + return; | |
392 | + | |
393 | + count = count / dtime; | |
394 | + | |
395 | + if (q->vars.avg_dq_rate == 0) | |
396 | + q->vars.avg_dq_rate = count; | |
397 | + else | |
398 | + q->vars.avg_dq_rate = | |
399 | + (q->vars.avg_dq_rate - | |
400 | + (q->vars.avg_dq_rate >> 3)) + (count >> 3); | |
401 | + | |
402 | + /* If the queue has receded below the threshold, we hold | |
403 | + * on to the last drain rate calculated, else we reset | |
404 | + * dq_count to 0 to re-enter the if block when the next | |
405 | + * packet is dequeued | |
406 | + */ | |
407 | + if (qlen < QUEUE_THRESHOLD) | |
408 | + q->vars.dq_count = DQCOUNT_INVALID; | |
409 | + else { | |
410 | + q->vars.dq_count = 0; | |
411 | + q->vars.dq_tstamp = psched_get_time(); | |
412 | + } | |
413 | + | |
414 | + if (q->vars.burst_time > 0) { | |
415 | + if (q->vars.burst_time > dtime) | |
416 | + q->vars.burst_time -= dtime; | |
417 | + else | |
418 | + q->vars.burst_time = 0; | |
419 | + } | |
420 | + } | |
421 | + } | |
422 | +} | |
423 | + | |
424 | +static void calculate_probability(struct Qdisc *sch) | |
425 | +{ | |
426 | + struct pie_sched_data *q = qdisc_priv(sch); | |
427 | + u32 qlen = sch->qstats.backlog; /* queue size in bytes */ | |
428 | + psched_time_t qdelay = 0; /* in pschedtime */ | |
429 | + psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ | |
430 | + s32 delta = 0; /* determines the change in probability */ | |
431 | + u32 oldprob; | |
432 | + u32 alpha, beta; | |
433 | + bool update_prob = true; | |
434 | + | |
435 | + q->vars.qdelay_old = q->vars.qdelay; | |
436 | + | |
437 | + if (q->vars.avg_dq_rate > 0) | |
438 | + qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; | |
439 | + else | |
440 | + qdelay = 0; | |
441 | + | |
442 | + /* If qdelay is zero and qlen is not, it means qlen is very small, less | |
443 | + * than dequeue_rate, so we do not update probabilty in this round | |
444 | + */ | |
445 | + if (qdelay == 0 && qlen != 0) | |
446 | + update_prob = false; | |
447 | + | |
448 | + /* Add ranges for alpha and beta, more aggressive for high dropping | |
449 | + * mode and gentle steps for light dropping mode | |
450 | + * In light dropping mode, take gentle steps; in medium dropping mode, | |
451 | + * take medium steps; in high dropping mode, take big steps. | |
452 | + */ | |
453 | + if (q->vars.prob < MAX_PROB / 100) { | |
454 | + alpha = | |
455 | + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; | |
456 | + beta = | |
457 | + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; | |
458 | + } else if (q->vars.prob < MAX_PROB / 10) { | |
459 | + alpha = | |
460 | + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; | |
461 | + beta = | |
462 | + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; | |
463 | + } else { | |
464 | + alpha = | |
465 | + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; | |
466 | + beta = | |
467 | + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; | |
468 | + } | |
469 | + | |
470 | + /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ | |
471 | + delta += alpha * ((qdelay - q->params.target)); | |
472 | + delta += beta * ((qdelay - qdelay_old)); | |
473 | + | |
474 | + oldprob = q->vars.prob; | |
475 | + | |
476 | + /* to ensure we increase probability in steps of no more than 2% */ | |
477 | + if (delta > (s32) (MAX_PROB / (100 / 2)) && | |
478 | + q->vars.prob >= MAX_PROB / 10) | |
479 | + delta = (MAX_PROB / 100) * 2; | |
480 | + | |
481 | + /* Non-linear drop: | |
482 | + * Tune drop probability to increase quickly for high delays(>= 250ms) | |
483 | + * 250ms is derived through experiments and provides error protection | |
484 | + */ | |
485 | + | |
486 | + if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) | |
487 | + delta += MAX_PROB / (100 / 2); | |
488 | + | |
489 | + q->vars.prob += delta; | |
490 | + | |
491 | + if (delta > 0) { | |
492 | + /* prevent overflow */ | |
493 | + if (q->vars.prob < oldprob) { | |
494 | + q->vars.prob = MAX_PROB; | |
495 | + /* Prevent normalization error. If probability is at | |
496 | + * maximum value already, we normalize it here, and | |
497 | + * skip the check to do a non-linear drop in the next | |
498 | + * section. | |
499 | + */ | |
500 | + update_prob = false; | |
501 | + } | |
502 | + } else { | |
503 | + /* prevent underflow */ | |
504 | + if (q->vars.prob > oldprob) | |
505 | + q->vars.prob = 0; | |
506 | + } | |
507 | + | |
508 | + /* Non-linear drop in probability: Reduce drop probability quickly if | |
509 | + * delay is 0 for 2 consecutive Tupdate periods. | |
510 | + */ | |
511 | + | |
512 | + if ((qdelay == 0) && (qdelay_old == 0) && update_prob) | |
513 | + q->vars.prob = (q->vars.prob * 98) / 100; | |
514 | + | |
515 | + q->vars.qdelay = qdelay; | |
516 | + q->vars.qlen_old = qlen; | |
517 | + | |
518 | + /* We restart the measurement cycle if the following conditions are met | |
519 | + * 1. If the delay has been low for 2 consecutive Tupdate periods | |
520 | + * 2. Calculated drop probability is zero | |
521 | + * 3. We have atleast one estimate for the avg_dq_rate ie., | |
522 | + * is a non-zero value | |
523 | + */ | |
524 | + if ((q->vars.qdelay < q->params.target / 2) && | |
525 | + (q->vars.qdelay_old < q->params.target / 2) && | |
526 | + (q->vars.prob == 0) && | |
527 | + (q->vars.avg_dq_rate > 0)) | |
528 | + pie_vars_init(&q->vars); | |
529 | +} | |
530 | + | |
531 | +static void pie_timer(unsigned long arg) | |
532 | +{ | |
533 | + struct Qdisc *sch = (struct Qdisc *)arg; | |
534 | + struct pie_sched_data *q = qdisc_priv(sch); | |
535 | + spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); | |
536 | + | |
537 | + spin_lock(root_lock); | |
538 | + calculate_probability(sch); | |
539 | + | |
540 | + /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ | |
541 | + if (q->params.tupdate) | |
542 | + mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); | |
543 | + spin_unlock(root_lock); | |
544 | + | |
545 | +} | |
546 | + | |
547 | +static int pie_init(struct Qdisc *sch, struct nlattr *opt) | |
548 | +{ | |
549 | + struct pie_sched_data *q = qdisc_priv(sch); | |
550 | + | |
551 | + pie_params_init(&q->params); | |
552 | + pie_vars_init(&q->vars); | |
553 | + sch->limit = q->params.limit; | |
554 | + | |
555 | + setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch); | |
556 | + mod_timer(&q->adapt_timer, jiffies + HZ / 2); | |
557 | + | |
558 | + if (opt) { | |
559 | + int err = pie_change(sch, opt); | |
560 | + | |
561 | + if (err) | |
562 | + return err; | |
563 | + } | |
564 | + | |
565 | + return 0; | |
566 | +} | |
567 | + | |
568 | +static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) | |
569 | +{ | |
570 | + struct pie_sched_data *q = qdisc_priv(sch); | |
571 | + struct nlattr *opts; | |
572 | + | |
573 | + opts = nla_nest_start(skb, TCA_OPTIONS); | |
574 | + if (opts == NULL) | |
575 | + goto nla_put_failure; | |
576 | + | |
577 | + /* convert target from pschedtime to us */ | |
578 | + if (nla_put_u32(skb, TCA_PIE_TARGET, | |
579 | + ((u32) PSCHED_TICKS2NS(q->params.target)) / | |
580 | + NSEC_PER_USEC) || | |
581 | + nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || | |
582 | + nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || | |
583 | + nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || | |
584 | + nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || | |
585 | + nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || | |
586 | + nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode)) | |
587 | + goto nla_put_failure; | |
588 | + | |
589 | + return nla_nest_end(skb, opts); | |
590 | + | |
591 | +nla_put_failure: | |
592 | + nla_nest_cancel(skb, opts); | |
593 | + return -1; | |
594 | + | |
595 | +} | |
596 | + | |
597 | +static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) | |
598 | +{ | |
599 | + struct pie_sched_data *q = qdisc_priv(sch); | |
600 | + struct tc_pie_xstats st = { | |
601 | + .prob = q->vars.prob, | |
602 | + .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / | |
603 | + NSEC_PER_USEC, | |
604 | + /* unscale and return dq_rate in bytes per sec */ | |
605 | + .avg_dq_rate = q->vars.avg_dq_rate * | |
606 | + (PSCHED_TICKS_PER_SEC) >> PIE_SCALE, | |
607 | + .packets_in = q->stats.packets_in, | |
608 | + .overlimit = q->stats.overlimit, | |
609 | + .maxq = q->stats.maxq, | |
610 | + .dropped = q->stats.dropped, | |
611 | + .ecn_mark = q->stats.ecn_mark, | |
612 | + }; | |
613 | + | |
614 | + return gnet_stats_copy_app(d, &st, sizeof(st)); | |
615 | +} | |
616 | + | |
617 | +static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) | |
618 | +{ | |
619 | + struct sk_buff *skb; | |
620 | + skb = __qdisc_dequeue_head(sch, &sch->q); | |
621 | + | |
622 | + if (!skb) | |
623 | + return NULL; | |
624 | + | |
625 | + pie_process_dequeue(sch, skb); | |
626 | + return skb; | |
627 | +} | |
628 | + | |
629 | +static void pie_reset(struct Qdisc *sch) | |
630 | +{ | |
631 | + struct pie_sched_data *q = qdisc_priv(sch); | |
632 | + qdisc_reset_queue(sch); | |
633 | + pie_vars_init(&q->vars); | |
634 | +} | |
635 | + | |
636 | +static void pie_destroy(struct Qdisc *sch) | |
637 | +{ | |
638 | + struct pie_sched_data *q = qdisc_priv(sch); | |
639 | + q->params.tupdate = 0; | |
640 | + del_timer_sync(&q->adapt_timer); | |
641 | +} | |
642 | + | |
643 | +static struct Qdisc_ops pie_qdisc_ops __read_mostly = { | |
644 | + .id = "pie", | |
645 | + .priv_size = sizeof(struct pie_sched_data), | |
646 | + .enqueue = pie_qdisc_enqueue, | |
647 | + .dequeue = pie_qdisc_dequeue, | |
648 | + .peek = qdisc_peek_dequeued, | |
649 | + .init = pie_init, | |
650 | + .destroy = pie_destroy, | |
651 | + .reset = pie_reset, | |
652 | + .change = pie_change, | |
653 | + .dump = pie_dump, | |
654 | + .dump_stats = pie_dump_stats, | |
655 | + .owner = THIS_MODULE, | |
656 | +}; | |
657 | + | |
658 | +static int __init pie_module_init(void) | |
659 | +{ | |
660 | + return register_qdisc(&pie_qdisc_ops); | |
661 | +} | |
662 | + | |
663 | +static void __exit pie_module_exit(void) | |
664 | +{ | |
665 | + unregister_qdisc(&pie_qdisc_ops); | |
666 | +} | |
667 | + | |
668 | +module_init(pie_module_init); | |
669 | +module_exit(pie_module_exit); | |
670 | + | |
671 | +MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler"); | |
672 | +MODULE_AUTHOR("Vijay Subramanian"); | |
673 | +MODULE_AUTHOR("Mythili Prabhu"); | |
674 | +MODULE_LICENSE("GPL"); |