1 /* $OpenBSD: altq_rio.c,v 1.10 2007/06/17 19:58:58 jasper Exp $ */
2 /* $KAME: altq_rio.c,v 1.8 2000/12/14 08:12:46 thorpej Exp $ */
3
4 /*
5 * Copyright (C) 1998-2000
6 * Sony Computer Science Laboratories Inc. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29 /*
30 * Copyright (c) 1990-1994 Regents of the University of California.
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the Computer Systems
44 * Engineering Group at Lawrence Berkeley Laboratory.
45 * 4. Neither the name of the University nor of the Laboratory may be used
46 * to endorse or promote products derived from this software without
47 * specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 */
61
62 #include <sys/param.h>
63 #include <sys/malloc.h>
64 #include <sys/mbuf.h>
65 #include <sys/socket.h>
66 #include <sys/systm.h>
67 #include <sys/errno.h>
68
69 #include <net/if.h>
70 #include <net/if_types.h>
71
72 #include <netinet/in.h>
73 #include <netinet/in_systm.h>
74 #include <netinet/ip.h>
75 #ifdef INET6
76 #include <netinet/ip6.h>
77 #endif
78
79 #include <net/pfvar.h>
80 #include <altq/altq.h>
81 #include <altq/altq_cdnr.h>
82 #include <altq/altq_red.h>
83 #include <altq/altq_rio.h>
84
85 /*
86 * RIO: RED with IN/OUT bit
87 * described in
88 * "Explicit Allocation of Best Effort Packet Delivery Service"
89 * David D. Clark and Wenjia Fang, MIT Lab for Computer Science
90 * http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf}
91 *
92 * this implementation is extended to support more than 2 drop precedence
93 * values as described in RFC2597 (Assured Forwarding PHB Group).
94 *
95 */
96 /*
97 * AF DS (differentiated service) codepoints.
98 * (classes can be mapped to CBQ or H-FSC classes.)
99 *
100 * 0 1 2 3 4 5 6 7
101 * +---+---+---+---+---+---+---+---+
102 * | CLASS |DropPre| 0 | CU |
103 * +---+---+---+---+---+---+---+---+
104 *
105 * class 1: 001
106 * class 2: 010
107 * class 3: 011
108 * class 4: 100
109 *
110 * low drop prec: 01
111 * medium drop prec: 10
112 * high drop prec: 01
113 */
114
115 /* normal red parameters */
116 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
117 /* q_weight = 0.00195 */
118
119 /* red parameters for a slow link */
120 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
121 /* q_weight = 0.0078125 */
122
123 /* red parameters for a very slow link (e.g., dialup) */
124 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
125 /* q_weight = 0.015625 */
126
127 /* fixed-point uses 12-bit decimal places */
128 #define FP_SHIFT 12 /* fixed-point shift */
129
130 /* red parameters for drop probability */
131 #define INV_P_MAX 10 /* inverse of max drop probability */
132 #define TH_MIN 5 /* min threshold */
133 #define TH_MAX 15 /* max threshold */
134
135 #define RIO_LIMIT 60 /* default max queue length */
136 #define RIO_STATS /* collect statistics */
137
138 #define TV_DELTA(a, b, delta) { \
139 int xxs; \
140 \
141 delta = (a)->tv_usec - (b)->tv_usec; \
142 if ((xxs = (a)->tv_sec - (b)->tv_sec) != 0) { \
143 if (xxs < 0) { \
144 delta = 60000000; \
145 } else if (xxs > 4) { \
146 if (xxs > 60) \
147 delta = 60000000; \
148 else \
149 delta += xxs * 1000000; \
150 } else while (xxs > 0) { \
151 delta += 1000000; \
152 xxs--; \
153 } \
154 } \
155 }
156
157 /* default rio parameter values */
158 static struct redparams default_rio_params[RIO_NDROPPREC] = {
159 /* th_min, th_max, inv_pmax */
160 { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */
161 { TH_MAX + TH_MIN, TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */
162 { TH_MIN, TH_MAX, INV_P_MAX } /* high drop precedence */
163 };
164
165 /* internal function prototypes */
166 static int dscp2index(u_int8_t);
167
168 rio_t *
169 rio_alloc(int weight, struct redparams *params, int flags, int pkttime)
170 {
171 rio_t *rp;
172 int w, i;
173 int npkts_per_sec;
174
175 MALLOC(rp, rio_t *, sizeof(rio_t), M_DEVBUF, M_WAITOK);
176 if (rp == NULL)
177 return (NULL);
178 bzero(rp, sizeof(rio_t));
179
180 rp->rio_flags = flags;
181 if (pkttime == 0)
182 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
183 rp->rio_pkttime = 800;
184 else
185 rp->rio_pkttime = pkttime;
186
187 if (weight != 0)
188 rp->rio_weight = weight;
189 else {
190 /* use default */
191 rp->rio_weight = W_WEIGHT;
192
193 /* when the link is very slow, adjust red parameters */
194 npkts_per_sec = 1000000 / rp->rio_pkttime;
195 if (npkts_per_sec < 50) {
196 /* up to about 400Kbps */
197 rp->rio_weight = W_WEIGHT_2;
198 } else if (npkts_per_sec < 300) {
199 /* up to about 2.4Mbps */
200 rp->rio_weight = W_WEIGHT_1;
201 }
202 }
203
204 /* calculate wshift. weight must be power of 2 */
205 w = rp->rio_weight;
206 for (i = 0; w > 1; i++)
207 w = w >> 1;
208 rp->rio_wshift = i;
209 w = 1 << rp->rio_wshift;
210 if (w != rp->rio_weight) {
211 printf("invalid weight value %d for red! use %d\n",
212 rp->rio_weight, w);
213 rp->rio_weight = w;
214 }
215
216 /* allocate weight table */
217 rp->rio_wtab = wtab_alloc(rp->rio_weight);
218
219 for (i = 0; i < RIO_NDROPPREC; i++) {
220 struct dropprec_state *prec = &rp->rio_precstate[i];
221
222 prec->avg = 0;
223 prec->idle = 1;
224
225 if (params == NULL || params[i].inv_pmax == 0)
226 prec->inv_pmax = default_rio_params[i].inv_pmax;
227 else
228 prec->inv_pmax = params[i].inv_pmax;
229 if (params == NULL || params[i].th_min == 0)
230 prec->th_min = default_rio_params[i].th_min;
231 else
232 prec->th_min = params[i].th_min;
233 if (params == NULL || params[i].th_max == 0)
234 prec->th_max = default_rio_params[i].th_max;
235 else
236 prec->th_max = params[i].th_max;
237
238 /*
239 * th_min_s and th_max_s are scaled versions of th_min
240 * and th_max to be compared with avg.
241 */
242 prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT);
243 prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT);
244
245 /*
246 * precompute probability denominator
247 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
248 */
249 prec->probd = (2 * (prec->th_max - prec->th_min)
250 * prec->inv_pmax) << FP_SHIFT;
251
252 microtime(&prec->last);
253 }
254
255 return (rp);
256 }
257
258 void
259 rio_destroy(rio_t *rp)
260 {
261 wtab_destroy(rp->rio_wtab);
262 FREE(rp, M_DEVBUF);
263 }
264
265 void
266 rio_getstats(rio_t *rp, struct redstats *sp)
267 {
268 int i;
269
270 for (i = 0; i < RIO_NDROPPREC; i++) {
271 bcopy(&rp->q_stats[i], sp, sizeof(struct redstats));
272 sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift;
273 sp++;
274 }
275 }
276
277 #if (RIO_NDROPPREC == 3)
278 /*
279 * internally, a drop precedence value is converted to an index
280 * starting from 0.
281 */
282 static int
283 dscp2index(u_int8_t dscp)
284 {
285 int dpindex = dscp & AF_DROPPRECMASK;
286
287 if (dpindex == 0)
288 return (0);
289 return ((dpindex >> 3) - 1);
290 }
291 #endif
292
293 #if 1
294 /*
295 * kludge: when a packet is dequeued, we need to know its drop precedence
296 * in order to keep the queue length of each drop precedence.
297 * use m_pkthdr.rcvif to pass this info.
298 */
299 #define RIOM_SET_PRECINDEX(m, idx) \
300 do { (m)->m_pkthdr.rcvif = (struct ifnet *)((long)(idx)); } while (0)
301 #define RIOM_GET_PRECINDEX(m) \
302 ({ long idx; idx = (long)((m)->m_pkthdr.rcvif); \
303 (m)->m_pkthdr.rcvif = NULL; idx; })
304 #endif
305
306 int
307 rio_addq(rio_t *rp, class_queue_t *q, struct mbuf *m,
308 struct altq_pktattr *pktattr)
309 {
310 int avg, droptype;
311 u_int8_t dsfield, odsfield;
312 int dpindex, i, n, t;
313 struct timeval now;
314 struct dropprec_state *prec;
315
316 dsfield = odsfield = read_dsfield(m, pktattr);
317 dpindex = dscp2index(dsfield);
318
319 /*
320 * update avg of the precedence states whose drop precedence
321 * is larger than or equal to the drop precedence of the packet
322 */
323 now.tv_sec = 0;
324 for (i = dpindex; i < RIO_NDROPPREC; i++) {
325 prec = &rp->rio_precstate[i];
326 avg = prec->avg;
327 if (prec->idle) {
328 prec->idle = 0;
329 if (now.tv_sec == 0)
330 microtime(&now);
331 t = (now.tv_sec - prec->last.tv_sec);
332 if (t > 60)
333 avg = 0;
334 else {
335 t = t * 1000000 +
336 (now.tv_usec - prec->last.tv_usec);
337 n = t / rp->rio_pkttime;
338 /* calculate (avg = (1 - Wq)^n * avg) */
339 if (n > 0)
340 avg = (avg >> FP_SHIFT) *
341 pow_w(rp->rio_wtab, n);
342 }
343 }
344
345 /* run estimator. (avg is scaled by WEIGHT in fixed-point) */
346 avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift);
347 prec->avg = avg; /* save the new value */
348 /*
349 * count keeps a tally of arriving traffic that has not
350 * been dropped.
351 */
352 prec->count++;
353 }
354
355 prec = &rp->rio_precstate[dpindex];
356 avg = prec->avg;
357
358 /* see if we drop early */
359 droptype = DTYPE_NODROP;
360 if (avg >= prec->th_min_s && prec->qlen > 1) {
361 if (avg >= prec->th_max_s) {
362 /* avg >= th_max: forced drop */
363 droptype = DTYPE_FORCED;
364 } else if (prec->old == 0) {
365 /* first exceeds th_min */
366 prec->count = 1;
367 prec->old = 1;
368 } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift,
369 prec->probd, prec->count)) {
370 /* unforced drop by red */
371 droptype = DTYPE_EARLY;
372 }
373 } else {
374 /* avg < th_min */
375 prec->old = 0;
376 }
377
378 /*
379 * if the queue length hits the hard limit, it's a forced drop.
380 */
381 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
382 droptype = DTYPE_FORCED;
383
384 if (droptype != DTYPE_NODROP) {
385 /* always drop incoming packet (as opposed to randomdrop) */
386 for (i = dpindex; i < RIO_NDROPPREC; i++)
387 rp->rio_precstate[i].count = 0;
388 #ifdef RIO_STATS
389 if (droptype == DTYPE_EARLY)
390 rp->q_stats[dpindex].drop_unforced++;
391 else
392 rp->q_stats[dpindex].drop_forced++;
393 PKTCNTR_ADD(&rp->q_stats[dpindex].drop_cnt, m_pktlen(m));
394 #endif
395 m_freem(m);
396 return (-1);
397 }
398
399 for (i = dpindex; i < RIO_NDROPPREC; i++)
400 rp->rio_precstate[i].qlen++;
401
402 /* save drop precedence index in mbuf hdr */
403 RIOM_SET_PRECINDEX(m, dpindex);
404
405 if (rp->rio_flags & RIOF_CLEARDSCP)
406 dsfield &= ~DSCP_MASK;
407
408 if (dsfield != odsfield)
409 write_dsfield(m, pktattr, dsfield);
410
411 _addq(q, m);
412
413 #ifdef RIO_STATS
414 PKTCNTR_ADD(&rp->q_stats[dpindex].xmit_cnt, m_pktlen(m));
415 #endif
416 return (0);
417 }
418
419 struct mbuf *
420 rio_getq(rio_t *rp, class_queue_t *q)
421 {
422 struct mbuf *m;
423 int dpindex, i;
424
425 if ((m = _getq(q)) == NULL)
426 return NULL;
427
428 dpindex = RIOM_GET_PRECINDEX(m);
429 for (i = dpindex; i < RIO_NDROPPREC; i++) {
430 if (--rp->rio_precstate[i].qlen == 0) {
431 if (rp->rio_precstate[i].idle == 0) {
432 rp->rio_precstate[i].idle = 1;
433 microtime(&rp->rio_precstate[i].last);
434 }
435 }
436 }
437 return (m);
438 }