1 /* $OpenBSD: tcp_timer.c,v 1.39 2007/06/15 18:23:07 markus Exp $ */
2 /* $NetBSD: tcp_timer.c,v 1.14 1996/02/13 23:44:09 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1988, 1990, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)tcp_timer.c 8.1 (Berkeley) 6/10/93
33 */
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/socketvar.h>
40 #include <sys/protosw.h>
41 #include <sys/kernel.h>
42
43 #include <net/route.h>
44
45 #include <netinet/in.h>
46 #include <netinet/in_systm.h>
47 #include <netinet/ip.h>
48 #include <netinet/in_pcb.h>
49 #include <netinet/ip_var.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_fsm.h>
52 #include <netinet/tcp_timer.h>
53 #include <netinet/tcp_var.h>
54 #include <netinet/ip_icmp.h>
55 #include <netinet/tcp_seq.h>
56
57 int tcp_keepidle;
58 int tcp_keepintvl;
59 int tcp_maxpersistidle; /* max idle time in persist */
60 int tcp_maxidle;
61
62 /*
63 * Time to delay the ACK. This is initialized in tcp_init(), unless
64 * its patched.
65 */
66 int tcp_delack_ticks;
67
68 void tcp_timer_rexmt(void *);
69 void tcp_timer_persist(void *);
70 void tcp_timer_keep(void *);
71 void tcp_timer_2msl(void *);
72
73 const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS] = {
74 tcp_timer_rexmt,
75 tcp_timer_persist,
76 tcp_timer_keep,
77 tcp_timer_2msl,
78 };
79
80 /*
81 * Timer state initialization, called from tcp_init().
82 */
83 void
84 tcp_timer_init(void)
85 {
86
87 if (tcp_keepidle == 0)
88 tcp_keepidle = TCPTV_KEEP_IDLE;
89
90 if (tcp_keepintvl == 0)
91 tcp_keepintvl = TCPTV_KEEPINTVL;
92
93 if (tcp_maxpersistidle == 0)
94 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
95
96 if (tcp_delack_ticks == 0)
97 tcp_delack_ticks = TCP_DELACK_TICKS;
98 }
99
100 /*
101 * Callout to process delayed ACKs for a TCPCB.
102 */
103 void
104 tcp_delack(void *arg)
105 {
106 struct tcpcb *tp = arg;
107 int s;
108
109 /*
110 * If tcp_output() wasn't able to transmit the ACK
111 * for whatever reason, it will restart the delayed
112 * ACK callout.
113 */
114
115 s = splsoftnet();
116 if (tp->t_flags & TF_DEAD) {
117 splx(s);
118 return;
119 }
120 tp->t_flags |= TF_ACKNOW;
121 (void) tcp_output(tp);
122 splx(s);
123 }
124
125 /*
126 * Tcp protocol timeout routine called every 500 ms.
127 * Updates the timers in all active tcb's and
128 * causes finite state machine actions if timers expire.
129 */
130 void
131 tcp_slowtimo()
132 {
133 int s;
134
135 s = splsoftnet();
136 tcp_maxidle = TCPTV_KEEPCNT * tcp_keepintvl;
137 #ifdef TCP_COMPAT_42
138 tcp_iss += TCP_ISSINCR/PR_SLOWHZ; /* increment iss */
139 if ((int)tcp_iss < 0)
140 tcp_iss = 0; /* XXX */
141 #else
142 tcp_iss += TCP_ISSINCR2/PR_SLOWHZ; /* increment iss */
143 #endif /* TCP_COMPAT_42 */
144 tcp_now++; /* for timestamps */
145 splx(s);
146 }
147
148 /*
149 * Cancel all timers for TCP tp.
150 */
151 void
152 tcp_canceltimers(tp)
153 struct tcpcb *tp;
154 {
155 int i;
156
157 for (i = 0; i < TCPT_NTIMERS; i++)
158 TCP_TIMER_DISARM(tp, i);
159 }
160
161 int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
162 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
163
164 int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
165
166 /*
167 * TCP timer processing.
168 */
169
170 #ifdef TCP_SACK
171 void tcp_timer_freesack(struct tcpcb *);
172
173 void
174 tcp_timer_freesack(struct tcpcb *tp)
175 {
176 struct sackhole *p, *q;
177 /*
178 * Free SACK holes for 2MSL and REXMT timers.
179 */
180 q = tp->snd_holes;
181 while (q != NULL) {
182 p = q;
183 q = q->next;
184 pool_put(&sackhl_pool, p);
185 }
186 tp->snd_holes = 0;
187 #ifdef TCP_FACK
188 tp->snd_fack = tp->snd_una;
189 tp->retran_data = 0;
190 tp->snd_awnd = 0;
191 #endif /* TCP_FACK */
192 }
193 #endif /* TCP_SACK */
194
195 void
196 tcp_timer_rexmt(void *arg)
197 {
198 struct tcpcb *tp = arg;
199 uint32_t rto;
200 int s;
201
202 s = splsoftnet();
203 if (tp->t_flags & TF_DEAD) {
204 splx(s);
205 return;
206 }
207
208 if ((tp->t_flags & TF_PMTUD_PEND) && tp->t_inpcb &&
209 SEQ_GEQ(tp->t_pmtud_th_seq, tp->snd_una) &&
210 SEQ_LT(tp->t_pmtud_th_seq, (int)(tp->snd_una + tp->t_maxseg))) {
211 extern struct sockaddr_in icmpsrc;
212 struct icmp icmp;
213
214 tp->t_flags &= ~TF_PMTUD_PEND;
215
216 /* XXX create fake icmp message with relevant entries */
217 icmp.icmp_nextmtu = tp->t_pmtud_nextmtu;
218 icmp.icmp_ip.ip_len = tp->t_pmtud_ip_len;
219 icmp.icmp_ip.ip_hl = tp->t_pmtud_ip_hl;
220 icmpsrc.sin_addr = tp->t_inpcb->inp_faddr;
221 icmp_mtudisc(&icmp);
222
223 /*
224 * Notify all connections to the same peer about
225 * new mss and trigger retransmit.
226 */
227 in_pcbnotifyall(&tcbtable, sintosa(&icmpsrc), EMSGSIZE,
228 tcp_mtudisc);
229 splx(s);
230 return;
231 }
232
233 #ifdef TCP_SACK
234 tcp_timer_freesack(tp);
235 #endif
236 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
237 tp->t_rxtshift = TCP_MAXRXTSHIFT;
238 tcpstat.tcps_timeoutdrop++;
239 (void)tcp_drop(tp, tp->t_softerror ?
240 tp->t_softerror : ETIMEDOUT);
241 goto out;
242 }
243 tcpstat.tcps_rexmttimeo++;
244 rto = TCP_REXMTVAL(tp);
245 if (rto < tp->t_rttmin)
246 rto = tp->t_rttmin;
247 TCPT_RANGESET(tp->t_rxtcur,
248 rto * tcp_backoff[tp->t_rxtshift],
249 tp->t_rttmin, TCPTV_REXMTMAX);
250 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
251
252 /*
253 * If we are losing and we are trying path MTU discovery,
254 * try turning it off. This will avoid black holes in
255 * the network which suppress or fail to send "packet
256 * too big" ICMP messages. We should ideally do
257 * lots more sophisticated searching to find the right
258 * value here...
259 */
260 if (ip_mtudisc && tp->t_inpcb &&
261 TCPS_HAVEESTABLISHED(tp->t_state) &&
262 tp->t_rxtshift > TCP_MAXRXTSHIFT / 6) {
263 struct inpcb *inp = tp->t_inpcb;
264 struct rtentry *rt = NULL;
265 struct sockaddr_in sin;
266
267 /* No data to send means path mtu is not a problem */
268 if (!inp->inp_socket->so_snd.sb_cc)
269 goto leave;
270
271 rt = in_pcbrtentry(inp);
272 /* Check if path MTU discovery is disabled already */
273 if (rt && (rt->rt_flags & RTF_HOST) &&
274 (rt->rt_rmx.rmx_locks & RTV_MTU))
275 goto leave;
276
277 rt = NULL;
278 switch(tp->pf) {
279 #ifdef INET6
280 case PF_INET6:
281 /*
282 * We can not turn off path MTU for IPv6.
283 * Do nothing for now, maybe lower to
284 * minimum MTU.
285 */
286 break;
287 #endif
288 case PF_INET:
289 bzero(&sin, sizeof(struct sockaddr_in));
290 sin.sin_family = AF_INET;
291 sin.sin_len = sizeof(struct sockaddr_in);
292 sin.sin_addr = inp->inp_faddr;
293 rt = icmp_mtudisc_clone(sintosa(&sin));
294 break;
295 }
296 if (rt != NULL) {
297 /* Disable path MTU discovery */
298 if ((rt->rt_rmx.rmx_locks & RTV_MTU) == 0) {
299 rt->rt_rmx.rmx_locks |= RTV_MTU;
300 in_rtchange(inp, 0);
301 }
302
303 rtfree(rt);
304 }
305 leave:
306 ;
307 }
308
309 /*
310 * If losing, let the lower level know and try for
311 * a better route. Also, if we backed off this far,
312 * our srtt estimate is probably bogus. Clobber it
313 * so we'll take the next rtt measurement as our srtt;
314 * move the current srtt into rttvar to keep the current
315 * retransmit times until then.
316 */
317 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
318 in_losing(tp->t_inpcb);
319 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
320 tp->t_srtt = 0;
321 }
322 tp->snd_nxt = tp->snd_una;
323 #if defined(TCP_SACK)
324 /*
325 * Note: We overload snd_last to function also as the
326 * snd_last variable described in RFC 2582
327 */
328 tp->snd_last = tp->snd_max;
329 #endif /* TCP_SACK */
330 /*
331 * If timing a segment in this window, stop the timer.
332 */
333 tp->t_rtttime = 0;
334 #ifdef TCP_ECN
335 /*
336 * if ECN is enabled, there might be a broken firewall which
337 * blocks ecn packets. fall back to non-ecn.
338 */
339 if ((tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED)
340 && tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
341 tp->t_flags |= TF_DISABLE_ECN;
342 #endif
343 /*
344 * Close the congestion window down to one segment
345 * (we'll open it by one segment for each ack we get).
346 * Since we probably have a window's worth of unacked
347 * data accumulated, this "slow start" keeps us from
348 * dumping all that data as back-to-back packets (which
349 * might overwhelm an intermediate gateway).
350 *
351 * There are two phases to the opening: Initially we
352 * open by one mss on each ack. This makes the window
353 * size increase exponentially with time. If the
354 * window is larger than the path can handle, this
355 * exponential growth results in dropped packet(s)
356 * almost immediately. To get more time between
357 * drops but still "push" the network to take advantage
358 * of improving conditions, we switch from exponential
359 * to linear window opening at some threshold size.
360 * For a threshold, we use half the current window
361 * size, truncated to a multiple of the mss.
362 *
363 * (the minimum cwnd that will give us exponential
364 * growth is 2 mss. We don't allow the threshold
365 * to go below this.)
366 */
367 {
368 u_long win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
369 if (win < 2)
370 win = 2;
371 tp->snd_cwnd = tp->t_maxseg;
372 tp->snd_ssthresh = win * tp->t_maxseg;
373 tp->t_dupacks = 0;
374 #ifdef TCP_ECN
375 tp->snd_last = tp->snd_max;
376 tp->t_flags |= TF_SEND_CWR;
377 #endif
378 #if 1 /* TCP_ECN */
379 tcpstat.tcps_cwr_timeout++;
380 #endif
381 }
382 (void) tcp_output(tp);
383
384 out:
385 splx(s);
386 }
387
388 void
389 tcp_timer_persist(void *arg)
390 {
391 struct tcpcb *tp = arg;
392 uint32_t rto;
393 int s;
394
395 s = splsoftnet();
396 if ((tp->t_flags & TF_DEAD) ||
397 TCP_TIMER_ISARMED(tp, TCPT_REXMT)) {
398 splx(s);
399 return;
400 }
401 tcpstat.tcps_persisttimeo++;
402 /*
403 * Hack: if the peer is dead/unreachable, we do not
404 * time out if the window is closed. After a full
405 * backoff, drop the connection if the idle time
406 * (no responses to probes) reaches the maximum
407 * backoff that we would use if retransmitting.
408 */
409 rto = TCP_REXMTVAL(tp);
410 if (rto < tp->t_rttmin)
411 rto = tp->t_rttmin;
412 if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
413 ((tcp_now - tp->t_rcvtime) >= tcp_maxpersistidle ||
414 (tcp_now - tp->t_rcvtime) >= rto * tcp_totbackoff)) {
415 tcpstat.tcps_persistdrop++;
416 tp = tcp_drop(tp, ETIMEDOUT);
417 goto out;
418 }
419 tcp_setpersist(tp);
420 tp->t_force = 1;
421 (void) tcp_output(tp);
422 tp->t_force = 0;
423 out:
424 splx(s);
425 }
426
427 void
428 tcp_timer_keep(void *arg)
429 {
430 struct tcpcb *tp = arg;
431 int s;
432
433 s = splsoftnet();
434 if (tp->t_flags & TF_DEAD) {
435 splx(s);
436 return;
437 }
438
439 tcpstat.tcps_keeptimeo++;
440 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
441 goto dropit;
442 if (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE &&
443 tp->t_state <= TCPS_CLOSING) {
444 if ((tcp_maxidle > 0) &&
445 ((tcp_now - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle))
446 goto dropit;
447 /*
448 * Send a packet designed to force a response
449 * if the peer is up and reachable:
450 * either an ACK if the connection is still alive,
451 * or an RST if the peer has closed the connection
452 * due to timeout or reboot.
453 * Using sequence number tp->snd_una-1
454 * causes the transmitted zero-length segment
455 * to lie outside the receive window;
456 * by the protocol spec, this requires the
457 * correspondent TCP to respond.
458 */
459 tcpstat.tcps_keepprobe++;
460 #ifdef TCP_COMPAT_42
461 /*
462 * The keepalive packet must have nonzero length
463 * to get a 4.2 host to respond.
464 */
465 tcp_respond(tp, mtod(tp->t_template, caddr_t),
466 (struct mbuf *)NULL, tp->rcv_nxt - 1, tp->snd_una - 1, 0);
467 #else
468 tcp_respond(tp, mtod(tp->t_template, caddr_t),
469 (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0);
470 #endif
471 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepintvl);
472 } else
473 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
474
475 splx(s);
476 return;
477
478 dropit:
479 tcpstat.tcps_keepdrops++;
480 tp = tcp_drop(tp, ETIMEDOUT);
481
482 splx(s);
483 }
484
485 void
486 tcp_timer_2msl(void *arg)
487 {
488 struct tcpcb *tp = arg;
489 int s;
490
491 s = splsoftnet();
492 if (tp->t_flags & TF_DEAD) {
493 splx(s);
494 return;
495 }
496
497 #ifdef TCP_SACK
498 tcp_timer_freesack(tp);
499 #endif
500
501 if (tp->t_state != TCPS_TIME_WAIT &&
502 ((tcp_maxidle == 0) || ((tcp_now - tp->t_rcvtime) <= tcp_maxidle)))
503 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_keepintvl);
504 else
505 tp = tcp_close(tp);
506
507 splx(s);
508 }