1 /* $OpenBSD: tcp_input.c,v 1.207 2007/06/15 18:23:06 markus Exp $ */
2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
33 *
34 * NRL grants permission for redistribution and use in source and binary
35 * forms, with or without modification, of the software and documentation
36 * created at NRL provided that the following conditions are met:
37 *
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgements:
45 * This product includes software developed by the University of
46 * California, Berkeley and its contributors.
47 * This product includes software developed at the Information
48 * Technology Division, US Naval Research Laboratory.
49 * 4. Neither the name of the NRL nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64 *
65 * The views and conclusions contained in the software and documentation
66 * are those of the authors and should not be interpreted as representing
67 * official policies, either expressed or implied, of the US Naval
68 * Research Laboratory (NRL).
69 */
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/kernel.h>
78
79 #include <dev/rndvar.h>
80
81 #include <net/if.h>
82 #include <net/route.h>
83
84 #include <netinet/in.h>
85 #include <netinet/in_systm.h>
86 #include <netinet/ip.h>
87 #include <netinet/in_pcb.h>
88 #include <netinet/ip_var.h>
89 #include <netinet/tcp.h>
90 #include <netinet/tcp_fsm.h>
91 #include <netinet/tcp_seq.h>
92 #include <netinet/tcp_timer.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/tcpip.h>
95 #include <netinet/tcp_debug.h>
96
97 struct tcpiphdr tcp_saveti;
98
99 int tcp_mss_adv(struct ifnet *, int);
100
101 #ifdef INET6
102 #include <netinet6/in6_var.h>
103 #include <netinet6/nd6.h>
104
105 struct tcpipv6hdr tcp_saveti6;
106
107 /* for the packet header length in the mbuf */
108 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len)
109 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr))
110 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip))
111 #endif /* INET6 */
112
113 int tcprexmtthresh = 3;
114 int tcptv_keep_init = TCPTV_KEEP_INIT;
115
116 extern u_long sb_max;
117
118 int tcp_rst_ppslim = 100; /* 100pps */
119 int tcp_rst_ppslim_count = 0;
120 struct timeval tcp_rst_ppslim_last;
121
122 int tcp_ackdrop_ppslim = 100; /* 100pps */
123 int tcp_ackdrop_ppslim_count = 0;
124 struct timeval tcp_ackdrop_ppslim_last;
125
126 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
127
128 /* for modulo comparisons of timestamps */
129 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
130 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
131
132 /* for TCP SACK comparisons */
133 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b))
134 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b))
135
136 /*
137 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
138 */
139 #ifdef INET6
140 #define ND6_HINT(tp) \
141 do { \
142 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \
143 tp->t_inpcb->inp_route6.ro_rt) { \
144 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \
145 } \
146 } while (0)
147 #else
148 #define ND6_HINT(tp)
149 #endif
150
151 #ifdef TCP_ECN
152 /*
153 * ECN (Explicit Congestion Notification) support based on RFC3168
154 * implementation note:
155 * snd_last is used to track a recovery phase.
156 * when cwnd is reduced, snd_last is set to snd_max.
157 * while snd_last > snd_una, the sender is in a recovery phase and
158 * its cwnd should not be reduced again.
159 * snd_last follows snd_una when not in a recovery phase.
160 */
161 #endif
162
163 /*
164 * Macro to compute ACK transmission behavior. Delay the ACK unless
165 * we have already delayed an ACK (must send an ACK every two segments).
166 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
167 * option is enabled.
168 */
169 #define TCP_SETUP_ACK(tp, tiflags) \
170 do { \
171 if ((tp)->t_flags & TF_DELACK || \
172 (tcp_ack_on_push && (tiflags) & TH_PUSH)) \
173 tp->t_flags |= TF_ACKNOW; \
174 else \
175 TCP_SET_DELACK(tp); \
176 } while (0)
177
178 /*
179 * Insert segment ti into reassembly queue of tcp with
180 * control block tp. Return TH_FIN if reassembly now includes
181 * a segment with FIN. The macro form does the common case inline
182 * (segment is the next to be received on an established connection,
183 * and the queue is empty), avoiding linkage into and removal
184 * from the queue and repetition of various conversions.
185 * Set DELACK for segments received in order, but ack immediately
186 * when segments are out of order (so fast retransmit can work).
187 */
188
189 int
190 tcp_reass(tp, th, m, tlen)
191 struct tcpcb *tp;
192 struct tcphdr *th;
193 struct mbuf *m;
194 int *tlen;
195 {
196 struct tcpqent *p, *q, *nq, *tiqe;
197 struct socket *so = tp->t_inpcb->inp_socket;
198 int flags;
199
200 /*
201 * Call with th==0 after become established to
202 * force pre-ESTABLISHED data up to user socket.
203 */
204 if (th == 0)
205 goto present;
206
207 /*
208 * Allocate a new queue entry, before we throw away any data.
209 * If we can't, just drop the packet. XXX
210 */
211 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT);
212 if (tiqe == NULL) {
213 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead);
214 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) {
215 /* Reuse last entry since new segment fills a hole */
216 m_freem(tiqe->tcpqe_m);
217 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q);
218 }
219 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) {
220 /* Flush segment queue for this connection */
221 tcp_freeq(tp);
222 tcpstat.tcps_rcvmemdrop++;
223 m_freem(m);
224 return (0);
225 }
226 }
227
228 /*
229 * Find a segment which begins after this one does.
230 */
231 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL;
232 p = q, q = TAILQ_NEXT(q, tcpqe_q))
233 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq))
234 break;
235
236 /*
237 * If there is a preceding segment, it may provide some of
238 * our data already. If so, drop the data from the incoming
239 * segment. If it provides all of our data, drop us.
240 */
241 if (p != NULL) {
242 struct tcphdr *phdr = p->tcpqe_tcp;
243 int i;
244
245 /* conversion to int (in i) handles seq wraparound */
246 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq;
247 if (i > 0) {
248 if (i >= *tlen) {
249 tcpstat.tcps_rcvduppack++;
250 tcpstat.tcps_rcvdupbyte += *tlen;
251 m_freem(m);
252 pool_put(&tcpqe_pool, tiqe);
253 return (0);
254 }
255 m_adj(m, i);
256 *tlen -= i;
257 th->th_seq += i;
258 }
259 }
260 tcpstat.tcps_rcvoopack++;
261 tcpstat.tcps_rcvoobyte += *tlen;
262
263 /*
264 * While we overlap succeeding segments trim them or,
265 * if they are completely covered, dequeue them.
266 */
267 for (; q != NULL; q = nq) {
268 struct tcphdr *qhdr = q->tcpqe_tcp;
269 int i = (th->th_seq + *tlen) - qhdr->th_seq;
270
271 if (i <= 0)
272 break;
273 if (i < qhdr->th_reseqlen) {
274 qhdr->th_seq += i;
275 qhdr->th_reseqlen -= i;
276 m_adj(q->tcpqe_m, i);
277 break;
278 }
279 nq = TAILQ_NEXT(q, tcpqe_q);
280 m_freem(q->tcpqe_m);
281 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
282 pool_put(&tcpqe_pool, q);
283 }
284
285 /* Insert the new segment queue entry into place. */
286 tiqe->tcpqe_m = m;
287 th->th_reseqlen = *tlen;
288 tiqe->tcpqe_tcp = th;
289 if (p == NULL) {
290 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q);
291 } else {
292 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q);
293 }
294
295 present:
296 /*
297 * Present data to user, advancing rcv_nxt through
298 * completed sequence space.
299 */
300 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
301 return (0);
302 q = TAILQ_FIRST(&tp->t_segq);
303 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt)
304 return (0);
305 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen)
306 return (0);
307 do {
308 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen;
309 flags = q->tcpqe_tcp->th_flags & TH_FIN;
310
311 nq = TAILQ_NEXT(q, tcpqe_q);
312 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
313 ND6_HINT(tp);
314 if (so->so_state & SS_CANTRCVMORE)
315 m_freem(q->tcpqe_m);
316 else
317 sbappendstream(&so->so_rcv, q->tcpqe_m);
318 pool_put(&tcpqe_pool, q);
319 q = nq;
320 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt);
321 sorwakeup(so);
322 return (flags);
323 }
324
325 #ifdef INET6
326 int
327 tcp6_input(mp, offp, proto)
328 struct mbuf **mp;
329 int *offp, proto;
330 {
331 struct mbuf *m = *mp;
332
333 #if defined(NFAITH) && 0 < NFAITH
334 if (m->m_pkthdr.rcvif) {
335 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
336 /* XXX send icmp6 host/port unreach? */
337 m_freem(m);
338 return IPPROTO_DONE;
339 }
340 }
341 #endif
342
343 /*
344 * draft-itojun-ipv6-tcp-to-anycast
345 * better place to put this in?
346 */
347 if (m->m_flags & M_ANYCAST6) {
348 if (m->m_len >= sizeof(struct ip6_hdr)) {
349 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
350 icmp6_error(m, ICMP6_DST_UNREACH,
351 ICMP6_DST_UNREACH_ADDR,
352 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
353 } else
354 m_freem(m);
355 return IPPROTO_DONE;
356 }
357
358 tcp_input(m, *offp, proto);
359 return IPPROTO_DONE;
360 }
361 #endif
362
363 /*
364 * TCP input routine, follows pages 65-76 of the
365 * protocol specification dated September, 1981 very closely.
366 */
367 void
368 tcp_input(struct mbuf *m, ...)
369 {
370 struct ip *ip;
371 struct inpcb *inp;
372 u_int8_t *optp = NULL;
373 int optlen = 0;
374 int tlen, off;
375 struct tcpcb *tp = 0;
376 int tiflags;
377 struct socket *so = NULL;
378 int todrop, acked, ourfinisacked, needoutput = 0;
379 int hdroptlen = 0;
380 short ostate = 0;
381 tcp_seq iss, *reuse = NULL;
382 u_long tiwin;
383 struct tcp_opt_info opti;
384 int iphlen;
385 va_list ap;
386 struct tcphdr *th;
387 #ifdef INET6
388 struct ip6_hdr *ip6 = NULL;
389 #endif /* INET6 */
390 #ifdef IPSEC
391 struct m_tag *mtag;
392 struct tdb_ident *tdbi;
393 struct tdb *tdb;
394 int error, s;
395 #endif /* IPSEC */
396 int af;
397 #ifdef TCP_ECN
398 u_char iptos;
399 #endif
400
401 va_start(ap, m);
402 iphlen = va_arg(ap, int);
403 va_end(ap);
404
405 tcpstat.tcps_rcvtotal++;
406
407 opti.ts_present = 0;
408 opti.maxseg = 0;
409
410 /*
411 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
412 * See below for AF specific multicast.
413 */
414 if (m->m_flags & (M_BCAST|M_MCAST))
415 goto drop;
416
417 /*
418 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or
419 * TCP/IPv4.
420 */
421 switch (mtod(m, struct ip *)->ip_v) {
422 #ifdef INET6
423 case 6:
424 af = AF_INET6;
425 break;
426 #endif
427 case 4:
428 af = AF_INET;
429 break;
430 default:
431 m_freem(m);
432 return; /*EAFNOSUPPORT*/
433 }
434
435 /*
436 * Get IP and TCP header together in first mbuf.
437 * Note: IP leaves IP header in first mbuf.
438 */
439 switch (af) {
440 case AF_INET:
441 #ifdef DIAGNOSTIC
442 if (iphlen < sizeof(struct ip)) {
443 m_freem(m);
444 return;
445 }
446 #endif /* DIAGNOSTIC */
447 break;
448 #ifdef INET6
449 case AF_INET6:
450 #ifdef DIAGNOSTIC
451 if (iphlen < sizeof(struct ip6_hdr)) {
452 m_freem(m);
453 return;
454 }
455 #endif /* DIAGNOSTIC */
456 break;
457 #endif
458 default:
459 m_freem(m);
460 return;
461 }
462
463 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th));
464 if (!th) {
465 tcpstat.tcps_rcvshort++;
466 return;
467 }
468
469 tlen = m->m_pkthdr.len - iphlen;
470 ip = NULL;
471 #ifdef INET6
472 ip6 = NULL;
473 #endif
474 switch (af) {
475 case AF_INET:
476 ip = mtod(m, struct ip *);
477 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
478 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
479 goto drop;
480 #ifdef TCP_ECN
481 /* save ip_tos before clearing it for checksum */
482 iptos = ip->ip_tos;
483 #endif
484 /*
485 * Checksum extended TCP header and data.
486 */
487 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) {
488 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) {
489 tcpstat.tcps_inhwcsum++;
490 tcpstat.tcps_rcvbadsum++;
491 goto drop;
492 }
493 if (in4_cksum(m, IPPROTO_TCP, iphlen, tlen) != 0) {
494 tcpstat.tcps_rcvbadsum++;
495 goto drop;
496 }
497 } else {
498 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK;
499 tcpstat.tcps_inhwcsum++;
500 }
501 break;
502 #ifdef INET6
503 case AF_INET6:
504 ip6 = mtod(m, struct ip6_hdr *);
505 #ifdef TCP_ECN
506 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
507 #endif
508
509 /* Be proactive about malicious use of IPv4 mapped address */
510 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
511 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
512 /* XXX stat */
513 goto drop;
514 }
515
516 /*
517 * Be proactive about unspecified IPv6 address in source.
518 * As we use all-zero to indicate unbounded/unconnected pcb,
519 * unspecified IPv6 address can be used to confuse us.
520 *
521 * Note that packets with unspecified IPv6 destination is
522 * already dropped in ip6_input.
523 */
524 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
525 /* XXX stat */
526 goto drop;
527 }
528
529 /* Discard packets to multicast */
530 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
531 /* XXX stat */
532 goto drop;
533 }
534
535 /*
536 * Checksum extended TCP header and data.
537 */
538 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) {
539 tcpstat.tcps_rcvbadsum++;
540 goto drop;
541 }
542 break;
543 #endif
544 }
545
546 /*
547 * Check that TCP offset makes sense,
548 * pull out TCP options and adjust length. XXX
549 */
550 off = th->th_off << 2;
551 if (off < sizeof(struct tcphdr) || off > tlen) {
552 tcpstat.tcps_rcvbadoff++;
553 goto drop;
554 }
555 tlen -= off;
556 if (off > sizeof(struct tcphdr)) {
557 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off);
558 if (!th) {
559 tcpstat.tcps_rcvshort++;
560 return;
561 }
562 optlen = off - sizeof(struct tcphdr);
563 optp = (u_int8_t *)(th + 1);
564 /*
565 * Do quick retrieval of timestamp options ("options
566 * prediction?"). If timestamp is the only option and it's
567 * formatted as recommended in RFC 1323 appendix A, we
568 * quickly get the values now and not bother calling
569 * tcp_dooptions(), etc.
570 */
571 if ((optlen == TCPOLEN_TSTAMP_APPA ||
572 (optlen > TCPOLEN_TSTAMP_APPA &&
573 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
574 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
575 (th->th_flags & TH_SYN) == 0) {
576 opti.ts_present = 1;
577 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
578 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
579 optp = NULL; /* we've parsed the options */
580 }
581 }
582 tiflags = th->th_flags;
583
584 /*
585 * Convert TCP protocol specific fields to host format.
586 */
587 NTOHL(th->th_seq);
588 NTOHL(th->th_ack);
589 NTOHS(th->th_win);
590 NTOHS(th->th_urp);
591
592 /*
593 * Locate pcb for segment.
594 */
595 findpcb:
596 switch (af) {
597 #ifdef INET6
598 case AF_INET6:
599 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, th->th_sport,
600 &ip6->ip6_dst, th->th_dport);
601 break;
602 #endif
603 case AF_INET:
604 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, th->th_sport,
605 ip->ip_dst, th->th_dport);
606 break;
607 }
608 if (inp == 0) {
609 int inpl_flags = 0;
610 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST)
611 inpl_flags = INPLOOKUP_WILDCARD;
612 ++tcpstat.tcps_pcbhashmiss;
613 switch (af) {
614 #ifdef INET6
615 case AF_INET6:
616 inp = in6_pcblookup_listen(&tcbtable,
617 &ip6->ip6_dst, th->th_dport, inpl_flags);
618 break;
619 #endif /* INET6 */
620 case AF_INET:
621 inp = in_pcblookup_listen(&tcbtable,
622 ip->ip_dst, th->th_dport, inpl_flags);
623 break;
624 }
625 /*
626 * If the state is CLOSED (i.e., TCB does not exist) then
627 * all data in the incoming segment is discarded.
628 * If the TCB exists but is in CLOSED state, it is embryonic,
629 * but should either do a listen or a connect soon.
630 */
631 if (inp == 0) {
632 ++tcpstat.tcps_noport;
633 goto dropwithreset_ratelim;
634 }
635 }
636
637 /* Check the minimum TTL for socket. */
638 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl)
639 goto drop;
640
641 tp = intotcpcb(inp);
642 if (tp == 0)
643 goto dropwithreset_ratelim;
644 if (tp->t_state == TCPS_CLOSED)
645 goto drop;
646
647 /* Unscale the window into a 32-bit value. */
648 if ((tiflags & TH_SYN) == 0)
649 tiwin = th->th_win << tp->snd_scale;
650 else
651 tiwin = th->th_win;
652
653 so = inp->inp_socket;
654 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
655 union syn_cache_sa src;
656 union syn_cache_sa dst;
657
658 bzero(&src, sizeof(src));
659 bzero(&dst, sizeof(dst));
660 switch (af) {
661 #ifdef INET
662 case AF_INET:
663 src.sin.sin_len = sizeof(struct sockaddr_in);
664 src.sin.sin_family = AF_INET;
665 src.sin.sin_addr = ip->ip_src;
666 src.sin.sin_port = th->th_sport;
667
668 dst.sin.sin_len = sizeof(struct sockaddr_in);
669 dst.sin.sin_family = AF_INET;
670 dst.sin.sin_addr = ip->ip_dst;
671 dst.sin.sin_port = th->th_dport;
672 break;
673 #endif
674 #ifdef INET6
675 case AF_INET6:
676 src.sin6.sin6_len = sizeof(struct sockaddr_in6);
677 src.sin6.sin6_family = AF_INET6;
678 src.sin6.sin6_addr = ip6->ip6_src;
679 src.sin6.sin6_port = th->th_sport;
680
681 dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
682 dst.sin6.sin6_family = AF_INET6;
683 dst.sin6.sin6_addr = ip6->ip6_dst;
684 dst.sin6.sin6_port = th->th_dport;
685 break;
686 #endif /* INET6 */
687 default:
688 goto badsyn; /*sanity*/
689 }
690
691 if (so->so_options & SO_DEBUG) {
692 ostate = tp->t_state;
693 switch (af) {
694 #ifdef INET6
695 case AF_INET6:
696 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6));
697 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th));
698 break;
699 #endif
700 case AF_INET:
701 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip));
702 bcopy(th, &tcp_saveti.ti_t, sizeof(*th));
703 break;
704 }
705 }
706 if (so->so_options & SO_ACCEPTCONN) {
707 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
708 if (tiflags & TH_RST) {
709 syn_cache_reset(&src.sa, &dst.sa, th);
710 } else if ((tiflags & (TH_ACK|TH_SYN)) ==
711 (TH_ACK|TH_SYN)) {
712 /*
713 * Received a SYN,ACK. This should
714 * never happen while we are in
715 * LISTEN. Send an RST.
716 */
717 goto badsyn;
718 } else if (tiflags & TH_ACK) {
719 so = syn_cache_get(&src.sa, &dst.sa,
720 th, iphlen, tlen, so, m);
721 if (so == NULL) {
722 /*
723 * We don't have a SYN for
724 * this ACK; send an RST.
725 */
726 goto badsyn;
727 } else if (so ==
728 (struct socket *)(-1)) {
729 /*
730 * We were unable to create
731 * the connection. If the
732 * 3-way handshake was
733 * completed, and RST has
734 * been sent to the peer.
735 * Since the mbuf might be
736 * in use for the reply,
737 * do not free it.
738 */
739 m = NULL;
740 } else {
741 /*
742 * We have created a
743 * full-blown connection.
744 */
745 tp = NULL;
746 inp = (struct inpcb *)so->so_pcb;
747 tp = intotcpcb(inp);
748 if (tp == NULL)
749 goto badsyn; /*XXX*/
750
751 /*
752 * Compute proper scaling
753 * value from buffer space
754 */
755 tcp_rscale(tp, so->so_rcv.sb_hiwat);
756 goto after_listen;
757 }
758 } else {
759 /*
760 * None of RST, SYN or ACK was set.
761 * This is an invalid packet for a
762 * TCB in LISTEN state. Send a RST.
763 */
764 goto badsyn;
765 }
766 } else {
767 /*
768 * Received a SYN.
769 */
770 #ifdef INET6
771 /*
772 * If deprecated address is forbidden, we do
773 * not accept SYN to deprecated interface
774 * address to prevent any new inbound
775 * connection from getting established.
776 * When we do not accept SYN, we send a TCP
777 * RST, with deprecated source address (instead
778 * of dropping it). We compromise it as it is
779 * much better for peer to send a RST, and
780 * RST will be the final packet for the
781 * exchange.
782 *
783 * If we do not forbid deprecated addresses, we
784 * accept the SYN packet. RFC2462 does not
785 * suggest dropping SYN in this case.
786 * If we decipher RFC2462 5.5.4, it says like
787 * this:
788 * 1. use of deprecated addr with existing
789 * communication is okay - "SHOULD continue
790 * to be used"
791 * 2. use of it with new communication:
792 * (2a) "SHOULD NOT be used if alternate
793 * address with sufficient scope is
794 * available"
795 * (2b) nothing mentioned otherwise.
796 * Here we fall into (2b) case as we have no
797 * choice in our source address selection - we
798 * must obey the peer.
799 *
800 * The wording in RFC2462 is confusing, and
801 * there are multiple description text for
802 * deprecated address handling - worse, they
803 * are not exactly the same. I believe 5.5.4
804 * is the best one, so we follow 5.5.4.
805 */
806 if (ip6 && !ip6_use_deprecated) {
807 struct in6_ifaddr *ia6;
808
809 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif,
810 &ip6->ip6_dst)) &&
811 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
812 tp = NULL;
813 goto dropwithreset;
814 }
815 }
816 #endif
817
818 /*
819 * LISTEN socket received a SYN
820 * from itself? This can't possibly
821 * be valid; drop the packet.
822 */
823 if (th->th_dport == th->th_sport) {
824 switch (af) {
825 #ifdef INET6
826 case AF_INET6:
827 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
828 &ip6->ip6_dst)) {
829 tcpstat.tcps_badsyn++;
830 goto drop;
831 }
832 break;
833 #endif /* INET6 */
834 case AF_INET:
835 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
836 tcpstat.tcps_badsyn++;
837 goto drop;
838 }
839 break;
840 }
841 }
842
843 /*
844 * SYN looks ok; create compressed TCP
845 * state for it.
846 */
847 if (so->so_qlen <= so->so_qlimit &&
848 syn_cache_add(&src.sa, &dst.sa, th, iphlen,
849 so, m, optp, optlen, &opti, reuse))
850 m = NULL;
851 }
852 goto drop;
853 }
854 }
855
856 after_listen:
857 #ifdef DIAGNOSTIC
858 /*
859 * Should not happen now that all embryonic connections
860 * are handled with compressed state.
861 */
862 if (tp->t_state == TCPS_LISTEN)
863 panic("tcp_input: TCPS_LISTEN");
864 #endif
865
866 #ifdef IPSEC
867 /* Find most recent IPsec tag */
868 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
869 s = splnet();
870 if (mtag != NULL) {
871 tdbi = (struct tdb_ident *)(mtag + 1);
872 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
873 } else
874 tdb = NULL;
875 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN,
876 tdb, inp);
877 if (error) {
878 splx(s);
879 goto drop;
880 }
881
882 /* Latch SA */
883 if (inp->inp_tdb_in != tdb) {
884 if (tdb) {
885 tdb_add_inp(tdb, inp, 1);
886 if (inp->inp_ipo == NULL) {
887 inp->inp_ipo = ipsec_add_policy(inp, af,
888 IPSP_DIRECTION_OUT);
889 if (inp->inp_ipo == NULL) {
890 splx(s);
891 goto drop;
892 }
893 }
894 if (inp->inp_ipo->ipo_dstid == NULL &&
895 tdb->tdb_srcid != NULL) {
896 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid;
897 tdb->tdb_srcid->ref_count++;
898 }
899 if (inp->inp_ipsec_remotecred == NULL &&
900 tdb->tdb_remote_cred != NULL) {
901 inp->inp_ipsec_remotecred =
902 tdb->tdb_remote_cred;
903 tdb->tdb_remote_cred->ref_count++;
904 }
905 if (inp->inp_ipsec_remoteauth == NULL &&
906 tdb->tdb_remote_auth != NULL) {
907 inp->inp_ipsec_remoteauth =
908 tdb->tdb_remote_auth;
909 tdb->tdb_remote_auth->ref_count++;
910 }
911 } else { /* Just reset */
912 TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp,
913 inp_tdb_in_next);
914 inp->inp_tdb_in = NULL;
915 }
916 }
917 splx(s);
918 #endif /* IPSEC */
919
920 /*
921 * Segment received on connection.
922 * Reset idle time and keep-alive timer.
923 */
924 tp->t_rcvtime = tcp_now;
925 if (TCPS_HAVEESTABLISHED(tp->t_state))
926 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
927
928 #ifdef TCP_SACK
929 if (tp->sack_enable)
930 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */
931 #endif /* TCP_SACK */
932
933 /*
934 * Process options.
935 */
936 #ifdef TCP_SIGNATURE
937 if (optp || (tp->t_flags & TF_SIGNATURE))
938 #else
939 if (optp)
940 #endif
941 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti))
942 goto drop;
943
944 if (opti.ts_present && opti.ts_ecr) {
945 int rtt_test;
946
947 /* subtract out the tcp timestamp modulator */
948 opti.ts_ecr -= tp->ts_modulate;
949
950 /* make sure ts_ecr is sensible */
951 rtt_test = tcp_now - opti.ts_ecr;
952 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX)
953 opti.ts_ecr = 0;
954 }
955
956 #ifdef TCP_ECN
957 /* if congestion experienced, set ECE bit in subsequent packets. */
958 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
959 tp->t_flags |= TF_RCVD_CE;
960 tcpstat.tcps_ecn_rcvce++;
961 }
962 #endif
963 /*
964 * Header prediction: check for the two common cases
965 * of a uni-directional data xfer. If the packet has
966 * no control flags, is in-sequence, the window didn't
967 * change and we're not retransmitting, it's a
968 * candidate. If the length is zero and the ack moved
969 * forward, we're the sender side of the xfer. Just
970 * free the data acked & wake any higher level process
971 * that was blocked waiting for space. If the length
972 * is non-zero and the ack didn't move, we're the
973 * receiver side. If we're getting packets in-order
974 * (the reassembly queue is empty), add the data to
975 * the socket buffer and note that we need a delayed ack.
976 */
977 if (tp->t_state == TCPS_ESTABLISHED &&
978 #ifdef TCP_ECN
979 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK &&
980 #else
981 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
982 #endif
983 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
984 th->th_seq == tp->rcv_nxt &&
985 tiwin && tiwin == tp->snd_wnd &&
986 tp->snd_nxt == tp->snd_max) {
987
988 /*
989 * If last ACK falls within this segment's sequence numbers,
990 * record the timestamp.
991 * Fix from Braden, see Stevens p. 870
992 */
993 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
994 tp->ts_recent_age = tcp_now;
995 tp->ts_recent = opti.ts_val;
996 }
997
998 if (tlen == 0) {
999 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1000 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1001 tp->snd_cwnd >= tp->snd_wnd &&
1002 tp->t_dupacks == 0) {
1003 /*
1004 * this is a pure ack for outstanding data.
1005 */
1006 ++tcpstat.tcps_predack;
1007 if (opti.ts_present && opti.ts_ecr)
1008 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr);
1009 else if (tp->t_rtttime &&
1010 SEQ_GT(th->th_ack, tp->t_rtseq))
1011 tcp_xmit_timer(tp,
1012 tcp_now - tp->t_rtttime);
1013 acked = th->th_ack - tp->snd_una;
1014 tcpstat.tcps_rcvackpack++;
1015 tcpstat.tcps_rcvackbyte += acked;
1016 ND6_HINT(tp);
1017 sbdrop(&so->so_snd, acked);
1018
1019 /*
1020 * If we had a pending ICMP message that
1021 * referres to data that have just been
1022 * acknowledged, disregard the recorded ICMP
1023 * message.
1024 */
1025 if ((tp->t_flags & TF_PMTUD_PEND) &&
1026 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
1027 tp->t_flags &= ~TF_PMTUD_PEND;
1028
1029 /*
1030 * Keep track of the largest chunk of data
1031 * acknowledged since last PMTU update
1032 */
1033 if (tp->t_pmtud_mss_acked < acked)
1034 tp->t_pmtud_mss_acked = acked;
1035
1036 tp->snd_una = th->th_ack;
1037 #if defined(TCP_SACK) || defined(TCP_ECN)
1038 /*
1039 * We want snd_last to track snd_una so
1040 * as to avoid sequence wraparound problems
1041 * for very large transfers.
1042 */
1043 #ifdef TCP_ECN
1044 if (SEQ_GT(tp->snd_una, tp->snd_last))
1045 #endif
1046 tp->snd_last = tp->snd_una;
1047 #endif /* TCP_SACK */
1048 #if defined(TCP_SACK) && defined(TCP_FACK)
1049 tp->snd_fack = tp->snd_una;
1050 tp->retran_data = 0;
1051 #endif /* TCP_FACK */
1052 m_freem(m);
1053
1054 /*
1055 * If all outstanding data are acked, stop
1056 * retransmit timer, otherwise restart timer
1057 * using current (possibly backed-off) value.
1058 * If process is waiting for space,
1059 * wakeup/selwakeup/signal. If data
1060 * are ready to send, let tcp_output
1061 * decide between more output or persist.
1062 */
1063 if (tp->snd_una == tp->snd_max)
1064 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1065 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
1066 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1067
1068 if (sb_notify(&so->so_snd))
1069 sowwakeup(so);
1070 if (so->so_snd.sb_cc)
1071 (void) tcp_output(tp);
1072 return;
1073 }
1074 } else if (th->th_ack == tp->snd_una &&
1075 TAILQ_EMPTY(&tp->t_segq) &&
1076 tlen <= sbspace(&so->so_rcv)) {
1077 /*
1078 * This is a pure, in-sequence data packet
1079 * with nothing on the reassembly queue and
1080 * we have enough buffer space to take it.
1081 */
1082 #ifdef TCP_SACK
1083 /* Clean receiver SACK report if present */
1084 if (tp->sack_enable && tp->rcv_numsacks)
1085 tcp_clean_sackreport(tp);
1086 #endif /* TCP_SACK */
1087 ++tcpstat.tcps_preddat;
1088 tp->rcv_nxt += tlen;
1089 tcpstat.tcps_rcvpack++;
1090 tcpstat.tcps_rcvbyte += tlen;
1091 ND6_HINT(tp);
1092 /*
1093 * Drop TCP, IP headers and TCP options then add data
1094 * to socket buffer.
1095 */
1096 if (so->so_state & SS_CANTRCVMORE)
1097 m_freem(m);
1098 else {
1099 m_adj(m, iphlen + off);
1100 sbappendstream(&so->so_rcv, m);
1101 }
1102 sorwakeup(so);
1103 TCP_SETUP_ACK(tp, tiflags);
1104 if (tp->t_flags & TF_ACKNOW)
1105 (void) tcp_output(tp);
1106 return;
1107 }
1108 }
1109
1110 /*
1111 * Compute mbuf offset to TCP data segment.
1112 */
1113 hdroptlen = iphlen + off;
1114
1115 /*
1116 * Calculate amount of space in receive window,
1117 * and then do TCP input processing.
1118 * Receive window is amount of space in rcv queue,
1119 * but not less than advertised window.
1120 */
1121 { int win;
1122
1123 win = sbspace(&so->so_rcv);
1124 if (win < 0)
1125 win = 0;
1126 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1127 }
1128
1129 switch (tp->t_state) {
1130
1131 /*
1132 * If the state is SYN_RECEIVED:
1133 * if seg contains SYN/ACK, send an RST.
1134 * if seg contains an ACK, but not for our SYN/ACK, send an RST
1135 */
1136
1137 case TCPS_SYN_RECEIVED:
1138 if (tiflags & TH_ACK) {
1139 if (tiflags & TH_SYN) {
1140 tcpstat.tcps_badsyn++;
1141 goto dropwithreset;
1142 }
1143 if (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1144 SEQ_GT(th->th_ack, tp->snd_max))
1145 goto dropwithreset;
1146 }
1147 break;
1148
1149 /*
1150 * If the state is SYN_SENT:
1151 * if seg contains an ACK, but not for our SYN, drop the input.
1152 * if seg contains a RST, then drop the connection.
1153 * if seg does not contain SYN, then drop it.
1154 * Otherwise this is an acceptable SYN segment
1155 * initialize tp->rcv_nxt and tp->irs
1156 * if seg contains ack then advance tp->snd_una
1157 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1158 * arrange for segment to be acked (eventually)
1159 * continue processing rest of data/controls, beginning with URG
1160 */
1161 case TCPS_SYN_SENT:
1162 if ((tiflags & TH_ACK) &&
1163 (SEQ_LEQ(th->th_ack, tp->iss) ||
1164 SEQ_GT(th->th_ack, tp->snd_max)))
1165 goto dropwithreset;
1166 if (tiflags & TH_RST) {
1167 #ifdef TCP_ECN
1168 /* if ECN is enabled, fall back to non-ecn at rexmit */
1169 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
1170 goto drop;
1171 #endif
1172 if (tiflags & TH_ACK)
1173 tp = tcp_drop(tp, ECONNREFUSED);
1174 goto drop;
1175 }
1176 if ((tiflags & TH_SYN) == 0)
1177 goto drop;
1178 if (tiflags & TH_ACK) {
1179 tp->snd_una = th->th_ack;
1180 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1181 tp->snd_nxt = tp->snd_una;
1182 }
1183 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1184 tp->irs = th->th_seq;
1185 tcp_mss(tp, opti.maxseg);
1186 /* Reset initial window to 1 segment for retransmit */
1187 if (tp->t_rxtshift > 0)
1188 tp->snd_cwnd = tp->t_maxseg;
1189 tcp_rcvseqinit(tp);
1190 tp->t_flags |= TF_ACKNOW;
1191 #ifdef TCP_SACK
1192 /*
1193 * If we've sent a SACK_PERMITTED option, and the peer
1194 * also replied with one, then TF_SACK_PERMIT should have
1195 * been set in tcp_dooptions(). If it was not, disable SACKs.
1196 */
1197 if (tp->sack_enable)
1198 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT;
1199 #endif
1200 #ifdef TCP_ECN
1201 /*
1202 * if ECE is set but CWR is not set for SYN-ACK, or
1203 * both ECE and CWR are set for simultaneous open,
1204 * peer is ECN capable.
1205 */
1206 if (tcp_do_ecn) {
1207 if ((tiflags & (TH_ACK|TH_ECE|TH_CWR))
1208 == (TH_ACK|TH_ECE) ||
1209 (tiflags & (TH_ACK|TH_ECE|TH_CWR))
1210 == (TH_ECE|TH_CWR)) {
1211 tp->t_flags |= TF_ECN_PERMIT;
1212 tiflags &= ~(TH_ECE|TH_CWR);
1213 tcpstat.tcps_ecn_accepts++;
1214 }
1215 }
1216 #endif
1217
1218 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
1219 tcpstat.tcps_connects++;
1220 soisconnected(so);
1221 tp->t_state = TCPS_ESTABLISHED;
1222 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
1223 /* Do window scaling on this connection? */
1224 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1225 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1226 tp->snd_scale = tp->requested_s_scale;
1227 tp->rcv_scale = tp->request_r_scale;
1228 }
1229 tcp_reass_lock(tp);
1230 (void) tcp_reass(tp, (struct tcphdr *)0,
1231 (struct mbuf *)0, &tlen);
1232 tcp_reass_unlock(tp);
1233 /*
1234 * if we didn't have to retransmit the SYN,
1235 * use its rtt as our initial srtt & rtt var.
1236 */
1237 if (tp->t_rtttime)
1238 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
1239 /*
1240 * Since new data was acked (the SYN), open the
1241 * congestion window by one MSS. We do this
1242 * here, because we won't go through the normal
1243 * ACK processing below. And since this is the
1244 * start of the connection, we know we are in
1245 * the exponential phase of slow-start.
1246 */
1247 tp->snd_cwnd += tp->t_maxseg;
1248 } else
1249 tp->t_state = TCPS_SYN_RECEIVED;
1250
1251 #if 0
1252 trimthenstep6:
1253 #endif
1254 /*
1255 * Advance th->th_seq to correspond to first data byte.
1256 * If data, trim to stay within window,
1257 * dropping FIN if necessary.
1258 */
1259 th->th_seq++;
1260 if (tlen > tp->rcv_wnd) {
1261 todrop = tlen - tp->rcv_wnd;
1262 m_adj(m, -todrop);
1263 tlen = tp->rcv_wnd;
1264 tiflags &= ~TH_FIN;
1265 tcpstat.tcps_rcvpackafterwin++;
1266 tcpstat.tcps_rcvbyteafterwin += todrop;
1267 }
1268 tp->snd_wl1 = th->th_seq - 1;
1269 tp->rcv_up = th->th_seq;
1270 goto step6;
1271 /*
1272 * If a new connection request is received while in TIME_WAIT,
1273 * drop the old connection and start over if the if the
1274 * timestamp or the sequence numbers are above the previous
1275 * ones.
1276 */
1277 case TCPS_TIME_WAIT:
1278 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) &&
1279 ((opti.ts_present &&
1280 TSTMP_LT(tp->ts_recent, opti.ts_val)) ||
1281 SEQ_GT(th->th_seq, tp->rcv_nxt))) {
1282 /*
1283 * Advance the iss by at least 32768, but
1284 * clear the msb in order to make sure
1285 * that SEG_LT(snd_nxt, iss).
1286 */
1287 iss = tp->snd_nxt +
1288 ((arc4random() & 0x7fffffff) | 0x8000);
1289 reuse = &iss;
1290 tp = tcp_close(tp);
1291 goto findpcb;
1292 }
1293 }
1294
1295 /*
1296 * States other than LISTEN or SYN_SENT.
1297 * First check timestamp, if present.
1298 * Then check that at least some bytes of segment are within
1299 * receive window. If segment begins before rcv_nxt,
1300 * drop leading data (and SYN); if nothing left, just ack.
1301 *
1302 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1303 * and it's less than opti.ts_recent, drop it.
1304 */
1305 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
1306 TSTMP_LT(opti.ts_val, tp->ts_recent)) {
1307
1308 /* Check to see if ts_recent is over 24 days old. */
1309 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1310 /*
1311 * Invalidate ts_recent. If this segment updates
1312 * ts_recent, the age will be reset later and ts_recent
1313 * will get a valid value. If it does not, setting
1314 * ts_recent to zero will at least satisfy the
1315 * requirement that zero be placed in the timestamp
1316 * echo reply when ts_recent isn't valid. The
1317 * age isn't reset until we get a valid ts_recent
1318 * because we don't want out-of-order segments to be
1319 * dropped when ts_recent is old.
1320 */
1321 tp->ts_recent = 0;
1322 } else {
1323 tcpstat.tcps_rcvduppack++;
1324 tcpstat.tcps_rcvdupbyte += tlen;
1325 tcpstat.tcps_pawsdrop++;
1326 goto dropafterack;
1327 }
1328 }
1329
1330 todrop = tp->rcv_nxt - th->th_seq;
1331 if (todrop > 0) {
1332 if (tiflags & TH_SYN) {
1333 tiflags &= ~TH_SYN;
1334 th->th_seq++;
1335 if (th->th_urp > 1)
1336 th->th_urp--;
1337 else
1338 tiflags &= ~TH_URG;
1339 todrop--;
1340 }
1341 if (todrop > tlen ||
1342 (todrop == tlen && (tiflags & TH_FIN) == 0)) {
1343 /*
1344 * Any valid FIN must be to the left of the
1345 * window. At this point, FIN must be a
1346 * duplicate or out-of-sequence, so drop it.
1347 */
1348 tiflags &= ~TH_FIN;
1349 /*
1350 * Send ACK to resynchronize, and drop any data,
1351 * but keep on processing for RST or ACK.
1352 */
1353 tp->t_flags |= TF_ACKNOW;
1354 tcpstat.tcps_rcvdupbyte += todrop = tlen;
1355 tcpstat.tcps_rcvduppack++;
1356 } else {
1357 tcpstat.tcps_rcvpartduppack++;
1358 tcpstat.tcps_rcvpartdupbyte += todrop;
1359 }
1360 hdroptlen += todrop; /* drop from head afterwards */
1361 th->th_seq += todrop;
1362 tlen -= todrop;
1363 if (th->th_urp > todrop)
1364 th->th_urp -= todrop;
1365 else {
1366 tiflags &= ~TH_URG;
1367 th->th_urp = 0;
1368 }
1369 }
1370
1371 /*
1372 * If new data are received on a connection after the
1373 * user processes are gone, then RST the other end.
1374 */
1375 if ((so->so_state & SS_NOFDREF) &&
1376 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1377 tp = tcp_close(tp);
1378 tcpstat.tcps_rcvafterclose++;
1379 goto dropwithreset;
1380 }
1381
1382 /*
1383 * If segment ends after window, drop trailing data
1384 * (and PUSH and FIN); if nothing left, just ACK.
1385 */
1386 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1387 if (todrop > 0) {
1388 tcpstat.tcps_rcvpackafterwin++;
1389 if (todrop >= tlen) {
1390 tcpstat.tcps_rcvbyteafterwin += tlen;
1391 /*
1392 * If window is closed can only take segments at
1393 * window edge, and have to drop data and PUSH from
1394 * incoming segments. Continue processing, but
1395 * remember to ack. Otherwise, drop segment
1396 * and ack.
1397 */
1398 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1399 tp->t_flags |= TF_ACKNOW;
1400 tcpstat.tcps_rcvwinprobe++;
1401 } else
1402 goto dropafterack;
1403 } else
1404 tcpstat.tcps_rcvbyteafterwin += todrop;
1405 m_adj(m, -todrop);
1406 tlen -= todrop;
1407 tiflags &= ~(TH_PUSH|TH_FIN);
1408 }
1409
1410 /*
1411 * If last ACK falls within this segment's sequence numbers,
1412 * record its timestamp if it's more recent.
1413 * Cf fix from Braden, see Stevens p. 870
1414 */
1415 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
1416 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1417 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
1418 ((tiflags & (TH_SYN|TH_FIN)) != 0)))
1419 tp->ts_recent = opti.ts_val;
1420 else
1421 tp->ts_recent = 0;
1422 tp->ts_recent_age = tcp_now;
1423 }
1424
1425 /*
1426 * If the RST bit is set examine the state:
1427 * SYN_RECEIVED STATE:
1428 * If passive open, return to LISTEN state.
1429 * If active open, inform user that connection was refused.
1430 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1431 * Inform user that connection was reset, and close tcb.
1432 * CLOSING, LAST_ACK, TIME_WAIT STATES
1433 * Close the tcb.
1434 */
1435 if (tiflags & TH_RST) {
1436 if (th->th_seq != tp->last_ack_sent &&
1437 th->th_seq != tp->rcv_nxt &&
1438 th->th_seq != (tp->rcv_nxt + 1))
1439 goto drop;
1440
1441 switch (tp->t_state) {
1442 case TCPS_SYN_RECEIVED:
1443 #ifdef TCP_ECN
1444 /* if ECN is enabled, fall back to non-ecn at rexmit */
1445 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
1446 goto drop;
1447 #endif
1448 so->so_error = ECONNREFUSED;
1449 goto close;
1450
1451 case TCPS_ESTABLISHED:
1452 case TCPS_FIN_WAIT_1:
1453 case TCPS_FIN_WAIT_2:
1454 case TCPS_CLOSE_WAIT:
1455 so->so_error = ECONNRESET;
1456 close:
1457 tp->t_state = TCPS_CLOSED;
1458 tcpstat.tcps_drops++;
1459 tp = tcp_close(tp);
1460 goto drop;
1461 case TCPS_CLOSING:
1462 case TCPS_LAST_ACK:
1463 case TCPS_TIME_WAIT:
1464 tp = tcp_close(tp);
1465 goto drop;
1466 }
1467 }
1468
1469 /*
1470 * If a SYN is in the window, then this is an
1471 * error and we ACK and drop the packet.
1472 */
1473 if (tiflags & TH_SYN)
1474 goto dropafterack_ratelim;
1475
1476 /*
1477 * If the ACK bit is off we drop the segment and return.
1478 */
1479 if ((tiflags & TH_ACK) == 0) {
1480 if (tp->t_flags & TF_ACKNOW)
1481 goto dropafterack;
1482 else
1483 goto drop;
1484 }
1485
1486 /*
1487 * Ack processing.
1488 */
1489 switch (tp->t_state) {
1490
1491 /*
1492 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1493 * ESTABLISHED state and continue processing.
1494 * The ACK was checked above.
1495 */
1496 case TCPS_SYN_RECEIVED:
1497 tcpstat.tcps_connects++;
1498 soisconnected(so);
1499 tp->t_state = TCPS_ESTABLISHED;
1500 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
1501 /* Do window scaling? */
1502 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1503 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1504 tp->snd_scale = tp->requested_s_scale;
1505 tp->rcv_scale = tp->request_r_scale;
1506 }
1507 tcp_reass_lock(tp);
1508 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0,
1509 &tlen);
1510 tcp_reass_unlock(tp);
1511 tp->snd_wl1 = th->th_seq - 1;
1512 /* fall into ... */
1513
1514 /*
1515 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1516 * ACKs. If the ack is in the range
1517 * tp->snd_una < th->th_ack <= tp->snd_max
1518 * then advance tp->snd_una to th->th_ack and drop
1519 * data from the retransmission queue. If this ACK reflects
1520 * more up to date window information we update our window information.
1521 */
1522 case TCPS_ESTABLISHED:
1523 case TCPS_FIN_WAIT_1:
1524 case TCPS_FIN_WAIT_2:
1525 case TCPS_CLOSE_WAIT:
1526 case TCPS_CLOSING:
1527 case TCPS_LAST_ACK:
1528 case TCPS_TIME_WAIT:
1529 #ifdef TCP_ECN
1530 /*
1531 * if we receive ECE and are not already in recovery phase,
1532 * reduce cwnd by half but don't slow-start.
1533 * advance snd_last to snd_max not to reduce cwnd again
1534 * until all outstanding packets are acked.
1535 */
1536 if (tcp_do_ecn && (tiflags & TH_ECE)) {
1537 if ((tp->t_flags & TF_ECN_PERMIT) &&
1538 SEQ_GEQ(tp->snd_una, tp->snd_last)) {
1539 u_int win;
1540
1541 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg;
1542 if (win > 1) {
1543 tp->snd_ssthresh = win / 2 * tp->t_maxseg;
1544 tp->snd_cwnd = tp->snd_ssthresh;
1545 tp->snd_last = tp->snd_max;
1546 tp->t_flags |= TF_SEND_CWR;
1547 tcpstat.tcps_cwr_ecn++;
1548 }
1549 }
1550 tcpstat.tcps_ecn_rcvece++;
1551 }
1552 /*
1553 * if we receive CWR, we know that the peer has reduced
1554 * its congestion window. stop sending ecn-echo.
1555 */
1556 if ((tiflags & TH_CWR)) {
1557 tp->t_flags &= ~TF_RCVD_CE;
1558 tcpstat.tcps_ecn_rcvcwr++;
1559 }
1560 #endif /* TCP_ECN */
1561
1562 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1563 /*
1564 * Duplicate/old ACK processing.
1565 * Increments t_dupacks:
1566 * Pure duplicate (same seq/ack/window, no data)
1567 * Doesn't affect t_dupacks:
1568 * Data packets.
1569 * Normal window updates (window opens)
1570 * Resets t_dupacks:
1571 * New data ACKed.
1572 * Window shrinks
1573 * Old ACK
1574 */
1575 if (tlen) {
1576 /* Drop very old ACKs unless th_seq matches */
1577 if (th->th_seq != tp->rcv_nxt &&
1578 SEQ_LT(th->th_ack,
1579 tp->snd_una - tp->max_sndwnd)) {
1580 tcpstat.tcps_rcvacktooold++;
1581 goto drop;
1582 }
1583 break;
1584 }
1585 /*
1586 * If we get an old ACK, there is probably packet
1587 * reordering going on. Be conservative and reset
1588 * t_dupacks so that we are less agressive in
1589 * doing a fast retransmit.
1590 */
1591 if (th->th_ack != tp->snd_una) {
1592 tp->t_dupacks = 0;
1593 break;
1594 }
1595 if (tiwin == tp->snd_wnd) {
1596 tcpstat.tcps_rcvdupack++;
1597 /*
1598 * If we have outstanding data (other than
1599 * a window probe), this is a completely
1600 * duplicate ack (ie, window info didn't
1601 * change), the ack is the biggest we've
1602 * seen and we've seen exactly our rexmt
1603 * threshold of them, assume a packet
1604 * has been dropped and retransmit it.
1605 * Kludge snd_nxt & the congestion
1606 * window so we send only this one
1607 * packet.
1608 *
1609 * We know we're losing at the current
1610 * window size so do congestion avoidance
1611 * (set ssthresh to half the current window
1612 * and pull our congestion window back to
1613 * the new ssthresh).
1614 *
1615 * Dup acks mean that packets have left the
1616 * network (they're now cached at the receiver)
1617 * so bump cwnd by the amount in the receiver
1618 * to keep a constant cwnd packets in the
1619 * network.
1620 */
1621 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0)
1622 tp->t_dupacks = 0;
1623 #if defined(TCP_SACK) && defined(TCP_FACK)
1624 /*
1625 * In FACK, can enter fast rec. if the receiver
1626 * reports a reass. queue longer than 3 segs.
1627 */
1628 else if (++tp->t_dupacks == tcprexmtthresh ||
1629 ((SEQ_GT(tp->snd_fack, tcprexmtthresh *
1630 tp->t_maxseg + tp->snd_una)) &&
1631 SEQ_GT(tp->snd_una, tp->snd_last))) {
1632 #else
1633 else if (++tp->t_dupacks == tcprexmtthresh) {
1634 #endif /* TCP_FACK */
1635 tcp_seq onxt = tp->snd_nxt;
1636 u_long win =
1637 ulmin(tp->snd_wnd, tp->snd_cwnd) /
1638 2 / tp->t_maxseg;
1639
1640 #if defined(TCP_SACK) || defined(TCP_ECN)
1641 if (SEQ_LT(th->th_ack, tp->snd_last)){
1642 /*
1643 * False fast retx after
1644 * timeout. Do not cut window.
1645 */
1646 tp->t_dupacks = 0;
1647 goto drop;
1648 }
1649 #endif
1650 if (win < 2)
1651 win = 2;
1652 tp->snd_ssthresh = win * tp->t_maxseg;
1653 #if defined(TCP_SACK)
1654 tp->snd_last = tp->snd_max;
1655 #endif
1656 #ifdef TCP_SACK
1657 if (tp->sack_enable) {
1658 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1659 tp->t_rtttime = 0;
1660 #ifdef TCP_ECN
1661 tp->t_flags |= TF_SEND_CWR;
1662 #endif
1663 #if 1 /* TCP_ECN */
1664 tcpstat.tcps_cwr_frecovery++;
1665 #endif
1666 tcpstat.tcps_sack_recovery_episode++;
1667 #if defined(TCP_SACK) && defined(TCP_FACK)
1668 tp->t_dupacks = tcprexmtthresh;
1669 (void) tcp_output(tp);
1670 /*
1671 * During FR, snd_cwnd is held
1672 * constant for FACK.
1673 */
1674 tp->snd_cwnd = tp->snd_ssthresh;
1675 #else
1676 /*
1677 * tcp_output() will send
1678 * oldest SACK-eligible rtx.
1679 */
1680 (void) tcp_output(tp);
1681 tp->snd_cwnd = tp->snd_ssthresh+
1682 tp->t_maxseg * tp->t_dupacks;
1683 #endif /* TCP_FACK */
1684 goto drop;
1685 }
1686 #endif /* TCP_SACK */
1687 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1688 tp->t_rtttime = 0;
1689 tp->snd_nxt = th->th_ack;
1690 tp->snd_cwnd = tp->t_maxseg;
1691 #ifdef TCP_ECN
1692 tp->t_flags |= TF_SEND_CWR;
1693 #endif
1694 #if 1 /* TCP_ECN */
1695 tcpstat.tcps_cwr_frecovery++;
1696 #endif
1697 tcpstat.tcps_sndrexmitfast++;
1698 (void) tcp_output(tp);
1699
1700 tp->snd_cwnd = tp->snd_ssthresh +
1701 tp->t_maxseg * tp->t_dupacks;
1702 if (SEQ_GT(onxt, tp->snd_nxt))
1703 tp->snd_nxt = onxt;
1704 goto drop;
1705 } else if (tp->t_dupacks > tcprexmtthresh) {
1706 #if defined(TCP_SACK) && defined(TCP_FACK)
1707 /*
1708 * while (awnd < cwnd)
1709 * sendsomething();
1710 */
1711 if (tp->sack_enable) {
1712 if (tp->snd_awnd < tp->snd_cwnd)
1713 tcp_output(tp);
1714 goto drop;
1715 }
1716 #endif /* TCP_FACK */
1717 tp->snd_cwnd += tp->t_maxseg;
1718 (void) tcp_output(tp);
1719 goto drop;
1720 }
1721 } else if (tiwin < tp->snd_wnd) {
1722 /*
1723 * The window was retracted! Previous dup
1724 * ACKs may have been due to packets arriving
1725 * after the shrunken window, not a missing
1726 * packet, so play it safe and reset t_dupacks
1727 */
1728 tp->t_dupacks = 0;
1729 }
1730 break;
1731 }
1732 /*
1733 * If the congestion window was inflated to account
1734 * for the other side's cached packets, retract it.
1735 */
1736 #if defined(TCP_SACK)
1737 if (tp->sack_enable) {
1738 if (tp->t_dupacks >= tcprexmtthresh) {
1739 /* Check for a partial ACK */
1740 if (tcp_sack_partialack(tp, th)) {
1741 #if defined(TCP_SACK) && defined(TCP_FACK)
1742 /* Force call to tcp_output */
1743 if (tp->snd_awnd < tp->snd_cwnd)
1744 needoutput = 1;
1745 #else
1746 tp->snd_cwnd += tp->t_maxseg;
1747 needoutput = 1;
1748 #endif /* TCP_FACK */
1749 } else {
1750 /* Out of fast recovery */
1751 tp->snd_cwnd = tp->snd_ssthresh;
1752 if (tcp_seq_subtract(tp->snd_max,
1753 th->th_ack) < tp->snd_ssthresh)
1754 tp->snd_cwnd =
1755 tcp_seq_subtract(tp->snd_max,
1756 th->th_ack);
1757 tp->t_dupacks = 0;
1758 #if defined(TCP_SACK) && defined(TCP_FACK)
1759 if (SEQ_GT(th->th_ack, tp->snd_fack))
1760 tp->snd_fack = th->th_ack;
1761 #endif /* TCP_FACK */
1762 }
1763 }
1764 } else {
1765 if (tp->t_dupacks >= tcprexmtthresh &&
1766 !tcp_newreno(tp, th)) {
1767 /* Out of fast recovery */
1768 tp->snd_cwnd = tp->snd_ssthresh;
1769 if (tcp_seq_subtract(tp->snd_max, th->th_ack) <
1770 tp->snd_ssthresh)
1771 tp->snd_cwnd =
1772 tcp_seq_subtract(tp->snd_max,
1773 th->th_ack);
1774 tp->t_dupacks = 0;
1775 }
1776 }
1777 if (tp->t_dupacks < tcprexmtthresh)
1778 tp->t_dupacks = 0;
1779 #else /* else no TCP_SACK */
1780 if (tp->t_dupacks >= tcprexmtthresh &&
1781 tp->snd_cwnd > tp->snd_ssthresh)
1782 tp->snd_cwnd = tp->snd_ssthresh;
1783 tp->t_dupacks = 0;
1784 #endif
1785 if (SEQ_GT(th->th_ack, tp->snd_max)) {
1786 tcpstat.tcps_rcvacktoomuch++;
1787 goto dropafterack_ratelim;
1788 }
1789 acked = th->th_ack - tp->snd_una;
1790 tcpstat.tcps_rcvackpack++;
1791 tcpstat.tcps_rcvackbyte += acked;
1792
1793 /*
1794 * If we have a timestamp reply, update smoothed
1795 * round trip time. If no timestamp is present but
1796 * transmit timer is running and timed sequence
1797 * number was acked, update smoothed round trip time.
1798 * Since we now have an rtt measurement, cancel the
1799 * timer backoff (cf., Phil Karn's retransmit alg.).
1800 * Recompute the initial retransmit timer.
1801 */
1802 if (opti.ts_present && opti.ts_ecr)
1803 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr);
1804 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
1805 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
1806
1807 /*
1808 * If all outstanding data is acked, stop retransmit
1809 * timer and remember to restart (more output or persist).
1810 * If there is more data to be acked, restart retransmit
1811 * timer, using current (possibly backed-off) value.
1812 */
1813 if (th->th_ack == tp->snd_max) {
1814 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1815 needoutput = 1;
1816 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
1817 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1818 /*
1819 * When new data is acked, open the congestion window.
1820 * If the window gives us less than ssthresh packets
1821 * in flight, open exponentially (maxseg per packet).
1822 * Otherwise open linearly: maxseg per window
1823 * (maxseg^2 / cwnd per packet).
1824 */
1825 {
1826 u_int cw = tp->snd_cwnd;
1827 u_int incr = tp->t_maxseg;
1828
1829 if (cw > tp->snd_ssthresh)
1830 incr = incr * incr / cw;
1831 #if defined (TCP_SACK)
1832 if (tp->t_dupacks < tcprexmtthresh)
1833 #endif
1834 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1835 }
1836 ND6_HINT(tp);
1837 if (acked > so->so_snd.sb_cc) {
1838 tp->snd_wnd -= so->so_snd.sb_cc;
1839 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1840 ourfinisacked = 1;
1841 } else {
1842 sbdrop(&so->so_snd, acked);
1843 tp->snd_wnd -= acked;
1844 ourfinisacked = 0;
1845 }
1846 if (sb_notify(&so->so_snd))
1847 sowwakeup(so);
1848
1849 /*
1850 * If we had a pending ICMP message that referred to data
1851 * that have just been acknowledged, disregard the recorded
1852 * ICMP message.
1853 */
1854 if ((tp->t_flags & TF_PMTUD_PEND) &&
1855 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
1856 tp->t_flags &= ~TF_PMTUD_PEND;
1857
1858 /*
1859 * Keep track of the largest chunk of data acknowledged
1860 * since last PMTU update
1861 */
1862 if (tp->t_pmtud_mss_acked < acked)
1863 tp->t_pmtud_mss_acked = acked;
1864
1865 tp->snd_una = th->th_ack;
1866 #ifdef TCP_ECN
1867 /* sync snd_last with snd_una */
1868 if (SEQ_GT(tp->snd_una, tp->snd_last))
1869 tp->snd_last = tp->snd_una;
1870 #endif
1871 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1872 tp->snd_nxt = tp->snd_una;
1873 #if defined (TCP_SACK) && defined (TCP_FACK)
1874 if (SEQ_GT(tp->snd_una, tp->snd_fack)) {
1875 tp->snd_fack = tp->snd_una;
1876 /* Update snd_awnd for partial ACK
1877 * without any SACK blocks.
1878 */
1879 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt,
1880 tp->snd_fack) + tp->retran_data;
1881 }
1882 #endif
1883
1884 switch (tp->t_state) {
1885
1886 /*
1887 * In FIN_WAIT_1 STATE in addition to the processing
1888 * for the ESTABLISHED state if our FIN is now acknowledged
1889 * then enter FIN_WAIT_2.
1890 */
1891 case TCPS_FIN_WAIT_1:
1892 if (ourfinisacked) {
1893 /*
1894 * If we can't receive any more
1895 * data, then closing user can proceed.
1896 * Starting the timer is contrary to the
1897 * specification, but if we don't get a FIN
1898 * we'll hang forever.
1899 */
1900 if (so->so_state & SS_CANTRCVMORE) {
1901 soisdisconnected(so);
1902 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
1903 }
1904 tp->t_state = TCPS_FIN_WAIT_2;
1905 }
1906 break;
1907
1908 /*
1909 * In CLOSING STATE in addition to the processing for
1910 * the ESTABLISHED state if the ACK acknowledges our FIN
1911 * then enter the TIME-WAIT state, otherwise ignore
1912 * the segment.
1913 */
1914 case TCPS_CLOSING:
1915 if (ourfinisacked) {
1916 tp->t_state = TCPS_TIME_WAIT;
1917 tcp_canceltimers(tp);
1918 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
1919 soisdisconnected(so);
1920 }
1921 break;
1922
1923 /*
1924 * In LAST_ACK, we may still be waiting for data to drain
1925 * and/or to be acked, as well as for the ack of our FIN.
1926 * If our FIN is now acknowledged, delete the TCB,
1927 * enter the closed state and return.
1928 */
1929 case TCPS_LAST_ACK:
1930 if (ourfinisacked) {
1931 tp = tcp_close(tp);
1932 goto drop;
1933 }
1934 break;
1935
1936 /*
1937 * In TIME_WAIT state the only thing that should arrive
1938 * is a retransmission of the remote FIN. Acknowledge
1939 * it and restart the finack timer.
1940 */
1941 case TCPS_TIME_WAIT:
1942 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
1943 goto dropafterack;
1944 }
1945 }
1946
1947 step6:
1948 /*
1949 * Update window information.
1950 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1951 */
1952 if ((tiflags & TH_ACK) &&
1953 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq &&
1954 (SEQ_LT(tp->snd_wl2, th->th_ack) ||
1955 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
1956 /* keep track of pure window updates */
1957 if (tlen == 0 &&
1958 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
1959 tcpstat.tcps_rcvwinupd++;
1960 tp->snd_wnd = tiwin;
1961 tp->snd_wl1 = th->th_seq;
1962 tp->snd_wl2 = th->th_ack;
1963 if (tp->snd_wnd > tp->max_sndwnd)
1964 tp->max_sndwnd = tp->snd_wnd;
1965 needoutput = 1;
1966 }
1967
1968 /*
1969 * Process segments with URG.
1970 */
1971 if ((tiflags & TH_URG) && th->th_urp &&
1972 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1973 /*
1974 * This is a kludge, but if we receive and accept
1975 * random urgent pointers, we'll crash in
1976 * soreceive. It's hard to imagine someone
1977 * actually wanting to send this much urgent data.
1978 */
1979 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
1980 th->th_urp = 0; /* XXX */
1981 tiflags &= ~TH_URG; /* XXX */
1982 goto dodata; /* XXX */
1983 }
1984 /*
1985 * If this segment advances the known urgent pointer,
1986 * then mark the data stream. This should not happen
1987 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1988 * a FIN has been received from the remote side.
1989 * In these states we ignore the URG.
1990 *
1991 * According to RFC961 (Assigned Protocols),
1992 * the urgent pointer points to the last octet
1993 * of urgent data. We continue, however,
1994 * to consider it to indicate the first octet
1995 * of data past the urgent section as the original
1996 * spec states (in one of two places).
1997 */
1998 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
1999 tp->rcv_up = th->th_seq + th->th_urp;
2000 so->so_oobmark = so->so_rcv.sb_cc +
2001 (tp->rcv_up - tp->rcv_nxt) - 1;
2002 if (so->so_oobmark == 0)
2003 so->so_state |= SS_RCVATMARK;
2004 sohasoutofband(so);
2005 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2006 }
2007 /*
2008 * Remove out of band data so doesn't get presented to user.
2009 * This can happen independent of advancing the URG pointer,
2010 * but if two URG's are pending at once, some out-of-band
2011 * data may creep in... ick.
2012 */
2013 if (th->th_urp <= (u_int16_t) tlen
2014 #ifdef SO_OOBINLINE
2015 && (so->so_options & SO_OOBINLINE) == 0
2016 #endif
2017 )
2018 tcp_pulloutofband(so, th->th_urp, m, hdroptlen);
2019 } else
2020 /*
2021 * If no out of band data is expected,
2022 * pull receive urgent pointer along
2023 * with the receive window.
2024 */
2025 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2026 tp->rcv_up = tp->rcv_nxt;
2027 dodata: /* XXX */
2028
2029 /*
2030 * Process the segment text, merging it into the TCP sequencing queue,
2031 * and arranging for acknowledgment of receipt if necessary.
2032 * This process logically involves adjusting tp->rcv_wnd as data
2033 * is presented to the user (this happens in tcp_usrreq.c,
2034 * case PRU_RCVD). If a FIN has already been received on this
2035 * connection then we just ignore the text.
2036 */
2037 if ((tlen || (tiflags & TH_FIN)) &&
2038 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2039 #ifdef TCP_SACK
2040 tcp_seq laststart = th->th_seq;
2041 tcp_seq lastend = th->th_seq + tlen;
2042 #endif
2043 tcp_reass_lock(tp);
2044 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) &&
2045 tp->t_state == TCPS_ESTABLISHED) {
2046 tcp_reass_unlock(tp);
2047 TCP_SETUP_ACK(tp, tiflags);
2048 tp->rcv_nxt += tlen;
2049 tiflags = th->th_flags & TH_FIN;
2050 tcpstat.tcps_rcvpack++;
2051 tcpstat.tcps_rcvbyte += tlen;
2052 ND6_HINT(tp);
2053 if (so->so_state & SS_CANTRCVMORE)
2054 m_freem(m);
2055 else {
2056 m_adj(m, hdroptlen);
2057 sbappendstream(&so->so_rcv, m);
2058 }
2059 sorwakeup(so);
2060 } else {
2061 m_adj(m, hdroptlen);
2062 tiflags = tcp_reass(tp, th, m, &tlen);
2063 tcp_reass_unlock(tp);
2064 tp->t_flags |= TF_ACKNOW;
2065 }
2066 #ifdef TCP_SACK
2067 if (tp->sack_enable)
2068 tcp_update_sack_list(tp, laststart, lastend);
2069 #endif
2070
2071 /*
2072 * variable len never referenced again in modern BSD,
2073 * so why bother computing it ??
2074 */
2075 #if 0
2076 /*
2077 * Note the amount of data that peer has sent into
2078 * our window, in order to estimate the sender's
2079 * buffer size.
2080 */
2081 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2082 #endif /* 0 */
2083 } else {
2084 m_freem(m);
2085 tiflags &= ~TH_FIN;
2086 }
2087
2088 /*
2089 * If FIN is received ACK the FIN and let the user know
2090 * that the connection is closing. Ignore a FIN received before
2091 * the connection is fully established.
2092 */
2093 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2094 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2095 socantrcvmore(so);
2096 tp->t_flags |= TF_ACKNOW;
2097 tp->rcv_nxt++;
2098 }
2099 switch (tp->t_state) {
2100
2101 /*
2102 * In ESTABLISHED STATE enter the CLOSE_WAIT state.
2103 */
2104 case TCPS_ESTABLISHED:
2105 tp->t_state = TCPS_CLOSE_WAIT;
2106 break;
2107
2108 /*
2109 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2110 * enter the CLOSING state.
2111 */
2112 case TCPS_FIN_WAIT_1:
2113 tp->t_state = TCPS_CLOSING;
2114 break;
2115
2116 /*
2117 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2118 * starting the time-wait timer, turning off the other
2119 * standard timers.
2120 */
2121 case TCPS_FIN_WAIT_2:
2122 tp->t_state = TCPS_TIME_WAIT;
2123 tcp_canceltimers(tp);
2124 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2125 soisdisconnected(so);
2126 break;
2127
2128 /*
2129 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2130 */
2131 case TCPS_TIME_WAIT:
2132 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2133 break;
2134 }
2135 }
2136 if (so->so_options & SO_DEBUG) {
2137 switch (tp->pf) {
2138 #ifdef INET6
2139 case PF_INET6:
2140 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6,
2141 0, tlen);
2142 break;
2143 #endif /* INET6 */
2144 case PF_INET:
2145 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti,
2146 0, tlen);
2147 break;
2148 }
2149 }
2150
2151 /*
2152 * Return any desired output.
2153 */
2154 if (needoutput || (tp->t_flags & TF_ACKNOW)) {
2155 (void) tcp_output(tp);
2156 }
2157 return;
2158
2159 badsyn:
2160 /*
2161 * Received a bad SYN. Increment counters and dropwithreset.
2162 */
2163 tcpstat.tcps_badsyn++;
2164 tp = NULL;
2165 goto dropwithreset;
2166
2167 dropafterack_ratelim:
2168 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
2169 tcp_ackdrop_ppslim) == 0) {
2170 /* XXX stat */
2171 goto drop;
2172 }
2173 /* ...fall into dropafterack... */
2174
2175 dropafterack:
2176 /*
2177 * Generate an ACK dropping incoming segment if it occupies
2178 * sequence space, where the ACK reflects our state.
2179 */
2180 if (tiflags & TH_RST)
2181 goto drop;
2182 m_freem(m);
2183 tp->t_flags |= TF_ACKNOW;
2184 (void) tcp_output(tp);
2185 return;
2186
2187 dropwithreset_ratelim:
2188 /*
2189 * We may want to rate-limit RSTs in certain situations,
2190 * particularly if we are sending an RST in response to
2191 * an attempt to connect to or otherwise communicate with
2192 * a port for which we have no socket.
2193 */
2194 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
2195 tcp_rst_ppslim) == 0) {
2196 /* XXX stat */
2197 goto drop;
2198 }
2199 /* ...fall into dropwithreset... */
2200
2201 dropwithreset:
2202 /*
2203 * Generate a RST, dropping incoming segment.
2204 * Make ACK acceptable to originator of segment.
2205 * Don't bother to respond to RST.
2206 */
2207 if (tiflags & TH_RST)
2208 goto drop;
2209 if (tiflags & TH_ACK) {
2210 tcp_respond(tp, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack,
2211 TH_RST);
2212 } else {
2213 if (tiflags & TH_SYN)
2214 tlen++;
2215 tcp_respond(tp, mtod(m, caddr_t), m, th->th_seq + tlen,
2216 (tcp_seq)0, TH_RST|TH_ACK);
2217 }
2218 return;
2219
2220 drop:
2221 /*
2222 * Drop space held by incoming segment and return.
2223 */
2224 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
2225 switch (tp->pf) {
2226 #ifdef INET6
2227 case PF_INET6:
2228 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6,
2229 0, tlen);
2230 break;
2231 #endif /* INET6 */
2232 case PF_INET:
2233 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti,
2234 0, tlen);
2235 break;
2236 }
2237 }
2238
2239 m_freem(m);
2240 return;
2241 }
2242
2243 int
2244 tcp_dooptions(tp, cp, cnt, th, m, iphlen, oi)
2245 struct tcpcb *tp;
2246 u_char *cp;
2247 int cnt;
2248 struct tcphdr *th;
2249 struct mbuf *m;
2250 int iphlen;
2251 struct tcp_opt_info *oi;
2252 {
2253 u_int16_t mss = 0;
2254 int opt, optlen;
2255 #ifdef TCP_SIGNATURE
2256 caddr_t sigp = NULL;
2257 struct tdb *tdb = NULL;
2258 #endif /* TCP_SIGNATURE */
2259
2260 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
2261 opt = cp[0];
2262 if (opt == TCPOPT_EOL)
2263 break;
2264 if (opt == TCPOPT_NOP)
2265 optlen = 1;
2266 else {
2267 if (cnt < 2)
2268 break;
2269 optlen = cp[1];
2270 if (optlen < 2 || optlen > cnt)
2271 break;
2272 }
2273 switch (opt) {
2274
2275 default:
2276 continue;
2277
2278 case TCPOPT_MAXSEG:
2279 if (optlen != TCPOLEN_MAXSEG)
2280 continue;
2281 if (!(th->th_flags & TH_SYN))
2282 continue;
2283 if (TCPS_HAVERCVDSYN(tp->t_state))
2284 continue;
2285 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
2286 NTOHS(mss);
2287 oi->maxseg = mss;
2288 break;
2289
2290 case TCPOPT_WINDOW:
2291 if (optlen != TCPOLEN_WINDOW)
2292 continue;
2293 if (!(th->th_flags & TH_SYN))
2294 continue;
2295 if (TCPS_HAVERCVDSYN(tp->t_state))
2296 continue;
2297 tp->t_flags |= TF_RCVD_SCALE;
2298 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
2299 break;
2300
2301 case TCPOPT_TIMESTAMP:
2302 if (optlen != TCPOLEN_TIMESTAMP)
2303 continue;
2304 oi->ts_present = 1;
2305 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
2306 NTOHL(oi->ts_val);
2307 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
2308 NTOHL(oi->ts_ecr);
2309
2310 if (!(th->th_flags & TH_SYN))
2311 continue;
2312 if (TCPS_HAVERCVDSYN(tp->t_state))
2313 continue;
2314 /*
2315 * A timestamp received in a SYN makes
2316 * it ok to send timestamp requests and replies.
2317 */
2318 tp->t_flags |= TF_RCVD_TSTMP;
2319 tp->ts_recent = oi->ts_val;
2320 tp->ts_recent_age = tcp_now;
2321 break;
2322
2323 #ifdef TCP_SACK
2324 case TCPOPT_SACK_PERMITTED:
2325 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED)
2326 continue;
2327 if (!(th->th_flags & TH_SYN))
2328 continue;
2329 if (TCPS_HAVERCVDSYN(tp->t_state))
2330 continue;
2331 /* MUST only be set on SYN */
2332 tp->t_flags |= TF_SACK_PERMIT;
2333 break;
2334 case TCPOPT_SACK:
2335 tcp_sack_option(tp, th, cp, optlen);
2336 break;
2337 #endif
2338 #ifdef TCP_SIGNATURE
2339 case TCPOPT_SIGNATURE:
2340 if (optlen != TCPOLEN_SIGNATURE)
2341 continue;
2342
2343 if (sigp && bcmp(sigp, cp + 2, 16))
2344 return (-1);
2345
2346 sigp = cp + 2;
2347 break;
2348 #endif /* TCP_SIGNATURE */
2349 }
2350 }
2351
2352 #ifdef TCP_SIGNATURE
2353 if (tp->t_flags & TF_SIGNATURE) {
2354 union sockaddr_union src, dst;
2355
2356 memset(&src, 0, sizeof(union sockaddr_union));
2357 memset(&dst, 0, sizeof(union sockaddr_union));
2358
2359 switch (tp->pf) {
2360 case 0:
2361 #ifdef INET
2362 case AF_INET:
2363 src.sa.sa_len = sizeof(struct sockaddr_in);
2364 src.sa.sa_family = AF_INET;
2365 src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
2366 dst.sa.sa_len = sizeof(struct sockaddr_in);
2367 dst.sa.sa_family = AF_INET;
2368 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
2369 break;
2370 #endif
2371 #ifdef INET6
2372 case AF_INET6:
2373 src.sa.sa_len = sizeof(struct sockaddr_in6);
2374 src.sa.sa_family = AF_INET6;
2375 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
2376 dst.sa.sa_len = sizeof(struct sockaddr_in6);
2377 dst.sa.sa_family = AF_INET6;
2378 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
2379 break;
2380 #endif /* INET6 */
2381 }
2382
2383 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP);
2384
2385 /*
2386 * We don't have an SA for this peer, so we turn off
2387 * TF_SIGNATURE on the listen socket
2388 */
2389 if (tdb == NULL && tp->t_state == TCPS_LISTEN)
2390 tp->t_flags &= ~TF_SIGNATURE;
2391
2392 }
2393
2394 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) {
2395 tcpstat.tcps_rcvbadsig++;
2396 return (-1);
2397 }
2398
2399 if (sigp) {
2400 char sig[16];
2401
2402 if (tdb == NULL) {
2403 tcpstat.tcps_rcvbadsig++;
2404 return (-1);
2405 }
2406
2407 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0)
2408 return (-1);
2409
2410 if (bcmp(sig, sigp, 16)) {
2411 tcpstat.tcps_rcvbadsig++;
2412 return (-1);
2413 }
2414
2415 tcpstat.tcps_rcvgoodsig++;
2416 }
2417 #endif /* TCP_SIGNATURE */
2418
2419 return (0);
2420 }
2421
2422 #if defined(TCP_SACK)
2423 u_long
2424 tcp_seq_subtract(a, b)
2425 u_long a, b;
2426 {
2427 return ((long)(a - b));
2428 }
2429 #endif
2430
2431
2432 #ifdef TCP_SACK
2433 /*
2434 * This function is called upon receipt of new valid data (while not in header
2435 * prediction mode), and it updates the ordered list of sacks.
2436 */
2437 void
2438 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart,
2439 tcp_seq rcv_lastend)
2440 {
2441 /*
2442 * First reported block MUST be the most recent one. Subsequent
2443 * blocks SHOULD be in the order in which they arrived at the
2444 * receiver. These two conditions make the implementation fully
2445 * compliant with RFC 2018.
2446 */
2447 int i, j = 0, count = 0, lastpos = -1;
2448 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
2449
2450 /* First clean up current list of sacks */
2451 for (i = 0; i < tp->rcv_numsacks; i++) {
2452 sack = tp->sackblks[i];
2453 if (sack.start == 0 && sack.end == 0) {
2454 count++; /* count = number of blocks to be discarded */
2455 continue;
2456 }
2457 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
2458 tp->sackblks[i].start = tp->sackblks[i].end = 0;
2459 count++;
2460 } else {
2461 temp[j].start = tp->sackblks[i].start;
2462 temp[j++].end = tp->sackblks[i].end;
2463 }
2464 }
2465 tp->rcv_numsacks -= count;
2466 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
2467 tcp_clean_sackreport(tp);
2468 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) {
2469 /* ==> need first sack block */
2470 tp->sackblks[0].start = rcv_laststart;
2471 tp->sackblks[0].end = rcv_lastend;
2472 tp->rcv_numsacks = 1;
2473 }
2474 return;
2475 }
2476 /* Otherwise, sack blocks are already present. */
2477 for (i = 0; i < tp->rcv_numsacks; i++)
2478 tp->sackblks[i] = temp[i]; /* first copy back sack list */
2479 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend))
2480 return; /* sack list remains unchanged */
2481 /*
2482 * From here, segment just received should be (part of) the 1st sack.
2483 * Go through list, possibly coalescing sack block entries.
2484 */
2485 firstsack.start = rcv_laststart;
2486 firstsack.end = rcv_lastend;
2487 for (i = 0; i < tp->rcv_numsacks; i++) {
2488 sack = tp->sackblks[i];
2489 if (SEQ_LT(sack.end, firstsack.start) ||
2490 SEQ_GT(sack.start, firstsack.end))
2491 continue; /* no overlap */
2492 if (sack.start == firstsack.start && sack.end == firstsack.end){
2493 /*
2494 * identical block; delete it here since we will
2495 * move it to the front of the list.
2496 */
2497 tp->sackblks[i].start = tp->sackblks[i].end = 0;
2498 lastpos = i; /* last posn with a zero entry */
2499 continue;
2500 }
2501 if (SEQ_LEQ(sack.start, firstsack.start))
2502 firstsack.start = sack.start; /* merge blocks */
2503 if (SEQ_GEQ(sack.end, firstsack.end))
2504 firstsack.end = sack.end; /* merge blocks */
2505 tp->sackblks[i].start = tp->sackblks[i].end = 0;
2506 lastpos = i; /* last posn with a zero entry */
2507 }
2508 if (lastpos != -1) { /* at least one merge */
2509 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
2510 sack = tp->sackblks[i];
2511 if (sack.start == 0 && sack.end == 0)
2512 continue;
2513 temp[j++] = sack;
2514 }
2515 tp->rcv_numsacks = j; /* including first blk (added later) */
2516 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
2517 tp->sackblks[i] = temp[i];
2518 } else { /* no merges -- shift sacks by 1 */
2519 if (tp->rcv_numsacks < MAX_SACK_BLKS)
2520 tp->rcv_numsacks++;
2521 for (i = tp->rcv_numsacks-1; i > 0; i--)
2522 tp->sackblks[i] = tp->sackblks[i-1];
2523 }
2524 tp->sackblks[0] = firstsack;
2525 return;
2526 }
2527
2528 /*
2529 * Process the TCP SACK option. tp->snd_holes is an ordered list
2530 * of holes (oldest to newest, in terms of the sequence space).
2531 */
2532 void
2533 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
2534 {
2535 int tmp_olen;
2536 u_char *tmp_cp;
2537 struct sackhole *cur, *p, *temp;
2538
2539 if (!tp->sack_enable)
2540 return;
2541 /* SACK without ACK doesn't make sense. */
2542 if ((th->th_flags & TH_ACK) == 0)
2543 return;
2544 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */
2545 if (SEQ_LT(th->th_ack, tp->snd_una) ||
2546 SEQ_GT(th->th_ack, tp->snd_max))
2547 return;
2548 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
2549 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
2550 return;
2551 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
2552 tmp_cp = cp + 2;
2553 tmp_olen = optlen - 2;
2554 tcpstat.tcps_sack_rcv_opts++;
2555 if (tp->snd_numholes < 0)
2556 tp->snd_numholes = 0;
2557 if (tp->t_maxseg == 0)
2558 panic("tcp_sack_option"); /* Should never happen */
2559 while (tmp_olen > 0) {
2560 struct sackblk sack;
2561
2562 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
2563 NTOHL(sack.start);
2564 bcopy(tmp_cp + sizeof(tcp_seq),
2565 (char *) &(sack.end), sizeof(tcp_seq));
2566 NTOHL(sack.end);
2567 tmp_olen -= TCPOLEN_SACK;
2568 tmp_cp += TCPOLEN_SACK;
2569 if (SEQ_LEQ(sack.end, sack.start))
2570 continue; /* bad SACK fields */
2571 if (SEQ_LEQ(sack.end, tp->snd_una))
2572 continue; /* old block */
2573 #if defined(TCP_SACK) && defined(TCP_FACK)
2574 /* Updates snd_fack. */
2575 if (SEQ_GT(sack.end, tp->snd_fack))
2576 tp->snd_fack = sack.end;
2577 #endif /* TCP_FACK */
2578 if (SEQ_GT(th->th_ack, tp->snd_una)) {
2579 if (SEQ_LT(sack.start, th->th_ack))
2580 continue;
2581 }
2582 if (SEQ_GT(sack.end, tp->snd_max))
2583 continue;
2584 if (tp->snd_holes == NULL) { /* first hole */
2585 tp->snd_holes = (struct sackhole *)
2586 pool_get(&sackhl_pool, PR_NOWAIT);
2587 if (tp->snd_holes == NULL) {
2588 /* ENOBUFS, so ignore SACKed block for now*/
2589 goto done;
2590 }
2591 cur = tp->snd_holes;
2592 cur->start = th->th_ack;
2593 cur->end = sack.start;
2594 cur->rxmit = cur->start;
2595 cur->next = NULL;
2596 tp->snd_numholes = 1;
2597 tp->rcv_lastsack = sack.end;
2598 /*
2599 * dups is at least one. If more data has been
2600 * SACKed, it can be greater than one.
2601 */
2602 cur->dups = min(tcprexmtthresh,
2603 ((sack.end - cur->end)/tp->t_maxseg));
2604 if (cur->dups < 1)
2605 cur->dups = 1;
2606 continue; /* with next sack block */
2607 }
2608 /* Go thru list of holes: p = previous, cur = current */
2609 p = cur = tp->snd_holes;
2610 while (cur) {
2611 if (SEQ_LEQ(sack.end, cur->start))
2612 /* SACKs data before the current hole */
2613 break; /* no use going through more holes */
2614 if (SEQ_GEQ(sack.start, cur->end)) {
2615 /* SACKs data beyond the current hole */
2616 cur->dups++;
2617 if (((sack.end - cur->end)/tp->t_maxseg) >=
2618 tcprexmtthresh)
2619 cur->dups = tcprexmtthresh;
2620 p = cur;
2621 cur = cur->next;
2622 continue;
2623 }
2624 if (SEQ_LEQ(sack.start, cur->start)) {
2625 /* Data acks at least the beginning of hole */
2626 #if defined(TCP_SACK) && defined(TCP_FACK)
2627 if (SEQ_GT(sack.end, cur->rxmit))
2628 tp->retran_data -=
2629 tcp_seq_subtract(cur->rxmit,
2630 cur->start);
2631 else
2632 tp->retran_data -=
2633 tcp_seq_subtract(sack.end,
2634 cur->start);
2635 #endif /* TCP_FACK */
2636 if (SEQ_GEQ(sack.end, cur->end)) {
2637 /* Acks entire hole, so delete hole */
2638 if (p != cur) {
2639 p->next = cur->next;
2640 pool_put(&sackhl_pool, cur);
2641 cur = p->next;
2642 } else {
2643 cur = cur->next;
2644 pool_put(&sackhl_pool, p);
2645 p = cur;
2646 tp->snd_holes = p;
2647 }
2648 tp->snd_numholes--;
2649 continue;
2650 }
2651 /* otherwise, move start of hole forward */
2652 cur->start = sack.end;
2653 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
2654 p = cur;
2655 cur = cur->next;
2656 continue;
2657 }
2658 /* move end of hole backward */
2659 if (SEQ_GEQ(sack.end, cur->end)) {
2660 #if defined(TCP_SACK) && defined(TCP_FACK)
2661 if (SEQ_GT(cur->rxmit, sack.start))
2662 tp->retran_data -=
2663 tcp_seq_subtract(cur->rxmit,
2664 sack.start);
2665 #endif /* TCP_FACK */
2666 cur->end = sack.start;
2667 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
2668 cur->dups++;
2669 if (((sack.end - cur->end)/tp->t_maxseg) >=
2670 tcprexmtthresh)
2671 cur->dups = tcprexmtthresh;
2672 p = cur;
2673 cur = cur->next;
2674 continue;
2675 }
2676 if (SEQ_LT(cur->start, sack.start) &&
2677 SEQ_GT(cur->end, sack.end)) {
2678 /*
2679 * ACKs some data in middle of a hole; need to
2680 * split current hole
2681 */
2682 temp = (struct sackhole *)
2683 pool_get(&sackhl_pool, PR_NOWAIT);
2684 if (temp == NULL)
2685 goto done; /* ENOBUFS */
2686 #if defined(TCP_SACK) && defined(TCP_FACK)
2687 if (SEQ_GT(cur->rxmit, sack.end))
2688 tp->retran_data -=
2689 tcp_seq_subtract(sack.end,
2690 sack.start);
2691 else if (SEQ_GT(cur->rxmit, sack.start))
2692 tp->retran_data -=
2693 tcp_seq_subtract(cur->rxmit,
2694 sack.start);
2695 #endif /* TCP_FACK */
2696 temp->next = cur->next;
2697 temp->start = sack.end;
2698 temp->end = cur->end;
2699 temp->dups = cur->dups;
2700 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start);
2701 cur->end = sack.start;
2702 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
2703 cur->dups++;
2704 if (((sack.end - cur->end)/tp->t_maxseg) >=
2705 tcprexmtthresh)
2706 cur->dups = tcprexmtthresh;
2707 cur->next = temp;
2708 p = temp;
2709 cur = p->next;
2710 tp->snd_numholes++;
2711 }
2712 }
2713 /* At this point, p points to the last hole on the list */
2714 if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
2715 /*
2716 * Need to append new hole at end.
2717 * Last hole is p (and it's not NULL).
2718 */
2719 temp = (struct sackhole *)
2720 pool_get(&sackhl_pool, PR_NOWAIT);
2721 if (temp == NULL)
2722 goto done; /* ENOBUFS */
2723 temp->start = tp->rcv_lastsack;
2724 temp->end = sack.start;
2725 temp->dups = min(tcprexmtthresh,
2726 ((sack.end - sack.start)/tp->t_maxseg));
2727 if (temp->dups < 1)
2728 temp->dups = 1;
2729 temp->rxmit = temp->start;
2730 temp->next = 0;
2731 p->next = temp;
2732 tp->rcv_lastsack = sack.end;
2733 tp->snd_numholes++;
2734 }
2735 }
2736 done:
2737 #if defined(TCP_SACK) && defined(TCP_FACK)
2738 /*
2739 * Update retran_data and snd_awnd. Go through the list of
2740 * holes. Increment retran_data by (hole->rxmit - hole->start).
2741 */
2742 tp->retran_data = 0;
2743 cur = tp->snd_holes;
2744 while (cur) {
2745 tp->retran_data += cur->rxmit - cur->start;
2746 cur = cur->next;
2747 }
2748 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) +
2749 tp->retran_data;
2750 #endif /* TCP_FACK */
2751
2752 return;
2753 }
2754
2755 /*
2756 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if
2757 * it is completely acked; otherwise, tcp_sack_option(), called from
2758 * tcp_dooptions(), will fix up the hole.
2759 */
2760 void
2761 tcp_del_sackholes(tp, th)
2762 struct tcpcb *tp;
2763 struct tcphdr *th;
2764 {
2765 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) {
2766 /* max because this could be an older ack just arrived */
2767 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
2768 th->th_ack : tp->snd_una;
2769 struct sackhole *cur = tp->snd_holes;
2770 struct sackhole *prev;
2771 while (cur)
2772 if (SEQ_LEQ(cur->end, lastack)) {
2773 prev = cur;
2774 cur = cur->next;
2775 pool_put(&sackhl_pool, prev);
2776 tp->snd_numholes--;
2777 } else if (SEQ_LT(cur->start, lastack)) {
2778 cur->start = lastack;
2779 if (SEQ_LT(cur->rxmit, cur->start))
2780 cur->rxmit = cur->start;
2781 break;
2782 } else
2783 break;
2784 tp->snd_holes = cur;
2785 }
2786 }
2787
2788 /*
2789 * Delete all receiver-side SACK information.
2790 */
2791 void
2792 tcp_clean_sackreport(tp)
2793 struct tcpcb *tp;
2794 {
2795 int i;
2796
2797 tp->rcv_numsacks = 0;
2798 for (i = 0; i < MAX_SACK_BLKS; i++)
2799 tp->sackblks[i].start = tp->sackblks[i].end=0;
2800
2801 }
2802
2803 /*
2804 * Checks for partial ack. If partial ack arrives, turn off retransmission
2805 * timer, deflate the window, do not clear tp->t_dupacks, and return 1.
2806 * If the ack advances at least to tp->snd_last, return 0.
2807 */
2808 int
2809 tcp_sack_partialack(tp, th)
2810 struct tcpcb *tp;
2811 struct tcphdr *th;
2812 {
2813 if (SEQ_LT(th->th_ack, tp->snd_last)) {
2814 /* Turn off retx. timer (will start again next segment) */
2815 TCP_TIMER_DISARM(tp, TCPT_REXMT);
2816 tp->t_rtttime = 0;
2817 #ifndef TCP_FACK
2818 /*
2819 * Partial window deflation. This statement relies on the
2820 * fact that tp->snd_una has not been updated yet. In FACK
2821 * hold snd_cwnd constant during fast recovery.
2822 */
2823 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) {
2824 tp->snd_cwnd -= th->th_ack - tp->snd_una;
2825 tp->snd_cwnd += tp->t_maxseg;
2826 } else
2827 tp->snd_cwnd = tp->t_maxseg;
2828 #endif
2829 return (1);
2830 }
2831 return (0);
2832 }
2833 #endif /* TCP_SACK */
2834
2835 /*
2836 * Pull out of band byte out of a segment so
2837 * it doesn't appear in the user's data queue.
2838 * It is still reflected in the segment length for
2839 * sequencing purposes.
2840 */
2841 void
2842 tcp_pulloutofband(so, urgent, m, off)
2843 struct socket *so;
2844 u_int urgent;
2845 struct mbuf *m;
2846 int off;
2847 {
2848 int cnt = off + urgent - 1;
2849
2850 while (cnt >= 0) {
2851 if (m->m_len > cnt) {
2852 char *cp = mtod(m, caddr_t) + cnt;
2853 struct tcpcb *tp = sototcpcb(so);
2854
2855 tp->t_iobc = *cp;
2856 tp->t_oobflags |= TCPOOB_HAVEDATA;
2857 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2858 m->m_len--;
2859 return;
2860 }
2861 cnt -= m->m_len;
2862 m = m->m_next;
2863 if (m == 0)
2864 break;
2865 }
2866 panic("tcp_pulloutofband");
2867 }
2868
2869 /*
2870 * Collect new round-trip time estimate
2871 * and update averages and current timeout.
2872 */
2873 void
2874 tcp_xmit_timer(tp, rtt)
2875 struct tcpcb *tp;
2876 short rtt;
2877 {
2878 short delta;
2879 short rttmin;
2880
2881 if (rtt < 0)
2882 rtt = 0;
2883 else if (rtt > TCP_RTT_MAX)
2884 rtt = TCP_RTT_MAX;
2885
2886 tcpstat.tcps_rttupdated++;
2887 if (tp->t_srtt != 0) {
2888 /*
2889 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits
2890 * after the binary point (scaled by 4), whereas
2891 * srtt is stored as fixed point with 5 bits after the
2892 * binary point (i.e., scaled by 32). The following magic
2893 * is equivalent to the smoothing algorithm in rfc793 with
2894 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2895 * point).
2896 */
2897 delta = (rtt << TCP_RTT_BASE_SHIFT) -
2898 (tp->t_srtt >> TCP_RTT_SHIFT);
2899 if ((tp->t_srtt += delta) <= 0)
2900 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT;
2901 /*
2902 * We accumulate a smoothed rtt variance (actually, a
2903 * smoothed mean difference), then set the retransmit
2904 * timer to smoothed rtt + 4 times the smoothed variance.
2905 * rttvar is stored as fixed point with 4 bits after the
2906 * binary point (scaled by 16). The following is
2907 * equivalent to rfc793 smoothing with an alpha of .75
2908 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
2909 * rfc793's wired-in beta.
2910 */
2911 if (delta < 0)
2912 delta = -delta;
2913 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
2914 if ((tp->t_rttvar += delta) <= 0)
2915 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT;
2916 } else {
2917 /*
2918 * No rtt measurement yet - use the unsmoothed rtt.
2919 * Set the variance to half the rtt (so our first
2920 * retransmit happens at 3*rtt).
2921 */
2922 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
2923 tp->t_rttvar = (rtt + 1) <<
2924 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1);
2925 }
2926 tp->t_rtttime = 0;
2927 tp->t_rxtshift = 0;
2928
2929 /*
2930 * the retransmit should happen at rtt + 4 * rttvar.
2931 * Because of the way we do the smoothing, srtt and rttvar
2932 * will each average +1/2 tick of bias. When we compute
2933 * the retransmit timer, we want 1/2 tick of rounding and
2934 * 1 extra tick because of +-1/2 tick uncertainty in the
2935 * firing of the timer. The bias will give us exactly the
2936 * 1.5 tick we need. But, because the bias is
2937 * statistical, we have to test that we don't drop below
2938 * the minimum feasible timer (which is 2 ticks).
2939 */
2940 rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX);
2941 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX);
2942
2943 /*
2944 * We received an ack for a packet that wasn't retransmitted;
2945 * it is probably safe to discard any error indications we've
2946 * received recently. This isn't quite right, but close enough
2947 * for now (a route might have failed after we sent a segment,
2948 * and the return path might not be symmetrical).
2949 */
2950 tp->t_softerror = 0;
2951 }
2952
2953 /*
2954 * Determine a reasonable value for maxseg size.
2955 * If the route is known, check route for mtu.
2956 * If none, use an mss that can be handled on the outgoing
2957 * interface without forcing IP to fragment; if bigger than
2958 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2959 * to utilize large mbufs. If no route is found, route has no mtu,
2960 * or the destination isn't local, use a default, hopefully conservative
2961 * size (usually 512 or the default IP max size, but no more than the mtu
2962 * of the interface), as we can't discover anything about intervening
2963 * gateways or networks. We also initialize the congestion/slow start
2964 * window to be a single segment if the destination isn't local.
2965 * While looking at the routing entry, we also initialize other path-dependent
2966 * parameters from pre-set or cached values in the routing entry.
2967 *
2968 * Also take into account the space needed for options that we
2969 * send regularly. Make maxseg shorter by that amount to assure
2970 * that we can send maxseg amount of data even when the options
2971 * are present. Store the upper limit of the length of options plus
2972 * data in maxopd.
2973 *
2974 * NOTE: offer == -1 indicates that the maxseg size changed due to
2975 * Path MTU discovery.
2976 */
2977 int
2978 tcp_mss(tp, offer)
2979 struct tcpcb *tp;
2980 int offer;
2981 {
2982 struct rtentry *rt;
2983 struct ifnet *ifp;
2984 int mss, mssopt;
2985 int iphlen;
2986 struct inpcb *inp;
2987
2988 inp = tp->t_inpcb;
2989
2990 mssopt = mss = tcp_mssdflt;
2991
2992 rt = in_pcbrtentry(inp);
2993
2994 if (rt == NULL)
2995 goto out;
2996
2997 ifp = rt->rt_ifp;
2998
2999 switch (tp->pf) {
3000 #ifdef INET6
3001 case AF_INET6:
3002 iphlen = sizeof(struct ip6_hdr);
3003 break;
3004 #endif
3005 case AF_INET:
3006 iphlen = sizeof(struct ip);
3007 break;
3008 default:
3009 /* the family does not support path MTU discovery */
3010 goto out;
3011 }
3012
3013 #ifdef RTV_MTU
3014 /*
3015 * if there's an mtu associated with the route and we support
3016 * path MTU discovery for the underlying protocol family, use it.
3017 */
3018 if (rt->rt_rmx.rmx_mtu) {
3019 /*
3020 * One may wish to lower MSS to take into account options,
3021 * especially security-related options.
3022 */
3023 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
3024 /*
3025 * RFC2460 section 5, last paragraph: if path MTU is
3026 * smaller than 1280, use 1280 as packet size and
3027 * attach fragment header.
3028 */
3029 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) -
3030 sizeof(struct tcphdr);
3031 } else
3032 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr);
3033 } else
3034 #endif /* RTV_MTU */
3035 if (!ifp)
3036 /*
3037 * ifp may be null and rmx_mtu may be zero in certain
3038 * v6 cases (e.g., if ND wasn't able to resolve the
3039 * destination host.
3040 */
3041 goto out;
3042 else if (ifp->if_flags & IFF_LOOPBACK)
3043 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
3044 else if (tp->pf == AF_INET) {
3045 if (ip_mtudisc)
3046 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
3047 else if (inp && in_localaddr(inp->inp_faddr))
3048 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
3049 }
3050 #ifdef INET6
3051 else if (tp->pf == AF_INET6) {
3052 /*
3053 * for IPv6, path MTU discovery is always turned on,
3054 * or the node must use packet size <= 1280.
3055 */
3056 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr);
3057 }
3058 #endif /* INET6 */
3059
3060 /* Calculate the value that we offer in TCPOPT_MAXSEG */
3061 if (offer != -1) {
3062 #ifndef INET6
3063 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
3064 #else
3065 if (tp->pf == AF_INET6)
3066 mssopt = IN6_LINKMTU(ifp) - iphlen -
3067 sizeof(struct tcphdr);
3068 else
3069 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
3070 #endif
3071
3072 mssopt = max(tcp_mssdflt, mssopt);
3073 }
3074
3075 out:
3076 /*
3077 * The current mss, t_maxseg, is initialized to the default value.
3078 * If we compute a smaller value, reduce the current mss.
3079 * If we compute a larger value, return it for use in sending
3080 * a max seg size option, but don't store it for use
3081 * unless we received an offer at least that large from peer.
3082 *
3083 * However, do not accept offers lower than the minimum of
3084 * the interface MTU and 216.
3085 */
3086 if (offer > 0)
3087 tp->t_peermss = offer;
3088 if (tp->t_peermss)
3089 mss = min(mss, max(tp->t_peermss, 216));
3090
3091 /* sanity - at least max opt. space */
3092 mss = max(mss, 64);
3093
3094 /*
3095 * maxopd stores the maximum length of data AND options
3096 * in a segment; maxseg is the amount of data in a normal
3097 * segment. We need to store this value (maxopd) apart
3098 * from maxseg, because now every segment carries options
3099 * and thus we normally have somewhat less data in segments.
3100 */
3101 tp->t_maxopd = mss;
3102
3103 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
3104 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
3105 mss -= TCPOLEN_TSTAMP_APPA;
3106 #ifdef TCP_SIGNATURE
3107 if (tp->t_flags & TF_SIGNATURE)
3108 mss -= TCPOLEN_SIGLEN;
3109 #endif
3110
3111 if (offer == -1) {
3112 /* mss changed due to Path MTU discovery */
3113 tp->t_flags &= ~TF_PMTUD_PEND;
3114 tp->t_pmtud_mtu_sent = 0;
3115 tp->t_pmtud_mss_acked = 0;
3116 if (mss < tp->t_maxseg) {
3117 /*
3118 * Follow suggestion in RFC 2414 to reduce the
3119 * congestion window by the ratio of the old
3120 * segment size to the new segment size.
3121 */
3122 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
3123 mss, mss);
3124 }
3125 } else if (tcp_do_rfc3390) {
3126 /* increase initial window */
3127 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380));
3128 } else
3129 tp->snd_cwnd = mss;
3130
3131 tp->t_maxseg = mss;
3132
3133 return (offer != -1 ? mssopt : mss);
3134 }
3135
3136 u_int
3137 tcp_hdrsz(struct tcpcb *tp)
3138 {
3139 u_int hlen;
3140
3141 switch (tp->pf) {
3142 #ifdef INET6
3143 case AF_INET6:
3144 hlen = sizeof(struct ip6_hdr);
3145 break;
3146 #endif
3147 case AF_INET:
3148 hlen = sizeof(struct ip);
3149 break;
3150 default:
3151 hlen = 0;
3152 break;
3153 }
3154 hlen += sizeof(struct tcphdr);
3155
3156 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
3157 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
3158 hlen += TCPOLEN_TSTAMP_APPA;
3159 #ifdef TCP_SIGNATURE
3160 if (tp->t_flags & TF_SIGNATURE)
3161 hlen += TCPOLEN_SIGLEN;
3162 #endif
3163 return (hlen);
3164 }
3165
3166 /*
3167 * Set connection variables based on the effective MSS.
3168 * We are passed the TCPCB for the actual connection. If we
3169 * are the server, we are called by the compressed state engine
3170 * when the 3-way handshake is complete. If we are the client,
3171 * we are called when we receive the SYN,ACK from the server.
3172 *
3173 * NOTE: The t_maxseg value must be initialized in the TCPCB
3174 * before this routine is called!
3175 */
3176 void
3177 tcp_mss_update(tp)
3178 struct tcpcb *tp;
3179 {
3180 int mss;
3181 u_long bufsize;
3182 struct rtentry *rt;
3183 struct socket *so;
3184
3185 so = tp->t_inpcb->inp_socket;
3186 mss = tp->t_maxseg;
3187
3188 rt = in_pcbrtentry(tp->t_inpcb);
3189
3190 if (rt == NULL)
3191 return;
3192
3193 bufsize = so->so_snd.sb_hiwat;
3194 if (bufsize < mss) {
3195 mss = bufsize;
3196 /* Update t_maxseg and t_maxopd */
3197 tcp_mss(tp, mss);
3198 } else {
3199 bufsize = roundup(bufsize, mss);
3200 if (bufsize > sb_max)
3201 bufsize = sb_max;
3202 (void)sbreserve(&so->so_snd, bufsize);
3203 }
3204
3205 bufsize = so->so_rcv.sb_hiwat;
3206 if (bufsize > mss) {
3207 bufsize = roundup(bufsize, mss);
3208 if (bufsize > sb_max)
3209 bufsize = sb_max;
3210 (void)sbreserve(&so->so_rcv, bufsize);
3211 }
3212
3213 }
3214
3215 #if defined (TCP_SACK)
3216 /*
3217 * Checks for partial ack. If partial ack arrives, force the retransmission
3218 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
3219 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to
3220 * be started again. If the ack advances at least to tp->snd_last, return 0.
3221 */
3222 int
3223 tcp_newreno(tp, th)
3224 struct tcpcb *tp;
3225 struct tcphdr *th;
3226 {
3227 if (SEQ_LT(th->th_ack, tp->snd_last)) {
3228 /*
3229 * snd_una has not been updated and the socket send buffer
3230 * not yet drained of the acked data, so we have to leave
3231 * snd_una as it was to get the correct data offset in
3232 * tcp_output().
3233 */
3234 tcp_seq onxt = tp->snd_nxt;
3235 u_long ocwnd = tp->snd_cwnd;
3236 TCP_TIMER_DISARM(tp, TCPT_REXMT);
3237 tp->t_rtttime = 0;
3238 tp->snd_nxt = th->th_ack;
3239 /*
3240 * Set snd_cwnd to one segment beyond acknowledged offset
3241 * (tp->snd_una not yet updated when this function is called)
3242 */
3243 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
3244 (void) tcp_output(tp);
3245 tp->snd_cwnd = ocwnd;
3246 if (SEQ_GT(onxt, tp->snd_nxt))
3247 tp->snd_nxt = onxt;
3248 /*
3249 * Partial window deflation. Relies on fact that tp->snd_una
3250 * not updated yet.
3251 */
3252 if (tp->snd_cwnd > th->th_ack - tp->snd_una)
3253 tp->snd_cwnd -= th->th_ack - tp->snd_una;
3254 else
3255 tp->snd_cwnd = 0;
3256 tp->snd_cwnd += tp->t_maxseg;
3257
3258 return 1;
3259 }
3260 return 0;
3261 }
3262 #endif /* TCP_SACK */
3263
3264 int
3265 tcp_mss_adv(struct ifnet *ifp, int af)
3266 {
3267 int mss = 0;
3268 int iphlen;
3269
3270 switch (af) {
3271 case AF_INET:
3272 if (ifp != NULL)
3273 mss = ifp->if_mtu;
3274 iphlen = sizeof(struct ip);
3275 break;
3276 #ifdef INET6
3277 case AF_INET6:
3278 if (ifp != NULL)
3279 mss = IN6_LINKMTU(ifp);
3280 iphlen = sizeof(struct ip6_hdr);
3281 break;
3282 #endif
3283 }
3284 mss = mss - iphlen - sizeof(struct tcphdr);
3285 return (max(mss, tcp_mssdflt));
3286 }
3287
3288 /*
3289 * TCP compressed state engine. Currently used to hold compressed
3290 * state for SYN_RECEIVED.
3291 */
3292
3293 u_long syn_cache_count;
3294 u_int32_t syn_hash1, syn_hash2;
3295
3296 #define SYN_HASH(sa, sp, dp) \
3297 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
3298 ((u_int32_t)(sp)))^syn_hash2)))
3299 #ifndef INET6
3300 #define SYN_HASHALL(hash, src, dst) \
3301 do { \
3302 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
3303 ((struct sockaddr_in *)(src))->sin_port, \
3304 ((struct sockaddr_in *)(dst))->sin_port); \
3305 } while (/*CONSTCOND*/ 0)
3306 #else
3307 #define SYN_HASH6(sa, sp, dp) \
3308 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
3309 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
3310 & 0x7fffffff)
3311
3312 #define SYN_HASHALL(hash, src, dst) \
3313 do { \
3314 switch ((src)->sa_family) { \
3315 case AF_INET: \
3316 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
3317 ((struct sockaddr_in *)(src))->sin_port, \
3318 ((struct sockaddr_in *)(dst))->sin_port); \
3319 break; \
3320 case AF_INET6: \
3321 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \
3322 ((struct sockaddr_in6 *)(src))->sin6_port, \
3323 ((struct sockaddr_in6 *)(dst))->sin6_port); \
3324 break; \
3325 default: \
3326 hash = 0; \
3327 } \
3328 } while (/*CONSTCOND*/0)
3329 #endif /* INET6 */
3330
3331 #define SYN_CACHE_RM(sc) \
3332 do { \
3333 (sc)->sc_flags |= SCF_DEAD; \
3334 TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \
3335 (sc), sc_bucketq); \
3336 (sc)->sc_tp = NULL; \
3337 LIST_REMOVE((sc), sc_tpq); \
3338 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \
3339 timeout_del(&(sc)->sc_timer); \
3340 syn_cache_count--; \
3341 } while (/*CONSTCOND*/0)
3342
3343 #define SYN_CACHE_PUT(sc) \
3344 do { \
3345 if ((sc)->sc_ipopts) \
3346 (void) m_free((sc)->sc_ipopts); \
3347 if ((sc)->sc_route4.ro_rt != NULL) \
3348 RTFREE((sc)->sc_route4.ro_rt); \
3349 timeout_set(&(sc)->sc_timer, syn_cache_reaper, (sc)); \
3350 timeout_add(&(sc)->sc_timer, 0); \
3351 } while (/*CONSTCOND*/0)
3352
3353 struct pool syn_cache_pool;
3354
3355 /*
3356 * We don't estimate RTT with SYNs, so each packet starts with the default
3357 * RTT and each timer step has a fixed timeout value.
3358 */
3359 #define SYN_CACHE_TIMER_ARM(sc) \
3360 do { \
3361 TCPT_RANGESET((sc)->sc_rxtcur, \
3362 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
3363 TCPTV_REXMTMAX); \
3364 if (!timeout_initialized(&(sc)->sc_timer)) \
3365 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \
3366 timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \
3367 } while (/*CONSTCOND*/0)
3368
3369 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate
3370
3371 void
3372 syn_cache_init()
3373 {
3374 int i;
3375
3376 /* Initialize the hash buckets. */
3377 for (i = 0; i < tcp_syn_cache_size; i++)
3378 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
3379
3380 /* Initialize the syn cache pool. */
3381 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
3382 "synpl", NULL);
3383 }
3384
3385 void
3386 syn_cache_insert(sc, tp)
3387 struct syn_cache *sc;
3388 struct tcpcb *tp;
3389 {
3390 struct syn_cache_head *scp;
3391 struct syn_cache *sc2;
3392 int s;
3393
3394 /*
3395 * If there are no entries in the hash table, reinitialize
3396 * the hash secrets.
3397 */
3398 if (syn_cache_count == 0) {
3399 syn_hash1 = arc4random();
3400 syn_hash2 = arc4random();
3401 }
3402
3403 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
3404 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
3405 scp = &tcp_syn_cache[sc->sc_bucketidx];
3406
3407 /*
3408 * Make sure that we don't overflow the per-bucket
3409 * limit or the total cache size limit.
3410 */
3411 s = splsoftnet();
3412 if (scp->sch_length >= tcp_syn_bucket_limit) {
3413 tcpstat.tcps_sc_bucketoverflow++;
3414 /*
3415 * The bucket is full. Toss the oldest element in the
3416 * bucket. This will be the first entry in the bucket.
3417 */
3418 sc2 = TAILQ_FIRST(&scp->sch_bucket);
3419 #ifdef DIAGNOSTIC
3420 /*
3421 * This should never happen; we should always find an
3422 * entry in our bucket.
3423 */
3424 if (sc2 == NULL)
3425 panic("syn_cache_insert: bucketoverflow: impossible");
3426 #endif
3427 SYN_CACHE_RM(sc2);
3428 SYN_CACHE_PUT(sc2);
3429 } else if (syn_cache_count >= tcp_syn_cache_limit) {
3430 struct syn_cache_head *scp2, *sce;
3431
3432 tcpstat.tcps_sc_overflowed++;
3433 /*
3434 * The cache is full. Toss the oldest entry in the
3435 * first non-empty bucket we can find.
3436 *
3437 * XXX We would really like to toss the oldest
3438 * entry in the cache, but we hope that this
3439 * condition doesn't happen very often.
3440 */
3441 scp2 = scp;
3442 if (TAILQ_EMPTY(&scp2->sch_bucket)) {
3443 sce = &tcp_syn_cache[tcp_syn_cache_size];
3444 for (++scp2; scp2 != scp; scp2++) {
3445 if (scp2 >= sce)
3446 scp2 = &tcp_syn_cache[0];
3447 if (! TAILQ_EMPTY(&scp2->sch_bucket))
3448 break;
3449 }
3450 #ifdef DIAGNOSTIC
3451 /*
3452 * This should never happen; we should always find a
3453 * non-empty bucket.
3454 */
3455 if (scp2 == scp)
3456 panic("syn_cache_insert: cacheoverflow: "
3457 "impossible");
3458 #endif
3459 }
3460 sc2 = TAILQ_FIRST(&scp2->sch_bucket);
3461 SYN_CACHE_RM(sc2);
3462 SYN_CACHE_PUT(sc2);
3463 }
3464
3465 /*
3466 * Initialize the entry's timer.
3467 */
3468 sc->sc_rxttot = 0;
3469 sc->sc_rxtshift = 0;
3470 SYN_CACHE_TIMER_ARM(sc);
3471
3472 /* Link it from tcpcb entry */
3473 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
3474
3475 /* Put it into the bucket. */
3476 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
3477 scp->sch_length++;
3478 syn_cache_count++;
3479
3480 tcpstat.tcps_sc_added++;
3481 splx(s);
3482 }
3483
3484 /*
3485 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
3486 * If we have retransmitted an entry the maximum number of times, expire
3487 * that entry.
3488 */
3489 void
3490 syn_cache_timer(void *arg)
3491 {
3492 struct syn_cache *sc = arg;
3493 int s;
3494
3495 s = splsoftnet();
3496 if (sc->sc_flags & SCF_DEAD) {
3497 splx(s);
3498 return;
3499 }
3500
3501 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
3502 /* Drop it -- too many retransmissions. */
3503 goto dropit;
3504 }
3505
3506 /*
3507 * Compute the total amount of time this entry has
3508 * been on a queue. If this entry has been on longer
3509 * than the keep alive timer would allow, expire it.
3510 */
3511 sc->sc_rxttot += sc->sc_rxtcur;
3512 if (sc->sc_rxttot >= tcptv_keep_init)
3513 goto dropit;
3514
3515 tcpstat.tcps_sc_retransmitted++;
3516 (void) syn_cache_respond(sc, NULL);
3517
3518 /* Advance the timer back-off. */
3519 sc->sc_rxtshift++;
3520 SYN_CACHE_TIMER_ARM(sc);
3521
3522 splx(s);
3523 return;
3524
3525 dropit:
3526 tcpstat.tcps_sc_timed_out++;
3527 SYN_CACHE_RM(sc);
3528 SYN_CACHE_PUT(sc);
3529 splx(s);
3530 }
3531
3532 void
3533 syn_cache_reaper(void *arg)
3534 {
3535 struct syn_cache *sc = arg;
3536 int s;
3537
3538 s = splsoftnet();
3539 pool_put(&syn_cache_pool, (sc));
3540 splx(s);
3541 return;
3542 }
3543
3544 /*
3545 * Remove syn cache created by the specified tcb entry,
3546 * because this does not make sense to keep them
3547 * (if there's no tcb entry, syn cache entry will never be used)
3548 */
3549 void
3550 syn_cache_cleanup(tp)
3551 struct tcpcb *tp;
3552 {
3553 struct syn_cache *sc, *nsc;
3554 int s;
3555
3556 s = splsoftnet();
3557
3558 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
3559 nsc = LIST_NEXT(sc, sc_tpq);
3560
3561 #ifdef DIAGNOSTIC
3562 if (sc->sc_tp != tp)
3563 panic("invalid sc_tp in syn_cache_cleanup");
3564 #endif
3565 SYN_CACHE_RM(sc);
3566 SYN_CACHE_PUT(sc);
3567 }
3568 /* just for safety */
3569 LIST_INIT(&tp->t_sc);
3570
3571 splx(s);
3572 }
3573
3574 /*
3575 * Find an entry in the syn cache.
3576 */
3577 struct syn_cache *
3578 syn_cache_lookup(src, dst, headp)
3579 struct sockaddr *src;
3580 struct sockaddr *dst;
3581 struct syn_cache_head **headp;
3582 {
3583 struct syn_cache *sc;
3584 struct syn_cache_head *scp;
3585 u_int32_t hash;
3586 int s;
3587
3588 SYN_HASHALL(hash, src, dst);
3589
3590 scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
3591 *headp = scp;
3592 s = splsoftnet();
3593 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
3594 sc = TAILQ_NEXT(sc, sc_bucketq)) {
3595 if (sc->sc_hash != hash)
3596 continue;
3597 if (!bcmp(&sc->sc_src, src, src->sa_len) &&
3598 !bcmp(&sc->sc_dst, dst, dst->sa_len)) {
3599 splx(s);
3600 return (sc);
3601 }
3602 }
3603 splx(s);
3604 return (NULL);
3605 }
3606
3607 /*
3608 * This function gets called when we receive an ACK for a
3609 * socket in the LISTEN state. We look up the connection
3610 * in the syn cache, and if its there, we pull it out of
3611 * the cache and turn it into a full-blown connection in
3612 * the SYN-RECEIVED state.
3613 *
3614 * The return values may not be immediately obvious, and their effects
3615 * can be subtle, so here they are:
3616 *
3617 * NULL SYN was not found in cache; caller should drop the
3618 * packet and send an RST.
3619 *
3620 * -1 We were unable to create the new connection, and are
3621 * aborting it. An ACK,RST is being sent to the peer
3622 * (unless we got screwey sequence numbners; see below),
3623 * because the 3-way handshake has been completed. Caller
3624 * should not free the mbuf, since we may be using it. If
3625 * we are not, we will free it.
3626 *
3627 * Otherwise, the return value is a pointer to the new socket
3628 * associated with the connection.
3629 */
3630 struct socket *
3631 syn_cache_get(src, dst, th, hlen, tlen, so, m)
3632 struct sockaddr *src;
3633 struct sockaddr *dst;
3634 struct tcphdr *th;
3635 unsigned int hlen, tlen;
3636 struct socket *so;
3637 struct mbuf *m;
3638 {
3639 struct syn_cache *sc;
3640 struct syn_cache_head *scp;
3641 struct inpcb *inp = NULL;
3642 struct tcpcb *tp = 0;
3643 struct mbuf *am;
3644 int s;
3645 struct socket *oso;
3646
3647 s = splsoftnet();
3648 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3649 splx(s);
3650 return (NULL);
3651 }
3652
3653 /*
3654 * Verify the sequence and ack numbers. Try getting the correct
3655 * response again.
3656 */
3657 if ((th->th_ack != sc->sc_iss + 1) ||
3658 SEQ_LEQ(th->th_seq, sc->sc_irs) ||
3659 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
3660 (void) syn_cache_respond(sc, m);
3661 splx(s);
3662 return ((struct socket *)(-1));
3663 }
3664
3665 /* Remove this cache entry */
3666 SYN_CACHE_RM(sc);
3667 splx(s);
3668
3669 /*
3670 * Ok, create the full blown connection, and set things up
3671 * as they would have been set up if we had created the
3672 * connection when the SYN arrived. If we can't create
3673 * the connection, abort it.
3674 */
3675 oso = so;
3676 so = sonewconn(so, SS_ISCONNECTED);
3677 if (so == NULL)
3678 goto resetandabort;
3679
3680 inp = sotoinpcb(oso);
3681 #ifdef IPSEC
3682 /*
3683 * We need to copy the required security levels
3684 * from the old pcb. Ditto for any other
3685 * IPsec-related information.
3686 */
3687 {
3688 struct inpcb *newinp = (struct inpcb *)so->so_pcb;
3689 bcopy(inp->inp_seclevel, newinp->inp_seclevel,
3690 sizeof(inp->inp_seclevel));
3691 newinp->inp_secrequire = inp->inp_secrequire;
3692 if (inp->inp_ipo != NULL) {
3693 newinp->inp_ipo = inp->inp_ipo;
3694 inp->inp_ipo->ipo_ref_count++;
3695 }
3696 if (inp->inp_ipsec_remotecred != NULL) {
3697 newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred;
3698 inp->inp_ipsec_remotecred->ref_count++;
3699 }
3700 if (inp->inp_ipsec_remoteauth != NULL) {
3701 newinp->inp_ipsec_remoteauth
3702 = inp->inp_ipsec_remoteauth;
3703 inp->inp_ipsec_remoteauth->ref_count++;
3704 }
3705 }
3706 #endif /* IPSEC */
3707 #ifdef INET6
3708 /*
3709 * inp still has the OLD in_pcb stuff, set the
3710 * v6-related flags on the new guy, too.
3711 */
3712 {
3713 int flags = inp->inp_flags;
3714 struct inpcb *oldinpcb = inp;
3715
3716 inp = (struct inpcb *)so->so_pcb;
3717 inp->inp_flags |= (flags & INP_IPV6);
3718 if ((inp->inp_flags & INP_IPV6) != 0) {
3719 inp->inp_ipv6.ip6_hlim =
3720 oldinpcb->inp_ipv6.ip6_hlim;
3721 }
3722 }
3723 #else /* INET6 */
3724 inp = (struct inpcb *)so->so_pcb;
3725 #endif /* INET6 */
3726
3727 inp->inp_lport = th->th_dport;
3728 switch (src->sa_family) {
3729 #ifdef INET6
3730 case AF_INET6:
3731 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr;
3732 break;
3733 #endif /* INET6 */
3734 case AF_INET:
3735
3736 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
3737 inp->inp_options = ip_srcroute();
3738 if (inp->inp_options == NULL) {
3739 inp->inp_options = sc->sc_ipopts;
3740 sc->sc_ipopts = NULL;
3741 }
3742 break;
3743 }
3744 in_pcbrehash(inp);
3745
3746 /*
3747 * Give the new socket our cached route reference.
3748 */
3749 if (src->sa_family == AF_INET)
3750 inp->inp_route = sc->sc_route4; /* struct assignment */
3751 #ifdef INET6
3752 else
3753 inp->inp_route6 = sc->sc_route6;
3754 #endif
3755 sc->sc_route4.ro_rt = NULL;
3756
3757 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */
3758 if (am == NULL)
3759 goto resetandabort;
3760 am->m_len = src->sa_len;
3761 bcopy(src, mtod(am, caddr_t), src->sa_len);
3762
3763 switch (src->sa_family) {
3764 case AF_INET:
3765 /* drop IPv4 packet to AF_INET6 socket */
3766 if (inp->inp_flags & INP_IPV6) {
3767 (void) m_free(am);
3768 goto resetandabort;
3769 }
3770 if (in_pcbconnect(inp, am)) {
3771 (void) m_free(am);
3772 goto resetandabort;
3773 }
3774 break;
3775 #ifdef INET6
3776 case AF_INET6:
3777 if (in6_pcbconnect(inp, am)) {
3778 (void) m_free(am);
3779 goto resetandabort;
3780 }
3781 break;
3782 #endif
3783 }
3784 (void) m_free(am);
3785
3786 tp = intotcpcb(inp);
3787 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
3788 if (sc->sc_request_r_scale != 15) {
3789 tp->requested_s_scale = sc->sc_requested_s_scale;
3790 tp->request_r_scale = sc->sc_request_r_scale;
3791 tp->snd_scale = sc->sc_requested_s_scale;
3792 tp->rcv_scale = sc->sc_request_r_scale;
3793 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
3794 }
3795 if (sc->sc_flags & SCF_TIMESTAMP)
3796 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
3797
3798 tp->t_template = tcp_template(tp);
3799 if (tp->t_template == 0) {
3800 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
3801 so = NULL;
3802 m_freem(m);
3803 goto abort;
3804 }
3805 #ifdef TCP_SACK
3806 tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT;
3807 #endif
3808
3809 tp->ts_modulate = sc->sc_modulate;
3810 tp->iss = sc->sc_iss;
3811 tp->irs = sc->sc_irs;
3812 tcp_sendseqinit(tp);
3813 #if defined (TCP_SACK) || defined(TCP_ECN)
3814 tp->snd_last = tp->snd_una;
3815 #endif /* TCP_SACK */
3816 #if defined(TCP_SACK) && defined(TCP_FACK)
3817 tp->snd_fack = tp->snd_una;
3818 tp->retran_data = 0;
3819 tp->snd_awnd = 0;
3820 #endif /* TCP_FACK */
3821 #ifdef TCP_ECN
3822 if (sc->sc_flags & SCF_ECN_PERMIT) {
3823 tp->t_flags |= TF_ECN_PERMIT;
3824 tcpstat.tcps_ecn_accepts++;
3825 }
3826 #endif
3827 #ifdef TCP_SACK
3828 if (sc->sc_flags & SCF_SACK_PERMIT)
3829 tp->t_flags |= TF_SACK_PERMIT;
3830 #endif
3831 #ifdef TCP_SIGNATURE
3832 if (sc->sc_flags & SCF_SIGNATURE)
3833 tp->t_flags |= TF_SIGNATURE;
3834 #endif
3835 tcp_rcvseqinit(tp);
3836 tp->t_state = TCPS_SYN_RECEIVED;
3837 tp->t_rcvtime = tcp_now;
3838 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
3839 tcpstat.tcps_accepts++;
3840
3841 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */
3842 if (sc->sc_peermaxseg)
3843 tcp_mss_update(tp);
3844 /* Reset initial window to 1 segment for retransmit */
3845 if (sc->sc_rxtshift > 0)
3846 tp->snd_cwnd = tp->t_maxseg;
3847 tp->snd_wl1 = sc->sc_irs;
3848 tp->rcv_up = sc->sc_irs + 1;
3849
3850 /*
3851 * This is what whould have happened in tcp_output() when
3852 * the SYN,ACK was sent.
3853 */
3854 tp->snd_up = tp->snd_una;
3855 tp->snd_max = tp->snd_nxt = tp->iss+1;
3856 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
3857 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
3858 tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
3859 tp->last_ack_sent = tp->rcv_nxt;
3860
3861 tcpstat.tcps_sc_completed++;
3862 SYN_CACHE_PUT(sc);
3863 return (so);
3864
3865 resetandabort:
3866 tcp_respond(NULL, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack, TH_RST);
3867 abort:
3868 if (so != NULL)
3869 (void) soabort(so);
3870 SYN_CACHE_PUT(sc);
3871 tcpstat.tcps_sc_aborted++;
3872 return ((struct socket *)(-1));
3873 }
3874
3875 /*
3876 * This function is called when we get a RST for a
3877 * non-existent connection, so that we can see if the
3878 * connection is in the syn cache. If it is, zap it.
3879 */
3880
3881 void
3882 syn_cache_reset(src, dst, th)
3883 struct sockaddr *src;
3884 struct sockaddr *dst;
3885 struct tcphdr *th;
3886 {
3887 struct syn_cache *sc;
3888 struct syn_cache_head *scp;
3889 int s = splsoftnet();
3890
3891 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3892 splx(s);
3893 return;
3894 }
3895 if (SEQ_LT(th->th_seq, sc->sc_irs) ||
3896 SEQ_GT(th->th_seq, sc->sc_irs+1)) {
3897 splx(s);
3898 return;
3899 }
3900 SYN_CACHE_RM(sc);
3901 splx(s);
3902 tcpstat.tcps_sc_reset++;
3903 SYN_CACHE_PUT(sc);
3904 }
3905
3906 void
3907 syn_cache_unreach(src, dst, th)
3908 struct sockaddr *src;
3909 struct sockaddr *dst;
3910 struct tcphdr *th;
3911 {
3912 struct syn_cache *sc;
3913 struct syn_cache_head *scp;
3914 int s;
3915
3916 s = splsoftnet();
3917 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3918 splx(s);
3919 return;
3920 }
3921 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
3922 if (ntohl (th->th_seq) != sc->sc_iss) {
3923 splx(s);
3924 return;
3925 }
3926
3927 /*
3928 * If we've retransmitted 3 times and this is our second error,
3929 * we remove the entry. Otherwise, we allow it to continue on.
3930 * This prevents us from incorrectly nuking an entry during a
3931 * spurious network outage.
3932 *
3933 * See tcp_notify().
3934 */
3935 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
3936 sc->sc_flags |= SCF_UNREACH;
3937 splx(s);
3938 return;
3939 }
3940
3941 SYN_CACHE_RM(sc);
3942 splx(s);
3943 tcpstat.tcps_sc_unreach++;
3944 SYN_CACHE_PUT(sc);
3945 }
3946
3947 /*
3948 * Given a LISTEN socket and an inbound SYN request, add
3949 * this to the syn cache, and send back a segment:
3950 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3951 * to the source.
3952 *
3953 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
3954 * Doing so would require that we hold onto the data and deliver it
3955 * to the application. However, if we are the target of a SYN-flood
3956 * DoS attack, an attacker could send data which would eventually
3957 * consume all available buffer space if it were ACKed. By not ACKing
3958 * the data, we avoid this DoS scenario.
3959 */
3960
3961 int
3962 syn_cache_add(src, dst, th, iphlen, so, m, optp, optlen, oi, issp)
3963 struct sockaddr *src;
3964 struct sockaddr *dst;
3965 struct tcphdr *th;
3966 unsigned int iphlen;
3967 struct socket *so;
3968 struct mbuf *m;
3969 u_char *optp;
3970 int optlen;
3971 struct tcp_opt_info *oi;
3972 tcp_seq *issp;
3973 {
3974 struct tcpcb tb, *tp;
3975 long win;
3976 struct syn_cache *sc;
3977 struct syn_cache_head *scp;
3978 struct mbuf *ipopts;
3979
3980 tp = sototcpcb(so);
3981
3982 /*
3983 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
3984 *
3985 * Note this check is performed in tcp_input() very early on.
3986 */
3987
3988 /*
3989 * Initialize some local state.
3990 */
3991 win = sbspace(&so->so_rcv);
3992 if (win > TCP_MAXWIN)
3993 win = TCP_MAXWIN;
3994
3995 #ifdef TCP_SIGNATURE
3996 if (optp || (tp->t_flags & TF_SIGNATURE)) {
3997 #else
3998 if (optp) {
3999 #endif
4000 tb.pf = tp->pf;
4001 #ifdef TCP_SACK
4002 tb.sack_enable = tp->sack_enable;
4003 #endif
4004 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
4005 #ifdef TCP_SIGNATURE
4006 if (tp->t_flags & TF_SIGNATURE)
4007 tb.t_flags |= TF_SIGNATURE;
4008 #endif
4009 tb.t_state = TCPS_LISTEN;
4010 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi))
4011 return (0);
4012 } else
4013 tb.t_flags = 0;
4014
4015 switch (src->sa_family) {
4016 #ifdef INET
4017 case AF_INET:
4018 /*
4019 * Remember the IP options, if any.
4020 */
4021 ipopts = ip_srcroute();
4022 break;
4023 #endif
4024 default:
4025 ipopts = NULL;
4026 }
4027
4028 /*
4029 * See if we already have an entry for this connection.
4030 * If we do, resend the SYN,ACK. We do not count this
4031 * as a retransmission (XXX though maybe we should).
4032 */
4033 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
4034 tcpstat.tcps_sc_dupesyn++;
4035 if (ipopts) {
4036 /*
4037 * If we were remembering a previous source route,
4038 * forget it and use the new one we've been given.
4039 */
4040 if (sc->sc_ipopts)
4041 (void) m_free(sc->sc_ipopts);
4042 sc->sc_ipopts = ipopts;
4043 }
4044 sc->sc_timestamp = tb.ts_recent;
4045 if (syn_cache_respond(sc, m) == 0) {
4046 tcpstat.tcps_sndacks++;
4047 tcpstat.tcps_sndtotal++;
4048 }
4049 return (1);
4050 }
4051
4052 sc = pool_get(&syn_cache_pool, PR_NOWAIT);
4053 if (sc == NULL) {
4054 if (ipopts)
4055 (void) m_free(ipopts);
4056 return (0);
4057 }
4058
4059 /*
4060 * Fill in the cache, and put the necessary IP and TCP
4061 * options into the reply.
4062 */
4063 bzero(sc, sizeof(struct syn_cache));
4064 bzero(&sc->sc_timer, sizeof(sc->sc_timer));
4065 bcopy(src, &sc->sc_src, src->sa_len);
4066 bcopy(dst, &sc->sc_dst, dst->sa_len);
4067 sc->sc_flags = 0;
4068 sc->sc_ipopts = ipopts;
4069 sc->sc_irs = th->th_seq;
4070
4071 #ifdef TCP_COMPAT_42
4072 tcp_iss += TCP_ISSINCR/2;
4073 sc->sc_iss = tcp_iss;
4074 #else
4075 sc->sc_iss = issp ? *issp : arc4random();
4076 #endif
4077 sc->sc_peermaxseg = oi->maxseg;
4078 sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ?
4079 m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family);
4080 sc->sc_win = win;
4081 sc->sc_timestamp = tb.ts_recent;
4082 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
4083 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
4084 sc->sc_flags |= SCF_TIMESTAMP;
4085 sc->sc_modulate = arc4random();
4086 }
4087 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
4088 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
4089 sc->sc_requested_s_scale = tb.requested_s_scale;
4090 sc->sc_request_r_scale = 0;
4091 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
4092 TCP_MAXWIN << sc->sc_request_r_scale <
4093 so->so_rcv.sb_hiwat)
4094 sc->sc_request_r_scale++;
4095 } else {
4096 sc->sc_requested_s_scale = 15;
4097 sc->sc_request_r_scale = 15;
4098 }
4099 #ifdef TCP_ECN
4100 /*
4101 * if both ECE and CWR flag bits are set, peer is ECN capable.
4102 */
4103 if (tcp_do_ecn &&
4104 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR))
4105 sc->sc_flags |= SCF_ECN_PERMIT;
4106 #endif
4107 #ifdef TCP_SACK
4108 /*
4109 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option
4110 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT).
4111 */
4112 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT))
4113 sc->sc_flags |= SCF_SACK_PERMIT;
4114 #endif
4115 #ifdef TCP_SIGNATURE
4116 if (tb.t_flags & TF_SIGNATURE)
4117 sc->sc_flags |= SCF_SIGNATURE;
4118 #endif
4119 sc->sc_tp = tp;
4120 if (syn_cache_respond(sc, m) == 0) {
4121 syn_cache_insert(sc, tp);
4122 tcpstat.tcps_sndacks++;
4123 tcpstat.tcps_sndtotal++;
4124 } else {
4125 SYN_CACHE_PUT(sc);
4126 tcpstat.tcps_sc_dropped++;
4127 }
4128 return (1);
4129 }
4130
4131 int
4132 syn_cache_respond(sc, m)
4133 struct syn_cache *sc;
4134 struct mbuf *m;
4135 {
4136 struct route *ro;
4137 u_int8_t *optp;
4138 int optlen, error;
4139 u_int16_t tlen;
4140 struct ip *ip = NULL;
4141 #ifdef INET6
4142 struct ip6_hdr *ip6 = NULL;
4143 #endif
4144 struct tcphdr *th;
4145 u_int hlen;
4146 struct inpcb *inp;
4147
4148 switch (sc->sc_src.sa.sa_family) {
4149 case AF_INET:
4150 hlen = sizeof(struct ip);
4151 ro = &sc->sc_route4;
4152 break;
4153 #ifdef INET6
4154 case AF_INET6:
4155 hlen = sizeof(struct ip6_hdr);
4156 ro = (struct route *)&sc->sc_route6;
4157 break;
4158 #endif
4159 default:
4160 if (m)
4161 m_freem(m);
4162 return (EAFNOSUPPORT);
4163 }
4164
4165 /* Compute the size of the TCP options. */
4166 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
4167 #ifdef TCP_SACK
4168 ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) +
4169 #endif
4170 #ifdef TCP_SIGNATURE
4171 ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) +
4172 #endif
4173 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
4174
4175 tlen = hlen + sizeof(struct tcphdr) + optlen;
4176
4177 /*
4178 * Create the IP+TCP header from scratch.
4179 */
4180 if (m)
4181 m_freem(m);
4182 #ifdef DIAGNOSTIC
4183 if (max_linkhdr + tlen > MCLBYTES)
4184 return (ENOBUFS);
4185 #endif
4186 MGETHDR(m, M_DONTWAIT, MT_DATA);
4187 if (m && max_linkhdr + tlen > MHLEN) {
4188 MCLGET(m, M_DONTWAIT);
4189 if ((m->m_flags & M_EXT) == 0) {
4190 m_freem(m);
4191 m = NULL;
4192 }
4193 }
4194 if (m == NULL)
4195 return (ENOBUFS);
4196
4197 /* Fixup the mbuf. */
4198 m->m_data += max_linkhdr;
4199 m->m_len = m->m_pkthdr.len = tlen;
4200 m->m_pkthdr.rcvif = NULL;
4201 memset(mtod(m, u_char *), 0, tlen);
4202
4203 switch (sc->sc_src.sa.sa_family) {
4204 case AF_INET:
4205 ip = mtod(m, struct ip *);
4206 ip->ip_dst = sc->sc_src.sin.sin_addr;
4207 ip->ip_src = sc->sc_dst.sin.sin_addr;
4208 ip->ip_p = IPPROTO_TCP;
4209 th = (struct tcphdr *)(ip + 1);
4210 th->th_dport = sc->sc_src.sin.sin_port;
4211 th->th_sport = sc->sc_dst.sin.sin_port;
4212 break;
4213 #ifdef INET6
4214 case AF_INET6:
4215 ip6 = mtod(m, struct ip6_hdr *);
4216 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
4217 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
4218 ip6->ip6_nxt = IPPROTO_TCP;
4219 /* ip6_plen will be updated in ip6_output() */
4220 th = (struct tcphdr *)(ip6 + 1);
4221 th->th_dport = sc->sc_src.sin6.sin6_port;
4222 th->th_sport = sc->sc_dst.sin6.sin6_port;
4223 break;
4224 #endif
4225 default:
4226 th = NULL;
4227 }
4228
4229 th->th_seq = htonl(sc->sc_iss);
4230 th->th_ack = htonl(sc->sc_irs + 1);
4231 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
4232 th->th_flags = TH_SYN|TH_ACK;
4233 #ifdef TCP_ECN
4234 /* Set ECE for SYN-ACK if peer supports ECN. */
4235 if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT))
4236 th->th_flags |= TH_ECE;
4237 #endif
4238 th->th_win = htons(sc->sc_win);
4239 /* th_sum already 0 */
4240 /* th_urp already 0 */
4241
4242 /* Tack on the TCP options. */
4243 optp = (u_int8_t *)(th + 1);
4244 *optp++ = TCPOPT_MAXSEG;
4245 *optp++ = 4;
4246 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
4247 *optp++ = sc->sc_ourmaxseg & 0xff;
4248
4249 #ifdef TCP_SACK
4250 /* Include SACK_PERMIT_HDR option if peer has already done so. */
4251 if (sc->sc_flags & SCF_SACK_PERMIT) {
4252 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR);
4253 optp += 4;
4254 }
4255 #endif
4256
4257 if (sc->sc_request_r_scale != 15) {
4258 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
4259 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
4260 sc->sc_request_r_scale);
4261 optp += 4;
4262 }
4263
4264 if (sc->sc_flags & SCF_TIMESTAMP) {
4265 u_int32_t *lp = (u_int32_t *)(optp);
4266 /* Form timestamp option as shown in appendix A of RFC 1323. */
4267 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
4268 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
4269 *lp = htonl(sc->sc_timestamp);
4270 optp += TCPOLEN_TSTAMP_APPA;
4271 }
4272
4273 #ifdef TCP_SIGNATURE
4274 if (sc->sc_flags & SCF_SIGNATURE) {
4275 union sockaddr_union src, dst;
4276 struct tdb *tdb;
4277
4278 bzero(&src, sizeof(union sockaddr_union));
4279 bzero(&dst, sizeof(union sockaddr_union));
4280 src.sa.sa_len = sc->sc_src.sa.sa_len;
4281 src.sa.sa_family = sc->sc_src.sa.sa_family;
4282 dst.sa.sa_len = sc->sc_dst.sa.sa_len;
4283 dst.sa.sa_family = sc->sc_dst.sa.sa_family;
4284
4285 switch (sc->sc_src.sa.sa_family) {
4286 case 0: /*default to PF_INET*/
4287 #ifdef INET
4288 case AF_INET:
4289 src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
4290 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
4291 break;
4292 #endif /* INET */
4293 #ifdef INET6
4294 case AF_INET6:
4295 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
4296 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
4297 break;
4298 #endif /* INET6 */
4299 }
4300
4301 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP);
4302 if (tdb == NULL) {
4303 if (m)
4304 m_freem(m);
4305 return (EPERM);
4306 }
4307
4308 /* Send signature option */
4309 *(optp++) = TCPOPT_SIGNATURE;
4310 *(optp++) = TCPOLEN_SIGNATURE;
4311
4312 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th,
4313 hlen, 0, optp) < 0) {
4314 if (m)
4315 m_freem(m);
4316 return (EINVAL);
4317 }
4318 optp += 16;
4319
4320 /* Pad options list to the next 32 bit boundary and
4321 * terminate it.
4322 */
4323 *optp++ = TCPOPT_NOP;
4324 *optp++ = TCPOPT_EOL;
4325 }
4326 #endif /* TCP_SIGNATURE */
4327
4328 /* Compute the packet's checksum. */
4329 switch (sc->sc_src.sa.sa_family) {
4330 case AF_INET:
4331 ip->ip_len = htons(tlen - hlen);
4332 th->th_sum = 0;
4333 th->th_sum = in_cksum(m, tlen);
4334 break;
4335 #ifdef INET6
4336 case AF_INET6:
4337 ip6->ip6_plen = htons(tlen - hlen);
4338 th->th_sum = 0;
4339 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4340 break;
4341 #endif
4342 }
4343
4344 /* use IPsec policy and ttl from listening socket, on SYN ACK */
4345 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
4346
4347 /*
4348 * Fill in some straggling IP bits. Note the stack expects
4349 * ip_len to be in host order, for convenience.
4350 */
4351 switch (sc->sc_src.sa.sa_family) {
4352 #ifdef INET
4353 case AF_INET:
4354 ip->ip_len = htons(tlen);
4355 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl;
4356 /* XXX tos? */
4357 break;
4358 #endif
4359 #ifdef INET6
4360 case AF_INET6:
4361 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
4362 ip6->ip6_vfc |= IPV6_VERSION;
4363 ip6->ip6_plen = htons(tlen - hlen);
4364 /* ip6_hlim will be initialized afterwards */
4365 /* leave flowlabel = 0, it is legal and require no state mgmt */
4366 break;
4367 #endif
4368 }
4369
4370 switch (sc->sc_src.sa.sa_family) {
4371 #ifdef INET
4372 case AF_INET:
4373 error = ip_output(m, sc->sc_ipopts, ro,
4374 (ip_mtudisc ? IP_MTUDISC : 0),
4375 (struct ip_moptions *)NULL, inp);
4376 break;
4377 #endif
4378 #ifdef INET6
4379 case AF_INET6:
4380 ip6->ip6_hlim = in6_selecthlim(NULL,
4381 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
4382
4383 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0,
4384 (struct ip6_moptions *)0, NULL, NULL);
4385 break;
4386 #endif
4387 default:
4388 error = EAFNOSUPPORT;
4389 break;
4390 }
4391 return (error);
4392 }