root/netinet/tcp_input.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. tcp_reass
  2. tcp6_input
  3. tcp_input
  4. tcp_dooptions
  5. tcp_seq_subtract
  6. tcp_update_sack_list
  7. tcp_sack_option
  8. tcp_del_sackholes
  9. tcp_clean_sackreport
  10. tcp_sack_partialack
  11. tcp_pulloutofband
  12. tcp_xmit_timer
  13. tcp_mss
  14. tcp_hdrsz
  15. tcp_mss_update
  16. tcp_newreno
  17. tcp_mss_adv
  18. syn_cache_init
  19. syn_cache_insert
  20. syn_cache_timer
  21. syn_cache_reaper
  22. syn_cache_cleanup
  23. syn_cache_lookup
  24. syn_cache_get
  25. syn_cache_reset
  26. syn_cache_unreach
  27. syn_cache_add
  28. syn_cache_respond

    1 /*      $OpenBSD: tcp_input.c,v 1.207 2007/06/15 18:23:06 markus Exp $  */
    2 /*      $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $  */
    3 
    4 /*
    5  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
    6  *      The Regents of the University of California.  All rights reserved.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 3. Neither the name of the University nor the names of its contributors
   17  *    may be used to endorse or promote products derived from this software
   18  *    without specific prior written permission.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   30  * SUCH DAMAGE.
   31  *
   32  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
   33  *
   34  * NRL grants permission for redistribution and use in source and binary
   35  * forms, with or without modification, of the software and documentation
   36  * created at NRL provided that the following conditions are met:
   37  *
   38  * 1. Redistributions of source code must retain the above copyright
   39  *    notice, this list of conditions and the following disclaimer.
   40  * 2. Redistributions in binary form must reproduce the above copyright
   41  *    notice, this list of conditions and the following disclaimer in the
   42  *    documentation and/or other materials provided with the distribution.
   43  * 3. All advertising materials mentioning features or use of this software
   44  *    must display the following acknowledgements:
   45  *      This product includes software developed by the University of
   46  *      California, Berkeley and its contributors.
   47  *      This product includes software developed at the Information
   48  *      Technology Division, US Naval Research Laboratory.
   49  * 4. Neither the name of the NRL nor the names of its contributors
   50  *    may be used to endorse or promote products derived from this software
   51  *    without specific prior written permission.
   52  *
   53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
   54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
   57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   64  *
   65  * The views and conclusions contained in the software and documentation
   66  * are those of the authors and should not be interpreted as representing
   67  * official policies, either expressed or implied, of the US Naval
   68  * Research Laboratory (NRL).
   69  */
   70 
   71 #include <sys/param.h>
   72 #include <sys/systm.h>
   73 #include <sys/mbuf.h>
   74 #include <sys/protosw.h>
   75 #include <sys/socket.h>
   76 #include <sys/socketvar.h>
   77 #include <sys/kernel.h>
   78 
   79 #include <dev/rndvar.h>
   80 
   81 #include <net/if.h>
   82 #include <net/route.h>
   83 
   84 #include <netinet/in.h>
   85 #include <netinet/in_systm.h>
   86 #include <netinet/ip.h>
   87 #include <netinet/in_pcb.h>
   88 #include <netinet/ip_var.h>
   89 #include <netinet/tcp.h>
   90 #include <netinet/tcp_fsm.h>
   91 #include <netinet/tcp_seq.h>
   92 #include <netinet/tcp_timer.h>
   93 #include <netinet/tcp_var.h>
   94 #include <netinet/tcpip.h>
   95 #include <netinet/tcp_debug.h>
   96 
   97 struct  tcpiphdr tcp_saveti;
   98 
   99 int tcp_mss_adv(struct ifnet *, int);
  100 
  101 #ifdef INET6
  102 #include <netinet6/in6_var.h>
  103 #include <netinet6/nd6.h>
  104 
  105 struct  tcpipv6hdr tcp_saveti6;
  106 
  107 /* for the packet header length in the mbuf */
  108 #define M_PH_LEN(m)      (((struct mbuf *)(m))->m_pkthdr.len)
  109 #define M_V6_LEN(m)      (M_PH_LEN(m) - sizeof(struct ip6_hdr))
  110 #define M_V4_LEN(m)      (M_PH_LEN(m) - sizeof(struct ip))
  111 #endif /* INET6 */
  112 
  113 int     tcprexmtthresh = 3;
  114 int     tcptv_keep_init = TCPTV_KEEP_INIT;
  115 
  116 extern u_long sb_max;
  117 
  118 int tcp_rst_ppslim = 100;               /* 100pps */
  119 int tcp_rst_ppslim_count = 0;
  120 struct timeval tcp_rst_ppslim_last;
  121 
  122 int tcp_ackdrop_ppslim = 100;           /* 100pps */
  123 int tcp_ackdrop_ppslim_count = 0;
  124 struct timeval tcp_ackdrop_ppslim_last;
  125 
  126 #define TCP_PAWS_IDLE   (24 * 24 * 60 * 60 * PR_SLOWHZ)
  127 
  128 /* for modulo comparisons of timestamps */
  129 #define TSTMP_LT(a,b)   ((int)((a)-(b)) < 0)
  130 #define TSTMP_GEQ(a,b)  ((int)((a)-(b)) >= 0)
  131 
  132 /* for TCP SACK comparisons */
  133 #define SEQ_MIN(a,b)    (SEQ_LT(a,b) ? (a) : (b))
  134 #define SEQ_MAX(a,b)    (SEQ_GT(a,b) ? (a) : (b))
  135 
  136 /*
  137  * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
  138  */
  139 #ifdef INET6
  140 #define ND6_HINT(tp) \
  141 do { \
  142         if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \
  143             tp->t_inpcb->inp_route6.ro_rt) { \
  144                 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \
  145         } \
  146 } while (0)
  147 #else
  148 #define ND6_HINT(tp)
  149 #endif
  150 
  151 #ifdef TCP_ECN
  152 /*
  153  * ECN (Explicit Congestion Notification) support based on RFC3168
  154  * implementation note:
  155  *   snd_last is used to track a recovery phase.
  156  *   when cwnd is reduced, snd_last is set to snd_max.
  157  *   while snd_last > snd_una, the sender is in a recovery phase and
  158  *   its cwnd should not be reduced again.
  159  *   snd_last follows snd_una when not in a recovery phase.
  160  */
  161 #endif
  162 
  163 /*
  164  * Macro to compute ACK transmission behavior.  Delay the ACK unless
  165  * we have already delayed an ACK (must send an ACK every two segments).
  166  * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
  167  * option is enabled.
  168  */
  169 #define TCP_SETUP_ACK(tp, tiflags) \
  170 do { \
  171         if ((tp)->t_flags & TF_DELACK || \
  172             (tcp_ack_on_push && (tiflags) & TH_PUSH)) \
  173                 tp->t_flags |= TF_ACKNOW; \
  174         else \
  175                 TCP_SET_DELACK(tp); \
  176 } while (0)
  177 
  178 /*
  179  * Insert segment ti into reassembly queue of tcp with
  180  * control block tp.  Return TH_FIN if reassembly now includes
  181  * a segment with FIN.  The macro form does the common case inline
  182  * (segment is the next to be received on an established connection,
  183  * and the queue is empty), avoiding linkage into and removal
  184  * from the queue and repetition of various conversions.
  185  * Set DELACK for segments received in order, but ack immediately
  186  * when segments are out of order (so fast retransmit can work).
  187  */
  188 
  189 int
  190 tcp_reass(tp, th, m, tlen)
  191         struct tcpcb *tp;
  192         struct tcphdr *th;
  193         struct mbuf *m;
  194         int *tlen;
  195 {
  196         struct tcpqent *p, *q, *nq, *tiqe;
  197         struct socket *so = tp->t_inpcb->inp_socket;
  198         int flags;
  199 
  200         /*
  201          * Call with th==0 after become established to
  202          * force pre-ESTABLISHED data up to user socket.
  203          */
  204         if (th == 0)
  205                 goto present;
  206 
  207         /*
  208          * Allocate a new queue entry, before we throw away any data.
  209          * If we can't, just drop the packet.  XXX
  210          */
  211         tiqe = pool_get(&tcpqe_pool, PR_NOWAIT);
  212         if (tiqe == NULL) {
  213                 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead);
  214                 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) {
  215                         /* Reuse last entry since new segment fills a hole */
  216                         m_freem(tiqe->tcpqe_m);
  217                         TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q);
  218                 }
  219                 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) {
  220                         /* Flush segment queue for this connection */
  221                         tcp_freeq(tp);
  222                         tcpstat.tcps_rcvmemdrop++;
  223                         m_freem(m);
  224                         return (0);
  225                 }
  226         }
  227 
  228         /*
  229          * Find a segment which begins after this one does.
  230          */
  231         for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL;
  232             p = q, q = TAILQ_NEXT(q, tcpqe_q))
  233                 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq))
  234                         break;
  235 
  236         /*
  237          * If there is a preceding segment, it may provide some of
  238          * our data already.  If so, drop the data from the incoming
  239          * segment.  If it provides all of our data, drop us.
  240          */
  241         if (p != NULL) {
  242                 struct tcphdr *phdr = p->tcpqe_tcp;
  243                 int i;
  244 
  245                 /* conversion to int (in i) handles seq wraparound */
  246                 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq;
  247                 if (i > 0) {
  248                         if (i >= *tlen) {
  249                                 tcpstat.tcps_rcvduppack++;
  250                                 tcpstat.tcps_rcvdupbyte += *tlen;
  251                                 m_freem(m);
  252                                 pool_put(&tcpqe_pool, tiqe);
  253                                 return (0);
  254                         }
  255                         m_adj(m, i);
  256                         *tlen -= i;
  257                         th->th_seq += i;
  258                 }
  259         }
  260         tcpstat.tcps_rcvoopack++;
  261         tcpstat.tcps_rcvoobyte += *tlen;
  262 
  263         /*
  264          * While we overlap succeeding segments trim them or,
  265          * if they are completely covered, dequeue them.
  266          */
  267         for (; q != NULL; q = nq) {
  268                 struct tcphdr *qhdr = q->tcpqe_tcp;
  269                 int i = (th->th_seq + *tlen) - qhdr->th_seq;
  270 
  271                 if (i <= 0)
  272                         break;
  273                 if (i < qhdr->th_reseqlen) {
  274                         qhdr->th_seq += i;
  275                         qhdr->th_reseqlen -= i;
  276                         m_adj(q->tcpqe_m, i);
  277                         break;
  278                 }
  279                 nq = TAILQ_NEXT(q, tcpqe_q);
  280                 m_freem(q->tcpqe_m);
  281                 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
  282                 pool_put(&tcpqe_pool, q);
  283         }
  284 
  285         /* Insert the new segment queue entry into place. */
  286         tiqe->tcpqe_m = m;
  287         th->th_reseqlen = *tlen;
  288         tiqe->tcpqe_tcp = th;
  289         if (p == NULL) {
  290                 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q);
  291         } else {
  292                 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q);
  293         }
  294 
  295 present:
  296         /*
  297          * Present data to user, advancing rcv_nxt through
  298          * completed sequence space.
  299          */
  300         if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
  301                 return (0);
  302         q = TAILQ_FIRST(&tp->t_segq);
  303         if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt)
  304                 return (0);
  305         if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen)
  306                 return (0);
  307         do {
  308                 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen;
  309                 flags = q->tcpqe_tcp->th_flags & TH_FIN;
  310 
  311                 nq = TAILQ_NEXT(q, tcpqe_q);
  312                 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
  313                 ND6_HINT(tp);
  314                 if (so->so_state & SS_CANTRCVMORE)
  315                         m_freem(q->tcpqe_m);
  316                 else
  317                         sbappendstream(&so->so_rcv, q->tcpqe_m);
  318                 pool_put(&tcpqe_pool, q);
  319                 q = nq;
  320         } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt);
  321         sorwakeup(so);
  322         return (flags);
  323 }
  324 
  325 #ifdef INET6
  326 int
  327 tcp6_input(mp, offp, proto)
  328         struct mbuf **mp;
  329         int *offp, proto;
  330 {
  331         struct mbuf *m = *mp;
  332 
  333 #if defined(NFAITH) && 0 < NFAITH
  334         if (m->m_pkthdr.rcvif) {
  335                 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
  336                         /* XXX send icmp6 host/port unreach? */
  337                         m_freem(m);
  338                         return IPPROTO_DONE;
  339                 }
  340         }
  341 #endif
  342 
  343         /*
  344          * draft-itojun-ipv6-tcp-to-anycast
  345          * better place to put this in?
  346          */
  347         if (m->m_flags & M_ANYCAST6) {
  348                 if (m->m_len >= sizeof(struct ip6_hdr)) {
  349                         struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
  350                         icmp6_error(m, ICMP6_DST_UNREACH,
  351                                 ICMP6_DST_UNREACH_ADDR,
  352                                 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
  353                 } else
  354                         m_freem(m);
  355                 return IPPROTO_DONE;
  356         }
  357 
  358         tcp_input(m, *offp, proto);
  359         return IPPROTO_DONE;
  360 }
  361 #endif
  362 
  363 /*
  364  * TCP input routine, follows pages 65-76 of the
  365  * protocol specification dated September, 1981 very closely.
  366  */
  367 void
  368 tcp_input(struct mbuf *m, ...)
  369 {
  370         struct ip *ip;
  371         struct inpcb *inp;
  372         u_int8_t *optp = NULL;
  373         int optlen = 0;
  374         int tlen, off;
  375         struct tcpcb *tp = 0;
  376         int tiflags;
  377         struct socket *so = NULL;
  378         int todrop, acked, ourfinisacked, needoutput = 0;
  379         int hdroptlen = 0;
  380         short ostate = 0;
  381         tcp_seq iss, *reuse = NULL;
  382         u_long tiwin;
  383         struct tcp_opt_info opti;
  384         int iphlen;
  385         va_list ap;
  386         struct tcphdr *th;
  387 #ifdef INET6
  388         struct ip6_hdr *ip6 = NULL;
  389 #endif /* INET6 */
  390 #ifdef IPSEC
  391         struct m_tag *mtag;
  392         struct tdb_ident *tdbi;
  393         struct tdb *tdb;
  394         int error, s;
  395 #endif /* IPSEC */
  396         int af;
  397 #ifdef TCP_ECN
  398         u_char iptos;
  399 #endif
  400 
  401         va_start(ap, m);
  402         iphlen = va_arg(ap, int);
  403         va_end(ap);
  404 
  405         tcpstat.tcps_rcvtotal++;
  406 
  407         opti.ts_present = 0;
  408         opti.maxseg = 0;
  409 
  410         /*
  411          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
  412          * See below for AF specific multicast.
  413          */
  414         if (m->m_flags & (M_BCAST|M_MCAST))
  415                 goto drop;
  416 
  417         /*
  418          * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or
  419          * TCP/IPv4.
  420          */
  421         switch (mtod(m, struct ip *)->ip_v) {
  422 #ifdef INET6
  423         case 6:
  424                 af = AF_INET6;
  425                 break;
  426 #endif
  427         case 4:
  428                 af = AF_INET;
  429                 break;
  430         default:
  431                 m_freem(m);
  432                 return; /*EAFNOSUPPORT*/
  433         }
  434 
  435         /*
  436          * Get IP and TCP header together in first mbuf.
  437          * Note: IP leaves IP header in first mbuf.
  438          */
  439         switch (af) {
  440         case AF_INET:
  441 #ifdef DIAGNOSTIC
  442                 if (iphlen < sizeof(struct ip)) {
  443                         m_freem(m);
  444                         return;
  445                 }
  446 #endif /* DIAGNOSTIC */
  447                 break;
  448 #ifdef INET6
  449         case AF_INET6:
  450 #ifdef DIAGNOSTIC
  451                 if (iphlen < sizeof(struct ip6_hdr)) {
  452                         m_freem(m);
  453                         return;
  454                 }
  455 #endif /* DIAGNOSTIC */
  456                 break;
  457 #endif
  458         default:
  459                 m_freem(m);
  460                 return;
  461         }
  462 
  463         IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th));
  464         if (!th) {
  465                 tcpstat.tcps_rcvshort++;
  466                 return;
  467         }
  468 
  469         tlen = m->m_pkthdr.len - iphlen;
  470         ip = NULL;
  471 #ifdef INET6
  472         ip6 = NULL;
  473 #endif
  474         switch (af) {
  475         case AF_INET:
  476                 ip = mtod(m, struct ip *);
  477                 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
  478                     in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
  479                         goto drop;
  480 #ifdef TCP_ECN
  481                 /* save ip_tos before clearing it for checksum */
  482                 iptos = ip->ip_tos;
  483 #endif
  484                 /*
  485                  * Checksum extended TCP header and data.
  486                  */
  487                 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) {
  488                         if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) {
  489                                 tcpstat.tcps_inhwcsum++;
  490                                 tcpstat.tcps_rcvbadsum++;
  491                                 goto drop;
  492                         }
  493                         if (in4_cksum(m, IPPROTO_TCP, iphlen, tlen) != 0) {
  494                                 tcpstat.tcps_rcvbadsum++;
  495                                 goto drop;
  496                         }
  497                 } else {
  498                         m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK;
  499                         tcpstat.tcps_inhwcsum++;
  500                 }
  501                 break;
  502 #ifdef INET6
  503         case AF_INET6:
  504                 ip6 = mtod(m, struct ip6_hdr *);
  505 #ifdef TCP_ECN
  506                 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
  507 #endif
  508 
  509                 /* Be proactive about malicious use of IPv4 mapped address */
  510                 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
  511                     IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
  512                         /* XXX stat */
  513                         goto drop;
  514                 }
  515 
  516                 /*
  517                  * Be proactive about unspecified IPv6 address in source.
  518                  * As we use all-zero to indicate unbounded/unconnected pcb,
  519                  * unspecified IPv6 address can be used to confuse us.
  520                  *
  521                  * Note that packets with unspecified IPv6 destination is
  522                  * already dropped in ip6_input.
  523                  */
  524                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
  525                         /* XXX stat */
  526                         goto drop;
  527                 }
  528 
  529                 /* Discard packets to multicast */
  530                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
  531                         /* XXX stat */
  532                         goto drop;
  533                 }
  534 
  535                 /*
  536                  * Checksum extended TCP header and data.
  537                  */
  538                 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) {
  539                         tcpstat.tcps_rcvbadsum++;
  540                         goto drop;
  541                 }
  542                 break;
  543 #endif
  544         }
  545 
  546         /*
  547          * Check that TCP offset makes sense,
  548          * pull out TCP options and adjust length.              XXX
  549          */
  550         off = th->th_off << 2;
  551         if (off < sizeof(struct tcphdr) || off > tlen) {
  552                 tcpstat.tcps_rcvbadoff++;
  553                 goto drop;
  554         }
  555         tlen -= off;
  556         if (off > sizeof(struct tcphdr)) {
  557                 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off);
  558                 if (!th) {
  559                         tcpstat.tcps_rcvshort++;
  560                         return;
  561                 }
  562                 optlen = off - sizeof(struct tcphdr);
  563                 optp = (u_int8_t *)(th + 1);
  564                 /*
  565                  * Do quick retrieval of timestamp options ("options
  566                  * prediction?").  If timestamp is the only option and it's
  567                  * formatted as recommended in RFC 1323 appendix A, we
  568                  * quickly get the values now and not bother calling
  569                  * tcp_dooptions(), etc.
  570                  */
  571                 if ((optlen == TCPOLEN_TSTAMP_APPA ||
  572                      (optlen > TCPOLEN_TSTAMP_APPA &&
  573                         optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
  574                      *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
  575                      (th->th_flags & TH_SYN) == 0) {
  576                         opti.ts_present = 1;
  577                         opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
  578                         opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
  579                         optp = NULL;    /* we've parsed the options */
  580                 }
  581         }
  582         tiflags = th->th_flags;
  583 
  584         /*
  585          * Convert TCP protocol specific fields to host format.
  586          */
  587         NTOHL(th->th_seq);
  588         NTOHL(th->th_ack);
  589         NTOHS(th->th_win);
  590         NTOHS(th->th_urp);
  591 
  592         /*
  593          * Locate pcb for segment.
  594          */
  595 findpcb:
  596         switch (af) {
  597 #ifdef INET6
  598         case AF_INET6:
  599                 inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, th->th_sport,
  600                     &ip6->ip6_dst, th->th_dport);
  601                 break;
  602 #endif
  603         case AF_INET:
  604                 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, th->th_sport,
  605                     ip->ip_dst, th->th_dport);
  606                 break;
  607         }
  608         if (inp == 0) {
  609                 int     inpl_flags = 0;
  610                 if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST)
  611                         inpl_flags = INPLOOKUP_WILDCARD;
  612                 ++tcpstat.tcps_pcbhashmiss;
  613                 switch (af) {
  614 #ifdef INET6
  615                 case AF_INET6:
  616                         inp = in6_pcblookup_listen(&tcbtable,
  617                             &ip6->ip6_dst, th->th_dport, inpl_flags);
  618                         break;
  619 #endif /* INET6 */
  620                 case AF_INET:
  621                         inp = in_pcblookup_listen(&tcbtable,
  622                             ip->ip_dst, th->th_dport, inpl_flags);
  623                         break;
  624                 }
  625                 /*
  626                  * If the state is CLOSED (i.e., TCB does not exist) then
  627                  * all data in the incoming segment is discarded.
  628                  * If the TCB exists but is in CLOSED state, it is embryonic,
  629                  * but should either do a listen or a connect soon.
  630                  */
  631                 if (inp == 0) {
  632                         ++tcpstat.tcps_noport;
  633                         goto dropwithreset_ratelim;
  634                 }
  635         }
  636 
  637         /* Check the minimum TTL for socket. */
  638         if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl)
  639                 goto drop;
  640 
  641         tp = intotcpcb(inp);
  642         if (tp == 0)
  643                 goto dropwithreset_ratelim;
  644         if (tp->t_state == TCPS_CLOSED)
  645                 goto drop;
  646 
  647         /* Unscale the window into a 32-bit value. */
  648         if ((tiflags & TH_SYN) == 0)
  649                 tiwin = th->th_win << tp->snd_scale;
  650         else
  651                 tiwin = th->th_win;
  652 
  653         so = inp->inp_socket;
  654         if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
  655                 union syn_cache_sa src;
  656                 union syn_cache_sa dst;
  657 
  658                 bzero(&src, sizeof(src));
  659                 bzero(&dst, sizeof(dst));
  660                 switch (af) {
  661 #ifdef INET
  662                 case AF_INET:
  663                         src.sin.sin_len = sizeof(struct sockaddr_in);
  664                         src.sin.sin_family = AF_INET;
  665                         src.sin.sin_addr = ip->ip_src;
  666                         src.sin.sin_port = th->th_sport;
  667 
  668                         dst.sin.sin_len = sizeof(struct sockaddr_in);
  669                         dst.sin.sin_family = AF_INET;
  670                         dst.sin.sin_addr = ip->ip_dst;
  671                         dst.sin.sin_port = th->th_dport;
  672                         break;
  673 #endif
  674 #ifdef INET6
  675                 case AF_INET6:
  676                         src.sin6.sin6_len = sizeof(struct sockaddr_in6);
  677                         src.sin6.sin6_family = AF_INET6;
  678                         src.sin6.sin6_addr = ip6->ip6_src;
  679                         src.sin6.sin6_port = th->th_sport;
  680 
  681                         dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
  682                         dst.sin6.sin6_family = AF_INET6;
  683                         dst.sin6.sin6_addr = ip6->ip6_dst;
  684                         dst.sin6.sin6_port = th->th_dport;
  685                         break;
  686 #endif /* INET6 */
  687                 default:
  688                         goto badsyn;    /*sanity*/
  689                 }
  690 
  691                 if (so->so_options & SO_DEBUG) {
  692                         ostate = tp->t_state;
  693                         switch (af) {
  694 #ifdef INET6
  695                         case AF_INET6:
  696                                 bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6));
  697                                 bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th));
  698                                 break;
  699 #endif
  700                         case AF_INET:
  701                                 bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip));
  702                                 bcopy(th, &tcp_saveti.ti_t, sizeof(*th));
  703                                 break;
  704                         }
  705                 }
  706                 if (so->so_options & SO_ACCEPTCONN) {
  707                         if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
  708                                 if (tiflags & TH_RST) {
  709                                         syn_cache_reset(&src.sa, &dst.sa, th);
  710                                 } else if ((tiflags & (TH_ACK|TH_SYN)) ==
  711                                     (TH_ACK|TH_SYN)) {
  712                                         /*
  713                                          * Received a SYN,ACK.  This should
  714                                          * never happen while we are in
  715                                          * LISTEN.  Send an RST.
  716                                          */
  717                                         goto badsyn;
  718                                 } else if (tiflags & TH_ACK) {
  719                                         so = syn_cache_get(&src.sa, &dst.sa,
  720                                                 th, iphlen, tlen, so, m);
  721                                         if (so == NULL) {
  722                                                 /*
  723                                                  * We don't have a SYN for
  724                                                  * this ACK; send an RST.
  725                                                  */
  726                                                 goto badsyn;
  727                                         } else if (so ==
  728                                             (struct socket *)(-1)) {
  729                                                 /*
  730                                                  * We were unable to create
  731                                                  * the connection.  If the
  732                                                  * 3-way handshake was
  733                                                  * completed, and RST has
  734                                                  * been sent to the peer.
  735                                                  * Since the mbuf might be
  736                                                  * in use for the reply,
  737                                                  * do not free it.
  738                                                  */
  739                                                 m = NULL;
  740                                         } else {
  741                                                 /*
  742                                                  * We have created a
  743                                                  * full-blown connection.
  744                                                  */
  745                                                 tp = NULL;
  746                                                 inp = (struct inpcb *)so->so_pcb;
  747                                                 tp = intotcpcb(inp);
  748                                                 if (tp == NULL)
  749                                                         goto badsyn;    /*XXX*/
  750 
  751                                                 /*
  752                                                  * Compute proper scaling
  753                                                  * value from buffer space
  754                                                  */
  755                                                 tcp_rscale(tp, so->so_rcv.sb_hiwat);
  756                                                 goto after_listen;
  757                                         }
  758                                 } else {
  759                                         /*
  760                                          * None of RST, SYN or ACK was set.
  761                                          * This is an invalid packet for a
  762                                          * TCB in LISTEN state.  Send a RST.
  763                                          */
  764                                         goto badsyn;
  765                                 }
  766                         } else {
  767                                 /*
  768                                  * Received a SYN.
  769                                  */
  770 #ifdef INET6
  771                                 /*
  772                                  * If deprecated address is forbidden, we do
  773                                  * not accept SYN to deprecated interface
  774                                  * address to prevent any new inbound
  775                                  * connection from getting established.
  776                                  * When we do not accept SYN, we send a TCP
  777                                  * RST, with deprecated source address (instead
  778                                  * of dropping it).  We compromise it as it is
  779                                  * much better for peer to send a RST, and
  780                                  * RST will be the final packet for the
  781                                  * exchange.
  782                                  *
  783                                  * If we do not forbid deprecated addresses, we
  784                                  * accept the SYN packet.  RFC2462 does not
  785                                  * suggest dropping SYN in this case.
  786                                  * If we decipher RFC2462 5.5.4, it says like
  787                                  * this:
  788                                  * 1. use of deprecated addr with existing
  789                                  *    communication is okay - "SHOULD continue
  790                                  *    to be used"
  791                                  * 2. use of it with new communication:
  792                                  *   (2a) "SHOULD NOT be used if alternate
  793                                  *        address with sufficient scope is
  794                                  *        available"
  795                                  *   (2b) nothing mentioned otherwise. 
  796                                  * Here we fall into (2b) case as we have no
  797                                  * choice in our source address selection - we
  798                                  * must obey the peer.
  799                                  *
  800                                  * The wording in RFC2462 is confusing, and
  801                                  * there are multiple description text for
  802                                  * deprecated address handling - worse, they
  803                                  * are not exactly the same.  I believe 5.5.4
  804                                  * is the best one, so we follow 5.5.4.
  805                                  */
  806                                 if (ip6 && !ip6_use_deprecated) {
  807                                         struct in6_ifaddr *ia6;
  808 
  809                                         if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif,
  810                                             &ip6->ip6_dst)) &&
  811                                             (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
  812                                                 tp = NULL;
  813                                                 goto dropwithreset;
  814                                         }
  815                                 }
  816 #endif
  817 
  818                                 /*
  819                                  * LISTEN socket received a SYN
  820                                  * from itself?  This can't possibly
  821                                  * be valid; drop the packet.
  822                                  */
  823                                 if (th->th_dport == th->th_sport) {
  824                                         switch (af) {
  825 #ifdef INET6
  826                                         case AF_INET6:
  827                                                 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
  828                                                     &ip6->ip6_dst)) {
  829                                                         tcpstat.tcps_badsyn++;
  830                                                         goto drop;
  831                                                 }
  832                                                 break;
  833 #endif /* INET6 */
  834                                         case AF_INET:
  835                                                 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
  836                                                         tcpstat.tcps_badsyn++;
  837                                                         goto drop;
  838                                                 }
  839                                                 break;
  840                                         }
  841                                 }
  842 
  843                                 /*
  844                                  * SYN looks ok; create compressed TCP
  845                                  * state for it.
  846                                  */
  847                                 if (so->so_qlen <= so->so_qlimit &&
  848                                     syn_cache_add(&src.sa, &dst.sa, th, iphlen,
  849                                     so, m, optp, optlen, &opti, reuse))
  850                                         m = NULL;
  851                         }
  852                         goto drop;
  853                 }
  854         }
  855 
  856 after_listen:
  857 #ifdef DIAGNOSTIC
  858         /*
  859          * Should not happen now that all embryonic connections
  860          * are handled with compressed state.
  861          */
  862         if (tp->t_state == TCPS_LISTEN)
  863                 panic("tcp_input: TCPS_LISTEN");
  864 #endif
  865 
  866 #ifdef IPSEC
  867         /* Find most recent IPsec tag */
  868         mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
  869         s = splnet();
  870         if (mtag != NULL) {
  871                 tdbi = (struct tdb_ident *)(mtag + 1);
  872                 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
  873         } else
  874                 tdb = NULL;
  875         ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN,
  876             tdb, inp);
  877         if (error) {
  878                 splx(s);
  879                 goto drop;
  880         }
  881 
  882         /* Latch SA */
  883         if (inp->inp_tdb_in != tdb) {
  884                 if (tdb) {
  885                         tdb_add_inp(tdb, inp, 1);
  886                         if (inp->inp_ipo == NULL) {
  887                                 inp->inp_ipo = ipsec_add_policy(inp, af,
  888                                     IPSP_DIRECTION_OUT);
  889                                 if (inp->inp_ipo == NULL) {
  890                                         splx(s);
  891                                         goto drop;
  892                                 }
  893                         }
  894                         if (inp->inp_ipo->ipo_dstid == NULL &&
  895                             tdb->tdb_srcid != NULL) {
  896                                 inp->inp_ipo->ipo_dstid = tdb->tdb_srcid;
  897                                 tdb->tdb_srcid->ref_count++;
  898                         }
  899                         if (inp->inp_ipsec_remotecred == NULL &&
  900                             tdb->tdb_remote_cred != NULL) {
  901                                 inp->inp_ipsec_remotecred =
  902                                     tdb->tdb_remote_cred;
  903                                 tdb->tdb_remote_cred->ref_count++;
  904                         }
  905                         if (inp->inp_ipsec_remoteauth == NULL &&
  906                             tdb->tdb_remote_auth != NULL) {
  907                                 inp->inp_ipsec_remoteauth =
  908                                     tdb->tdb_remote_auth;
  909                                 tdb->tdb_remote_auth->ref_count++;
  910                         }
  911                 } else { /* Just reset */
  912                         TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp,
  913                                      inp_tdb_in_next);
  914                         inp->inp_tdb_in = NULL;
  915                 }
  916         }
  917         splx(s);
  918 #endif /* IPSEC */
  919 
  920         /*
  921          * Segment received on connection.
  922          * Reset idle time and keep-alive timer.
  923          */
  924         tp->t_rcvtime = tcp_now;
  925         if (TCPS_HAVEESTABLISHED(tp->t_state))
  926                 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
  927 
  928 #ifdef TCP_SACK
  929         if (tp->sack_enable)
  930                 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */
  931 #endif /* TCP_SACK */
  932 
  933         /*
  934          * Process options.
  935          */
  936 #ifdef TCP_SIGNATURE
  937         if (optp || (tp->t_flags & TF_SIGNATURE))
  938 #else
  939         if (optp)
  940 #endif
  941                 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti))
  942                         goto drop;
  943 
  944         if (opti.ts_present && opti.ts_ecr) {
  945                 int rtt_test;
  946 
  947                 /* subtract out the tcp timestamp modulator */
  948                 opti.ts_ecr -= tp->ts_modulate;
  949                                                      
  950                 /* make sure ts_ecr is sensible */
  951                 rtt_test = tcp_now - opti.ts_ecr;
  952                 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX)
  953                         opti.ts_ecr = 0;
  954         }
  955 
  956 #ifdef TCP_ECN
  957         /* if congestion experienced, set ECE bit in subsequent packets. */
  958         if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
  959                 tp->t_flags |= TF_RCVD_CE;
  960                 tcpstat.tcps_ecn_rcvce++;
  961         }
  962 #endif
  963         /*
  964          * Header prediction: check for the two common cases
  965          * of a uni-directional data xfer.  If the packet has
  966          * no control flags, is in-sequence, the window didn't
  967          * change and we're not retransmitting, it's a
  968          * candidate.  If the length is zero and the ack moved
  969          * forward, we're the sender side of the xfer.  Just
  970          * free the data acked & wake any higher level process
  971          * that was blocked waiting for space.  If the length
  972          * is non-zero and the ack didn't move, we're the
  973          * receiver side.  If we're getting packets in-order
  974          * (the reassembly queue is empty), add the data to
  975          * the socket buffer and note that we need a delayed ack.
  976          */
  977         if (tp->t_state == TCPS_ESTABLISHED &&
  978 #ifdef TCP_ECN
  979             (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK &&
  980 #else
  981             (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
  982 #endif
  983             (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
  984             th->th_seq == tp->rcv_nxt &&
  985             tiwin && tiwin == tp->snd_wnd &&
  986             tp->snd_nxt == tp->snd_max) {
  987 
  988                 /*
  989                  * If last ACK falls within this segment's sequence numbers,
  990                  *  record the timestamp.
  991                  * Fix from Braden, see Stevens p. 870
  992                  */
  993                 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
  994                         tp->ts_recent_age = tcp_now;
  995                         tp->ts_recent = opti.ts_val;
  996                 }
  997 
  998                 if (tlen == 0) {
  999                         if (SEQ_GT(th->th_ack, tp->snd_una) &&
 1000                             SEQ_LEQ(th->th_ack, tp->snd_max) &&
 1001                             tp->snd_cwnd >= tp->snd_wnd &&
 1002                             tp->t_dupacks == 0) {
 1003                                 /*
 1004                                  * this is a pure ack for outstanding data.
 1005                                  */
 1006                                 ++tcpstat.tcps_predack;
 1007                                 if (opti.ts_present && opti.ts_ecr)
 1008                                         tcp_xmit_timer(tp, tcp_now - opti.ts_ecr);
 1009                                 else if (tp->t_rtttime &&
 1010                                     SEQ_GT(th->th_ack, tp->t_rtseq))
 1011                                         tcp_xmit_timer(tp,
 1012                                             tcp_now - tp->t_rtttime);
 1013                                 acked = th->th_ack - tp->snd_una;
 1014                                 tcpstat.tcps_rcvackpack++;
 1015                                 tcpstat.tcps_rcvackbyte += acked;
 1016                                 ND6_HINT(tp);
 1017                                 sbdrop(&so->so_snd, acked);
 1018 
 1019                                 /*
 1020                                  * If we had a pending ICMP message that
 1021                                  * referres to data that have just been 
 1022                                  * acknowledged, disregard the recorded ICMP 
 1023                                  * message.
 1024                                  */
 1025                                 if ((tp->t_flags & TF_PMTUD_PEND) && 
 1026                                     SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
 1027                                         tp->t_flags &= ~TF_PMTUD_PEND;
 1028 
 1029                                 /*
 1030                                  * Keep track of the largest chunk of data 
 1031                                  * acknowledged since last PMTU update
 1032                                  */
 1033                                 if (tp->t_pmtud_mss_acked < acked)
 1034                                         tp->t_pmtud_mss_acked = acked;
 1035 
 1036                                 tp->snd_una = th->th_ack;
 1037 #if defined(TCP_SACK) || defined(TCP_ECN)
 1038                                 /*
 1039                                  * We want snd_last to track snd_una so
 1040                                  * as to avoid sequence wraparound problems
 1041                                  * for very large transfers.
 1042                                  */
 1043 #ifdef TCP_ECN
 1044                                 if (SEQ_GT(tp->snd_una, tp->snd_last))
 1045 #endif
 1046                                 tp->snd_last = tp->snd_una;
 1047 #endif /* TCP_SACK */
 1048 #if defined(TCP_SACK) && defined(TCP_FACK)
 1049                                 tp->snd_fack = tp->snd_una;
 1050                                 tp->retran_data = 0;
 1051 #endif /* TCP_FACK */
 1052                                 m_freem(m);
 1053 
 1054                                 /*
 1055                                  * If all outstanding data are acked, stop
 1056                                  * retransmit timer, otherwise restart timer
 1057                                  * using current (possibly backed-off) value.
 1058                                  * If process is waiting for space,
 1059                                  * wakeup/selwakeup/signal.  If data
 1060                                  * are ready to send, let tcp_output
 1061                                  * decide between more output or persist.
 1062                                  */
 1063                                 if (tp->snd_una == tp->snd_max)
 1064                                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1065                                 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
 1066                                         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 1067 
 1068                                 if (sb_notify(&so->so_snd))
 1069                                         sowwakeup(so);
 1070                                 if (so->so_snd.sb_cc)
 1071                                         (void) tcp_output(tp);
 1072                                 return;
 1073                         }
 1074                 } else if (th->th_ack == tp->snd_una &&
 1075                     TAILQ_EMPTY(&tp->t_segq) &&
 1076                     tlen <= sbspace(&so->so_rcv)) {
 1077                         /*
 1078                          * This is a pure, in-sequence data packet
 1079                          * with nothing on the reassembly queue and
 1080                          * we have enough buffer space to take it.
 1081                          */
 1082 #ifdef TCP_SACK
 1083                         /* Clean receiver SACK report if present */
 1084                         if (tp->sack_enable && tp->rcv_numsacks)
 1085                                 tcp_clean_sackreport(tp);
 1086 #endif /* TCP_SACK */
 1087                         ++tcpstat.tcps_preddat;
 1088                         tp->rcv_nxt += tlen;
 1089                         tcpstat.tcps_rcvpack++;
 1090                         tcpstat.tcps_rcvbyte += tlen;
 1091                         ND6_HINT(tp);
 1092                         /*
 1093                          * Drop TCP, IP headers and TCP options then add data
 1094                          * to socket buffer.
 1095                          */
 1096                         if (so->so_state & SS_CANTRCVMORE)
 1097                                 m_freem(m);
 1098                         else {
 1099                                 m_adj(m, iphlen + off);
 1100                                 sbappendstream(&so->so_rcv, m);
 1101                         }
 1102                         sorwakeup(so);
 1103                         TCP_SETUP_ACK(tp, tiflags);
 1104                         if (tp->t_flags & TF_ACKNOW)
 1105                                 (void) tcp_output(tp);
 1106                         return;
 1107                 }
 1108         }
 1109 
 1110         /*
 1111          * Compute mbuf offset to TCP data segment.
 1112          */
 1113         hdroptlen = iphlen + off;
 1114 
 1115         /*
 1116          * Calculate amount of space in receive window,
 1117          * and then do TCP input processing.
 1118          * Receive window is amount of space in rcv queue,
 1119          * but not less than advertised window.
 1120          */
 1121         { int win;
 1122 
 1123         win = sbspace(&so->so_rcv);
 1124         if (win < 0)
 1125                 win = 0;
 1126         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 1127         }
 1128 
 1129         switch (tp->t_state) {
 1130 
 1131         /*
 1132          * If the state is SYN_RECEIVED:
 1133          *      if seg contains SYN/ACK, send an RST.
 1134          *      if seg contains an ACK, but not for our SYN/ACK, send an RST
 1135          */
 1136 
 1137         case TCPS_SYN_RECEIVED:
 1138                 if (tiflags & TH_ACK) {
 1139                         if (tiflags & TH_SYN) {
 1140                                 tcpstat.tcps_badsyn++;
 1141                                 goto dropwithreset;
 1142                         }
 1143                         if (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 1144                             SEQ_GT(th->th_ack, tp->snd_max))
 1145                                 goto dropwithreset;
 1146                 }
 1147                 break;
 1148 
 1149         /*
 1150          * If the state is SYN_SENT:
 1151          *      if seg contains an ACK, but not for our SYN, drop the input.
 1152          *      if seg contains a RST, then drop the connection.
 1153          *      if seg does not contain SYN, then drop it.
 1154          * Otherwise this is an acceptable SYN segment
 1155          *      initialize tp->rcv_nxt and tp->irs
 1156          *      if seg contains ack then advance tp->snd_una
 1157          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 1158          *      arrange for segment to be acked (eventually)
 1159          *      continue processing rest of data/controls, beginning with URG
 1160          */
 1161         case TCPS_SYN_SENT:
 1162                 if ((tiflags & TH_ACK) &&
 1163                     (SEQ_LEQ(th->th_ack, tp->iss) ||
 1164                      SEQ_GT(th->th_ack, tp->snd_max)))
 1165                         goto dropwithreset;
 1166                 if (tiflags & TH_RST) {
 1167 #ifdef TCP_ECN
 1168                         /* if ECN is enabled, fall back to non-ecn at rexmit */
 1169                         if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
 1170                                 goto drop;
 1171 #endif
 1172                         if (tiflags & TH_ACK)
 1173                                 tp = tcp_drop(tp, ECONNREFUSED);
 1174                         goto drop;
 1175                 }
 1176                 if ((tiflags & TH_SYN) == 0)
 1177                         goto drop;
 1178                 if (tiflags & TH_ACK) {
 1179                         tp->snd_una = th->th_ack;
 1180                         if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 1181                                 tp->snd_nxt = tp->snd_una;
 1182                 }
 1183                 TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1184                 tp->irs = th->th_seq;
 1185                 tcp_mss(tp, opti.maxseg);
 1186                 /* Reset initial window to 1 segment for retransmit */
 1187                 if (tp->t_rxtshift > 0)
 1188                         tp->snd_cwnd = tp->t_maxseg;
 1189                 tcp_rcvseqinit(tp);
 1190                 tp->t_flags |= TF_ACKNOW;
 1191 #ifdef TCP_SACK
 1192                 /*
 1193                  * If we've sent a SACK_PERMITTED option, and the peer
 1194                  * also replied with one, then TF_SACK_PERMIT should have
 1195                  * been set in tcp_dooptions().  If it was not, disable SACKs.
 1196                  */
 1197                 if (tp->sack_enable)
 1198                         tp->sack_enable = tp->t_flags & TF_SACK_PERMIT;
 1199 #endif
 1200 #ifdef TCP_ECN
 1201                 /*
 1202                  * if ECE is set but CWR is not set for SYN-ACK, or
 1203                  * both ECE and CWR are set for simultaneous open,
 1204                  * peer is ECN capable.
 1205                  */
 1206                 if (tcp_do_ecn) {
 1207                         if ((tiflags & (TH_ACK|TH_ECE|TH_CWR))
 1208                             == (TH_ACK|TH_ECE) ||
 1209                             (tiflags & (TH_ACK|TH_ECE|TH_CWR))
 1210                             == (TH_ECE|TH_CWR)) {
 1211                                 tp->t_flags |= TF_ECN_PERMIT;
 1212                                 tiflags &= ~(TH_ECE|TH_CWR);
 1213                                 tcpstat.tcps_ecn_accepts++;
 1214                         }
 1215                 }
 1216 #endif
 1217 
 1218                 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
 1219                         tcpstat.tcps_connects++;
 1220                         soisconnected(so);
 1221                         tp->t_state = TCPS_ESTABLISHED;
 1222                         TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
 1223                         /* Do window scaling on this connection? */
 1224                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 1225                                 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 1226                                 tp->snd_scale = tp->requested_s_scale;
 1227                                 tp->rcv_scale = tp->request_r_scale;
 1228                         }
 1229                         tcp_reass_lock(tp);
 1230                         (void) tcp_reass(tp, (struct tcphdr *)0,
 1231                                 (struct mbuf *)0, &tlen);
 1232                         tcp_reass_unlock(tp);
 1233                         /*
 1234                          * if we didn't have to retransmit the SYN,
 1235                          * use its rtt as our initial srtt & rtt var.
 1236                          */
 1237                         if (tp->t_rtttime)
 1238                                 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
 1239                         /*
 1240                          * Since new data was acked (the SYN), open the
 1241                          * congestion window by one MSS.  We do this
 1242                          * here, because we won't go through the normal
 1243                          * ACK processing below.  And since this is the
 1244                          * start of the connection, we know we are in
 1245                          * the exponential phase of slow-start.
 1246                          */
 1247                         tp->snd_cwnd += tp->t_maxseg;
 1248                 } else
 1249                         tp->t_state = TCPS_SYN_RECEIVED;
 1250 
 1251 #if 0
 1252 trimthenstep6:
 1253 #endif
 1254                 /*
 1255                  * Advance th->th_seq to correspond to first data byte.
 1256                  * If data, trim to stay within window,
 1257                  * dropping FIN if necessary.
 1258                  */
 1259                 th->th_seq++;
 1260                 if (tlen > tp->rcv_wnd) {
 1261                         todrop = tlen - tp->rcv_wnd;
 1262                         m_adj(m, -todrop);
 1263                         tlen = tp->rcv_wnd;
 1264                         tiflags &= ~TH_FIN;
 1265                         tcpstat.tcps_rcvpackafterwin++;
 1266                         tcpstat.tcps_rcvbyteafterwin += todrop;
 1267                 }
 1268                 tp->snd_wl1 = th->th_seq - 1;
 1269                 tp->rcv_up = th->th_seq;
 1270                 goto step6;
 1271         /*
 1272          * If a new connection request is received while in TIME_WAIT,
 1273          * drop the old connection and start over if the if the
 1274          * timestamp or the sequence numbers are above the previous
 1275          * ones.
 1276          */
 1277         case TCPS_TIME_WAIT:
 1278                 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) &&
 1279                     ((opti.ts_present &&
 1280                     TSTMP_LT(tp->ts_recent, opti.ts_val)) ||
 1281                     SEQ_GT(th->th_seq, tp->rcv_nxt))) {
 1282                         /*
 1283                         * Advance the iss by at least 32768, but
 1284                         * clear the msb in order to make sure
 1285                         * that SEG_LT(snd_nxt, iss).
 1286                         */
 1287                         iss = tp->snd_nxt +
 1288                             ((arc4random() & 0x7fffffff) | 0x8000);
 1289                         reuse = &iss;
 1290                         tp = tcp_close(tp);
 1291                         goto findpcb;
 1292                 }
 1293         }
 1294 
 1295         /*
 1296          * States other than LISTEN or SYN_SENT.
 1297          * First check timestamp, if present.
 1298          * Then check that at least some bytes of segment are within
 1299          * receive window.  If segment begins before rcv_nxt,
 1300          * drop leading data (and SYN); if nothing left, just ack.
 1301          *
 1302          * RFC 1323 PAWS: If we have a timestamp reply on this segment
 1303          * and it's less than opti.ts_recent, drop it.
 1304          */
 1305         if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
 1306             TSTMP_LT(opti.ts_val, tp->ts_recent)) {
 1307 
 1308                 /* Check to see if ts_recent is over 24 days old.  */
 1309                 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
 1310                         /*
 1311                          * Invalidate ts_recent.  If this segment updates
 1312                          * ts_recent, the age will be reset later and ts_recent
 1313                          * will get a valid value.  If it does not, setting
 1314                          * ts_recent to zero will at least satisfy the
 1315                          * requirement that zero be placed in the timestamp
 1316                          * echo reply when ts_recent isn't valid.  The
 1317                          * age isn't reset until we get a valid ts_recent
 1318                          * because we don't want out-of-order segments to be
 1319                          * dropped when ts_recent is old.
 1320                          */
 1321                         tp->ts_recent = 0;
 1322                 } else {
 1323                         tcpstat.tcps_rcvduppack++;
 1324                         tcpstat.tcps_rcvdupbyte += tlen;
 1325                         tcpstat.tcps_pawsdrop++;
 1326                         goto dropafterack;
 1327                 }
 1328         }
 1329 
 1330         todrop = tp->rcv_nxt - th->th_seq;
 1331         if (todrop > 0) {
 1332                 if (tiflags & TH_SYN) {
 1333                         tiflags &= ~TH_SYN;
 1334                         th->th_seq++;
 1335                         if (th->th_urp > 1)
 1336                                 th->th_urp--;
 1337                         else
 1338                                 tiflags &= ~TH_URG;
 1339                         todrop--;
 1340                 }
 1341                 if (todrop > tlen ||
 1342                     (todrop == tlen && (tiflags & TH_FIN) == 0)) {
 1343                         /*
 1344                          * Any valid FIN must be to the left of the
 1345                          * window.  At this point, FIN must be a
 1346                          * duplicate or out-of-sequence, so drop it.
 1347                          */
 1348                         tiflags &= ~TH_FIN;
 1349                         /*
 1350                          * Send ACK to resynchronize, and drop any data,
 1351                          * but keep on processing for RST or ACK.
 1352                          */
 1353                         tp->t_flags |= TF_ACKNOW;
 1354                         tcpstat.tcps_rcvdupbyte += todrop = tlen;
 1355                         tcpstat.tcps_rcvduppack++;
 1356                 } else {
 1357                         tcpstat.tcps_rcvpartduppack++;
 1358                         tcpstat.tcps_rcvpartdupbyte += todrop;
 1359                 }
 1360                 hdroptlen += todrop;    /* drop from head afterwards */
 1361                 th->th_seq += todrop;
 1362                 tlen -= todrop;
 1363                 if (th->th_urp > todrop)
 1364                         th->th_urp -= todrop;
 1365                 else {
 1366                         tiflags &= ~TH_URG;
 1367                         th->th_urp = 0;
 1368                 }
 1369         }
 1370 
 1371         /*
 1372          * If new data are received on a connection after the
 1373          * user processes are gone, then RST the other end.
 1374          */
 1375         if ((so->so_state & SS_NOFDREF) &&
 1376             tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 1377                 tp = tcp_close(tp);
 1378                 tcpstat.tcps_rcvafterclose++;
 1379                 goto dropwithreset;
 1380         }
 1381 
 1382         /*
 1383          * If segment ends after window, drop trailing data
 1384          * (and PUSH and FIN); if nothing left, just ACK.
 1385          */
 1386         todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
 1387         if (todrop > 0) {
 1388                 tcpstat.tcps_rcvpackafterwin++;
 1389                 if (todrop >= tlen) {
 1390                         tcpstat.tcps_rcvbyteafterwin += tlen;
 1391                         /*
 1392                          * If window is closed can only take segments at
 1393                          * window edge, and have to drop data and PUSH from
 1394                          * incoming segments.  Continue processing, but
 1395                          * remember to ack.  Otherwise, drop segment
 1396                          * and ack.
 1397                          */
 1398                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 1399                                 tp->t_flags |= TF_ACKNOW;
 1400                                 tcpstat.tcps_rcvwinprobe++;
 1401                         } else
 1402                                 goto dropafterack;
 1403                 } else
 1404                         tcpstat.tcps_rcvbyteafterwin += todrop;
 1405                 m_adj(m, -todrop);
 1406                 tlen -= todrop;
 1407                 tiflags &= ~(TH_PUSH|TH_FIN);
 1408         }
 1409 
 1410         /*
 1411          * If last ACK falls within this segment's sequence numbers,
 1412          * record its timestamp if it's more recent.
 1413          * Cf fix from Braden, see Stevens p. 870
 1414          */
 1415         if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
 1416             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 1417                 if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 1418                     ((tiflags & (TH_SYN|TH_FIN)) != 0)))
 1419                         tp->ts_recent = opti.ts_val;
 1420                 else
 1421                         tp->ts_recent = 0;
 1422                 tp->ts_recent_age = tcp_now;
 1423         }
 1424 
 1425         /*
 1426          * If the RST bit is set examine the state:
 1427          *    SYN_RECEIVED STATE:
 1428          *      If passive open, return to LISTEN state.
 1429          *      If active open, inform user that connection was refused.
 1430          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
 1431          *      Inform user that connection was reset, and close tcb.
 1432          *    CLOSING, LAST_ACK, TIME_WAIT STATES
 1433          *      Close the tcb.
 1434          */
 1435         if (tiflags & TH_RST) {
 1436                 if (th->th_seq != tp->last_ack_sent &&
 1437                     th->th_seq != tp->rcv_nxt &&
 1438                     th->th_seq != (tp->rcv_nxt + 1))
 1439                         goto drop;
 1440 
 1441                 switch (tp->t_state) {
 1442                 case TCPS_SYN_RECEIVED:
 1443 #ifdef TCP_ECN
 1444                         /* if ECN is enabled, fall back to non-ecn at rexmit */
 1445                         if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
 1446                                 goto drop;
 1447 #endif
 1448                         so->so_error = ECONNREFUSED;
 1449                         goto close;
 1450 
 1451                 case TCPS_ESTABLISHED:
 1452                 case TCPS_FIN_WAIT_1:
 1453                 case TCPS_FIN_WAIT_2:
 1454                 case TCPS_CLOSE_WAIT:
 1455                         so->so_error = ECONNRESET;
 1456                 close:
 1457                         tp->t_state = TCPS_CLOSED;
 1458                         tcpstat.tcps_drops++;
 1459                         tp = tcp_close(tp);
 1460                         goto drop;
 1461                 case TCPS_CLOSING:
 1462                 case TCPS_LAST_ACK:
 1463                 case TCPS_TIME_WAIT:
 1464                         tp = tcp_close(tp);
 1465                         goto drop;
 1466                 }
 1467         }
 1468 
 1469         /*
 1470          * If a SYN is in the window, then this is an
 1471          * error and we ACK and drop the packet.
 1472          */
 1473         if (tiflags & TH_SYN)
 1474                 goto dropafterack_ratelim;
 1475 
 1476         /*
 1477          * If the ACK bit is off we drop the segment and return.
 1478          */
 1479         if ((tiflags & TH_ACK) == 0) {
 1480                 if (tp->t_flags & TF_ACKNOW)
 1481                         goto dropafterack;
 1482                 else
 1483                         goto drop;
 1484         }
 1485 
 1486         /*
 1487          * Ack processing.
 1488          */
 1489         switch (tp->t_state) {
 1490 
 1491         /*
 1492          * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
 1493          * ESTABLISHED state and continue processing.
 1494          * The ACK was checked above.
 1495          */
 1496         case TCPS_SYN_RECEIVED:
 1497                 tcpstat.tcps_connects++;
 1498                 soisconnected(so);
 1499                 tp->t_state = TCPS_ESTABLISHED;
 1500                 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
 1501                 /* Do window scaling? */
 1502                 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 1503                         (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 1504                         tp->snd_scale = tp->requested_s_scale;
 1505                         tp->rcv_scale = tp->request_r_scale;
 1506                 }
 1507                 tcp_reass_lock(tp);
 1508                 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0,
 1509                                  &tlen);
 1510                 tcp_reass_unlock(tp);
 1511                 tp->snd_wl1 = th->th_seq - 1;
 1512                 /* fall into ... */
 1513 
 1514         /*
 1515          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 1516          * ACKs.  If the ack is in the range
 1517          *      tp->snd_una < th->th_ack <= tp->snd_max
 1518          * then advance tp->snd_una to th->th_ack and drop
 1519          * data from the retransmission queue.  If this ACK reflects
 1520          * more up to date window information we update our window information.
 1521          */
 1522         case TCPS_ESTABLISHED:
 1523         case TCPS_FIN_WAIT_1:
 1524         case TCPS_FIN_WAIT_2:
 1525         case TCPS_CLOSE_WAIT:
 1526         case TCPS_CLOSING:
 1527         case TCPS_LAST_ACK:
 1528         case TCPS_TIME_WAIT:
 1529 #ifdef TCP_ECN
 1530                 /*
 1531                  * if we receive ECE and are not already in recovery phase,
 1532                  * reduce cwnd by half but don't slow-start.
 1533                  * advance snd_last to snd_max not to reduce cwnd again
 1534                  * until all outstanding packets are acked.
 1535                  */
 1536                 if (tcp_do_ecn && (tiflags & TH_ECE)) {
 1537                         if ((tp->t_flags & TF_ECN_PERMIT) &&
 1538                             SEQ_GEQ(tp->snd_una, tp->snd_last)) {
 1539                                 u_int win;
 1540 
 1541                                 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg;
 1542                                 if (win > 1) {
 1543                                         tp->snd_ssthresh = win / 2 * tp->t_maxseg;
 1544                                         tp->snd_cwnd = tp->snd_ssthresh;
 1545                                         tp->snd_last = tp->snd_max;
 1546                                         tp->t_flags |= TF_SEND_CWR;
 1547                                         tcpstat.tcps_cwr_ecn++;
 1548                                 }
 1549                         }
 1550                         tcpstat.tcps_ecn_rcvece++;
 1551                 }
 1552                 /*
 1553                  * if we receive CWR, we know that the peer has reduced
 1554                  * its congestion window.  stop sending ecn-echo.
 1555                  */
 1556                 if ((tiflags & TH_CWR)) {
 1557                         tp->t_flags &= ~TF_RCVD_CE;
 1558                         tcpstat.tcps_ecn_rcvcwr++;
 1559                 }
 1560 #endif /* TCP_ECN */
 1561 
 1562                 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 1563                         /*
 1564                          * Duplicate/old ACK processing.
 1565                          * Increments t_dupacks:
 1566                          *      Pure duplicate (same seq/ack/window, no data)
 1567                          * Doesn't affect t_dupacks:
 1568                          *      Data packets.
 1569                          *      Normal window updates (window opens)
 1570                          * Resets t_dupacks:
 1571                          *      New data ACKed.
 1572                          *      Window shrinks
 1573                          *      Old ACK
 1574                          */
 1575                         if (tlen) {
 1576                                 /* Drop very old ACKs unless th_seq matches */
 1577                                 if (th->th_seq != tp->rcv_nxt &&
 1578                                    SEQ_LT(th->th_ack,
 1579                                    tp->snd_una - tp->max_sndwnd)) {
 1580                                         tcpstat.tcps_rcvacktooold++;
 1581                                         goto drop;
 1582                                 }
 1583                                 break;
 1584                         }
 1585                         /*
 1586                          * If we get an old ACK, there is probably packet
 1587                          * reordering going on.  Be conservative and reset
 1588                          * t_dupacks so that we are less agressive in
 1589                          * doing a fast retransmit.
 1590                          */
 1591                         if (th->th_ack != tp->snd_una) {
 1592                                 tp->t_dupacks = 0;
 1593                                 break;
 1594                         }
 1595                         if (tiwin == tp->snd_wnd) {
 1596                                 tcpstat.tcps_rcvdupack++;
 1597                                 /*
 1598                                  * If we have outstanding data (other than
 1599                                  * a window probe), this is a completely
 1600                                  * duplicate ack (ie, window info didn't
 1601                                  * change), the ack is the biggest we've
 1602                                  * seen and we've seen exactly our rexmt
 1603                                  * threshold of them, assume a packet
 1604                                  * has been dropped and retransmit it.
 1605                                  * Kludge snd_nxt & the congestion
 1606                                  * window so we send only this one
 1607                                  * packet.
 1608                                  *
 1609                                  * We know we're losing at the current
 1610                                  * window size so do congestion avoidance
 1611                                  * (set ssthresh to half the current window
 1612                                  * and pull our congestion window back to
 1613                                  * the new ssthresh).
 1614                                  *
 1615                                  * Dup acks mean that packets have left the
 1616                                  * network (they're now cached at the receiver)
 1617                                  * so bump cwnd by the amount in the receiver
 1618                                  * to keep a constant cwnd packets in the
 1619                                  * network.
 1620                                  */
 1621                                 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0)
 1622                                         tp->t_dupacks = 0;
 1623 #if defined(TCP_SACK) && defined(TCP_FACK)
 1624                                 /*
 1625                                  * In FACK, can enter fast rec. if the receiver
 1626                                  * reports a reass. queue longer than 3 segs.
 1627                                  */
 1628                                 else if (++tp->t_dupacks == tcprexmtthresh ||
 1629                                     ((SEQ_GT(tp->snd_fack, tcprexmtthresh *
 1630                                     tp->t_maxseg + tp->snd_una)) &&
 1631                                     SEQ_GT(tp->snd_una, tp->snd_last))) {
 1632 #else
 1633                                 else if (++tp->t_dupacks == tcprexmtthresh) {
 1634 #endif /* TCP_FACK */
 1635                                         tcp_seq onxt = tp->snd_nxt;
 1636                                         u_long win =
 1637                                             ulmin(tp->snd_wnd, tp->snd_cwnd) /
 1638                                                 2 / tp->t_maxseg;
 1639 
 1640 #if defined(TCP_SACK) || defined(TCP_ECN)
 1641                                         if (SEQ_LT(th->th_ack, tp->snd_last)){
 1642                                                 /*
 1643                                                  * False fast retx after
 1644                                                  * timeout.  Do not cut window.
 1645                                                  */
 1646                                                 tp->t_dupacks = 0;
 1647                                                 goto drop;
 1648                                         }
 1649 #endif
 1650                                         if (win < 2)
 1651                                                 win = 2;
 1652                                         tp->snd_ssthresh = win * tp->t_maxseg;
 1653 #if defined(TCP_SACK)
 1654                                         tp->snd_last = tp->snd_max;
 1655 #endif
 1656 #ifdef TCP_SACK
 1657                                         if (tp->sack_enable) {
 1658                                                 TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1659                                                 tp->t_rtttime = 0;
 1660 #ifdef TCP_ECN
 1661                                                 tp->t_flags |= TF_SEND_CWR;
 1662 #endif
 1663 #if 1 /* TCP_ECN */
 1664                                                 tcpstat.tcps_cwr_frecovery++;
 1665 #endif
 1666                                                 tcpstat.tcps_sack_recovery_episode++;
 1667 #if defined(TCP_SACK) && defined(TCP_FACK)
 1668                                                 tp->t_dupacks = tcprexmtthresh;
 1669                                                 (void) tcp_output(tp);
 1670                                                 /*
 1671                                                  * During FR, snd_cwnd is held
 1672                                                  * constant for FACK.
 1673                                                  */
 1674                                                 tp->snd_cwnd = tp->snd_ssthresh;
 1675 #else
 1676                                                 /*
 1677                                                  * tcp_output() will send
 1678                                                  * oldest SACK-eligible rtx.
 1679                                                  */
 1680                                                 (void) tcp_output(tp);
 1681                                                 tp->snd_cwnd = tp->snd_ssthresh+
 1682                                                    tp->t_maxseg * tp->t_dupacks;
 1683 #endif /* TCP_FACK */
 1684                                                 goto drop;
 1685                                         }
 1686 #endif /* TCP_SACK */
 1687                                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1688                                         tp->t_rtttime = 0;
 1689                                         tp->snd_nxt = th->th_ack;
 1690                                         tp->snd_cwnd = tp->t_maxseg;
 1691 #ifdef TCP_ECN
 1692                                         tp->t_flags |= TF_SEND_CWR;
 1693 #endif
 1694 #if 1 /* TCP_ECN */
 1695                                         tcpstat.tcps_cwr_frecovery++;
 1696 #endif
 1697                                         tcpstat.tcps_sndrexmitfast++;
 1698                                         (void) tcp_output(tp);
 1699 
 1700                                         tp->snd_cwnd = tp->snd_ssthresh +
 1701                                             tp->t_maxseg * tp->t_dupacks;
 1702                                         if (SEQ_GT(onxt, tp->snd_nxt))
 1703                                                 tp->snd_nxt = onxt;
 1704                                         goto drop;
 1705                                 } else if (tp->t_dupacks > tcprexmtthresh) {
 1706 #if defined(TCP_SACK) && defined(TCP_FACK)
 1707                                         /*
 1708                                          * while (awnd < cwnd)
 1709                                          *         sendsomething();
 1710                                          */
 1711                                         if (tp->sack_enable) {
 1712                                                 if (tp->snd_awnd < tp->snd_cwnd)
 1713                                                         tcp_output(tp);
 1714                                                 goto drop;
 1715                                         }
 1716 #endif /* TCP_FACK */
 1717                                         tp->snd_cwnd += tp->t_maxseg;
 1718                                         (void) tcp_output(tp);
 1719                                         goto drop;
 1720                                 }
 1721                         } else if (tiwin < tp->snd_wnd) {
 1722                                 /*
 1723                                  * The window was retracted!  Previous dup
 1724                                  * ACKs may have been due to packets arriving
 1725                                  * after the shrunken window, not a missing
 1726                                  * packet, so play it safe and reset t_dupacks
 1727                                  */
 1728                                 tp->t_dupacks = 0;
 1729                         }
 1730                         break;
 1731                 }
 1732                 /*
 1733                  * If the congestion window was inflated to account
 1734                  * for the other side's cached packets, retract it.
 1735                  */
 1736 #if defined(TCP_SACK)
 1737                 if (tp->sack_enable) {
 1738                         if (tp->t_dupacks >= tcprexmtthresh) {
 1739                                 /* Check for a partial ACK */
 1740                                 if (tcp_sack_partialack(tp, th)) {
 1741 #if defined(TCP_SACK) && defined(TCP_FACK)
 1742                                         /* Force call to tcp_output */
 1743                                         if (tp->snd_awnd < tp->snd_cwnd)
 1744                                                 needoutput = 1;
 1745 #else
 1746                                         tp->snd_cwnd += tp->t_maxseg;
 1747                                         needoutput = 1;
 1748 #endif /* TCP_FACK */
 1749                                 } else {
 1750                                         /* Out of fast recovery */
 1751                                         tp->snd_cwnd = tp->snd_ssthresh;
 1752                                         if (tcp_seq_subtract(tp->snd_max,
 1753                                             th->th_ack) < tp->snd_ssthresh)
 1754                                                 tp->snd_cwnd =
 1755                                                    tcp_seq_subtract(tp->snd_max,
 1756                                                    th->th_ack);
 1757                                         tp->t_dupacks = 0;
 1758 #if defined(TCP_SACK) && defined(TCP_FACK)
 1759                                         if (SEQ_GT(th->th_ack, tp->snd_fack))
 1760                                                 tp->snd_fack = th->th_ack;
 1761 #endif /* TCP_FACK */
 1762                                 }
 1763                         }
 1764                 } else {
 1765                         if (tp->t_dupacks >= tcprexmtthresh &&
 1766                             !tcp_newreno(tp, th)) {
 1767                                 /* Out of fast recovery */
 1768                                 tp->snd_cwnd = tp->snd_ssthresh;
 1769                                 if (tcp_seq_subtract(tp->snd_max, th->th_ack) <
 1770                                     tp->snd_ssthresh)
 1771                                         tp->snd_cwnd =
 1772                                             tcp_seq_subtract(tp->snd_max,
 1773                                             th->th_ack);
 1774                                 tp->t_dupacks = 0;
 1775                         }
 1776                 }
 1777                 if (tp->t_dupacks < tcprexmtthresh)
 1778                         tp->t_dupacks = 0;
 1779 #else /* else no TCP_SACK */
 1780                 if (tp->t_dupacks >= tcprexmtthresh &&
 1781                     tp->snd_cwnd > tp->snd_ssthresh)
 1782                         tp->snd_cwnd = tp->snd_ssthresh;
 1783                 tp->t_dupacks = 0;
 1784 #endif
 1785                 if (SEQ_GT(th->th_ack, tp->snd_max)) {
 1786                         tcpstat.tcps_rcvacktoomuch++;
 1787                         goto dropafterack_ratelim;
 1788                 }
 1789                 acked = th->th_ack - tp->snd_una;
 1790                 tcpstat.tcps_rcvackpack++;
 1791                 tcpstat.tcps_rcvackbyte += acked;
 1792 
 1793                 /*
 1794                  * If we have a timestamp reply, update smoothed
 1795                  * round trip time.  If no timestamp is present but
 1796                  * transmit timer is running and timed sequence
 1797                  * number was acked, update smoothed round trip time.
 1798                  * Since we now have an rtt measurement, cancel the
 1799                  * timer backoff (cf., Phil Karn's retransmit alg.).
 1800                  * Recompute the initial retransmit timer.
 1801                  */
 1802                 if (opti.ts_present && opti.ts_ecr)
 1803                         tcp_xmit_timer(tp, tcp_now - opti.ts_ecr);
 1804                 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
 1805                         tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
 1806 
 1807                 /*
 1808                  * If all outstanding data is acked, stop retransmit
 1809                  * timer and remember to restart (more output or persist).
 1810                  * If there is more data to be acked, restart retransmit
 1811                  * timer, using current (possibly backed-off) value.
 1812                  */
 1813                 if (th->th_ack == tp->snd_max) {
 1814                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1815                         needoutput = 1;
 1816                 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
 1817                         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 1818                 /*
 1819                  * When new data is acked, open the congestion window.
 1820                  * If the window gives us less than ssthresh packets
 1821                  * in flight, open exponentially (maxseg per packet).
 1822                  * Otherwise open linearly: maxseg per window
 1823                  * (maxseg^2 / cwnd per packet).
 1824                  */
 1825                 {
 1826                 u_int cw = tp->snd_cwnd;
 1827                 u_int incr = tp->t_maxseg;
 1828 
 1829                 if (cw > tp->snd_ssthresh)
 1830                         incr = incr * incr / cw;
 1831 #if defined (TCP_SACK)
 1832                 if (tp->t_dupacks < tcprexmtthresh)
 1833 #endif
 1834                 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale);
 1835                 }
 1836                 ND6_HINT(tp);
 1837                 if (acked > so->so_snd.sb_cc) {
 1838                         tp->snd_wnd -= so->so_snd.sb_cc;
 1839                         sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
 1840                         ourfinisacked = 1;
 1841                 } else {
 1842                         sbdrop(&so->so_snd, acked);
 1843                         tp->snd_wnd -= acked;
 1844                         ourfinisacked = 0;
 1845                 }
 1846                 if (sb_notify(&so->so_snd))
 1847                         sowwakeup(so);
 1848 
 1849                 /*
 1850                  * If we had a pending ICMP message that referred to data
 1851                  * that have just been acknowledged, disregard the recorded
 1852                  * ICMP message.
 1853                  */
 1854                 if ((tp->t_flags & TF_PMTUD_PEND) && 
 1855                     SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
 1856                         tp->t_flags &= ~TF_PMTUD_PEND;
 1857 
 1858                 /*
 1859                  * Keep track of the largest chunk of data acknowledged
 1860                  * since last PMTU update
 1861                  */
 1862                 if (tp->t_pmtud_mss_acked < acked)
 1863                         tp->t_pmtud_mss_acked = acked;
 1864 
 1865                 tp->snd_una = th->th_ack;
 1866 #ifdef TCP_ECN
 1867                 /* sync snd_last with snd_una */
 1868                 if (SEQ_GT(tp->snd_una, tp->snd_last))
 1869                         tp->snd_last = tp->snd_una;
 1870 #endif
 1871                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 1872                         tp->snd_nxt = tp->snd_una;
 1873 #if defined (TCP_SACK) && defined (TCP_FACK)
 1874                 if (SEQ_GT(tp->snd_una, tp->snd_fack)) {
 1875                         tp->snd_fack = tp->snd_una;
 1876                         /* Update snd_awnd for partial ACK
 1877                          * without any SACK blocks.
 1878                          */
 1879                         tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt,
 1880                                 tp->snd_fack) + tp->retran_data;
 1881                 }
 1882 #endif
 1883 
 1884                 switch (tp->t_state) {
 1885 
 1886                 /*
 1887                  * In FIN_WAIT_1 STATE in addition to the processing
 1888                  * for the ESTABLISHED state if our FIN is now acknowledged
 1889                  * then enter FIN_WAIT_2.
 1890                  */
 1891                 case TCPS_FIN_WAIT_1:
 1892                         if (ourfinisacked) {
 1893                                 /*
 1894                                  * If we can't receive any more
 1895                                  * data, then closing user can proceed.
 1896                                  * Starting the timer is contrary to the
 1897                                  * specification, but if we don't get a FIN
 1898                                  * we'll hang forever.
 1899                                  */
 1900                                 if (so->so_state & SS_CANTRCVMORE) {
 1901                                         soisdisconnected(so);
 1902                                         TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
 1903                                 }
 1904                                 tp->t_state = TCPS_FIN_WAIT_2;
 1905                         }
 1906                         break;
 1907 
 1908                 /*
 1909                  * In CLOSING STATE in addition to the processing for
 1910                  * the ESTABLISHED state if the ACK acknowledges our FIN
 1911                  * then enter the TIME-WAIT state, otherwise ignore
 1912                  * the segment.
 1913                  */
 1914                 case TCPS_CLOSING:
 1915                         if (ourfinisacked) {
 1916                                 tp->t_state = TCPS_TIME_WAIT;
 1917                                 tcp_canceltimers(tp);
 1918                                 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 1919                                 soisdisconnected(so);
 1920                         }
 1921                         break;
 1922 
 1923                 /*
 1924                  * In LAST_ACK, we may still be waiting for data to drain
 1925                  * and/or to be acked, as well as for the ack of our FIN.
 1926                  * If our FIN is now acknowledged, delete the TCB,
 1927                  * enter the closed state and return.
 1928                  */
 1929                 case TCPS_LAST_ACK:
 1930                         if (ourfinisacked) {
 1931                                 tp = tcp_close(tp);
 1932                                 goto drop;
 1933                         }
 1934                         break;
 1935 
 1936                 /*
 1937                  * In TIME_WAIT state the only thing that should arrive
 1938                  * is a retransmission of the remote FIN.  Acknowledge
 1939                  * it and restart the finack timer.
 1940                  */
 1941                 case TCPS_TIME_WAIT:
 1942                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 1943                         goto dropafterack;
 1944                 }
 1945         }
 1946 
 1947 step6:
 1948         /*
 1949          * Update window information.
 1950          * Don't look at window if no ACK: TAC's send garbage on first SYN.
 1951          */
 1952         if ((tiflags & TH_ACK) &&
 1953             (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq &&
 1954             (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 1955             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 1956                 /* keep track of pure window updates */
 1957                 if (tlen == 0 &&
 1958                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 1959                         tcpstat.tcps_rcvwinupd++;
 1960                 tp->snd_wnd = tiwin;
 1961                 tp->snd_wl1 = th->th_seq;
 1962                 tp->snd_wl2 = th->th_ack;
 1963                 if (tp->snd_wnd > tp->max_sndwnd)
 1964                         tp->max_sndwnd = tp->snd_wnd;
 1965                 needoutput = 1;
 1966         }
 1967 
 1968         /*
 1969          * Process segments with URG.
 1970          */
 1971         if ((tiflags & TH_URG) && th->th_urp &&
 1972             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 1973                 /*
 1974                  * This is a kludge, but if we receive and accept
 1975                  * random urgent pointers, we'll crash in
 1976                  * soreceive.  It's hard to imagine someone
 1977                  * actually wanting to send this much urgent data.
 1978                  */
 1979                 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
 1980                         th->th_urp = 0;                 /* XXX */
 1981                         tiflags &= ~TH_URG;             /* XXX */
 1982                         goto dodata;                    /* XXX */
 1983                 }
 1984                 /*
 1985                  * If this segment advances the known urgent pointer,
 1986                  * then mark the data stream.  This should not happen
 1987                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 1988                  * a FIN has been received from the remote side.
 1989                  * In these states we ignore the URG.
 1990                  *
 1991                  * According to RFC961 (Assigned Protocols),
 1992                  * the urgent pointer points to the last octet
 1993                  * of urgent data.  We continue, however,
 1994                  * to consider it to indicate the first octet
 1995                  * of data past the urgent section as the original
 1996                  * spec states (in one of two places).
 1997                  */
 1998                 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 1999                         tp->rcv_up = th->th_seq + th->th_urp;
 2000                         so->so_oobmark = so->so_rcv.sb_cc +
 2001                             (tp->rcv_up - tp->rcv_nxt) - 1;
 2002                         if (so->so_oobmark == 0)
 2003                                 so->so_state |= SS_RCVATMARK;
 2004                         sohasoutofband(so);
 2005                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 2006                 }
 2007                 /*
 2008                  * Remove out of band data so doesn't get presented to user.
 2009                  * This can happen independent of advancing the URG pointer,
 2010                  * but if two URG's are pending at once, some out-of-band
 2011                  * data may creep in... ick.
 2012                  */
 2013                 if (th->th_urp <= (u_int16_t) tlen
 2014 #ifdef SO_OOBINLINE
 2015                      && (so->so_options & SO_OOBINLINE) == 0
 2016 #endif
 2017                      )
 2018                         tcp_pulloutofband(so, th->th_urp, m, hdroptlen);
 2019         } else
 2020                 /*
 2021                  * If no out of band data is expected,
 2022                  * pull receive urgent pointer along
 2023                  * with the receive window.
 2024                  */
 2025                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 2026                         tp->rcv_up = tp->rcv_nxt;
 2027 dodata:                                                 /* XXX */
 2028 
 2029         /*
 2030          * Process the segment text, merging it into the TCP sequencing queue,
 2031          * and arranging for acknowledgment of receipt if necessary.
 2032          * This process logically involves adjusting tp->rcv_wnd as data
 2033          * is presented to the user (this happens in tcp_usrreq.c,
 2034          * case PRU_RCVD).  If a FIN has already been received on this
 2035          * connection then we just ignore the text.
 2036          */
 2037         if ((tlen || (tiflags & TH_FIN)) &&
 2038             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 2039 #ifdef TCP_SACK
 2040                 tcp_seq laststart = th->th_seq;
 2041                 tcp_seq lastend = th->th_seq + tlen;
 2042 #endif
 2043                 tcp_reass_lock(tp);
 2044                 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) &&
 2045                     tp->t_state == TCPS_ESTABLISHED) {
 2046                         tcp_reass_unlock(tp);
 2047                         TCP_SETUP_ACK(tp, tiflags);
 2048                         tp->rcv_nxt += tlen;
 2049                         tiflags = th->th_flags & TH_FIN;
 2050                         tcpstat.tcps_rcvpack++;
 2051                         tcpstat.tcps_rcvbyte += tlen;
 2052                         ND6_HINT(tp);
 2053                         if (so->so_state & SS_CANTRCVMORE)
 2054                                 m_freem(m);
 2055                         else {
 2056                                 m_adj(m, hdroptlen);
 2057                                 sbappendstream(&so->so_rcv, m);
 2058                         }
 2059                         sorwakeup(so);
 2060                 } else {
 2061                         m_adj(m, hdroptlen);
 2062                         tiflags = tcp_reass(tp, th, m, &tlen);
 2063                         tcp_reass_unlock(tp);
 2064                         tp->t_flags |= TF_ACKNOW;
 2065                 }
 2066 #ifdef TCP_SACK
 2067                 if (tp->sack_enable)
 2068                         tcp_update_sack_list(tp, laststart, lastend);
 2069 #endif
 2070 
 2071                 /*
 2072                  * variable len never referenced again in modern BSD,
 2073                  * so why bother computing it ??
 2074                  */
 2075 #if 0
 2076                 /*
 2077                  * Note the amount of data that peer has sent into
 2078                  * our window, in order to estimate the sender's
 2079                  * buffer size.
 2080                  */
 2081                 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 2082 #endif /* 0 */
 2083         } else {
 2084                 m_freem(m);
 2085                 tiflags &= ~TH_FIN;
 2086         }
 2087 
 2088         /*
 2089          * If FIN is received ACK the FIN and let the user know
 2090          * that the connection is closing.  Ignore a FIN received before
 2091          * the connection is fully established.
 2092          */
 2093         if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
 2094                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 2095                         socantrcvmore(so);
 2096                         tp->t_flags |= TF_ACKNOW;
 2097                         tp->rcv_nxt++;
 2098                 }
 2099                 switch (tp->t_state) {
 2100 
 2101                 /*
 2102                  * In ESTABLISHED STATE enter the CLOSE_WAIT state.
 2103                  */
 2104                 case TCPS_ESTABLISHED:
 2105                         tp->t_state = TCPS_CLOSE_WAIT;
 2106                         break;
 2107 
 2108                 /*
 2109                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
 2110                  * enter the CLOSING state.
 2111                  */
 2112                 case TCPS_FIN_WAIT_1:
 2113                         tp->t_state = TCPS_CLOSING;
 2114                         break;
 2115 
 2116                 /*
 2117                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
 2118                  * starting the time-wait timer, turning off the other
 2119                  * standard timers.
 2120                  */
 2121                 case TCPS_FIN_WAIT_2:
 2122                         tp->t_state = TCPS_TIME_WAIT;
 2123                         tcp_canceltimers(tp);
 2124                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 2125                         soisdisconnected(so);
 2126                         break;
 2127 
 2128                 /*
 2129                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
 2130                  */
 2131                 case TCPS_TIME_WAIT:
 2132                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 2133                         break;
 2134                 }
 2135         }
 2136         if (so->so_options & SO_DEBUG) {
 2137                 switch (tp->pf) {
 2138 #ifdef INET6
 2139                 case PF_INET6:
 2140                         tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6,
 2141                             0, tlen);
 2142                         break;
 2143 #endif /* INET6 */
 2144                 case PF_INET:
 2145                         tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti,
 2146                             0, tlen);
 2147                         break;
 2148                 }
 2149         }
 2150 
 2151         /*
 2152          * Return any desired output.
 2153          */
 2154         if (needoutput || (tp->t_flags & TF_ACKNOW)) {
 2155                 (void) tcp_output(tp);
 2156         }
 2157         return;
 2158 
 2159 badsyn:
 2160         /*
 2161          * Received a bad SYN.  Increment counters and dropwithreset.
 2162          */
 2163         tcpstat.tcps_badsyn++;
 2164         tp = NULL;
 2165         goto dropwithreset;
 2166 
 2167 dropafterack_ratelim:
 2168         if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
 2169             tcp_ackdrop_ppslim) == 0) {
 2170                 /* XXX stat */
 2171                 goto drop;
 2172         }
 2173         /* ...fall into dropafterack... */
 2174 
 2175 dropafterack:
 2176         /*
 2177          * Generate an ACK dropping incoming segment if it occupies
 2178          * sequence space, where the ACK reflects our state.
 2179          */
 2180         if (tiflags & TH_RST)
 2181                 goto drop;
 2182         m_freem(m);
 2183         tp->t_flags |= TF_ACKNOW;
 2184         (void) tcp_output(tp);
 2185         return;
 2186 
 2187 dropwithreset_ratelim:
 2188         /*
 2189          * We may want to rate-limit RSTs in certain situations,
 2190          * particularly if we are sending an RST in response to
 2191          * an attempt to connect to or otherwise communicate with
 2192          * a port for which we have no socket.
 2193          */
 2194         if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
 2195             tcp_rst_ppslim) == 0) {
 2196                 /* XXX stat */
 2197                 goto drop;
 2198         }
 2199         /* ...fall into dropwithreset... */
 2200 
 2201 dropwithreset:
 2202         /*
 2203          * Generate a RST, dropping incoming segment.
 2204          * Make ACK acceptable to originator of segment.
 2205          * Don't bother to respond to RST.
 2206          */
 2207         if (tiflags & TH_RST)
 2208                 goto drop;
 2209         if (tiflags & TH_ACK) {
 2210                 tcp_respond(tp, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack,
 2211                     TH_RST);
 2212         } else {
 2213                 if (tiflags & TH_SYN)
 2214                         tlen++;
 2215                 tcp_respond(tp, mtod(m, caddr_t), m, th->th_seq + tlen,
 2216                     (tcp_seq)0, TH_RST|TH_ACK);
 2217         }
 2218         return;
 2219 
 2220 drop:
 2221         /*
 2222          * Drop space held by incoming segment and return.
 2223          */
 2224         if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
 2225                 switch (tp->pf) {
 2226 #ifdef INET6
 2227                 case PF_INET6:
 2228                         tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6,
 2229                             0, tlen);
 2230                         break;
 2231 #endif /* INET6 */
 2232                 case PF_INET:
 2233                         tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti,
 2234                             0, tlen);
 2235                         break;
 2236                 }
 2237         }
 2238 
 2239         m_freem(m);
 2240         return;
 2241 }
 2242 
 2243 int
 2244 tcp_dooptions(tp, cp, cnt, th, m, iphlen, oi)
 2245         struct tcpcb *tp;
 2246         u_char *cp;
 2247         int cnt;
 2248         struct tcphdr *th;
 2249         struct mbuf *m;
 2250         int iphlen;
 2251         struct tcp_opt_info *oi;
 2252 {
 2253         u_int16_t mss = 0;
 2254         int opt, optlen;
 2255 #ifdef TCP_SIGNATURE
 2256         caddr_t sigp = NULL;
 2257         struct tdb *tdb = NULL;
 2258 #endif /* TCP_SIGNATURE */
 2259 
 2260         for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
 2261                 opt = cp[0];
 2262                 if (opt == TCPOPT_EOL)
 2263                         break;
 2264                 if (opt == TCPOPT_NOP)
 2265                         optlen = 1;
 2266                 else {
 2267                         if (cnt < 2)
 2268                                 break;
 2269                         optlen = cp[1];
 2270                         if (optlen < 2 || optlen > cnt)
 2271                                 break;
 2272                 }
 2273                 switch (opt) {
 2274 
 2275                 default:
 2276                         continue;
 2277 
 2278                 case TCPOPT_MAXSEG:
 2279                         if (optlen != TCPOLEN_MAXSEG)
 2280                                 continue;
 2281                         if (!(th->th_flags & TH_SYN))
 2282                                 continue;
 2283                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2284                                 continue;
 2285                         bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
 2286                         NTOHS(mss);
 2287                         oi->maxseg = mss;
 2288                         break;
 2289 
 2290                 case TCPOPT_WINDOW:
 2291                         if (optlen != TCPOLEN_WINDOW)
 2292                                 continue;
 2293                         if (!(th->th_flags & TH_SYN))
 2294                                 continue;
 2295                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2296                                 continue;
 2297                         tp->t_flags |= TF_RCVD_SCALE;
 2298                         tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
 2299                         break;
 2300 
 2301                 case TCPOPT_TIMESTAMP:
 2302                         if (optlen != TCPOLEN_TIMESTAMP)
 2303                                 continue;
 2304                         oi->ts_present = 1;
 2305                         bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
 2306                         NTOHL(oi->ts_val);
 2307                         bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
 2308                         NTOHL(oi->ts_ecr);
 2309 
 2310                         if (!(th->th_flags & TH_SYN))
 2311                                 continue;
 2312                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2313                                 continue;
 2314                         /*
 2315                          * A timestamp received in a SYN makes
 2316                          * it ok to send timestamp requests and replies.
 2317                          */
 2318                         tp->t_flags |= TF_RCVD_TSTMP;
 2319                         tp->ts_recent = oi->ts_val;
 2320                         tp->ts_recent_age = tcp_now;
 2321                         break;
 2322 
 2323 #ifdef TCP_SACK
 2324                 case TCPOPT_SACK_PERMITTED:
 2325                         if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED)
 2326                                 continue;
 2327                         if (!(th->th_flags & TH_SYN))
 2328                                 continue;
 2329                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2330                                 continue;
 2331                         /* MUST only be set on SYN */
 2332                         tp->t_flags |= TF_SACK_PERMIT;
 2333                         break;
 2334                 case TCPOPT_SACK:
 2335                         tcp_sack_option(tp, th, cp, optlen);
 2336                         break;
 2337 #endif
 2338 #ifdef TCP_SIGNATURE
 2339                 case TCPOPT_SIGNATURE:
 2340                         if (optlen != TCPOLEN_SIGNATURE)
 2341                                 continue;
 2342 
 2343                         if (sigp && bcmp(sigp, cp + 2, 16))
 2344                                 return (-1);
 2345 
 2346                         sigp = cp + 2;
 2347                         break;
 2348 #endif /* TCP_SIGNATURE */
 2349                 }
 2350         }
 2351 
 2352 #ifdef TCP_SIGNATURE
 2353         if (tp->t_flags & TF_SIGNATURE) {
 2354                 union sockaddr_union src, dst;
 2355 
 2356                 memset(&src, 0, sizeof(union sockaddr_union));
 2357                 memset(&dst, 0, sizeof(union sockaddr_union));
 2358 
 2359                 switch (tp->pf) {
 2360                 case 0:
 2361 #ifdef INET
 2362                 case AF_INET:
 2363                         src.sa.sa_len = sizeof(struct sockaddr_in);
 2364                         src.sa.sa_family = AF_INET;
 2365                         src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
 2366                         dst.sa.sa_len = sizeof(struct sockaddr_in);
 2367                         dst.sa.sa_family = AF_INET;
 2368                         dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
 2369                         break;
 2370 #endif
 2371 #ifdef INET6
 2372                 case AF_INET6:
 2373                         src.sa.sa_len = sizeof(struct sockaddr_in6);
 2374                         src.sa.sa_family = AF_INET6;
 2375                         src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
 2376                         dst.sa.sa_len = sizeof(struct sockaddr_in6);
 2377                         dst.sa.sa_family = AF_INET6;
 2378                         dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
 2379                         break;
 2380 #endif /* INET6 */
 2381                 }
 2382 
 2383                 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP);
 2384 
 2385                 /*
 2386                  * We don't have an SA for this peer, so we turn off
 2387                  * TF_SIGNATURE on the listen socket
 2388                  */
 2389                 if (tdb == NULL && tp->t_state == TCPS_LISTEN)
 2390                         tp->t_flags &= ~TF_SIGNATURE;
 2391 
 2392         }
 2393 
 2394         if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) {
 2395                 tcpstat.tcps_rcvbadsig++;
 2396                 return (-1);
 2397         }
 2398 
 2399         if (sigp) {
 2400                 char sig[16];
 2401 
 2402                 if (tdb == NULL) {
 2403                         tcpstat.tcps_rcvbadsig++;
 2404                         return (-1);
 2405                 }
 2406 
 2407                 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0)
 2408                         return (-1);
 2409 
 2410                 if (bcmp(sig, sigp, 16)) {
 2411                         tcpstat.tcps_rcvbadsig++;
 2412                         return (-1);
 2413                 }
 2414 
 2415                 tcpstat.tcps_rcvgoodsig++;
 2416         }
 2417 #endif /* TCP_SIGNATURE */
 2418 
 2419         return (0);
 2420 }
 2421 
 2422 #if defined(TCP_SACK)
 2423 u_long
 2424 tcp_seq_subtract(a, b)
 2425         u_long a, b;
 2426 {
 2427         return ((long)(a - b));
 2428 }
 2429 #endif
 2430 
 2431 
 2432 #ifdef TCP_SACK
 2433 /*
 2434  * This function is called upon receipt of new valid data (while not in header
 2435  * prediction mode), and it updates the ordered list of sacks.
 2436  */
 2437 void
 2438 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart,
 2439     tcp_seq rcv_lastend)
 2440 {
 2441         /*
 2442          * First reported block MUST be the most recent one.  Subsequent
 2443          * blocks SHOULD be in the order in which they arrived at the
 2444          * receiver.  These two conditions make the implementation fully
 2445          * compliant with RFC 2018.
 2446          */
 2447         int i, j = 0, count = 0, lastpos = -1;
 2448         struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
 2449 
 2450         /* First clean up current list of sacks */
 2451         for (i = 0; i < tp->rcv_numsacks; i++) {
 2452                 sack = tp->sackblks[i];
 2453                 if (sack.start == 0 && sack.end == 0) {
 2454                         count++; /* count = number of blocks to be discarded */
 2455                         continue;
 2456                 }
 2457                 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
 2458                         tp->sackblks[i].start = tp->sackblks[i].end = 0;
 2459                         count++;
 2460                 } else {
 2461                         temp[j].start = tp->sackblks[i].start;
 2462                         temp[j++].end = tp->sackblks[i].end;
 2463                 }
 2464         }
 2465         tp->rcv_numsacks -= count;
 2466         if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
 2467                 tcp_clean_sackreport(tp);
 2468                 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) {
 2469                         /* ==> need first sack block */
 2470                         tp->sackblks[0].start = rcv_laststart;
 2471                         tp->sackblks[0].end = rcv_lastend;
 2472                         tp->rcv_numsacks = 1;
 2473                 }
 2474                 return;
 2475         }
 2476         /* Otherwise, sack blocks are already present. */
 2477         for (i = 0; i < tp->rcv_numsacks; i++)
 2478                 tp->sackblks[i] = temp[i]; /* first copy back sack list */
 2479         if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend))
 2480                 return;     /* sack list remains unchanged */
 2481         /*
 2482          * From here, segment just received should be (part of) the 1st sack.
 2483          * Go through list, possibly coalescing sack block entries.
 2484          */
 2485         firstsack.start = rcv_laststart;
 2486         firstsack.end = rcv_lastend;
 2487         for (i = 0; i < tp->rcv_numsacks; i++) {
 2488                 sack = tp->sackblks[i];
 2489                 if (SEQ_LT(sack.end, firstsack.start) ||
 2490                     SEQ_GT(sack.start, firstsack.end))
 2491                         continue; /* no overlap */
 2492                 if (sack.start == firstsack.start && sack.end == firstsack.end){
 2493                         /*
 2494                          * identical block; delete it here since we will
 2495                          * move it to the front of the list.
 2496                          */
 2497                         tp->sackblks[i].start = tp->sackblks[i].end = 0;
 2498                         lastpos = i;    /* last posn with a zero entry */
 2499                         continue;
 2500                 }
 2501                 if (SEQ_LEQ(sack.start, firstsack.start))
 2502                         firstsack.start = sack.start; /* merge blocks */
 2503                 if (SEQ_GEQ(sack.end, firstsack.end))
 2504                         firstsack.end = sack.end;     /* merge blocks */
 2505                 tp->sackblks[i].start = tp->sackblks[i].end = 0;
 2506                 lastpos = i;    /* last posn with a zero entry */
 2507         }
 2508         if (lastpos != -1) {    /* at least one merge */
 2509                 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
 2510                         sack = tp->sackblks[i];
 2511                         if (sack.start == 0 && sack.end == 0)
 2512                                 continue;
 2513                         temp[j++] = sack;
 2514                 }
 2515                 tp->rcv_numsacks = j; /* including first blk (added later) */
 2516                 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
 2517                         tp->sackblks[i] = temp[i];
 2518         } else {        /* no merges -- shift sacks by 1 */
 2519                 if (tp->rcv_numsacks < MAX_SACK_BLKS)
 2520                         tp->rcv_numsacks++;
 2521                 for (i = tp->rcv_numsacks-1; i > 0; i--)
 2522                         tp->sackblks[i] = tp->sackblks[i-1];
 2523         }
 2524         tp->sackblks[0] = firstsack;
 2525         return;
 2526 }
 2527 
 2528 /*
 2529  * Process the TCP SACK option.  tp->snd_holes is an ordered list
 2530  * of holes (oldest to newest, in terms of the sequence space).
 2531  */
 2532 void
 2533 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
 2534 {
 2535         int tmp_olen;
 2536         u_char *tmp_cp;
 2537         struct sackhole *cur, *p, *temp;
 2538 
 2539         if (!tp->sack_enable)
 2540                 return;
 2541         /* SACK without ACK doesn't make sense. */
 2542         if ((th->th_flags & TH_ACK) == 0)
 2543                return;
 2544         /* Make sure the ACK on this segment is in [snd_una, snd_max]. */
 2545         if (SEQ_LT(th->th_ack, tp->snd_una) ||
 2546             SEQ_GT(th->th_ack, tp->snd_max))
 2547                 return;
 2548         /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
 2549         if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
 2550                 return;
 2551         /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
 2552         tmp_cp = cp + 2;
 2553         tmp_olen = optlen - 2;
 2554         tcpstat.tcps_sack_rcv_opts++;
 2555         if (tp->snd_numholes < 0)
 2556                 tp->snd_numholes = 0;
 2557         if (tp->t_maxseg == 0)
 2558                 panic("tcp_sack_option"); /* Should never happen */
 2559         while (tmp_olen > 0) {
 2560                 struct sackblk sack;
 2561 
 2562                 bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
 2563                 NTOHL(sack.start);
 2564                 bcopy(tmp_cp + sizeof(tcp_seq),
 2565                     (char *) &(sack.end), sizeof(tcp_seq));
 2566                 NTOHL(sack.end);
 2567                 tmp_olen -= TCPOLEN_SACK;
 2568                 tmp_cp += TCPOLEN_SACK;
 2569                 if (SEQ_LEQ(sack.end, sack.start))
 2570                         continue; /* bad SACK fields */
 2571                 if (SEQ_LEQ(sack.end, tp->snd_una))
 2572                         continue; /* old block */
 2573 #if defined(TCP_SACK) && defined(TCP_FACK)
 2574                 /* Updates snd_fack.  */
 2575                 if (SEQ_GT(sack.end, tp->snd_fack))
 2576                         tp->snd_fack = sack.end;
 2577 #endif /* TCP_FACK */
 2578                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
 2579                         if (SEQ_LT(sack.start, th->th_ack))
 2580                                 continue;
 2581                 }
 2582                 if (SEQ_GT(sack.end, tp->snd_max))
 2583                         continue;
 2584                 if (tp->snd_holes == NULL) { /* first hole */
 2585                         tp->snd_holes = (struct sackhole *)
 2586                             pool_get(&sackhl_pool, PR_NOWAIT);
 2587                         if (tp->snd_holes == NULL) {
 2588                                 /* ENOBUFS, so ignore SACKed block for now*/
 2589                                 goto done;
 2590                         }
 2591                         cur = tp->snd_holes;
 2592                         cur->start = th->th_ack;
 2593                         cur->end = sack.start;
 2594                         cur->rxmit = cur->start;
 2595                         cur->next = NULL;
 2596                         tp->snd_numholes = 1;
 2597                         tp->rcv_lastsack = sack.end;
 2598                         /*
 2599                          * dups is at least one.  If more data has been
 2600                          * SACKed, it can be greater than one.
 2601                          */
 2602                         cur->dups = min(tcprexmtthresh,
 2603                             ((sack.end - cur->end)/tp->t_maxseg));
 2604                         if (cur->dups < 1)
 2605                                 cur->dups = 1;
 2606                         continue; /* with next sack block */
 2607                 }
 2608                 /* Go thru list of holes:  p = previous,  cur = current */
 2609                 p = cur = tp->snd_holes;
 2610                 while (cur) {
 2611                         if (SEQ_LEQ(sack.end, cur->start))
 2612                                 /* SACKs data before the current hole */
 2613                                 break; /* no use going through more holes */
 2614                         if (SEQ_GEQ(sack.start, cur->end)) {
 2615                                 /* SACKs data beyond the current hole */
 2616                                 cur->dups++;
 2617                                 if (((sack.end - cur->end)/tp->t_maxseg) >=
 2618                                     tcprexmtthresh)
 2619                                         cur->dups = tcprexmtthresh;
 2620                                 p = cur;
 2621                                 cur = cur->next;
 2622                                 continue;
 2623                         }
 2624                         if (SEQ_LEQ(sack.start, cur->start)) {
 2625                                 /* Data acks at least the beginning of hole */
 2626 #if defined(TCP_SACK) && defined(TCP_FACK)
 2627                                 if (SEQ_GT(sack.end, cur->rxmit))
 2628                                         tp->retran_data -=
 2629                                             tcp_seq_subtract(cur->rxmit,
 2630                                             cur->start);
 2631                                 else
 2632                                         tp->retran_data -=
 2633                                             tcp_seq_subtract(sack.end,
 2634                                             cur->start);
 2635 #endif /* TCP_FACK */
 2636                                 if (SEQ_GEQ(sack.end, cur->end)) {
 2637                                         /* Acks entire hole, so delete hole */
 2638                                         if (p != cur) {
 2639                                                 p->next = cur->next;
 2640                                                 pool_put(&sackhl_pool, cur);
 2641                                                 cur = p->next;
 2642                                         } else {
 2643                                                 cur = cur->next;
 2644                                                 pool_put(&sackhl_pool, p);
 2645                                                 p = cur;
 2646                                                 tp->snd_holes = p;
 2647                                         }
 2648                                         tp->snd_numholes--;
 2649                                         continue;
 2650                                 }
 2651                                 /* otherwise, move start of hole forward */
 2652                                 cur->start = sack.end;
 2653                                 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
 2654                                 p = cur;
 2655                                 cur = cur->next;
 2656                                 continue;
 2657                         }
 2658                         /* move end of hole backward */
 2659                         if (SEQ_GEQ(sack.end, cur->end)) {
 2660 #if defined(TCP_SACK) && defined(TCP_FACK)
 2661                                 if (SEQ_GT(cur->rxmit, sack.start))
 2662                                         tp->retran_data -=
 2663                                             tcp_seq_subtract(cur->rxmit,
 2664                                             sack.start);
 2665 #endif /* TCP_FACK */
 2666                                 cur->end = sack.start;
 2667                                 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
 2668                                 cur->dups++;
 2669                                 if (((sack.end - cur->end)/tp->t_maxseg) >=
 2670                                     tcprexmtthresh)
 2671                                         cur->dups = tcprexmtthresh;
 2672                                 p = cur;
 2673                                 cur = cur->next;
 2674                                 continue;
 2675                         }
 2676                         if (SEQ_LT(cur->start, sack.start) &&
 2677                             SEQ_GT(cur->end, sack.end)) {
 2678                                 /*
 2679                                  * ACKs some data in middle of a hole; need to
 2680                                  * split current hole
 2681                                  */
 2682                                 temp = (struct sackhole *)
 2683                                     pool_get(&sackhl_pool, PR_NOWAIT);
 2684                                 if (temp == NULL)
 2685                                         goto done; /* ENOBUFS */
 2686 #if defined(TCP_SACK) && defined(TCP_FACK)
 2687                                 if (SEQ_GT(cur->rxmit, sack.end))
 2688                                         tp->retran_data -=
 2689                                             tcp_seq_subtract(sack.end,
 2690                                             sack.start);
 2691                                 else if (SEQ_GT(cur->rxmit, sack.start))
 2692                                         tp->retran_data -=
 2693                                             tcp_seq_subtract(cur->rxmit,
 2694                                             sack.start);
 2695 #endif /* TCP_FACK */
 2696                                 temp->next = cur->next;
 2697                                 temp->start = sack.end;
 2698                                 temp->end = cur->end;
 2699                                 temp->dups = cur->dups;
 2700                                 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start);
 2701                                 cur->end = sack.start;
 2702                                 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
 2703                                 cur->dups++;
 2704                                 if (((sack.end - cur->end)/tp->t_maxseg) >=
 2705                                         tcprexmtthresh)
 2706                                         cur->dups = tcprexmtthresh;
 2707                                 cur->next = temp;
 2708                                 p = temp;
 2709                                 cur = p->next;
 2710                                 tp->snd_numholes++;
 2711                         }
 2712                 }
 2713                 /* At this point, p points to the last hole on the list */
 2714                 if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
 2715                         /*
 2716                          * Need to append new hole at end.
 2717                          * Last hole is p (and it's not NULL).
 2718                          */
 2719                         temp = (struct sackhole *)
 2720                             pool_get(&sackhl_pool, PR_NOWAIT);
 2721                         if (temp == NULL)
 2722                                 goto done; /* ENOBUFS */
 2723                         temp->start = tp->rcv_lastsack;
 2724                         temp->end = sack.start;
 2725                         temp->dups = min(tcprexmtthresh,
 2726                             ((sack.end - sack.start)/tp->t_maxseg));
 2727                         if (temp->dups < 1)
 2728                                 temp->dups = 1;
 2729                         temp->rxmit = temp->start;
 2730                         temp->next = 0;
 2731                         p->next = temp;
 2732                         tp->rcv_lastsack = sack.end;
 2733                         tp->snd_numholes++;
 2734                 }
 2735         }
 2736 done:
 2737 #if defined(TCP_SACK) && defined(TCP_FACK)
 2738         /*
 2739          * Update retran_data and snd_awnd.  Go through the list of
 2740          * holes.   Increment retran_data by (hole->rxmit - hole->start).
 2741          */
 2742         tp->retran_data = 0;
 2743         cur = tp->snd_holes;
 2744         while (cur) {
 2745                 tp->retran_data += cur->rxmit - cur->start;
 2746                 cur = cur->next;
 2747         }
 2748         tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) +
 2749             tp->retran_data;
 2750 #endif /* TCP_FACK */
 2751 
 2752         return;
 2753 }
 2754 
 2755 /*
 2756  * Delete stale (i.e, cumulatively ack'd) holes.  Hole is deleted only if
 2757  * it is completely acked; otherwise, tcp_sack_option(), called from
 2758  * tcp_dooptions(), will fix up the hole.
 2759  */
 2760 void
 2761 tcp_del_sackholes(tp, th)
 2762         struct tcpcb *tp;
 2763         struct tcphdr *th;
 2764 {
 2765         if (tp->sack_enable && tp->t_state != TCPS_LISTEN) {
 2766                 /* max because this could be an older ack just arrived */
 2767                 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
 2768                         th->th_ack : tp->snd_una;
 2769                 struct sackhole *cur = tp->snd_holes;
 2770                 struct sackhole *prev;
 2771                 while (cur)
 2772                         if (SEQ_LEQ(cur->end, lastack)) {
 2773                                 prev = cur;
 2774                                 cur = cur->next;
 2775                                 pool_put(&sackhl_pool, prev);
 2776                                 tp->snd_numholes--;
 2777                         } else if (SEQ_LT(cur->start, lastack)) {
 2778                                 cur->start = lastack;
 2779                                 if (SEQ_LT(cur->rxmit, cur->start))
 2780                                         cur->rxmit = cur->start;
 2781                                 break;
 2782                         } else
 2783                                 break;
 2784                 tp->snd_holes = cur;
 2785         }
 2786 }
 2787 
 2788 /*
 2789  * Delete all receiver-side SACK information.
 2790  */
 2791 void
 2792 tcp_clean_sackreport(tp)
 2793         struct tcpcb *tp;
 2794 {
 2795         int i;
 2796 
 2797         tp->rcv_numsacks = 0;
 2798         for (i = 0; i < MAX_SACK_BLKS; i++)
 2799                 tp->sackblks[i].start = tp->sackblks[i].end=0;
 2800 
 2801 }
 2802 
 2803 /*
 2804  * Checks for partial ack.  If partial ack arrives, turn off retransmission
 2805  * timer, deflate the window, do not clear tp->t_dupacks, and return 1.
 2806  * If the ack advances at least to tp->snd_last, return 0.
 2807  */
 2808 int
 2809 tcp_sack_partialack(tp, th)
 2810         struct tcpcb *tp;
 2811         struct tcphdr *th;
 2812 {
 2813         if (SEQ_LT(th->th_ack, tp->snd_last)) {
 2814                 /* Turn off retx. timer (will start again next segment) */
 2815                 TCP_TIMER_DISARM(tp, TCPT_REXMT);
 2816                 tp->t_rtttime = 0;
 2817 #ifndef TCP_FACK
 2818                 /*
 2819                  * Partial window deflation.  This statement relies on the
 2820                  * fact that tp->snd_una has not been updated yet.  In FACK
 2821                  * hold snd_cwnd constant during fast recovery.
 2822                  */
 2823                 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) {
 2824                         tp->snd_cwnd -= th->th_ack - tp->snd_una;
 2825                         tp->snd_cwnd += tp->t_maxseg;
 2826                 } else
 2827                         tp->snd_cwnd = tp->t_maxseg;
 2828 #endif
 2829                 return (1);
 2830         }
 2831         return (0);
 2832 }
 2833 #endif /* TCP_SACK */
 2834 
 2835 /*
 2836  * Pull out of band byte out of a segment so
 2837  * it doesn't appear in the user's data queue.
 2838  * It is still reflected in the segment length for
 2839  * sequencing purposes.
 2840  */
 2841 void
 2842 tcp_pulloutofband(so, urgent, m, off)
 2843         struct socket *so;
 2844         u_int urgent;
 2845         struct mbuf *m;
 2846         int off;
 2847 {
 2848         int cnt = off + urgent - 1;
 2849 
 2850         while (cnt >= 0) {
 2851                 if (m->m_len > cnt) {
 2852                         char *cp = mtod(m, caddr_t) + cnt;
 2853                         struct tcpcb *tp = sototcpcb(so);
 2854 
 2855                         tp->t_iobc = *cp;
 2856                         tp->t_oobflags |= TCPOOB_HAVEDATA;
 2857                         bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 2858                         m->m_len--;
 2859                         return;
 2860                 }
 2861                 cnt -= m->m_len;
 2862                 m = m->m_next;
 2863                 if (m == 0)
 2864                         break;
 2865         }
 2866         panic("tcp_pulloutofband");
 2867 }
 2868 
 2869 /*
 2870  * Collect new round-trip time estimate
 2871  * and update averages and current timeout.
 2872  */
 2873 void
 2874 tcp_xmit_timer(tp, rtt)
 2875         struct tcpcb *tp;
 2876         short rtt;
 2877 {
 2878         short delta;
 2879         short rttmin;
 2880 
 2881         if (rtt < 0)
 2882                 rtt = 0;
 2883         else if (rtt > TCP_RTT_MAX)
 2884                 rtt = TCP_RTT_MAX;
 2885 
 2886         tcpstat.tcps_rttupdated++;
 2887         if (tp->t_srtt != 0) {
 2888                 /*
 2889                  * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits
 2890                  * after the binary point (scaled by 4), whereas
 2891                  * srtt is stored as fixed point with 5 bits after the
 2892                  * binary point (i.e., scaled by 32).  The following magic
 2893                  * is equivalent to the smoothing algorithm in rfc793 with
 2894                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 2895                  * point).
 2896                  */
 2897                 delta = (rtt << TCP_RTT_BASE_SHIFT) -
 2898                     (tp->t_srtt >> TCP_RTT_SHIFT);
 2899                 if ((tp->t_srtt += delta) <= 0)
 2900                         tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT;
 2901                 /*
 2902                  * We accumulate a smoothed rtt variance (actually, a
 2903                  * smoothed mean difference), then set the retransmit
 2904                  * timer to smoothed rtt + 4 times the smoothed variance.
 2905                  * rttvar is stored as fixed point with 4 bits after the
 2906                  * binary point (scaled by 16).  The following is
 2907                  * equivalent to rfc793 smoothing with an alpha of .75
 2908                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 2909                  * rfc793's wired-in beta.
 2910                  */
 2911                 if (delta < 0)
 2912                         delta = -delta;
 2913                 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
 2914                 if ((tp->t_rttvar += delta) <= 0)
 2915                         tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT;
 2916         } else {
 2917                 /*
 2918                  * No rtt measurement yet - use the unsmoothed rtt.
 2919                  * Set the variance to half the rtt (so our first
 2920                  * retransmit happens at 3*rtt).
 2921                  */
 2922                 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
 2923                 tp->t_rttvar = (rtt + 1) <<
 2924                     (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1);
 2925         }
 2926         tp->t_rtttime = 0;
 2927         tp->t_rxtshift = 0;
 2928 
 2929         /*
 2930          * the retransmit should happen at rtt + 4 * rttvar.
 2931          * Because of the way we do the smoothing, srtt and rttvar
 2932          * will each average +1/2 tick of bias.  When we compute
 2933          * the retransmit timer, we want 1/2 tick of rounding and
 2934          * 1 extra tick because of +-1/2 tick uncertainty in the
 2935          * firing of the timer.  The bias will give us exactly the
 2936          * 1.5 tick we need.  But, because the bias is
 2937          * statistical, we have to test that we don't drop below
 2938          * the minimum feasible timer (which is 2 ticks).
 2939          */
 2940         rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX);
 2941         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX);
 2942 
 2943         /*
 2944          * We received an ack for a packet that wasn't retransmitted;
 2945          * it is probably safe to discard any error indications we've
 2946          * received recently.  This isn't quite right, but close enough
 2947          * for now (a route might have failed after we sent a segment,
 2948          * and the return path might not be symmetrical).
 2949          */
 2950         tp->t_softerror = 0;
 2951 }
 2952 
 2953 /*
 2954  * Determine a reasonable value for maxseg size.
 2955  * If the route is known, check route for mtu.
 2956  * If none, use an mss that can be handled on the outgoing
 2957  * interface without forcing IP to fragment; if bigger than
 2958  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
 2959  * to utilize large mbufs.  If no route is found, route has no mtu,
 2960  * or the destination isn't local, use a default, hopefully conservative
 2961  * size (usually 512 or the default IP max size, but no more than the mtu
 2962  * of the interface), as we can't discover anything about intervening
 2963  * gateways or networks.  We also initialize the congestion/slow start
 2964  * window to be a single segment if the destination isn't local.
 2965  * While looking at the routing entry, we also initialize other path-dependent
 2966  * parameters from pre-set or cached values in the routing entry.
 2967  *
 2968  * Also take into account the space needed for options that we
 2969  * send regularly.  Make maxseg shorter by that amount to assure
 2970  * that we can send maxseg amount of data even when the options
 2971  * are present.  Store the upper limit of the length of options plus
 2972  * data in maxopd.
 2973  *
 2974  * NOTE: offer == -1 indicates that the maxseg size changed due to
 2975  * Path MTU discovery.
 2976  */
 2977 int
 2978 tcp_mss(tp, offer)
 2979         struct tcpcb *tp;
 2980         int offer;
 2981 {
 2982         struct rtentry *rt;
 2983         struct ifnet *ifp;
 2984         int mss, mssopt;
 2985         int iphlen;
 2986         struct inpcb *inp;
 2987 
 2988         inp = tp->t_inpcb;
 2989 
 2990         mssopt = mss = tcp_mssdflt;
 2991 
 2992         rt = in_pcbrtentry(inp);
 2993 
 2994         if (rt == NULL)
 2995                 goto out;
 2996 
 2997         ifp = rt->rt_ifp;
 2998 
 2999         switch (tp->pf) {
 3000 #ifdef INET6
 3001         case AF_INET6:
 3002                 iphlen = sizeof(struct ip6_hdr);
 3003                 break;
 3004 #endif
 3005         case AF_INET:
 3006                 iphlen = sizeof(struct ip);
 3007                 break;
 3008         default:
 3009                 /* the family does not support path MTU discovery */
 3010                 goto out;
 3011         }
 3012 
 3013 #ifdef RTV_MTU
 3014         /*
 3015          * if there's an mtu associated with the route and we support
 3016          * path MTU discovery for the underlying protocol family, use it.
 3017          */
 3018         if (rt->rt_rmx.rmx_mtu) {
 3019                 /*
 3020                  * One may wish to lower MSS to take into account options,
 3021                  * especially security-related options.
 3022                  */
 3023                 if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
 3024                         /*
 3025                          * RFC2460 section 5, last paragraph: if path MTU is
 3026                          * smaller than 1280, use 1280 as packet size and
 3027                          * attach fragment header.
 3028                          */
 3029                         mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) -
 3030                             sizeof(struct tcphdr);
 3031                 } else
 3032                         mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr);
 3033         } else
 3034 #endif /* RTV_MTU */
 3035         if (!ifp)
 3036                 /*
 3037                  * ifp may be null and rmx_mtu may be zero in certain
 3038                  * v6 cases (e.g., if ND wasn't able to resolve the
 3039                  * destination host.
 3040                  */
 3041                 goto out;
 3042         else if (ifp->if_flags & IFF_LOOPBACK)
 3043                 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
 3044         else if (tp->pf == AF_INET) {
 3045                 if (ip_mtudisc)
 3046                         mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
 3047                 else if (inp && in_localaddr(inp->inp_faddr))
 3048                         mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
 3049         }
 3050 #ifdef INET6
 3051         else if (tp->pf == AF_INET6) {
 3052                 /*
 3053                  * for IPv6, path MTU discovery is always turned on,
 3054                  * or the node must use packet size <= 1280.
 3055                  */
 3056                 mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr);
 3057         }
 3058 #endif /* INET6 */
 3059 
 3060         /* Calculate the value that we offer in TCPOPT_MAXSEG */
 3061         if (offer != -1) {
 3062 #ifndef INET6
 3063                 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
 3064 #else
 3065                 if (tp->pf == AF_INET6)
 3066                         mssopt = IN6_LINKMTU(ifp) - iphlen -
 3067                             sizeof(struct tcphdr);
 3068                 else
 3069                         mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
 3070 #endif
 3071 
 3072                 mssopt = max(tcp_mssdflt, mssopt);
 3073         }
 3074 
 3075  out:
 3076         /*
 3077          * The current mss, t_maxseg, is initialized to the default value.
 3078          * If we compute a smaller value, reduce the current mss.
 3079          * If we compute a larger value, return it for use in sending
 3080          * a max seg size option, but don't store it for use
 3081          * unless we received an offer at least that large from peer.
 3082          * 
 3083          * However, do not accept offers lower than the minimum of
 3084          * the interface MTU and 216.
 3085          */
 3086         if (offer > 0)
 3087                 tp->t_peermss = offer;
 3088         if (tp->t_peermss)
 3089                 mss = min(mss, max(tp->t_peermss, 216));
 3090 
 3091         /* sanity - at least max opt. space */
 3092         mss = max(mss, 64);
 3093 
 3094         /*
 3095          * maxopd stores the maximum length of data AND options
 3096          * in a segment; maxseg is the amount of data in a normal
 3097          * segment.  We need to store this value (maxopd) apart
 3098          * from maxseg, because now every segment carries options
 3099          * and thus we normally have somewhat less data in segments.
 3100          */
 3101         tp->t_maxopd = mss;
 3102 
 3103         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 3104             (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
 3105                 mss -= TCPOLEN_TSTAMP_APPA;
 3106 #ifdef TCP_SIGNATURE
 3107         if (tp->t_flags & TF_SIGNATURE)
 3108                 mss -= TCPOLEN_SIGLEN;
 3109 #endif
 3110 
 3111         if (offer == -1) {
 3112                 /* mss changed due to Path MTU discovery */
 3113                 tp->t_flags &= ~TF_PMTUD_PEND;
 3114                 tp->t_pmtud_mtu_sent = 0;
 3115                 tp->t_pmtud_mss_acked = 0;
 3116                 if (mss < tp->t_maxseg) {
 3117                         /*
 3118                          * Follow suggestion in RFC 2414 to reduce the
 3119                          * congestion window by the ratio of the old
 3120                          * segment size to the new segment size.
 3121                          */
 3122                         tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
 3123                                              mss, mss);
 3124                 }
 3125         } else if (tcp_do_rfc3390) {
 3126                 /* increase initial window  */
 3127                 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380));
 3128         } else
 3129                 tp->snd_cwnd = mss;
 3130 
 3131         tp->t_maxseg = mss;
 3132 
 3133         return (offer != -1 ? mssopt : mss);
 3134 }
 3135 
 3136 u_int
 3137 tcp_hdrsz(struct tcpcb *tp)
 3138 {
 3139         u_int hlen;
 3140 
 3141         switch (tp->pf) {
 3142 #ifdef INET6
 3143         case AF_INET6:
 3144                 hlen = sizeof(struct ip6_hdr);
 3145                 break;
 3146 #endif
 3147         case AF_INET:
 3148                 hlen = sizeof(struct ip);
 3149                 break;
 3150         default:
 3151                 hlen = 0;
 3152                 break;
 3153         }
 3154         hlen += sizeof(struct tcphdr);
 3155 
 3156         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 3157             (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
 3158                 hlen += TCPOLEN_TSTAMP_APPA;
 3159 #ifdef TCP_SIGNATURE
 3160         if (tp->t_flags & TF_SIGNATURE)
 3161                 hlen += TCPOLEN_SIGLEN;
 3162 #endif
 3163         return (hlen);
 3164 }
 3165 
 3166 /*
 3167  * Set connection variables based on the effective MSS.
 3168  * We are passed the TCPCB for the actual connection.  If we
 3169  * are the server, we are called by the compressed state engine
 3170  * when the 3-way handshake is complete.  If we are the client,
 3171  * we are called when we receive the SYN,ACK from the server.
 3172  *
 3173  * NOTE: The t_maxseg value must be initialized in the TCPCB
 3174  * before this routine is called!
 3175  */
 3176 void
 3177 tcp_mss_update(tp)
 3178         struct tcpcb *tp;
 3179 {
 3180         int mss;
 3181         u_long bufsize;
 3182         struct rtentry *rt;
 3183         struct socket *so;
 3184 
 3185         so = tp->t_inpcb->inp_socket;
 3186         mss = tp->t_maxseg;
 3187 
 3188         rt = in_pcbrtentry(tp->t_inpcb);
 3189 
 3190         if (rt == NULL)
 3191                 return;
 3192 
 3193         bufsize = so->so_snd.sb_hiwat;
 3194         if (bufsize < mss) {
 3195                 mss = bufsize;
 3196                 /* Update t_maxseg and t_maxopd */
 3197                 tcp_mss(tp, mss);
 3198         } else {
 3199                 bufsize = roundup(bufsize, mss);
 3200                 if (bufsize > sb_max)
 3201                         bufsize = sb_max;
 3202                 (void)sbreserve(&so->so_snd, bufsize);
 3203         }
 3204 
 3205         bufsize = so->so_rcv.sb_hiwat;
 3206         if (bufsize > mss) {
 3207                 bufsize = roundup(bufsize, mss);
 3208                 if (bufsize > sb_max)
 3209                         bufsize = sb_max;
 3210                 (void)sbreserve(&so->so_rcv, bufsize);
 3211         }
 3212 
 3213 }
 3214 
 3215 #if defined (TCP_SACK)
 3216 /*
 3217  * Checks for partial ack.  If partial ack arrives, force the retransmission
 3218  * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
 3219  * 1.  By setting snd_nxt to ti_ack, this forces retransmission timer to
 3220  * be started again.  If the ack advances at least to tp->snd_last, return 0.
 3221  */
 3222 int
 3223 tcp_newreno(tp, th)
 3224         struct tcpcb *tp;
 3225         struct tcphdr *th;
 3226 {
 3227         if (SEQ_LT(th->th_ack, tp->snd_last)) {
 3228                 /*
 3229                  * snd_una has not been updated and the socket send buffer
 3230                  * not yet drained of the acked data, so we have to leave
 3231                  * snd_una as it was to get the correct data offset in
 3232                  * tcp_output().
 3233                  */
 3234                 tcp_seq onxt = tp->snd_nxt;
 3235                 u_long  ocwnd = tp->snd_cwnd;
 3236                 TCP_TIMER_DISARM(tp, TCPT_REXMT);
 3237                 tp->t_rtttime = 0;
 3238                 tp->snd_nxt = th->th_ack;
 3239                 /*
 3240                  * Set snd_cwnd to one segment beyond acknowledged offset
 3241                  * (tp->snd_una not yet updated when this function is called)
 3242                  */
 3243                 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
 3244                 (void) tcp_output(tp);
 3245                 tp->snd_cwnd = ocwnd;
 3246                 if (SEQ_GT(onxt, tp->snd_nxt))
 3247                         tp->snd_nxt = onxt;
 3248                 /*
 3249                  * Partial window deflation.  Relies on fact that tp->snd_una
 3250                  * not updated yet.
 3251                  */
 3252                 if (tp->snd_cwnd > th->th_ack - tp->snd_una)
 3253                         tp->snd_cwnd -= th->th_ack - tp->snd_una;
 3254                 else
 3255                         tp->snd_cwnd = 0;
 3256                 tp->snd_cwnd += tp->t_maxseg;
 3257 
 3258                 return 1;
 3259         }
 3260         return 0;
 3261 }
 3262 #endif /* TCP_SACK */
 3263 
 3264 int
 3265 tcp_mss_adv(struct ifnet *ifp, int af)
 3266 {
 3267         int mss = 0;
 3268         int iphlen;
 3269 
 3270         switch (af) {
 3271         case AF_INET:
 3272                 if (ifp != NULL)
 3273                         mss = ifp->if_mtu;
 3274                 iphlen = sizeof(struct ip);
 3275                 break;
 3276 #ifdef INET6
 3277         case AF_INET6: 
 3278                 if (ifp != NULL)
 3279                         mss = IN6_LINKMTU(ifp);
 3280                 iphlen = sizeof(struct ip6_hdr);
 3281                 break;
 3282 #endif  
 3283         }
 3284         mss = mss - iphlen - sizeof(struct tcphdr);
 3285         return (max(mss, tcp_mssdflt));
 3286 }
 3287 
 3288 /*
 3289  * TCP compressed state engine.  Currently used to hold compressed
 3290  * state for SYN_RECEIVED.
 3291  */
 3292 
 3293 u_long  syn_cache_count;
 3294 u_int32_t syn_hash1, syn_hash2;
 3295 
 3296 #define SYN_HASH(sa, sp, dp) \
 3297         ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
 3298                                      ((u_int32_t)(sp)))^syn_hash2)))
 3299 #ifndef INET6
 3300 #define SYN_HASHALL(hash, src, dst) \
 3301 do {                                                                    \
 3302         hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr,       \
 3303                 ((struct sockaddr_in *)(src))->sin_port,                \
 3304                 ((struct sockaddr_in *)(dst))->sin_port);               \
 3305 } while (/*CONSTCOND*/ 0)
 3306 #else
 3307 #define SYN_HASH6(sa, sp, dp) \
 3308         ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
 3309           (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
 3310          & 0x7fffffff)
 3311 
 3312 #define SYN_HASHALL(hash, src, dst) \
 3313 do {                                                                    \
 3314         switch ((src)->sa_family) {                                     \
 3315         case AF_INET:                                                   \
 3316                 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
 3317                         ((struct sockaddr_in *)(src))->sin_port,        \
 3318                         ((struct sockaddr_in *)(dst))->sin_port);       \
 3319                 break;                                                  \
 3320         case AF_INET6:                                                  \
 3321                 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \
 3322                         ((struct sockaddr_in6 *)(src))->sin6_port,      \
 3323                         ((struct sockaddr_in6 *)(dst))->sin6_port);     \
 3324                 break;                                                  \
 3325         default:                                                        \
 3326                 hash = 0;                                               \
 3327         }                                                               \
 3328 } while (/*CONSTCOND*/0)
 3329 #endif /* INET6 */
 3330 
 3331 #define SYN_CACHE_RM(sc)                                                \
 3332 do {                                                                    \
 3333         (sc)->sc_flags |= SCF_DEAD;                                     \
 3334         TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket,     \
 3335             (sc), sc_bucketq);                                          \
 3336         (sc)->sc_tp = NULL;                                             \
 3337         LIST_REMOVE((sc), sc_tpq);                                      \
 3338         tcp_syn_cache[(sc)->sc_bucketidx].sch_length--;                 \
 3339         timeout_del(&(sc)->sc_timer);                                   \
 3340         syn_cache_count--;                                              \
 3341 } while (/*CONSTCOND*/0)
 3342 
 3343 #define SYN_CACHE_PUT(sc)                                               \
 3344 do {                                                                    \
 3345         if ((sc)->sc_ipopts)                                            \
 3346                 (void) m_free((sc)->sc_ipopts);                         \
 3347         if ((sc)->sc_route4.ro_rt != NULL)                              \
 3348                 RTFREE((sc)->sc_route4.ro_rt);                          \
 3349         timeout_set(&(sc)->sc_timer, syn_cache_reaper, (sc));           \
 3350         timeout_add(&(sc)->sc_timer, 0);                                \
 3351 } while (/*CONSTCOND*/0)
 3352 
 3353 struct pool syn_cache_pool;
 3354 
 3355 /*
 3356  * We don't estimate RTT with SYNs, so each packet starts with the default
 3357  * RTT and each timer step has a fixed timeout value.
 3358  */
 3359 #define SYN_CACHE_TIMER_ARM(sc)                                         \
 3360 do {                                                                    \
 3361         TCPT_RANGESET((sc)->sc_rxtcur,                                  \
 3362             TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
 3363             TCPTV_REXMTMAX);                                            \
 3364         if (!timeout_initialized(&(sc)->sc_timer))                      \
 3365                 timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc));    \
 3366         timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \
 3367 } while (/*CONSTCOND*/0)
 3368 
 3369 #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate
 3370 
 3371 void
 3372 syn_cache_init()
 3373 {
 3374         int i;
 3375 
 3376         /* Initialize the hash buckets. */
 3377         for (i = 0; i < tcp_syn_cache_size; i++)
 3378                 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
 3379 
 3380         /* Initialize the syn cache pool. */
 3381         pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
 3382             "synpl", NULL);
 3383 }
 3384 
 3385 void
 3386 syn_cache_insert(sc, tp)
 3387         struct syn_cache *sc;
 3388         struct tcpcb *tp;
 3389 {
 3390         struct syn_cache_head *scp;
 3391         struct syn_cache *sc2;
 3392         int s;
 3393 
 3394         /*
 3395          * If there are no entries in the hash table, reinitialize
 3396          * the hash secrets.
 3397          */
 3398         if (syn_cache_count == 0) {
 3399                 syn_hash1 = arc4random();
 3400                 syn_hash2 = arc4random();
 3401         }
 3402 
 3403         SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
 3404         sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
 3405         scp = &tcp_syn_cache[sc->sc_bucketidx];
 3406 
 3407         /*
 3408          * Make sure that we don't overflow the per-bucket
 3409          * limit or the total cache size limit.
 3410          */
 3411         s = splsoftnet();
 3412         if (scp->sch_length >= tcp_syn_bucket_limit) {
 3413                 tcpstat.tcps_sc_bucketoverflow++;
 3414                 /*
 3415                  * The bucket is full.  Toss the oldest element in the
 3416                  * bucket.  This will be the first entry in the bucket.
 3417                  */
 3418                 sc2 = TAILQ_FIRST(&scp->sch_bucket);
 3419 #ifdef DIAGNOSTIC
 3420                 /*
 3421                  * This should never happen; we should always find an
 3422                  * entry in our bucket.
 3423                  */
 3424                 if (sc2 == NULL)
 3425                         panic("syn_cache_insert: bucketoverflow: impossible");
 3426 #endif
 3427                 SYN_CACHE_RM(sc2);
 3428                 SYN_CACHE_PUT(sc2);
 3429         } else if (syn_cache_count >= tcp_syn_cache_limit) {
 3430                 struct syn_cache_head *scp2, *sce;
 3431 
 3432                 tcpstat.tcps_sc_overflowed++;
 3433                 /*
 3434                  * The cache is full.  Toss the oldest entry in the
 3435                  * first non-empty bucket we can find.
 3436                  *
 3437                  * XXX We would really like to toss the oldest
 3438                  * entry in the cache, but we hope that this
 3439                  * condition doesn't happen very often.
 3440                  */
 3441                 scp2 = scp;
 3442                 if (TAILQ_EMPTY(&scp2->sch_bucket)) {
 3443                         sce = &tcp_syn_cache[tcp_syn_cache_size];
 3444                         for (++scp2; scp2 != scp; scp2++) {
 3445                                 if (scp2 >= sce)
 3446                                         scp2 = &tcp_syn_cache[0];
 3447                                 if (! TAILQ_EMPTY(&scp2->sch_bucket))
 3448                                         break;
 3449                         }
 3450 #ifdef DIAGNOSTIC
 3451                         /*
 3452                          * This should never happen; we should always find a
 3453                          * non-empty bucket.
 3454                          */
 3455                         if (scp2 == scp)
 3456                                 panic("syn_cache_insert: cacheoverflow: "
 3457                                     "impossible");
 3458 #endif
 3459                 }
 3460                 sc2 = TAILQ_FIRST(&scp2->sch_bucket);
 3461                 SYN_CACHE_RM(sc2);
 3462                 SYN_CACHE_PUT(sc2);
 3463         }
 3464 
 3465         /*
 3466          * Initialize the entry's timer.
 3467          */
 3468         sc->sc_rxttot = 0;
 3469         sc->sc_rxtshift = 0;
 3470         SYN_CACHE_TIMER_ARM(sc);
 3471 
 3472         /* Link it from tcpcb entry */
 3473         LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
 3474 
 3475         /* Put it into the bucket. */
 3476         TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
 3477         scp->sch_length++;
 3478         syn_cache_count++;
 3479 
 3480         tcpstat.tcps_sc_added++;
 3481         splx(s);
 3482 }
 3483 
 3484 /*
 3485  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
 3486  * If we have retransmitted an entry the maximum number of times, expire
 3487  * that entry.
 3488  */
 3489 void
 3490 syn_cache_timer(void *arg)
 3491 {
 3492         struct syn_cache *sc = arg;
 3493         int s;
 3494 
 3495         s = splsoftnet();
 3496         if (sc->sc_flags & SCF_DEAD) {
 3497                 splx(s);
 3498                 return;
 3499         }
 3500 
 3501         if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
 3502                 /* Drop it -- too many retransmissions. */
 3503                 goto dropit;
 3504         }
 3505 
 3506         /*
 3507          * Compute the total amount of time this entry has
 3508          * been on a queue.  If this entry has been on longer
 3509          * than the keep alive timer would allow, expire it.
 3510          */
 3511         sc->sc_rxttot += sc->sc_rxtcur;
 3512         if (sc->sc_rxttot >= tcptv_keep_init)
 3513                 goto dropit;
 3514 
 3515         tcpstat.tcps_sc_retransmitted++;
 3516         (void) syn_cache_respond(sc, NULL);
 3517 
 3518         /* Advance the timer back-off. */
 3519         sc->sc_rxtshift++;
 3520         SYN_CACHE_TIMER_ARM(sc);
 3521 
 3522         splx(s);
 3523         return;
 3524 
 3525  dropit:
 3526         tcpstat.tcps_sc_timed_out++;
 3527         SYN_CACHE_RM(sc);
 3528         SYN_CACHE_PUT(sc);
 3529         splx(s);
 3530 }
 3531 
 3532 void
 3533 syn_cache_reaper(void *arg)
 3534 {
 3535         struct syn_cache *sc = arg;
 3536         int s;
 3537 
 3538         s = splsoftnet();
 3539         pool_put(&syn_cache_pool, (sc));
 3540         splx(s);
 3541         return;
 3542 }
 3543 
 3544 /*
 3545  * Remove syn cache created by the specified tcb entry,
 3546  * because this does not make sense to keep them
 3547  * (if there's no tcb entry, syn cache entry will never be used)
 3548  */
 3549 void
 3550 syn_cache_cleanup(tp)
 3551         struct tcpcb *tp;
 3552 {
 3553         struct syn_cache *sc, *nsc;
 3554         int s;
 3555 
 3556         s = splsoftnet();
 3557 
 3558         for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
 3559                 nsc = LIST_NEXT(sc, sc_tpq);
 3560 
 3561 #ifdef DIAGNOSTIC
 3562                 if (sc->sc_tp != tp)
 3563                         panic("invalid sc_tp in syn_cache_cleanup");
 3564 #endif
 3565                 SYN_CACHE_RM(sc);
 3566                 SYN_CACHE_PUT(sc);
 3567         }
 3568         /* just for safety */
 3569         LIST_INIT(&tp->t_sc);
 3570 
 3571         splx(s);
 3572 }
 3573 
 3574 /*
 3575  * Find an entry in the syn cache.
 3576  */
 3577 struct syn_cache *
 3578 syn_cache_lookup(src, dst, headp)
 3579         struct sockaddr *src;
 3580         struct sockaddr *dst;
 3581         struct syn_cache_head **headp;
 3582 {
 3583         struct syn_cache *sc;
 3584         struct syn_cache_head *scp;
 3585         u_int32_t hash;
 3586         int s;
 3587 
 3588         SYN_HASHALL(hash, src, dst);
 3589 
 3590         scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
 3591         *headp = scp;
 3592         s = splsoftnet();
 3593         for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
 3594              sc = TAILQ_NEXT(sc, sc_bucketq)) {
 3595                 if (sc->sc_hash != hash)
 3596                         continue;
 3597                 if (!bcmp(&sc->sc_src, src, src->sa_len) &&
 3598                     !bcmp(&sc->sc_dst, dst, dst->sa_len)) {
 3599                         splx(s);
 3600                         return (sc);
 3601                 }
 3602         }
 3603         splx(s);
 3604         return (NULL);
 3605 }
 3606 
 3607 /*
 3608  * This function gets called when we receive an ACK for a
 3609  * socket in the LISTEN state.  We look up the connection
 3610  * in the syn cache, and if its there, we pull it out of
 3611  * the cache and turn it into a full-blown connection in
 3612  * the SYN-RECEIVED state.
 3613  *
 3614  * The return values may not be immediately obvious, and their effects
 3615  * can be subtle, so here they are:
 3616  *
 3617  *      NULL    SYN was not found in cache; caller should drop the
 3618  *              packet and send an RST.
 3619  *
 3620  *      -1      We were unable to create the new connection, and are
 3621  *              aborting it.  An ACK,RST is being sent to the peer
 3622  *              (unless we got screwey sequence numbners; see below),
 3623  *              because the 3-way handshake has been completed.  Caller
 3624  *              should not free the mbuf, since we may be using it.  If
 3625  *              we are not, we will free it.
 3626  *
 3627  *      Otherwise, the return value is a pointer to the new socket
 3628  *      associated with the connection.
 3629  */
 3630 struct socket *
 3631 syn_cache_get(src, dst, th, hlen, tlen, so, m)
 3632         struct sockaddr *src;
 3633         struct sockaddr *dst;
 3634         struct tcphdr *th;
 3635         unsigned int hlen, tlen;
 3636         struct socket *so;
 3637         struct mbuf *m;
 3638 {
 3639         struct syn_cache *sc;
 3640         struct syn_cache_head *scp;
 3641         struct inpcb *inp = NULL;
 3642         struct tcpcb *tp = 0;
 3643         struct mbuf *am;
 3644         int s;
 3645         struct socket *oso;
 3646 
 3647         s = splsoftnet();
 3648         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
 3649                 splx(s);
 3650                 return (NULL);
 3651         }
 3652 
 3653         /*
 3654          * Verify the sequence and ack numbers.  Try getting the correct
 3655          * response again.
 3656          */
 3657         if ((th->th_ack != sc->sc_iss + 1) ||
 3658             SEQ_LEQ(th->th_seq, sc->sc_irs) ||
 3659             SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
 3660                 (void) syn_cache_respond(sc, m);
 3661                 splx(s);
 3662                 return ((struct socket *)(-1));
 3663         }
 3664 
 3665         /* Remove this cache entry */
 3666         SYN_CACHE_RM(sc);
 3667         splx(s);
 3668 
 3669         /*
 3670          * Ok, create the full blown connection, and set things up
 3671          * as they would have been set up if we had created the
 3672          * connection when the SYN arrived.  If we can't create
 3673          * the connection, abort it.
 3674          */
 3675         oso = so;
 3676         so = sonewconn(so, SS_ISCONNECTED);
 3677         if (so == NULL)
 3678                 goto resetandabort;
 3679 
 3680         inp = sotoinpcb(oso);
 3681 #ifdef IPSEC
 3682         /*
 3683          * We need to copy the required security levels
 3684          * from the old pcb. Ditto for any other
 3685          * IPsec-related information.
 3686          */
 3687         {
 3688           struct inpcb *newinp = (struct inpcb *)so->so_pcb;
 3689           bcopy(inp->inp_seclevel, newinp->inp_seclevel,
 3690                 sizeof(inp->inp_seclevel));
 3691           newinp->inp_secrequire = inp->inp_secrequire;
 3692           if (inp->inp_ipo != NULL) {
 3693                   newinp->inp_ipo = inp->inp_ipo;
 3694                   inp->inp_ipo->ipo_ref_count++;
 3695           }
 3696           if (inp->inp_ipsec_remotecred != NULL) {
 3697                   newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred;
 3698                   inp->inp_ipsec_remotecred->ref_count++;
 3699           }
 3700           if (inp->inp_ipsec_remoteauth != NULL) {
 3701                   newinp->inp_ipsec_remoteauth
 3702                       = inp->inp_ipsec_remoteauth;
 3703                   inp->inp_ipsec_remoteauth->ref_count++;
 3704           }
 3705         }
 3706 #endif /* IPSEC */
 3707 #ifdef INET6
 3708         /*
 3709          * inp still has the OLD in_pcb stuff, set the
 3710          * v6-related flags on the new guy, too.
 3711          */
 3712         {
 3713           int flags = inp->inp_flags;
 3714           struct inpcb *oldinpcb = inp;
 3715 
 3716           inp = (struct inpcb *)so->so_pcb;
 3717           inp->inp_flags |= (flags & INP_IPV6);
 3718           if ((inp->inp_flags & INP_IPV6) != 0) {
 3719             inp->inp_ipv6.ip6_hlim =
 3720               oldinpcb->inp_ipv6.ip6_hlim;
 3721           }
 3722         }
 3723 #else /* INET6 */
 3724         inp = (struct inpcb *)so->so_pcb;
 3725 #endif /* INET6 */
 3726 
 3727         inp->inp_lport = th->th_dport;
 3728         switch (src->sa_family) {
 3729 #ifdef INET6
 3730         case AF_INET6:
 3731                 inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr;
 3732                 break;
 3733 #endif /* INET6 */
 3734         case AF_INET:
 3735 
 3736                 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
 3737                 inp->inp_options = ip_srcroute();
 3738                 if (inp->inp_options == NULL) {
 3739                         inp->inp_options = sc->sc_ipopts;
 3740                         sc->sc_ipopts = NULL;
 3741                 }
 3742                 break;
 3743         }
 3744         in_pcbrehash(inp);
 3745 
 3746         /*
 3747          * Give the new socket our cached route reference.
 3748          */
 3749         if (src->sa_family == AF_INET)
 3750                 inp->inp_route = sc->sc_route4;         /* struct assignment */
 3751 #ifdef INET6
 3752         else
 3753                 inp->inp_route6 = sc->sc_route6;
 3754 #endif  
 3755         sc->sc_route4.ro_rt = NULL;
 3756 
 3757         am = m_get(M_DONTWAIT, MT_SONAME);      /* XXX */
 3758         if (am == NULL)
 3759                 goto resetandabort;
 3760         am->m_len = src->sa_len;
 3761         bcopy(src, mtod(am, caddr_t), src->sa_len);
 3762 
 3763         switch (src->sa_family) {
 3764         case AF_INET:
 3765                 /* drop IPv4 packet to AF_INET6 socket */
 3766                 if (inp->inp_flags & INP_IPV6) {
 3767                         (void) m_free(am);
 3768                         goto resetandabort;
 3769                 }
 3770                 if (in_pcbconnect(inp, am)) {
 3771                         (void) m_free(am);
 3772                         goto resetandabort;
 3773                 }
 3774                 break;
 3775 #ifdef INET6
 3776         case AF_INET6:
 3777                 if (in6_pcbconnect(inp, am)) {
 3778                         (void) m_free(am);
 3779                         goto resetandabort;
 3780                 }
 3781                 break;
 3782 #endif
 3783         }
 3784         (void) m_free(am);
 3785 
 3786         tp = intotcpcb(inp);
 3787         tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
 3788         if (sc->sc_request_r_scale != 15) {
 3789                 tp->requested_s_scale = sc->sc_requested_s_scale;
 3790                 tp->request_r_scale = sc->sc_request_r_scale;
 3791                 tp->snd_scale = sc->sc_requested_s_scale;
 3792                 tp->rcv_scale = sc->sc_request_r_scale;
 3793                 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
 3794         }
 3795         if (sc->sc_flags & SCF_TIMESTAMP)
 3796                 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
 3797 
 3798         tp->t_template = tcp_template(tp);
 3799         if (tp->t_template == 0) {
 3800                 tp = tcp_drop(tp, ENOBUFS);     /* destroys socket */
 3801                 so = NULL;
 3802                 m_freem(m);
 3803                 goto abort;
 3804         }
 3805 #ifdef TCP_SACK
 3806         tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT;
 3807 #endif
 3808 
 3809         tp->ts_modulate = sc->sc_modulate;
 3810         tp->iss = sc->sc_iss;
 3811         tp->irs = sc->sc_irs;
 3812         tcp_sendseqinit(tp);
 3813 #if defined (TCP_SACK) || defined(TCP_ECN)
 3814         tp->snd_last = tp->snd_una;
 3815 #endif /* TCP_SACK */
 3816 #if defined(TCP_SACK) && defined(TCP_FACK)
 3817         tp->snd_fack = tp->snd_una;
 3818         tp->retran_data = 0;
 3819         tp->snd_awnd = 0;
 3820 #endif /* TCP_FACK */
 3821 #ifdef TCP_ECN
 3822         if (sc->sc_flags & SCF_ECN_PERMIT) {
 3823                 tp->t_flags |= TF_ECN_PERMIT;
 3824                 tcpstat.tcps_ecn_accepts++;
 3825         }
 3826 #endif
 3827 #ifdef TCP_SACK
 3828         if (sc->sc_flags & SCF_SACK_PERMIT)
 3829                 tp->t_flags |= TF_SACK_PERMIT;
 3830 #endif
 3831 #ifdef TCP_SIGNATURE
 3832         if (sc->sc_flags & SCF_SIGNATURE)
 3833                 tp->t_flags |= TF_SIGNATURE;
 3834 #endif
 3835         tcp_rcvseqinit(tp);
 3836         tp->t_state = TCPS_SYN_RECEIVED;
 3837         tp->t_rcvtime = tcp_now;
 3838         TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
 3839         tcpstat.tcps_accepts++;
 3840 
 3841         tcp_mss(tp, sc->sc_peermaxseg);  /* sets t_maxseg */
 3842         if (sc->sc_peermaxseg)
 3843                 tcp_mss_update(tp);
 3844         /* Reset initial window to 1 segment for retransmit */
 3845         if (sc->sc_rxtshift > 0)
 3846                 tp->snd_cwnd = tp->t_maxseg;
 3847         tp->snd_wl1 = sc->sc_irs;
 3848         tp->rcv_up = sc->sc_irs + 1;
 3849 
 3850         /*
 3851          * This is what whould have happened in tcp_output() when
 3852          * the SYN,ACK was sent.
 3853          */
 3854         tp->snd_up = tp->snd_una;
 3855         tp->snd_max = tp->snd_nxt = tp->iss+1;
 3856         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 3857         if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
 3858                 tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
 3859         tp->last_ack_sent = tp->rcv_nxt;
 3860 
 3861         tcpstat.tcps_sc_completed++;
 3862         SYN_CACHE_PUT(sc);
 3863         return (so);
 3864 
 3865 resetandabort:
 3866         tcp_respond(NULL, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack, TH_RST);
 3867 abort:
 3868         if (so != NULL)
 3869                 (void) soabort(so);
 3870         SYN_CACHE_PUT(sc);
 3871         tcpstat.tcps_sc_aborted++;
 3872         return ((struct socket *)(-1));
 3873 }
 3874 
 3875 /*
 3876  * This function is called when we get a RST for a
 3877  * non-existent connection, so that we can see if the
 3878  * connection is in the syn cache.  If it is, zap it.
 3879  */
 3880 
 3881 void
 3882 syn_cache_reset(src, dst, th)
 3883         struct sockaddr *src;
 3884         struct sockaddr *dst;
 3885         struct tcphdr *th;
 3886 {
 3887         struct syn_cache *sc;
 3888         struct syn_cache_head *scp;
 3889         int s = splsoftnet();
 3890 
 3891         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
 3892                 splx(s);
 3893                 return;
 3894         }
 3895         if (SEQ_LT(th->th_seq, sc->sc_irs) ||
 3896             SEQ_GT(th->th_seq, sc->sc_irs+1)) {
 3897                 splx(s);
 3898                 return;
 3899         }
 3900         SYN_CACHE_RM(sc);
 3901         splx(s);
 3902         tcpstat.tcps_sc_reset++;
 3903         SYN_CACHE_PUT(sc);
 3904 }
 3905 
 3906 void
 3907 syn_cache_unreach(src, dst, th)
 3908         struct sockaddr *src;
 3909         struct sockaddr *dst;
 3910         struct tcphdr *th;
 3911 {
 3912         struct syn_cache *sc;
 3913         struct syn_cache_head *scp;
 3914         int s;
 3915 
 3916         s = splsoftnet();
 3917         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
 3918                 splx(s);
 3919                 return;
 3920         }
 3921         /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
 3922         if (ntohl (th->th_seq) != sc->sc_iss) {
 3923                 splx(s);
 3924                 return;
 3925         }
 3926 
 3927         /*
 3928          * If we've retransmitted 3 times and this is our second error,
 3929          * we remove the entry.  Otherwise, we allow it to continue on.
 3930          * This prevents us from incorrectly nuking an entry during a
 3931          * spurious network outage.
 3932          *
 3933          * See tcp_notify().
 3934          */
 3935         if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
 3936                 sc->sc_flags |= SCF_UNREACH;
 3937                 splx(s);
 3938                 return;
 3939         }
 3940 
 3941         SYN_CACHE_RM(sc);
 3942         splx(s);
 3943         tcpstat.tcps_sc_unreach++;
 3944         SYN_CACHE_PUT(sc);
 3945 }
 3946 
 3947 /*
 3948  * Given a LISTEN socket and an inbound SYN request, add
 3949  * this to the syn cache, and send back a segment:
 3950  *      <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
 3951  * to the source.
 3952  *
 3953  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
 3954  * Doing so would require that we hold onto the data and deliver it
 3955  * to the application.  However, if we are the target of a SYN-flood
 3956  * DoS attack, an attacker could send data which would eventually
 3957  * consume all available buffer space if it were ACKed.  By not ACKing
 3958  * the data, we avoid this DoS scenario.
 3959  */
 3960 
 3961 int
 3962 syn_cache_add(src, dst, th, iphlen, so, m, optp, optlen, oi, issp)
 3963         struct sockaddr *src;
 3964         struct sockaddr *dst;
 3965         struct tcphdr *th;
 3966         unsigned int iphlen;
 3967         struct socket *so;
 3968         struct mbuf *m;
 3969         u_char *optp;
 3970         int optlen;
 3971         struct tcp_opt_info *oi;
 3972         tcp_seq *issp;
 3973 {
 3974         struct tcpcb tb, *tp;
 3975         long win;
 3976         struct syn_cache *sc;
 3977         struct syn_cache_head *scp;
 3978         struct mbuf *ipopts;
 3979 
 3980         tp = sototcpcb(so);
 3981 
 3982         /*
 3983          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
 3984          *
 3985          * Note this check is performed in tcp_input() very early on.
 3986          */
 3987 
 3988         /*
 3989          * Initialize some local state.
 3990          */
 3991         win = sbspace(&so->so_rcv);
 3992         if (win > TCP_MAXWIN)
 3993                 win = TCP_MAXWIN;
 3994 
 3995 #ifdef TCP_SIGNATURE
 3996         if (optp || (tp->t_flags & TF_SIGNATURE)) {
 3997 #else
 3998         if (optp) {
 3999 #endif
 4000                 tb.pf = tp->pf;
 4001 #ifdef TCP_SACK
 4002                 tb.sack_enable = tp->sack_enable;
 4003 #endif
 4004                 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
 4005 #ifdef TCP_SIGNATURE
 4006                 if (tp->t_flags & TF_SIGNATURE)
 4007                         tb.t_flags |= TF_SIGNATURE;
 4008 #endif
 4009                 tb.t_state = TCPS_LISTEN;
 4010                 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi))
 4011                         return (0);
 4012         } else
 4013                 tb.t_flags = 0;
 4014 
 4015         switch (src->sa_family) {
 4016 #ifdef INET
 4017         case AF_INET:
 4018                 /*
 4019                  * Remember the IP options, if any.
 4020                  */
 4021                 ipopts = ip_srcroute();
 4022                 break;
 4023 #endif
 4024         default:
 4025                 ipopts = NULL;
 4026         }
 4027 
 4028         /*
 4029          * See if we already have an entry for this connection.
 4030          * If we do, resend the SYN,ACK.  We do not count this
 4031          * as a retransmission (XXX though maybe we should).
 4032          */
 4033         if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
 4034                 tcpstat.tcps_sc_dupesyn++;
 4035                 if (ipopts) {
 4036                         /*
 4037                          * If we were remembering a previous source route,
 4038                          * forget it and use the new one we've been given.
 4039                          */
 4040                         if (sc->sc_ipopts)
 4041                                 (void) m_free(sc->sc_ipopts);
 4042                         sc->sc_ipopts = ipopts;
 4043                 }
 4044                 sc->sc_timestamp = tb.ts_recent;
 4045                 if (syn_cache_respond(sc, m) == 0) {
 4046                         tcpstat.tcps_sndacks++;
 4047                         tcpstat.tcps_sndtotal++;
 4048                 }
 4049                 return (1);
 4050         }
 4051 
 4052         sc = pool_get(&syn_cache_pool, PR_NOWAIT);
 4053         if (sc == NULL) {
 4054                 if (ipopts)
 4055                         (void) m_free(ipopts);
 4056                 return (0);
 4057         }
 4058 
 4059         /*
 4060          * Fill in the cache, and put the necessary IP and TCP
 4061          * options into the reply.
 4062          */
 4063         bzero(sc, sizeof(struct syn_cache));
 4064         bzero(&sc->sc_timer, sizeof(sc->sc_timer));
 4065         bcopy(src, &sc->sc_src, src->sa_len);
 4066         bcopy(dst, &sc->sc_dst, dst->sa_len);
 4067         sc->sc_flags = 0;
 4068         sc->sc_ipopts = ipopts;
 4069         sc->sc_irs = th->th_seq;
 4070 
 4071 #ifdef TCP_COMPAT_42
 4072         tcp_iss += TCP_ISSINCR/2;
 4073         sc->sc_iss = tcp_iss;
 4074 #else
 4075         sc->sc_iss = issp ? *issp : arc4random();
 4076 #endif
 4077         sc->sc_peermaxseg = oi->maxseg;
 4078         sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ?
 4079             m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family);
 4080         sc->sc_win = win;
 4081         sc->sc_timestamp = tb.ts_recent;
 4082         if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
 4083             (TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
 4084                 sc->sc_flags |= SCF_TIMESTAMP;
 4085                 sc->sc_modulate = arc4random();
 4086         }
 4087         if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 4088             (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 4089                 sc->sc_requested_s_scale = tb.requested_s_scale;
 4090                 sc->sc_request_r_scale = 0;
 4091                 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
 4092                     TCP_MAXWIN << sc->sc_request_r_scale <
 4093                     so->so_rcv.sb_hiwat)
 4094                         sc->sc_request_r_scale++;
 4095         } else {
 4096                 sc->sc_requested_s_scale = 15;
 4097                 sc->sc_request_r_scale = 15;
 4098         }
 4099 #ifdef TCP_ECN
 4100         /*
 4101          * if both ECE and CWR flag bits are set, peer is ECN capable.
 4102          */
 4103         if (tcp_do_ecn &&
 4104             (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR))
 4105                 sc->sc_flags |= SCF_ECN_PERMIT;
 4106 #endif
 4107 #ifdef TCP_SACK
 4108         /*
 4109          * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option
 4110          * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT).
 4111          */
 4112         if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT))
 4113                 sc->sc_flags |= SCF_SACK_PERMIT;
 4114 #endif
 4115 #ifdef TCP_SIGNATURE
 4116         if (tb.t_flags & TF_SIGNATURE)
 4117                 sc->sc_flags |= SCF_SIGNATURE;
 4118 #endif
 4119         sc->sc_tp = tp;
 4120         if (syn_cache_respond(sc, m) == 0) {
 4121                 syn_cache_insert(sc, tp);
 4122                 tcpstat.tcps_sndacks++;
 4123                 tcpstat.tcps_sndtotal++;
 4124         } else {
 4125                 SYN_CACHE_PUT(sc);
 4126                 tcpstat.tcps_sc_dropped++;
 4127         }
 4128         return (1);
 4129 }
 4130 
 4131 int
 4132 syn_cache_respond(sc, m)
 4133         struct syn_cache *sc;
 4134         struct mbuf *m;
 4135 {
 4136         struct route *ro;
 4137         u_int8_t *optp;
 4138         int optlen, error;
 4139         u_int16_t tlen;
 4140         struct ip *ip = NULL;
 4141 #ifdef INET6
 4142         struct ip6_hdr *ip6 = NULL;
 4143 #endif
 4144         struct tcphdr *th;
 4145         u_int hlen;
 4146         struct inpcb *inp;
 4147 
 4148         switch (sc->sc_src.sa.sa_family) {
 4149         case AF_INET:
 4150                 hlen = sizeof(struct ip);
 4151                 ro = &sc->sc_route4;
 4152                 break;
 4153 #ifdef INET6
 4154         case AF_INET6:
 4155                 hlen = sizeof(struct ip6_hdr);
 4156                 ro = (struct route *)&sc->sc_route6;
 4157                 break;
 4158 #endif
 4159         default:
 4160                 if (m)
 4161                         m_freem(m);
 4162                 return (EAFNOSUPPORT);
 4163         }
 4164 
 4165         /* Compute the size of the TCP options. */
 4166         optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
 4167 #ifdef TCP_SACK
 4168             ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) +
 4169 #endif
 4170 #ifdef TCP_SIGNATURE
 4171             ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) +
 4172 #endif
 4173             ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
 4174 
 4175         tlen = hlen + sizeof(struct tcphdr) + optlen;
 4176 
 4177         /*
 4178          * Create the IP+TCP header from scratch.
 4179          */
 4180         if (m)
 4181                 m_freem(m);
 4182 #ifdef DIAGNOSTIC
 4183         if (max_linkhdr + tlen > MCLBYTES)
 4184                 return (ENOBUFS);
 4185 #endif
 4186         MGETHDR(m, M_DONTWAIT, MT_DATA);
 4187         if (m && max_linkhdr + tlen > MHLEN) {
 4188                 MCLGET(m, M_DONTWAIT);
 4189                 if ((m->m_flags & M_EXT) == 0) {
 4190                         m_freem(m);
 4191                         m = NULL;
 4192                 }
 4193         }
 4194         if (m == NULL)
 4195                 return (ENOBUFS);
 4196 
 4197         /* Fixup the mbuf. */
 4198         m->m_data += max_linkhdr;
 4199         m->m_len = m->m_pkthdr.len = tlen;
 4200         m->m_pkthdr.rcvif = NULL;
 4201         memset(mtod(m, u_char *), 0, tlen);
 4202 
 4203         switch (sc->sc_src.sa.sa_family) {
 4204         case AF_INET:
 4205                 ip = mtod(m, struct ip *);
 4206                 ip->ip_dst = sc->sc_src.sin.sin_addr;
 4207                 ip->ip_src = sc->sc_dst.sin.sin_addr;
 4208                 ip->ip_p = IPPROTO_TCP;
 4209                 th = (struct tcphdr *)(ip + 1);
 4210                 th->th_dport = sc->sc_src.sin.sin_port;
 4211                 th->th_sport = sc->sc_dst.sin.sin_port;
 4212                 break;
 4213 #ifdef INET6
 4214         case AF_INET6:
 4215                 ip6 = mtod(m, struct ip6_hdr *);
 4216                 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
 4217                 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
 4218                 ip6->ip6_nxt = IPPROTO_TCP;
 4219                 /* ip6_plen will be updated in ip6_output() */
 4220                 th = (struct tcphdr *)(ip6 + 1);
 4221                 th->th_dport = sc->sc_src.sin6.sin6_port;
 4222                 th->th_sport = sc->sc_dst.sin6.sin6_port;
 4223                 break;
 4224 #endif
 4225         default:
 4226                 th = NULL;
 4227         }
 4228 
 4229         th->th_seq = htonl(sc->sc_iss);
 4230         th->th_ack = htonl(sc->sc_irs + 1);
 4231         th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 4232         th->th_flags = TH_SYN|TH_ACK;
 4233 #ifdef TCP_ECN
 4234         /* Set ECE for SYN-ACK if peer supports ECN. */
 4235         if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT))
 4236                 th->th_flags |= TH_ECE;
 4237 #endif
 4238         th->th_win = htons(sc->sc_win);
 4239         /* th_sum already 0 */
 4240         /* th_urp already 0 */
 4241 
 4242         /* Tack on the TCP options. */
 4243         optp = (u_int8_t *)(th + 1);
 4244         *optp++ = TCPOPT_MAXSEG;
 4245         *optp++ = 4;
 4246         *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
 4247         *optp++ = sc->sc_ourmaxseg & 0xff;
 4248 
 4249 #ifdef TCP_SACK
 4250         /* Include SACK_PERMIT_HDR option if peer has already done so. */
 4251         if (sc->sc_flags & SCF_SACK_PERMIT) {
 4252                 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR);
 4253                 optp += 4;
 4254         }
 4255 #endif
 4256 
 4257         if (sc->sc_request_r_scale != 15) {
 4258                 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
 4259                     TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
 4260                     sc->sc_request_r_scale);
 4261                 optp += 4;
 4262         }
 4263 
 4264         if (sc->sc_flags & SCF_TIMESTAMP) {
 4265                 u_int32_t *lp = (u_int32_t *)(optp);
 4266                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 4267                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
 4268                 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
 4269                 *lp   = htonl(sc->sc_timestamp);
 4270                 optp += TCPOLEN_TSTAMP_APPA;
 4271         }
 4272 
 4273 #ifdef TCP_SIGNATURE
 4274         if (sc->sc_flags & SCF_SIGNATURE) {
 4275                 union sockaddr_union src, dst;
 4276                 struct tdb *tdb;
 4277 
 4278                 bzero(&src, sizeof(union sockaddr_union));
 4279                 bzero(&dst, sizeof(union sockaddr_union));
 4280                 src.sa.sa_len = sc->sc_src.sa.sa_len;
 4281                 src.sa.sa_family = sc->sc_src.sa.sa_family;
 4282                 dst.sa.sa_len = sc->sc_dst.sa.sa_len;
 4283                 dst.sa.sa_family = sc->sc_dst.sa.sa_family;
 4284 
 4285                 switch (sc->sc_src.sa.sa_family) {
 4286                 case 0: /*default to PF_INET*/
 4287 #ifdef INET
 4288                 case AF_INET:
 4289                         src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
 4290                         dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
 4291                         break;
 4292 #endif /* INET */
 4293 #ifdef INET6
 4294                 case AF_INET6:
 4295                         src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
 4296                         dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
 4297                         break;
 4298 #endif /* INET6 */
 4299                 }
 4300 
 4301                 tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP);
 4302                 if (tdb == NULL) {
 4303                         if (m)
 4304                                 m_freem(m);
 4305                         return (EPERM);
 4306                 }
 4307 
 4308                 /* Send signature option */
 4309                 *(optp++) = TCPOPT_SIGNATURE;
 4310                 *(optp++) = TCPOLEN_SIGNATURE;
 4311 
 4312                 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th,
 4313                     hlen, 0, optp) < 0) {
 4314                         if (m)
 4315                                 m_freem(m);
 4316                         return (EINVAL);
 4317                 }
 4318                 optp += 16;
 4319 
 4320                 /* Pad options list to the next 32 bit boundary and
 4321                  * terminate it.
 4322                  */
 4323                 *optp++ = TCPOPT_NOP;
 4324                 *optp++ = TCPOPT_EOL;
 4325         }
 4326 #endif /* TCP_SIGNATURE */
 4327 
 4328         /* Compute the packet's checksum. */
 4329         switch (sc->sc_src.sa.sa_family) {
 4330         case AF_INET:
 4331                 ip->ip_len = htons(tlen - hlen);
 4332                 th->th_sum = 0;
 4333                 th->th_sum = in_cksum(m, tlen);
 4334                 break;
 4335 #ifdef INET6
 4336         case AF_INET6:
 4337                 ip6->ip6_plen = htons(tlen - hlen);
 4338                 th->th_sum = 0;
 4339                 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
 4340                 break;
 4341 #endif
 4342         }
 4343 
 4344         /* use IPsec policy and ttl from listening socket, on SYN ACK */
 4345         inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
 4346 
 4347         /*
 4348          * Fill in some straggling IP bits.  Note the stack expects
 4349          * ip_len to be in host order, for convenience.
 4350          */
 4351         switch (sc->sc_src.sa.sa_family) {
 4352 #ifdef INET
 4353         case AF_INET:
 4354                 ip->ip_len = htons(tlen);
 4355                 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl;
 4356                 /* XXX tos? */
 4357                 break;
 4358 #endif
 4359 #ifdef INET6
 4360         case AF_INET6:
 4361                 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 4362                 ip6->ip6_vfc |= IPV6_VERSION;
 4363                 ip6->ip6_plen = htons(tlen - hlen);
 4364                 /* ip6_hlim will be initialized afterwards */
 4365                 /* leave flowlabel = 0, it is legal and require no state mgmt */
 4366                 break;
 4367 #endif
 4368         }
 4369 
 4370         switch (sc->sc_src.sa.sa_family) {
 4371 #ifdef INET
 4372         case AF_INET:
 4373                 error = ip_output(m, sc->sc_ipopts, ro,
 4374                     (ip_mtudisc ? IP_MTUDISC : 0), 
 4375                     (struct ip_moptions *)NULL, inp);
 4376                 break;
 4377 #endif
 4378 #ifdef INET6
 4379         case AF_INET6:
 4380                 ip6->ip6_hlim = in6_selecthlim(NULL,
 4381                                 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
 4382 
 4383                 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0,
 4384                         (struct ip6_moptions *)0, NULL, NULL);
 4385                 break;
 4386 #endif
 4387         default:
 4388                 error = EAFNOSUPPORT;
 4389                 break;
 4390         }
 4391         return (error);
 4392 }

/* [<][>][^][v][top][bottom][index][help] */