root/netinet/ip_mroute.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mfc_find
  2. ip_mrouter_set
  3. ip_mrouter_get
  4. mrt_ioctl
  5. get_sg_cnt
  6. get_vif_cnt
  7. ip_mrouter_init
  8. ip_mrouter_done
  9. ip_mrouter_detach
  10. get_version
  11. set_assert
  12. get_assert
  13. set_api_config
  14. get_api_support
  15. get_api_config
  16. add_vif
  17. reset_vif
  18. del_vif
  19. vif_delete
  20. update_mfc_params
  21. init_mfc_params
  22. expire_mfc
  23. add_mfc
  24. collate
  25. del_mfc
  26. socket_send
  27. ip_mforward
  28. expire_upcalls
  29. ip_mdq
  30. legal_vif_num
  31. phyint_send
  32. encap_send
  33. tbf_control
  34. tbf_queue
  35. tbf_process_q
  36. tbf_reprocess_q
  37. tbf_dq_sel
  38. tbf_send_packet
  39. tbf_update_tokens
  40. priority
  41. ip_rsvp_vif_init
  42. ip_rsvp_vif_done
  43. ip_rsvp_force_done
  44. rsvp_input
  45. compute_bw_meter_flags
  46. add_bw_upcall
  47. free_bw_list
  48. del_bw_upcall
  49. bw_meter_receive_packet
  50. bw_meter_prepare_upcall
  51. bw_upcalls_send
  52. schedule_bw_meter
  53. unschedule_bw_meter
  54. bw_meter_process
  55. expire_bw_upcalls_send
  56. expire_bw_meter_process
  57. pim_register_send
  58. pim_register_prepare
  59. pim_register_send_upcall
  60. pim_register_send_rp
  61. pim_input

    1 /*      $OpenBSD: ip_mroute.c,v 1.48 2007/05/22 09:51:13 michele Exp $  */
    2 /*      $NetBSD: ip_mroute.c,v 1.85 2004/04/26 01:31:57 matt Exp $      */
    3 
    4 /*
    5  * Copyright (c) 1989 Stephen Deering
    6  * Copyright (c) 1992, 1993
    7  *      The Regents of the University of California.  All rights reserved.
    8  *
    9  * This code is derived from software contributed to Berkeley by
   10  * Stephen Deering of Stanford University.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
   37  */
   38 
   39 /*
   40  * IP multicast forwarding procedures
   41  *
   42  * Written by David Waitzman, BBN Labs, August 1988.
   43  * Modified by Steve Deering, Stanford, February 1989.
   44  * Modified by Mark J. Steiglitz, Stanford, May, 1991
   45  * Modified by Van Jacobson, LBL, January 1993
   46  * Modified by Ajit Thyagarajan, PARC, August 1993
   47  * Modified by Bill Fenner, PARC, April 1994
   48  * Modified by Charles M. Hannum, NetBSD, May 1995.
   49  * Modified by Ahmed Helmy, SGI, June 1996
   50  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
   51  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
   52  * Modified by Hitoshi Asaeda, WIDE, August 2000
   53  * Modified by Pavlin Radoslavov, ICSI, October 2002
   54  *
   55  * MROUTING Revision: 1.2
   56  * and PIM-SMv2 and PIM-DM support, advanced API support,
   57  * bandwidth metering and signaling
   58  */
   59 
   60 #ifdef PIM
   61 #define _PIM_VT 1
   62 #endif
   63 
   64 #include <sys/param.h>
   65 #include <sys/systm.h>
   66 #include <sys/mbuf.h>
   67 #include <sys/socket.h>
   68 #include <sys/socketvar.h>
   69 #include <sys/protosw.h>
   70 #include <sys/errno.h>
   71 #include <sys/time.h>
   72 #include <sys/kernel.h>
   73 #include <sys/ioctl.h>
   74 #include <sys/syslog.h>
   75 #include <sys/timeout.h>
   76 
   77 #include <net/if.h>
   78 #include <net/route.h>
   79 #include <net/raw_cb.h>
   80 
   81 #include <netinet/in.h>
   82 #include <netinet/in_var.h>
   83 #include <netinet/in_systm.h>
   84 #include <netinet/ip.h>
   85 #include <netinet/ip_var.h>
   86 #include <netinet/in_pcb.h>
   87 #include <netinet/udp.h>
   88 #include <netinet/igmp.h>
   89 #include <netinet/igmp_var.h>
   90 #include <netinet/ip_mroute.h>
   91 #ifdef PIM
   92 #include <netinet/pim.h>
   93 #include <netinet/pim_var.h>
   94 #endif
   95 
   96 #include <sys/stdarg.h>
   97 
   98 #define IP_MULTICASTOPTS 0
   99 #define M_PULLUP(m, len)                                                 \
  100         do {                                                             \
  101                 if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \
  102                         (m) = m_pullup((m), (len));                      \
  103         } while (/*CONSTCOND*/ 0)
  104 
  105 /*
  106  * Globals.  All but ip_mrouter and ip_mrtproto could be static,
  107  * except for netstat or debugging purposes.
  108  */
  109 struct socket  *ip_mrouter  = NULL;
  110 int             ip_mrtproto = IGMP_DVMRP;    /* for netstat only */
  111 
  112 #define NO_RTE_FOUND    0x1
  113 #define RTE_FOUND       0x2
  114 
  115 #define MFCHASH(a, g)                                                   \
  116         ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^        \
  117             ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash)
  118 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl;
  119 u_long  mfchash;
  120 
  121 u_char          nexpire[MFCTBLSIZ];
  122 struct vif      viftable[MAXVIFS];
  123 struct mrtstat  mrtstat;
  124 u_int           mrtdebug = 0;     /* debug level        */
  125 #define         DEBUG_MFC       0x02
  126 #define         DEBUG_FORWARD   0x04
  127 #define         DEBUG_EXPIRE    0x08
  128 #define         DEBUG_XMIT      0x10
  129 #define         DEBUG_PIM       0x20
  130 
  131 #define         VIFI_INVALID    ((vifi_t) -1)
  132 
  133 u_int           tbfdebug = 0;     /* tbf debug level    */
  134 #ifdef RSVP_ISI
  135 u_int           rsvpdebug = 0;    /* rsvp debug level   */
  136 extern struct socket *ip_rsvpd;
  137 extern int rsvp_on;
  138 #endif /* RSVP_ISI */
  139 
  140 #define         EXPIRE_TIMEOUT  (hz / 4)        /* 4x / second */
  141 #define         UPCALL_EXPIRE   6               /* number of timeouts */
  142 struct timeout  expire_upcalls_ch;
  143 
  144 /*
  145  * Define the token bucket filter structures
  146  */
  147 
  148 #define         TBF_REPROCESS   (hz / 100)      /* 100x / second */
  149 
  150 static int get_sg_cnt(struct sioc_sg_req *);
  151 static int get_vif_cnt(struct sioc_vif_req *);
  152 static int ip_mrouter_init(struct socket *, struct mbuf *);
  153 static int get_version(struct mbuf *);
  154 static int set_assert(struct mbuf *);
  155 static int get_assert(struct mbuf *);
  156 static int add_vif(struct mbuf *);
  157 static int del_vif(struct mbuf *);
  158 static void update_mfc_params(struct mfc *, struct mfcctl2 *);
  159 static void init_mfc_params(struct mfc *, struct mfcctl2 *);
  160 static void expire_mfc(struct mfc *);
  161 static int add_mfc(struct mbuf *);
  162 #ifdef UPCALL_TIMING
  163 static void collate(struct timeval *);
  164 #endif
  165 static int del_mfc(struct mbuf *);
  166 static int set_api_config(struct mbuf *); /* chose API capabilities */
  167 static int get_api_support(struct mbuf *);
  168 static int get_api_config(struct mbuf *);
  169 static int socket_send(struct socket *, struct mbuf *,
  170                             struct sockaddr_in *);
  171 static void expire_upcalls(void *);
  172 #ifdef RSVP_ISI
  173 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
  174 #else
  175 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *);
  176 #endif
  177 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
  178 static void encap_send(struct ip *, struct vif *, struct mbuf *);
  179 static void tbf_control(struct vif *, struct mbuf *, struct ip *,
  180                              u_int32_t);
  181 static void tbf_queue(struct vif *, struct mbuf *);
  182 static void tbf_process_q(struct vif *);
  183 static void tbf_reprocess_q(void *);
  184 static int tbf_dq_sel(struct vif *, struct ip *);
  185 static void tbf_send_packet(struct vif *, struct mbuf *);
  186 static void tbf_update_tokens(struct vif *);
  187 static int priority(struct vif *, struct ip *);
  188 
  189 /*
  190  * Bandwidth monitoring
  191  */
  192 static void free_bw_list(struct bw_meter *);
  193 static int add_bw_upcall(struct mbuf *);
  194 static int del_bw_upcall(struct mbuf *);
  195 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *);
  196 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
  197 static void bw_upcalls_send(void);
  198 static void schedule_bw_meter(struct bw_meter *, struct timeval *);
  199 static void unschedule_bw_meter(struct bw_meter *);
  200 static void bw_meter_process(void);
  201 static void expire_bw_upcalls_send(void *);
  202 static void expire_bw_meter_process(void *);
  203 
  204 #ifdef PIM
  205 static int pim_register_send(struct ip *, struct vif *,
  206                 struct mbuf *, struct mfc *);
  207 static int pim_register_send_rp(struct ip *, struct vif *,
  208                 struct mbuf *, struct mfc *);
  209 static int pim_register_send_upcall(struct ip *, struct vif *,
  210                 struct mbuf *, struct mfc *);
  211 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *);
  212 #endif
  213 
  214 /*
  215  * 'Interfaces' associated with decapsulator (so we can tell
  216  * packets that went through it from ones that get reflected
  217  * by a broken gateway).  These interfaces are never linked into
  218  * the system ifnet list & no routes point to them.  I.e., packets
  219  * can't be sent this way.  They only exist as a placeholder for
  220  * multicast source verification.
  221  */
  222 #if 0
  223 struct ifnet multicast_decap_if[MAXVIFS];
  224 #endif
  225 
  226 #define ENCAP_TTL       64
  227 #define ENCAP_PROTO     IPPROTO_IPIP    /* 4 */
  228 
  229 /* prototype IP hdr for encapsulated packets */
  230 struct ip multicast_encap_iphdr = {
  231 #if BYTE_ORDER == LITTLE_ENDIAN
  232         sizeof(struct ip) >> 2, IPVERSION,
  233 #else
  234         IPVERSION, sizeof(struct ip) >> 2,
  235 #endif
  236         0,                              /* tos */
  237         sizeof(struct ip),              /* total length */
  238         0,                              /* id */
  239         0,                              /* frag offset */
  240         ENCAP_TTL, ENCAP_PROTO,
  241         0,                              /* checksum */
  242 };
  243 
  244 /*
  245  * Bandwidth meter variables and constants
  246  */
  247 
  248 /*
  249  * Pending timeouts are stored in a hash table, the key being the
  250  * expiration time. Periodically, the entries are analysed and processed.
  251  */
  252 #define BW_METER_BUCKETS        1024
  253 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
  254 struct timeout bw_meter_ch;
  255 #define BW_METER_PERIOD (hz)            /* periodical handling of bw meters */
  256 
  257 /*
  258  * Pending upcalls are stored in a vector which is flushed when
  259  * full, or periodically
  260  */
  261 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX];
  262 static u_int    bw_upcalls_n; /* # of pending upcalls */
  263 struct timeout  bw_upcalls_ch;
  264 #define BW_UPCALLS_PERIOD (hz)          /* periodical flush of bw upcalls */
  265 
  266 #ifdef PIM
  267 struct pimstat pimstat;
  268 
  269 /*
  270  * Note: the PIM Register encapsulation adds the following in front of a
  271  * data packet:
  272  *
  273  * struct pim_encap_hdr {
  274  *    struct ip ip;
  275  *    struct pim_encap_pimhdr  pim;
  276  * }
  277  *
  278  */
  279 
  280 struct pim_encap_pimhdr {
  281         struct pim pim;
  282         uint32_t   flags;
  283 };
  284 
  285 static struct ip pim_encap_iphdr = {
  286 #if BYTE_ORDER == LITTLE_ENDIAN
  287         sizeof(struct ip) >> 2,
  288         IPVERSION,
  289 #else
  290         IPVERSION,
  291         sizeof(struct ip) >> 2,
  292 #endif
  293         0,                      /* tos */
  294         sizeof(struct ip),      /* total length */
  295         0,                      /* id */
  296         0,                      /* frag offset */ 
  297         ENCAP_TTL,
  298         IPPROTO_PIM,
  299         0,                      /* checksum */
  300 };
  301 
  302 static struct pim_encap_pimhdr pim_encap_pimhdr = {
  303     {
  304         PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
  305         0,                      /* reserved */
  306         0,                      /* checksum */
  307     },
  308     0                           /* flags */
  309 };
  310 
  311 static struct ifnet multicast_register_if;
  312 static vifi_t reg_vif_num = VIFI_INVALID;
  313 #endif /* PIM */
  314 
  315 
  316 /*
  317  * Private variables.
  318  */
  319 static vifi_t      numvifs = 0;
  320 static int have_encap_tunnel = 0;
  321 
  322 /*
  323  * whether or not special PIM assert processing is enabled.
  324  */
  325 static int pim_assert;
  326 /*
  327  * Rate limit for assert notification messages, in usec
  328  */
  329 #define ASSERT_MSG_TIME         3000000
  330 
  331 /*
  332  * Kernel multicast routing API capabilities and setup.
  333  * If more API capabilities are added to the kernel, they should be
  334  * recorded in `mrt_api_support'.
  335  */
  336 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
  337                                           MRT_MFC_FLAGS_BORDER_VIF |
  338                                           MRT_MFC_RP |
  339                                           MRT_MFC_BW_UPCALL);
  340 static u_int32_t mrt_api_config = 0;
  341 
  342 /*
  343  * Find a route for a given origin IP address and Multicast group address
  344  * Type of service parameter to be added in the future!!!
  345  * Statistics are updated by the caller if needed
  346  * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
  347  */
  348 static struct mfc *
  349 mfc_find(struct in_addr *o, struct in_addr *g)
  350 {
  351         struct mfc *rt;
  352 
  353         LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
  354                 if (in_hosteq(rt->mfc_origin, *o) &&
  355                     in_hosteq(rt->mfc_mcastgrp, *g) &&
  356                     (rt->mfc_stall == NULL))
  357                         break;
  358         }
  359 
  360         return (rt);
  361 }
  362 
  363 /*
  364  * Macros to compute elapsed time efficiently
  365  * Borrowed from Van Jacobson's scheduling code
  366  */
  367 #define TV_DELTA(a, b, delta) do {                                      \
  368         int xxs;                                                        \
  369         delta = (a).tv_usec - (b).tv_usec;                              \
  370         xxs = (a).tv_sec - (b).tv_sec;                                  \
  371         switch (xxs) {                                                  \
  372         case 2:                                                         \
  373                 delta += 1000000;                                       \
  374                 /* FALLTHROUGH */                                       \
  375         case 1:                                                         \
  376                 delta += 1000000;                                       \
  377                 /* FALLTHROUGH */                                       \
  378         case 0:                                                         \
  379                 break;                                                  \
  380         default:                                                        \
  381                 delta += (1000000 * xxs);                               \
  382                 break;                                                  \
  383         }                                                               \
  384 } while (/*CONSTCOND*/ 0)
  385 
  386 #ifdef UPCALL_TIMING
  387 u_int32_t upcall_data[51];
  388 #endif /* UPCALL_TIMING */
  389 
  390 /*
  391  * Handle MRT setsockopt commands to modify the multicast routing tables.
  392  */
  393 int
  394 ip_mrouter_set(struct socket *so, int optname, struct mbuf **m)
  395 {
  396         int error;
  397 
  398         if (optname != MRT_INIT && so != ip_mrouter)
  399                 error = ENOPROTOOPT;
  400         else
  401                 switch (optname) {
  402                 case MRT_INIT:
  403                         error = ip_mrouter_init(so, *m);
  404                         break;
  405                 case MRT_DONE:
  406                         error = ip_mrouter_done();
  407                         break;
  408                 case MRT_ADD_VIF:
  409                         error = add_vif(*m);
  410                         break;
  411                 case MRT_DEL_VIF:
  412                         error = del_vif(*m);
  413                         break;
  414                 case MRT_ADD_MFC:
  415                         error = add_mfc(*m);
  416                         break;
  417                 case MRT_DEL_MFC:
  418                         error = del_mfc(*m);
  419                         break;
  420                 case MRT_ASSERT:
  421                         error = set_assert(*m);
  422                         break;
  423                 case MRT_API_CONFIG:
  424                         error = set_api_config(*m);
  425                         break;
  426                 case MRT_ADD_BW_UPCALL:
  427                         error = add_bw_upcall(*m);
  428                         break;
  429                 case MRT_DEL_BW_UPCALL:
  430                         error = del_bw_upcall(*m);
  431                         break;
  432                 default:
  433                         error = ENOPROTOOPT;
  434                         break;
  435                 }
  436 
  437         if (*m)
  438                 m_free(*m);
  439         return (error);
  440 }
  441 
  442 /*
  443  * Handle MRT getsockopt commands
  444  */
  445 int
  446 ip_mrouter_get(struct socket *so, int optname, struct mbuf **m)
  447 {
  448         int error;
  449 
  450         if (so != ip_mrouter)
  451                 error = ENOPROTOOPT;
  452         else {
  453                 *m = m_get(M_WAIT, MT_SOOPTS);
  454 
  455                 switch (optname) {
  456                 case MRT_VERSION:
  457                         error = get_version(*m);
  458                         break;
  459                 case MRT_ASSERT:
  460                         error = get_assert(*m);
  461                         break;
  462                 case MRT_API_SUPPORT:
  463                         error = get_api_support(*m);
  464                         break;
  465                 case MRT_API_CONFIG:
  466                         error = get_api_config(*m);
  467                         break;
  468                 default:
  469                         error = ENOPROTOOPT;
  470                         break;
  471                 }
  472 
  473                 if (error)
  474                         m_free(*m);
  475         }
  476 
  477         return (error);
  478 }
  479 
  480 /*
  481  * Handle ioctl commands to obtain information from the cache
  482  */
  483 int
  484 mrt_ioctl(struct socket *so, u_long cmd, caddr_t data)
  485 {
  486         int error;
  487 
  488         if (so != ip_mrouter)
  489                 error = EINVAL;
  490         else
  491                 switch (cmd) {
  492                 case SIOCGETVIFCNT:
  493                         error = get_vif_cnt((struct sioc_vif_req *)data);
  494                         break;
  495                 case SIOCGETSGCNT:
  496                         error = get_sg_cnt((struct sioc_sg_req *)data);
  497                         break;
  498                 default:
  499                         error = EINVAL;
  500                         break;
  501                 }
  502 
  503         return (error);
  504 }
  505 
  506 /*
  507  * returns the packet, byte, rpf-failure count for the source group provided
  508  */
  509 static int
  510 get_sg_cnt(struct sioc_sg_req *req)
  511 {
  512         int s;
  513         struct mfc *rt;
  514 
  515         s = splsoftnet();
  516         rt = mfc_find(&req->src, &req->grp);
  517         if (rt == NULL) {
  518                 splx(s);
  519                 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
  520                 return (EADDRNOTAVAIL);
  521         }
  522         req->pktcnt = rt->mfc_pkt_cnt;
  523         req->bytecnt = rt->mfc_byte_cnt;
  524         req->wrong_if = rt->mfc_wrong_if;
  525         splx(s);
  526 
  527         return (0);
  528 }
  529 
  530 /*
  531  * returns the input and output packet and byte counts on the vif provided
  532  */
  533 static int
  534 get_vif_cnt(struct sioc_vif_req *req)
  535 {
  536         vifi_t vifi = req->vifi;
  537 
  538         if (vifi >= numvifs)
  539                 return (EINVAL);
  540 
  541         req->icount = viftable[vifi].v_pkt_in;
  542         req->ocount = viftable[vifi].v_pkt_out;
  543         req->ibytes = viftable[vifi].v_bytes_in;
  544         req->obytes = viftable[vifi].v_bytes_out;
  545 
  546         return (0);
  547 }
  548 
  549 /*
  550  * Enable multicast routing
  551  */
  552 static int
  553 ip_mrouter_init(struct socket *so, struct mbuf *m)
  554 {
  555         int *v;
  556 
  557         if (mrtdebug)
  558                 log(LOG_DEBUG,
  559                     "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
  560                     so->so_type, so->so_proto->pr_protocol);
  561 
  562         if (so->so_type != SOCK_RAW ||
  563             so->so_proto->pr_protocol != IPPROTO_IGMP)
  564                 return (EOPNOTSUPP);
  565 
  566         if (m == NULL || m->m_len < sizeof(int))
  567                 return (EINVAL);
  568 
  569         v = mtod(m, int *);
  570         if (*v != 1)
  571                 return (EINVAL);
  572 
  573         if (ip_mrouter != NULL)
  574                 return (EADDRINUSE);
  575 
  576         ip_mrouter = so;
  577 
  578         mfchashtbl = hashinit(MFCTBLSIZ, M_MRTABLE, M_WAITOK, &mfchash);
  579         bzero((caddr_t)nexpire, sizeof(nexpire));
  580 
  581         pim_assert = 0;
  582 
  583         timeout_set(&expire_upcalls_ch, expire_upcalls, NULL);
  584         timeout_add(&expire_upcalls_ch, EXPIRE_TIMEOUT);
  585 
  586         timeout_set(&bw_upcalls_ch, expire_bw_upcalls_send, NULL);
  587         timeout_add(&bw_upcalls_ch, BW_UPCALLS_PERIOD);
  588 
  589         timeout_set(&bw_meter_ch, expire_bw_meter_process, NULL);
  590         timeout_add(&bw_meter_ch, BW_METER_PERIOD);
  591 
  592         if (mrtdebug)
  593                 log(LOG_DEBUG, "ip_mrouter_init\n");
  594 
  595         return (0);
  596 }
  597 
  598 /*
  599  * Disable multicast routing
  600  */
  601 int
  602 ip_mrouter_done()
  603 {
  604         vifi_t vifi;
  605         struct vif *vifp;
  606         int i;
  607         int s;
  608 
  609         s = splsoftnet();
  610 
  611         /* Clear out all the vifs currently in use. */
  612         for (vifi = 0; vifi < numvifs; vifi++) {
  613                 vifp = &viftable[vifi];
  614                 if (!in_nullhost(vifp->v_lcl_addr))
  615                         reset_vif(vifp);
  616         }
  617 
  618         numvifs = 0;
  619         pim_assert = 0;
  620         mrt_api_config = 0;
  621 
  622         timeout_del(&expire_upcalls_ch);
  623         timeout_del(&bw_upcalls_ch);
  624         timeout_del(&bw_meter_ch);
  625 
  626         /*
  627          * Free all multicast forwarding cache entries.
  628          */
  629         for (i = 0; i < MFCTBLSIZ; i++) {
  630                 struct mfc *rt, *nrt;
  631 
  632                 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
  633                         nrt = LIST_NEXT(rt, mfc_hash);
  634 
  635                         expire_mfc(rt);
  636                 }
  637         }
  638 
  639         bzero((caddr_t)nexpire, sizeof(nexpire));
  640         free(mfchashtbl, M_MRTABLE);
  641         mfchashtbl = NULL;
  642 
  643         bw_upcalls_n = 0;
  644         bzero(bw_meter_timers, sizeof(bw_meter_timers));
  645 
  646         /* Reset de-encapsulation cache. */
  647         have_encap_tunnel = 0;
  648 
  649         ip_mrouter = NULL;
  650 
  651         splx(s);
  652 
  653         if (mrtdebug)
  654                 log(LOG_DEBUG, "ip_mrouter_done\n");
  655 
  656         return (0);
  657 }
  658 
  659 void
  660 ip_mrouter_detach(struct ifnet *ifp)
  661 {
  662         int vifi, i;
  663         struct vif *vifp;
  664         struct mfc *rt;
  665         struct rtdetq *rte;
  666 
  667         /* XXX not sure about side effect to userland routing daemon */
  668         for (vifi = 0; vifi < numvifs; vifi++) {
  669                 vifp = &viftable[vifi];
  670                 if (vifp->v_ifp == ifp)
  671                         reset_vif(vifp);
  672         }
  673         for (i = 0; i < MFCTBLSIZ; i++) {
  674                 if (nexpire[i] == 0)
  675                         continue;
  676                 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) {
  677                         for (rte = rt->mfc_stall; rte; rte = rte->next) {
  678                                 if (rte->ifp == ifp)
  679                                         rte->ifp = NULL;
  680                         }
  681                 }
  682         }
  683 }
  684 
  685 static int
  686 get_version(struct mbuf *m)
  687 {
  688         int *v = mtod(m, int *);
  689 
  690         *v = 0x0305;    /* XXX !!!! */
  691         m->m_len = sizeof(int);
  692         return (0);
  693 }
  694 
  695 /*
  696  * Set PIM assert processing global
  697  */
  698 static int
  699 set_assert(struct mbuf *m)
  700 {
  701         int *i;
  702 
  703         if (m == NULL || m->m_len < sizeof(int))
  704                 return (EINVAL);
  705 
  706         i = mtod(m, int *);
  707         pim_assert = !!*i;
  708         return (0);
  709 }
  710 
  711 /*
  712  * Get PIM assert processing global
  713  */
  714 static int
  715 get_assert(struct mbuf *m)
  716 {
  717         int *i = mtod(m, int *);
  718 
  719         *i = pim_assert;
  720         m->m_len = sizeof(int);
  721         return (0);
  722 }
  723 
  724 /*
  725  * Configure API capabilities
  726  */
  727 static int
  728 set_api_config(struct mbuf *m)
  729 {
  730         int i;
  731         u_int32_t *apival;
  732 
  733         if (m == NULL || m->m_len < sizeof(u_int32_t))
  734                 return (EINVAL);
  735 
  736         apival = mtod(m, u_int32_t *);
  737 
  738         /*
  739          * We can set the API capabilities only if it is the first operation
  740          * after MRT_INIT. I.e.:
  741          *  - there are no vifs installed
  742          *  - pim_assert is not enabled
  743          *  - the MFC table is empty
  744          */
  745         if (numvifs > 0) {
  746                 *apival = 0;
  747                 return (EPERM);
  748         }
  749         if (pim_assert) {
  750                 *apival = 0;
  751                 return (EPERM);
  752         }
  753         for (i = 0; i < MFCTBLSIZ; i++) {
  754                 if (LIST_FIRST(&mfchashtbl[i]) != NULL) {
  755                         *apival = 0;
  756                         return (EPERM);
  757                 }
  758         }
  759 
  760         mrt_api_config = *apival & mrt_api_support;
  761         *apival = mrt_api_config;
  762 
  763         return (0);
  764 }
  765 
  766 /*
  767  * Get API capabilities
  768  */
  769 static int
  770 get_api_support(struct mbuf *m)
  771 {
  772         u_int32_t *apival;
  773 
  774         if (m == NULL || m->m_len < sizeof(u_int32_t))
  775                 return (EINVAL);
  776 
  777         apival = mtod(m, u_int32_t *);
  778 
  779         *apival = mrt_api_support;
  780 
  781         return (0);
  782 }
  783 
  784 /*
  785  * Get API configured capabilities
  786  */
  787 static int
  788 get_api_config(struct mbuf *m)
  789 {
  790         u_int32_t *apival;
  791 
  792         if (m == NULL || m->m_len < sizeof(u_int32_t))
  793                 return (EINVAL);
  794 
  795         apival = mtod(m, u_int32_t *);
  796 
  797         *apival = mrt_api_config;
  798 
  799         return (0);
  800 }
  801 
  802 static struct sockaddr_in sin = { sizeof(sin), AF_INET };
  803 
  804 /*
  805  * Add a vif to the vif table
  806  */
  807 static int
  808 add_vif(struct mbuf *m)
  809 {
  810         struct vifctl *vifcp;
  811         struct vif *vifp;
  812         struct ifaddr *ifa;
  813         struct ifnet *ifp;
  814         struct ifreq ifr;
  815         int error, s;
  816 
  817         if (m == NULL || m->m_len < sizeof(struct vifctl))
  818                 return (EINVAL);
  819 
  820         vifcp = mtod(m, struct vifctl *);
  821         if (vifcp->vifc_vifi >= MAXVIFS)
  822                 return (EINVAL);
  823         if (in_nullhost(vifcp->vifc_lcl_addr))
  824                 return (EADDRNOTAVAIL);
  825 
  826         vifp = &viftable[vifcp->vifc_vifi];
  827         if (!in_nullhost(vifp->v_lcl_addr))
  828                 return (EADDRINUSE);
  829 
  830         /* Find the interface with an address in AF_INET family. */
  831 #ifdef PIM
  832         if (vifcp->vifc_flags & VIFF_REGISTER) {
  833                 /*
  834                  * XXX: Because VIFF_REGISTER does not really need a valid
  835                  * local interface (e.g. it could be 127.0.0.2), we don't
  836                  * check its address.
  837                  */
  838             ifp = NULL;
  839         } else
  840 #endif
  841         {
  842                 sin.sin_addr = vifcp->vifc_lcl_addr;
  843                 ifa = ifa_ifwithaddr(sintosa(&sin));
  844                 if (ifa == NULL)
  845                         return (EADDRNOTAVAIL);
  846                 ifp = ifa->ifa_ifp;
  847         }
  848 
  849         if (vifcp->vifc_flags & VIFF_TUNNEL) {
  850                 /* tunnels are no longer supported use gif(4) instead */
  851                 return (EOPNOTSUPP);
  852 #ifdef PIM
  853         } else if (vifcp->vifc_flags & VIFF_REGISTER) {
  854                 ifp = &multicast_register_if;
  855                 if (mrtdebug)
  856                         log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
  857                             (void *)ifp);
  858                 if (reg_vif_num == VIFI_INVALID) {
  859                         bzero(ifp, sizeof(*ifp));
  860                         snprintf(ifp->if_xname, sizeof ifp->if_xname,
  861                                  "register_vif");
  862                         ifp->if_flags = IFF_LOOPBACK;
  863                         bzero(&vifp->v_route, sizeof(vifp->v_route));
  864                         reg_vif_num = vifcp->vifc_vifi;
  865                 }
  866 #endif
  867         } else {
  868                 /* Use the physical interface associated with the address. */
  869                 ifp = ifa->ifa_ifp;
  870 
  871                 /* Make sure the interface supports multicast. */
  872                 if ((ifp->if_flags & IFF_MULTICAST) == 0)
  873                         return (EOPNOTSUPP);
  874 
  875                 /* Enable promiscuous reception of all IP multicasts. */
  876                 satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
  877                 satosin(&ifr.ifr_addr)->sin_family = AF_INET;
  878                 satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
  879                 error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr);
  880                 if (error)
  881                         return (error);
  882         }
  883 
  884         s = splsoftnet();
  885 
  886         /* Define parameters for the tbf structure. */
  887         vifp->tbf_q = NULL;
  888         vifp->tbf_t = &vifp->tbf_q;
  889         microtime(&vifp->tbf_last_pkt_t);
  890         vifp->tbf_n_tok = 0;
  891         vifp->tbf_q_len = 0;
  892         vifp->tbf_max_q_len = MAXQSIZE;
  893 
  894         vifp->v_flags = vifcp->vifc_flags;
  895         vifp->v_threshold = vifcp->vifc_threshold;
  896         /* scaling up here allows division by 1024 in critical code */
  897         vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000;
  898         vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
  899         vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
  900         vifp->v_ifp = ifp;
  901         /* Initialize per vif pkt counters. */
  902         vifp->v_pkt_in = 0;
  903         vifp->v_pkt_out = 0;
  904         vifp->v_bytes_in = 0;
  905         vifp->v_bytes_out = 0;
  906 
  907         timeout_del(&vifp->v_repq_ch);
  908 
  909 #ifdef RSVP_ISI
  910         vifp->v_rsvp_on = 0;
  911         vifp->v_rsvpd = NULL;
  912 #endif /* RSVP_ISI */
  913 
  914         splx(s);
  915 
  916         /* Adjust numvifs up if the vifi is higher than numvifs. */
  917         if (numvifs <= vifcp->vifc_vifi)
  918                 numvifs = vifcp->vifc_vifi + 1;
  919 
  920         if (mrtdebug)
  921                 log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, "
  922                     "thresh %x, rate %d\n",
  923                     vifcp->vifc_vifi,
  924                     ntohl(vifcp->vifc_lcl_addr.s_addr),
  925                     (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
  926                     ntohl(vifcp->vifc_rmt_addr.s_addr),
  927                     vifcp->vifc_threshold,
  928                     vifcp->vifc_rate_limit);
  929 
  930         return (0);
  931 }
  932 
  933 void
  934 reset_vif(struct vif *vifp)
  935 {
  936         struct mbuf *m, *n;
  937         struct ifnet *ifp;
  938         struct ifreq ifr;
  939 
  940         timeout_set(&vifp->v_repq_ch, tbf_reprocess_q, vifp);
  941 
  942         /*
  943          * Free packets queued at the interface
  944          */
  945         for (m = vifp->tbf_q; m != NULL; m = n) {
  946                 n = m->m_nextpkt;
  947                 m_freem(m);
  948         }
  949 
  950         if (vifp->v_flags & VIFF_TUNNEL) {
  951                 /* empty */
  952         } else if (vifp->v_flags & VIFF_REGISTER) {
  953 #ifdef PIM
  954                 reg_vif_num = VIFI_INVALID;
  955 #endif
  956         } else {
  957                 satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
  958                 satosin(&ifr.ifr_addr)->sin_family = AF_INET;
  959                 satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
  960                 ifp = vifp->v_ifp;
  961                 (*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
  962         }
  963         bzero((caddr_t)vifp, sizeof(*vifp));
  964 }
  965 
  966 /*
  967  * Delete a vif from the vif table
  968  */
  969 static int
  970 del_vif(struct mbuf *m)
  971 {
  972         vifi_t *vifip;
  973         struct vif *vifp;
  974         vifi_t vifi;
  975         int s;
  976 
  977         if (m == NULL || m->m_len < sizeof(vifi_t))
  978                 return (EINVAL);
  979 
  980         vifip = mtod(m, vifi_t *);
  981         if (*vifip >= numvifs)
  982                 return (EINVAL);
  983 
  984         vifp = &viftable[*vifip];
  985         if (in_nullhost(vifp->v_lcl_addr))
  986                 return (EADDRNOTAVAIL);
  987 
  988         s = splsoftnet();
  989 
  990         reset_vif(vifp);
  991 
  992         /* Adjust numvifs down */
  993         for (vifi = numvifs; vifi > 0; vifi--)
  994                 if (!in_nullhost(viftable[vifi - 1].v_lcl_addr))
  995                         break;
  996         numvifs = vifi;
  997 
  998         splx(s);
  999 
 1000         if (mrtdebug)
 1001                 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs);
 1002 
 1003         return (0);
 1004 }
 1005 
 1006 void
 1007 vif_delete(struct ifnet *ifp)
 1008 {
 1009         int i;
 1010         struct vif *vifp;
 1011         struct mfc *rt;
 1012         struct rtdetq *rte;
 1013 
 1014         for (i = 0; i < numvifs; i++) {
 1015                 vifp = &viftable[i];
 1016                 if (vifp->v_ifp == ifp)
 1017                         bzero((caddr_t)vifp, sizeof *vifp);
 1018         }
 1019 
 1020         for (i = numvifs; i > 0; i--)
 1021                 if (!in_nullhost(viftable[i - 1].v_lcl_addr))
 1022                         break;
 1023         numvifs = i;
 1024 
 1025         for (i = 0; i < MFCTBLSIZ; i++) {
 1026                 if (nexpire[i] == 0)
 1027                         continue;
 1028                 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) {
 1029                         for (rte = rt->mfc_stall; rte; rte = rte->next) {
 1030                                 if (rte->ifp == ifp)
 1031                                         rte->ifp = NULL;
 1032                         }
 1033                 }
 1034         }
 1035 }
 1036 
 1037 /*
 1038  * update an mfc entry without resetting counters and S,G addresses.
 1039  */
 1040 static void
 1041 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 1042 {
 1043         int i;
 1044 
 1045         rt->mfc_parent = mfccp->mfcc_parent;
 1046         for (i = 0; i < numvifs; i++) {
 1047                 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 1048                 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
 1049                     MRT_MFC_FLAGS_ALL;
 1050         }
 1051         /* set the RP address */
 1052         if (mrt_api_config & MRT_MFC_RP)
 1053                 rt->mfc_rp = mfccp->mfcc_rp;
 1054         else
 1055                 rt->mfc_rp = zeroin_addr;
 1056 }
 1057 
 1058 /*
 1059  * fully initialize an mfc entry from the parameter.
 1060  */
 1061 static void
 1062 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 1063 {
 1064         rt->mfc_origin     = mfccp->mfcc_origin;
 1065         rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 1066 
 1067         update_mfc_params(rt, mfccp);
 1068 
 1069         /* initialize pkt counters per src-grp */
 1070         rt->mfc_pkt_cnt    = 0;
 1071         rt->mfc_byte_cnt   = 0;
 1072         rt->mfc_wrong_if   = 0;
 1073         timerclear(&rt->mfc_last_assert);
 1074 }
 1075 
 1076 static void
 1077 expire_mfc(struct mfc *rt)
 1078 {
 1079         struct rtdetq *rte, *nrte;
 1080 
 1081         free_bw_list(rt->mfc_bw_meter);
 1082 
 1083         for (rte = rt->mfc_stall; rte != NULL; rte = nrte) {
 1084                 nrte = rte->next;
 1085                 m_freem(rte->m);
 1086                 free(rte, M_MRTABLE);
 1087         }
 1088 
 1089         LIST_REMOVE(rt, mfc_hash);
 1090         free(rt, M_MRTABLE);
 1091 }
 1092 
 1093 /*
 1094  * Add an mfc entry
 1095  */
 1096 static int
 1097 add_mfc(struct mbuf *m)
 1098 {
 1099         struct mfcctl2 mfcctl2;
 1100         struct mfcctl2 *mfccp;
 1101         struct mfc *rt;
 1102         u_int32_t hash = 0;
 1103         struct rtdetq *rte, *nrte;
 1104         u_short nstl;
 1105         int s;
 1106         int mfcctl_size = sizeof(struct mfcctl);
 1107 
 1108         if (mrt_api_config & MRT_API_FLAGS_ALL)
 1109                 mfcctl_size = sizeof(struct mfcctl2);
 1110 
 1111         if (m == NULL || m->m_len < mfcctl_size)
 1112                 return (EINVAL);
 1113 
 1114         /*
 1115          * select data size depending on API version.
 1116          */
 1117         if (mrt_api_config & MRT_API_FLAGS_ALL) {
 1118                 struct mfcctl2 *mp2 = mtod(m, struct mfcctl2 *);
 1119                 bcopy(mp2, (caddr_t)&mfcctl2, sizeof(*mp2));
 1120         } else {
 1121                 struct mfcctl *mp = mtod(m, struct mfcctl *);
 1122                 bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp));
 1123                 bzero((caddr_t)&mfcctl2 + sizeof(struct mfcctl),
 1124                     sizeof(mfcctl2) - sizeof(struct mfcctl));
 1125         }
 1126         mfccp = &mfcctl2;
 1127 
 1128         s = splsoftnet();
 1129         rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 1130 
 1131         /* If an entry already exists, just update the fields */
 1132         if (rt) {
 1133                 if (mrtdebug & DEBUG_MFC)
 1134                         log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n",
 1135                             ntohl(mfccp->mfcc_origin.s_addr),
 1136                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1137                             mfccp->mfcc_parent);
 1138 
 1139                 update_mfc_params(rt, mfccp);
 1140 
 1141                 splx(s);
 1142                 return (0);
 1143         }
 1144 
 1145         /*
 1146          * Find the entry for which the upcall was made and update
 1147          */
 1148         nstl = 0;
 1149         hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
 1150         LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1151                 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 1152                     in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
 1153                     rt->mfc_stall != NULL) {
 1154                         if (nstl++)
 1155                                 log(LOG_ERR, "add_mfc %s o %x g %x "
 1156                                     "p %x dbx %p\n",
 1157                                     "multiple kernel entries",
 1158                                     ntohl(mfccp->mfcc_origin.s_addr),
 1159                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1160                                     mfccp->mfcc_parent, rt->mfc_stall);
 1161 
 1162                         if (mrtdebug & DEBUG_MFC)
 1163                                 log(LOG_DEBUG, "add_mfc o %x g %x "
 1164                                     "p %x dbg %p\n",
 1165                                     ntohl(mfccp->mfcc_origin.s_addr),
 1166                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1167                                     mfccp->mfcc_parent, rt->mfc_stall);
 1168 
 1169                         rte = rt->mfc_stall;
 1170                         init_mfc_params(rt, mfccp);
 1171                         rt->mfc_stall = NULL;
 1172 
 1173                         rt->mfc_expire = 0; /* Don't clean this guy up */
 1174                         nexpire[hash]--;
 1175 
 1176                         /* free packets Qed at the end of this entry */
 1177                         for (; rte != NULL; rte = nrte) {
 1178                                 nrte = rte->next;
 1179                                 if (rte->ifp) {
 1180 #ifdef RSVP_ISI
 1181                                         ip_mdq(rte->m, rte->ifp, rt, -1);
 1182 #else
 1183                                         ip_mdq(rte->m, rte->ifp, rt);
 1184 #endif /* RSVP_ISI */
 1185                                 }
 1186                                 m_freem(rte->m);
 1187 #ifdef UPCALL_TIMING
 1188                                 collate(&rte->t);
 1189 #endif /* UPCALL_TIMING */
 1190                                 free(rte, M_MRTABLE);
 1191                         }
 1192                 }
 1193         }
 1194 
 1195         /*
 1196          * It is possible that an entry is being inserted without an upcall
 1197          */
 1198         if (nstl == 0) {
 1199                 /*
 1200                  * No mfc; make a new one
 1201                  */
 1202                 if (mrtdebug & DEBUG_MFC)
 1203                         log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n",
 1204                             ntohl(mfccp->mfcc_origin.s_addr),
 1205                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1206                             mfccp->mfcc_parent);
 1207 
 1208                 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1209                         if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 1210                             in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
 1211                                 init_mfc_params(rt, mfccp);
 1212                                 if (rt->mfc_expire)
 1213                                         nexpire[hash]--;
 1214                                 rt->mfc_expire = 0;
 1215                                 break; /* XXX */
 1216                         }
 1217                 }
 1218                 if (rt == NULL) {       /* no upcall, so make a new entry */
 1219                         rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE,
 1220                             M_NOWAIT);
 1221                         if (rt == NULL) {
 1222                                 splx(s);
 1223                                 return (ENOBUFS);
 1224                         }
 1225 
 1226                         init_mfc_params(rt, mfccp);
 1227                         rt->mfc_expire  = 0;
 1228                         rt->mfc_stall   = NULL;
 1229                         rt->mfc_bw_meter = NULL;
 1230 
 1231                         /* insert new entry at head of hash chain */
 1232                         LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
 1233                 }
 1234         }
 1235 
 1236         splx(s);
 1237         return (0);
 1238 }
 1239 
 1240 #ifdef UPCALL_TIMING
 1241 /*
 1242  * collect delay statistics on the upcalls
 1243  */
 1244 static void
 1245 collate(struct timeval *t)
 1246 {
 1247         u_int32_t d;
 1248         struct timeval tp;
 1249         u_int32_t delta;
 1250 
 1251         microtime(&tp);
 1252 
 1253         if (timercmp(t, &tp, <)) {
 1254                 TV_DELTA(tp, *t, delta);
 1255 
 1256                 d = delta >> 10;
 1257                 if (d > 50)
 1258                         d = 50;
 1259 
 1260                 ++upcall_data[d];
 1261         }
 1262 }
 1263 #endif /* UPCALL_TIMING */
 1264 
 1265 /*
 1266  * Delete an mfc entry
 1267  */
 1268 static int
 1269 del_mfc(struct mbuf *m)
 1270 {
 1271         struct mfcctl2 mfcctl2;
 1272         struct mfcctl2 *mfccp;
 1273         struct mfc *rt;
 1274         int s;
 1275         int mfcctl_size = sizeof(struct mfcctl);
 1276         struct mfcctl *mp = mtod(m, struct mfcctl *);
 1277 
 1278         /*
 1279          * XXX: for deleting MFC entries the information in entries
 1280          * of size "struct mfcctl" is sufficient.
 1281          */
 1282 
 1283         if (m == NULL || m->m_len < mfcctl_size)
 1284                 return (EINVAL);
 1285 
 1286         bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp));
 1287         bzero((caddr_t)&mfcctl2 + sizeof(struct mfcctl),
 1288             sizeof(mfcctl2) - sizeof(struct mfcctl));
 1289 
 1290         mfccp = &mfcctl2;
 1291 
 1292         if (mrtdebug & DEBUG_MFC)
 1293                 log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n",
 1294                     ntohl(mfccp->mfcc_origin.s_addr),
 1295                     ntohl(mfccp->mfcc_mcastgrp.s_addr));
 1296 
 1297         s = splsoftnet();
 1298 
 1299         rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 1300         if (rt == NULL) {
 1301                 splx(s);
 1302                 return (EADDRNOTAVAIL);
 1303         }
 1304 
 1305         /*
 1306          * free the bw_meter entries
 1307          */
 1308         free_bw_list(rt->mfc_bw_meter);
 1309         rt->mfc_bw_meter = NULL;
 1310 
 1311         LIST_REMOVE(rt, mfc_hash);
 1312         free(rt, M_MRTABLE);
 1313 
 1314         splx(s);
 1315         return (0);
 1316 }
 1317 
 1318 static int
 1319 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
 1320 {
 1321         if (s != NULL) {
 1322                 if (sbappendaddr(&s->so_rcv, sintosa(src), mm,
 1323                     (struct mbuf *)NULL) != 0) {
 1324                         sorwakeup(s);
 1325                         return (0);
 1326                 }
 1327         }
 1328         m_freem(mm);
 1329         return (-1);
 1330 }
 1331 
 1332 /*
 1333  * IP multicast forwarding function. This function assumes that the packet
 1334  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
 1335  * pointed to by "ifp", and the packet is to be relayed to other networks
 1336  * that have members of the packet's destination IP multicast group.
 1337  *
 1338  * The packet is returned unscathed to the caller, unless it is
 1339  * erroneous, in which case a non-zero return value tells the caller to
 1340  * discard it.
 1341  */
 1342 
 1343 #define IP_HDR_LEN  20  /* # bytes of fixed IP header (excluding options) */
 1344 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 1345 
 1346 int
 1347 #ifdef RSVP_ISI
 1348 ip_mforward(struct mbuf *m, struct ifnet *ifp, struct ip_moptions *imo)
 1349 #else
 1350 ip_mforward(struct mbuf *m, struct ifnet *ifp)
 1351 #endif /* RSVP_ISI */
 1352 {
 1353         struct ip *ip = mtod(m, struct ip *);
 1354         struct mfc *rt;
 1355         static int srctun = 0;
 1356         struct mbuf *mm;
 1357         int s;
 1358         vifi_t vifi;
 1359 
 1360         if (mrtdebug & DEBUG_FORWARD)
 1361                 log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n",
 1362                     ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
 1363 
 1364         if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
 1365             ((u_char *)(ip + 1))[1] != IPOPT_LSRR) {
 1366                 /*
 1367                  * Packet arrived via a physical interface or
 1368                  * an encapsulated tunnel or a register_vif.
 1369                  */
 1370         } else {
 1371                 /*
 1372                  * Packet arrived through a source-route tunnel.
 1373                  * Source-route tunnels are no longer supported.
 1374                  */
 1375                 if ((srctun++ % 1000) == 0)
 1376                         log(LOG_ERR, "ip_mforward: received source-routed "
 1377                             "packet from %x\n", ntohl(ip->ip_src.s_addr));
 1378 
 1379                 return (1);
 1380         }
 1381 
 1382 #ifdef RSVP_ISI
 1383         if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) {
 1384                 if (ip->ip_ttl < 255) {
 1385                         /* compensate for -1 in *_send routines */
 1386                         ip->ip_ttl++;
 1387                 }
 1388                 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 1389                         struct vif *vifp = viftable + vifi;
 1390                         printf("Sending IPPROTO_RSVP from %x to %x on "
 1391                             "vif %d (%s%s)\n",
 1392                             ntohl(ip->ip_src), ntohl(ip->ip_dst), vifi,
 1393                             (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
 1394                             vifp->v_ifp->if_xname);
 1395                 }
 1396                 return (ip_mdq(m, ifp, (struct mfc *)NULL, vifi));
 1397         }
 1398         if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 1399                 printf("Warning: IPPROTO_RSVP from %x to %x without "
 1400                     "vif option\n", ntohl(ip->ip_src), ntohl(ip->ip_dst));
 1401         }
 1402 #endif /* RSVP_ISI */
 1403 
 1404         /*
 1405          * Don't forward a packet with time-to-live of zero or one,
 1406          * or a packet destined to a local-only group.
 1407          */
 1408         if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr))
 1409                 return (0);
 1410 
 1411         /*
 1412          * Determine forwarding vifs from the forwarding cache table
 1413          */
 1414         s = splsoftnet();
 1415         ++mrtstat.mrts_mfc_lookups;
 1416         rt = mfc_find(&ip->ip_src, &ip->ip_dst);
 1417 
 1418         /* Entry exists, so forward if necessary */
 1419         if (rt != NULL) {
 1420                 splx(s);
 1421 #ifdef RSVP_ISI
 1422                 return (ip_mdq(m, ifp, rt, -1));
 1423 #else
 1424                 return (ip_mdq(m, ifp, rt));
 1425 #endif /* RSVP_ISI */
 1426         } else {
 1427                 /*
 1428                  * If we don't have a route for packet's origin,
 1429                  * Make a copy of the packet & send message to routing daemon
 1430                  */
 1431 
 1432                 struct mbuf *mb0;
 1433                 struct rtdetq *rte;
 1434                 u_int32_t hash;
 1435                 int hlen = ip->ip_hl << 2;
 1436 #ifdef UPCALL_TIMING
 1437                 struct timeval tp;
 1438 
 1439                 microtime(&tp);
 1440 #endif /* UPCALL_TIMING */
 1441 
 1442                 ++mrtstat.mrts_mfc_misses;
 1443 
 1444                 mrtstat.mrts_no_route++;
 1445                 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
 1446                         log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n",
 1447                             ntohl(ip->ip_src.s_addr),
 1448                             ntohl(ip->ip_dst.s_addr));
 1449 
 1450                 /*
 1451                  * Allocate mbufs early so that we don't do extra work if we are
 1452                  * just going to fail anyway.  Make sure to pullup the header so
 1453                  * that other people can't step on it.
 1454                  */
 1455                 rte = (struct rtdetq *)malloc(sizeof(*rte),
 1456                     M_MRTABLE, M_NOWAIT);
 1457                 if (rte == NULL) {
 1458                         splx(s);
 1459                         return (ENOBUFS);
 1460                 }
 1461                 mb0 = m_copy(m, 0, M_COPYALL);
 1462                 M_PULLUP(mb0, hlen);
 1463                 if (mb0 == NULL) {
 1464                         free(rte, M_MRTABLE);
 1465                         splx(s);
 1466                         return (ENOBUFS);
 1467                 }
 1468 
 1469                 /* is there an upcall waiting for this flow? */
 1470                 hash = MFCHASH(ip->ip_src, ip->ip_dst);
 1471                 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1472                         if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
 1473                             in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
 1474                             rt->mfc_stall != NULL)
 1475                                 break;
 1476                 }
 1477 
 1478                 if (rt == NULL) {
 1479                         int i;
 1480                         struct igmpmsg *im;
 1481 
 1482                         /*
 1483                          * Locate the vifi for the incoming interface for
 1484                          * this packet.
 1485                          * If none found, drop packet.
 1486                          */
 1487                         for (vifi = 0; vifi < numvifs &&
 1488                                  viftable[vifi].v_ifp != ifp; vifi++)
 1489                                 ;
 1490                         if (vifi >= numvifs) /* vif not found, drop packet */
 1491                                 goto non_fatal;
 1492 
 1493                         /* no upcall, so make a new entry */
 1494                         rt = (struct mfc *)malloc(sizeof(*rt),
 1495                             M_MRTABLE, M_NOWAIT);
 1496                         if (rt == NULL)
 1497                                 goto fail;
 1498                         /*
 1499                          * Make a copy of the header to send to the user level
 1500                          * process
 1501                          */
 1502                         mm = m_copy(m, 0, hlen);
 1503                         M_PULLUP(mm, hlen);
 1504                         if (mm == NULL)
 1505                                 goto fail1;
 1506 
 1507                         /*
 1508                          * Send message to routing daemon to install
 1509                          * a route into the kernel table
 1510                          */
 1511 
 1512                         im = mtod(mm, struct igmpmsg *);
 1513                         im->im_msgtype = IGMPMSG_NOCACHE;
 1514                         im->im_mbz = 0;
 1515                         im->im_vif = vifi;
 1516 
 1517                         mrtstat.mrts_upcalls++;
 1518 
 1519                         sin.sin_addr = ip->ip_src;
 1520                         if (socket_send(ip_mrouter, mm, &sin) < 0) {
 1521                                 log(LOG_WARNING, "ip_mforward: ip_mrouter "
 1522                                     "socket queue full\n");
 1523                                 ++mrtstat.mrts_upq_sockfull;
 1524                         fail1:
 1525                                 free(rt, M_MRTABLE);
 1526                         fail:
 1527                                 free(rte, M_MRTABLE);
 1528                                 m_freem(mb0);
 1529                                 splx(s);
 1530                                 return (ENOBUFS);
 1531                         }
 1532 
 1533                         /* insert new entry at head of hash chain */
 1534                         rt->mfc_origin = ip->ip_src;
 1535                         rt->mfc_mcastgrp = ip->ip_dst;
 1536                         rt->mfc_pkt_cnt = 0;
 1537                         rt->mfc_byte_cnt = 0;
 1538                         rt->mfc_wrong_if = 0;
 1539                         rt->mfc_expire = UPCALL_EXPIRE;
 1540                         nexpire[hash]++;
 1541                         for (i = 0; i < numvifs; i++) {
 1542                                 rt->mfc_ttls[i] = 0;
 1543                                 rt->mfc_flags[i] = 0;
 1544                         }
 1545                         rt->mfc_parent = -1;
 1546 
 1547                         /* clear the RP address */
 1548                         rt->mfc_rp = zeroin_addr;
 1549 
 1550                         rt->mfc_bw_meter = NULL;
 1551 
 1552                         /* link into table */
 1553                         LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
 1554                         /* Add this entry to the end of the queue */
 1555                         rt->mfc_stall = rte;
 1556                 } else {
 1557                         /* determine if q has overflowed */
 1558                         struct rtdetq **p;
 1559                         int npkts = 0;
 1560 
 1561                         /*
 1562                          * XXX ouch! we need to append to the list, but we
 1563                          * only have a pointer to the front, so we have to
 1564                          * scan the entire list every time.
 1565                          */
 1566                         for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
 1567                                 if (++npkts > MAX_UPQ) {
 1568                                         mrtstat.mrts_upq_ovflw++;
 1569                                 non_fatal:
 1570                                         free(rte, M_MRTABLE);
 1571                                         m_freem(mb0);
 1572                                         splx(s);
 1573                                         return (0);
 1574                                 }
 1575 
 1576                         /* Add this entry to the end of the queue */
 1577                         *p = rte;
 1578                 }
 1579 
 1580                 rte->next = NULL;
 1581                 rte->m = mb0;
 1582                 rte->ifp = ifp;
 1583         #ifdef UPCALL_TIMING
 1584                 rte->t = tp;
 1585         #endif /* UPCALL_TIMING */
 1586 
 1587                 splx(s);
 1588 
 1589                 return (0);
 1590         }
 1591 }
 1592 
 1593 
 1594 /*ARGSUSED*/
 1595 static void
 1596 expire_upcalls(void *v)
 1597 {
 1598         int i;
 1599         int s;
 1600 
 1601         s = splsoftnet();
 1602 
 1603         for (i = 0; i < MFCTBLSIZ; i++) {
 1604                 struct mfc *rt, *nrt;
 1605 
 1606                 if (nexpire[i] == 0)
 1607                         continue;
 1608 
 1609                 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
 1610                         nrt = LIST_NEXT(rt, mfc_hash);
 1611 
 1612                         if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
 1613                                 continue;
 1614                         nexpire[i]--;
 1615 
 1616                         /*
 1617                          * free the bw_meter entries
 1618                          */
 1619                         while (rt->mfc_bw_meter != NULL) {
 1620                                 struct bw_meter *x = rt->mfc_bw_meter;
 1621 
 1622                                 rt->mfc_bw_meter = x->bm_mfc_next;
 1623                                 free(x, M_BWMETER);
 1624                         }
 1625 
 1626                         ++mrtstat.mrts_cache_cleanups;
 1627                         if (mrtdebug & DEBUG_EXPIRE)
 1628                                 log(LOG_DEBUG,
 1629                                     "expire_upcalls: expiring (%x %x)\n",
 1630                                     ntohl(rt->mfc_origin.s_addr),
 1631                                     ntohl(rt->mfc_mcastgrp.s_addr));
 1632 
 1633                         expire_mfc(rt);
 1634                 }
 1635         }
 1636 
 1637         splx(s);
 1638         timeout_add(&expire_upcalls_ch, EXPIRE_TIMEOUT);
 1639 }
 1640 
 1641 /*
 1642  * Packet forwarding routine once entry in the cache is made
 1643  */
 1644 static int
 1645 #ifdef RSVP_ISI
 1646 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
 1647 #else
 1648 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt)
 1649 #endif /* RSVP_ISI */
 1650 {
 1651         struct ip  *ip = mtod(m, struct ip *);
 1652         vifi_t vifi;
 1653         struct vif *vifp;
 1654         int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
 1655 
 1656 /*
 1657  * Macro to send packet on vif.  Since RSVP packets don't get counted on
 1658  * input, they shouldn't get counted on output, so statistics keeping is
 1659  * separate.
 1660  */
 1661 #define MC_SEND(ip, vifp, m) do {                                       \
 1662         if ((vifp)->v_flags & VIFF_TUNNEL)                              \
 1663                 encap_send((ip), (vifp), (m));                          \
 1664         else                                                            \
 1665                 phyint_send((ip), (vifp), (m));                         \
 1666 } while (/*CONSTCOND*/ 0)
 1667 
 1668 #ifdef RSVP_ISI
 1669         /*
 1670          * If xmt_vif is not -1, send on only the requested vif.
 1671          *
 1672          * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.
 1673          */
 1674         if (xmt_vif < numvifs) {
 1675 #ifdef PIM
 1676                 if (viftable[xmt_vif].v_flags & VIFF_REGISTER)
 1677                         pim_register_send(ip, viftable + xmt_vif, m, rt);
 1678                 else
 1679 #endif
 1680                 MC_SEND(ip, viftable + xmt_vif, m);
 1681                 return (1);
 1682         }
 1683 #endif /* RSVP_ISI */
 1684 
 1685         /*
 1686          * Don't forward if it didn't arrive from the parent vif for its origin.
 1687          */
 1688         vifi = rt->mfc_parent;
 1689         if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
 1690                 /* came in the wrong interface */
 1691                 if (mrtdebug & DEBUG_FORWARD)
 1692                         log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
 1693                             ifp, vifi,
 1694                             vifi >= numvifs ? 0 : viftable[vifi].v_ifp);
 1695                 ++mrtstat.mrts_wrong_if;
 1696                 ++rt->mfc_wrong_if;
 1697                 /*
 1698                  * If we are doing PIM assert processing, send a message
 1699                  * to the routing daemon.
 1700                  *
 1701                  * XXX: A PIM-SM router needs the WRONGVIF detection so it
 1702                  * can complete the SPT switch, regardless of the type
 1703                  * of interface (broadcast media, GRE tunnel, etc).
 1704                  */
 1705                 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
 1706                         struct timeval now;
 1707                         u_int32_t delta;
 1708 
 1709 #ifdef PIM
 1710                         if (ifp == &multicast_register_if)
 1711                                 pimstat.pims_rcv_registers_wrongiif++;
 1712 #endif
 1713 
 1714                         /* Get vifi for the incoming packet */
 1715                         for (vifi = 0;
 1716                              vifi < numvifs && viftable[vifi].v_ifp != ifp;
 1717                              vifi++)
 1718                             ;
 1719                         if (vifi >= numvifs) {
 1720                                 /* The iif is not found: ignore the packet. */
 1721                                 return (0);
 1722                         }
 1723 
 1724                         if (rt->mfc_flags[vifi] &
 1725                             MRT_MFC_FLAGS_DISABLE_WRONGVIF) {
 1726                                 /* WRONGVIF disabled: ignore the packet */
 1727                                 return (0);
 1728                         }
 1729 
 1730                         microtime(&now);
 1731 
 1732                         TV_DELTA(rt->mfc_last_assert, now, delta);
 1733 
 1734                         if (delta > ASSERT_MSG_TIME) {
 1735                                 struct igmpmsg *im;
 1736                                 int hlen = ip->ip_hl << 2;
 1737                                 struct mbuf *mm = m_copy(m, 0, hlen);
 1738 
 1739                                 M_PULLUP(mm, hlen);
 1740                                 if (mm == NULL)
 1741                                         return (ENOBUFS);
 1742 
 1743                                 rt->mfc_last_assert = now;
 1744 
 1745                                 im = mtod(mm, struct igmpmsg *);
 1746                                 im->im_msgtype  = IGMPMSG_WRONGVIF;
 1747                                 im->im_mbz      = 0;
 1748                                 im->im_vif      = vifi;
 1749 
 1750                                 mrtstat.mrts_upcalls++;
 1751 
 1752                                 sin.sin_addr = im->im_src;
 1753                                 if (socket_send(ip_mrouter, mm, &sin) < 0) {
 1754                                         log(LOG_WARNING, "ip_mforward: "
 1755                                             "ip_mrouter socket queue full\n");
 1756                                         ++mrtstat.mrts_upq_sockfull;
 1757                                         return (ENOBUFS);
 1758                                 }
 1759                         }
 1760                 }
 1761                 return (0);
 1762         }
 1763 
 1764         /* If I sourced this packet, it counts as output, else it was input. */
 1765         if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) {
 1766                 viftable[vifi].v_pkt_out++;
 1767                 viftable[vifi].v_bytes_out += plen;
 1768         } else {
 1769                 viftable[vifi].v_pkt_in++;
 1770                 viftable[vifi].v_bytes_in += plen;
 1771         }
 1772         rt->mfc_pkt_cnt++;
 1773         rt->mfc_byte_cnt += plen;
 1774 
 1775         /*
 1776          * For each vif, decide if a copy of the packet should be forwarded.
 1777          * Forward if:
 1778          *              - the ttl exceeds the vif's threshold
 1779          *              - there are group members downstream on interface
 1780          */
 1781         for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++)
 1782                 if ((rt->mfc_ttls[vifi] > 0) &&
 1783                         (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 1784                         vifp->v_pkt_out++;
 1785                         vifp->v_bytes_out += plen;
 1786 #ifdef PIM
 1787                         if (vifp->v_flags & VIFF_REGISTER)
 1788                                 pim_register_send(ip, vifp, m, rt);
 1789                         else
 1790 #endif
 1791                         MC_SEND(ip, vifp, m);
 1792                 }
 1793 
 1794         /*
 1795          * Perform upcall-related bw measuring.
 1796          */
 1797         if (rt->mfc_bw_meter != NULL) {
 1798                 struct bw_meter *x;
 1799                 struct timeval now;
 1800 
 1801                 microtime(&now);
 1802                 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
 1803                         bw_meter_receive_packet(x, plen, &now);
 1804         }
 1805 
 1806         return (0);
 1807 }
 1808 
 1809 #ifdef RSVP_ISI
 1810 /*
 1811  * check if a vif number is legal/ok. This is used by ip_output.
 1812  */
 1813 int
 1814 legal_vif_num(int vif)
 1815 {
 1816         if (vif >= 0 && vif < numvifs)
 1817                 return (1);
 1818         else
 1819                 return (0);
 1820 }
 1821 #endif /* RSVP_ISI */
 1822 
 1823 static void
 1824 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1825 {
 1826         struct mbuf *mb_copy;
 1827         int hlen = ip->ip_hl << 2;
 1828 
 1829         /*
 1830          * Make a new reference to the packet; make sure that
 1831          * the IP header is actually copied, not just referenced,
 1832          * so that ip_output() only scribbles on the copy.
 1833          */
 1834         mb_copy = m_copy(m, 0, M_COPYALL);
 1835         M_PULLUP(mb_copy, hlen);
 1836         if (mb_copy == NULL)
 1837                 return;
 1838 
 1839         if (vifp->v_rate_limit <= 0)
 1840                 tbf_send_packet(vifp, mb_copy);
 1841         else
 1842                 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *),
 1843                     ntohs(ip->ip_len));
 1844 }
 1845 
 1846 static void
 1847 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1848 {
 1849         struct mbuf *mb_copy;
 1850         struct ip *ip_copy;
 1851         int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr);
 1852 
 1853         /* Take care of delayed checksums */
 1854         if (m->m_pkthdr.csum_flags & (M_TCPV4_CSUM_OUT | M_UDPV4_CSUM_OUT)) {
 1855                 in_delayed_cksum(m);
 1856                 m->m_pkthdr.csum_flags &=
 1857                     ~(M_UDPV4_CSUM_OUT | M_TCPV4_CSUM_OUT);
 1858         }
 1859 
 1860         /*
 1861          * copy the old packet & pullup its IP header into the
 1862          * new mbuf so we can modify it.  Try to fill the new
 1863          * mbuf since if we don't the ethernet driver will.
 1864          */
 1865         MGETHDR(mb_copy, M_DONTWAIT, MT_DATA);
 1866         if (mb_copy == NULL)
 1867                 return;
 1868         mb_copy->m_data += max_linkhdr;
 1869         mb_copy->m_pkthdr.len = len;
 1870         mb_copy->m_len = sizeof(multicast_encap_iphdr);
 1871 
 1872         if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
 1873                 m_freem(mb_copy);
 1874                 return;
 1875         }
 1876         i = MHLEN - max_linkhdr;
 1877         if (i > len)
 1878                 i = len;
 1879         mb_copy = m_pullup(mb_copy, i);
 1880         if (mb_copy == NULL)
 1881                 return;
 1882 
 1883         /*
 1884          * fill in the encapsulating IP header.
 1885          */
 1886         ip_copy = mtod(mb_copy, struct ip *);
 1887         *ip_copy = multicast_encap_iphdr;
 1888         ip_copy->ip_id = htons(ip_randomid());
 1889         ip_copy->ip_len = htons(len);
 1890         ip_copy->ip_src = vifp->v_lcl_addr;
 1891         ip_copy->ip_dst = vifp->v_rmt_addr;
 1892 
 1893         /*
 1894          * turn the encapsulated IP header back into a valid one.
 1895          */
 1896         ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr));
 1897         --ip->ip_ttl;
 1898         ip->ip_sum = 0;
 1899         mb_copy->m_data += sizeof(multicast_encap_iphdr);
 1900         ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 1901         mb_copy->m_data -= sizeof(multicast_encap_iphdr);
 1902 
 1903         if (vifp->v_rate_limit <= 0)
 1904                 tbf_send_packet(vifp, mb_copy);
 1905         else
 1906                 tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len));
 1907 }
 1908 
 1909 /*
 1910  * Token bucket filter module
 1911  */
 1912 static void
 1913 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len)
 1914 {
 1915 
 1916         if (len > MAX_BKT_SIZE) {
 1917                 /* drop if packet is too large */
 1918                 mrtstat.mrts_pkt2large++;
 1919                 m_freem(m);
 1920                 return;
 1921         }
 1922 
 1923         tbf_update_tokens(vifp);
 1924 
 1925         /*
 1926          * If there are enough tokens, and the queue is empty, send this packet
 1927          * out immediately.  Otherwise, try to insert it on this vif's queue.
 1928          */
 1929         if (vifp->tbf_q_len == 0) {
 1930                 if (len <= vifp->tbf_n_tok) {
 1931                         vifp->tbf_n_tok -= len;
 1932                         tbf_send_packet(vifp, m);
 1933                 } else {
 1934                         /* queue packet and timeout till later */
 1935                         tbf_queue(vifp, m);
 1936                         timeout_add(&vifp->v_repq_ch, TBF_REPROCESS);
 1937                 }
 1938         } else {
 1939                 if (vifp->tbf_q_len >= vifp->tbf_max_q_len &&
 1940                     !tbf_dq_sel(vifp, ip)) {
 1941                         /* queue full, and couldn't make room */
 1942                         mrtstat.mrts_q_overflow++;
 1943                         m_freem(m);
 1944                 } else {
 1945                         /* queue length low enough, or made room */
 1946                         tbf_queue(vifp, m);
 1947                         tbf_process_q(vifp);
 1948                 }
 1949         }
 1950 }
 1951 
 1952 /*
 1953  * adds a packet to the queue at the interface
 1954  */
 1955 static void
 1956 tbf_queue(struct vif *vifp, struct mbuf *m)
 1957 {
 1958         int s = splsoftnet();
 1959 
 1960         /* insert at tail */
 1961         *vifp->tbf_t = m;
 1962         vifp->tbf_t = &m->m_nextpkt;
 1963         vifp->tbf_q_len++;
 1964 
 1965         splx(s);
 1966 }
 1967 
 1968 
 1969 /*
 1970  * processes the queue at the interface
 1971  */
 1972 static void
 1973 tbf_process_q(struct vif *vifp)
 1974 {
 1975         struct mbuf *m;
 1976         int len;
 1977         int s = splsoftnet();
 1978 
 1979         /*
 1980          * Loop through the queue at the interface and send as many packets
 1981          * as possible.
 1982          */
 1983         for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) {
 1984                 len = ntohs(mtod(m, struct ip *)->ip_len);
 1985 
 1986                 /* determine if the packet can be sent */
 1987                 if (len <= vifp->tbf_n_tok) {
 1988                         /* if so,
 1989                          * reduce no of tokens, dequeue the packet,
 1990                          * send the packet.
 1991                          */
 1992                         if ((vifp->tbf_q = m->m_nextpkt) == NULL)
 1993                                 vifp->tbf_t = &vifp->tbf_q;
 1994                         --vifp->tbf_q_len;
 1995 
 1996                         m->m_nextpkt = NULL;
 1997                         vifp->tbf_n_tok -= len;
 1998                         tbf_send_packet(vifp, m);
 1999                 } else
 2000                         break;
 2001         }
 2002         splx(s);
 2003 }
 2004 
 2005 static void
 2006 tbf_reprocess_q(void *arg)
 2007 {
 2008         struct vif *vifp = arg;
 2009 
 2010         if (ip_mrouter == NULL)
 2011                 return;
 2012 
 2013         tbf_update_tokens(vifp);
 2014         tbf_process_q(vifp);
 2015 
 2016         if (vifp->tbf_q_len != 0)
 2017                 timeout_add(&vifp->v_repq_ch, TBF_REPROCESS);
 2018 }
 2019 
 2020 /* function that will selectively discard a member of the queue
 2021  * based on the precedence value and the priority
 2022  */
 2023 static int
 2024 tbf_dq_sel(struct vif *vifp, struct ip *ip)
 2025 {
 2026         u_int p;
 2027         struct mbuf **mp, *m;
 2028         int s = splsoftnet();
 2029 
 2030         p = priority(vifp, ip);
 2031 
 2032         for (mp = &vifp->tbf_q, m = *mp;
 2033             m != NULL;
 2034             mp = &m->m_nextpkt, m = *mp) {
 2035                 if (p > priority(vifp, mtod(m, struct ip *))) {
 2036                         if ((*mp = m->m_nextpkt) == NULL)
 2037                                 vifp->tbf_t = mp;
 2038                         --vifp->tbf_q_len;
 2039 
 2040                         m_freem(m);
 2041                         mrtstat.mrts_drop_sel++;
 2042                         splx(s);
 2043                         return (1);
 2044                 }
 2045         }
 2046         splx(s);
 2047         return (0);
 2048 }
 2049 
 2050 static void
 2051 tbf_send_packet(struct vif *vifp, struct mbuf *m)
 2052 {
 2053         int error;
 2054         int s = splsoftnet();
 2055 
 2056         if (vifp->v_flags & VIFF_TUNNEL) {
 2057                 /* If tunnel options */
 2058                 ip_output(m, (struct mbuf *)NULL, &vifp->v_route,
 2059                     IP_FORWARDING, (struct ip_moptions *)NULL,
 2060                     (struct inpcb *)NULL);
 2061         } else {
 2062                 /*
 2063                  * if physical interface option, extract the options
 2064                  * and then send
 2065                  */
 2066                 struct ip_moptions imo;
 2067 
 2068                 imo.imo_multicast_ifp = vifp->v_ifp;
 2069                 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
 2070                 imo.imo_multicast_loop = 1;
 2071 #ifdef RSVP_ISI
 2072                 imo.imo_multicast_vif = -1;
 2073 #endif
 2074 
 2075                 error = ip_output(m, (struct mbuf *)NULL, (struct route *)NULL,
 2076                     IP_FORWARDING|IP_MULTICASTOPTS, &imo,
 2077                     (struct inpcb *)NULL);
 2078 
 2079                 if (mrtdebug & DEBUG_XMIT)
 2080                         log(LOG_DEBUG, "phyint_send on vif %ld err %d\n",
 2081                             (long)(vifp - viftable), error);
 2082         }
 2083         splx(s);
 2084 }
 2085 
 2086 /* determine the current time and then
 2087  * the elapsed time (between the last time and time now)
 2088  * in milliseconds & update the no. of tokens in the bucket
 2089  */
 2090 static void
 2091 tbf_update_tokens(struct vif *vifp)
 2092 {
 2093         struct timeval tp;
 2094         u_int32_t tm;
 2095         int s = splsoftnet();
 2096 
 2097         microtime(&tp);
 2098 
 2099         TV_DELTA(tp, vifp->tbf_last_pkt_t, tm);
 2100 
 2101         /*
 2102          * This formula is actually
 2103          * "time in seconds" * "bytes/second".
 2104          *
 2105          * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
 2106          *
 2107          * The (1000/1024) was introduced in add_vif to optimize
 2108          * this divide into a shift.
 2109          */
 2110         vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192;
 2111         vifp->tbf_last_pkt_t = tp;
 2112 
 2113         if (vifp->tbf_n_tok > MAX_BKT_SIZE)
 2114                 vifp->tbf_n_tok = MAX_BKT_SIZE;
 2115 
 2116         splx(s);
 2117 }
 2118 
 2119 static int
 2120 priority(struct vif *vifp, struct ip *ip)
 2121 {
 2122         int prio = 50;  /* the lowest priority -- default case */
 2123 
 2124         /* temporary hack; may add general packet classifier some day */
 2125 
 2126         /*
 2127          * The UDP port space is divided up into four priority ranges:
 2128          * [0, 16384)     : unclassified - lowest priority
 2129          * [16384, 32768) : audio - highest priority
 2130          * [32768, 49152) : whiteboard - medium priority
 2131          * [49152, 65536) : video - low priority
 2132          */
 2133         if (ip->ip_p == IPPROTO_UDP) {
 2134                 struct udphdr *udp =
 2135                     (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
 2136 
 2137                 switch (ntohs(udp->uh_dport) & 0xc000) {
 2138                 case 0x4000:
 2139                         prio = 70;
 2140                         break;
 2141                 case 0x8000:
 2142                         prio = 60;
 2143                         break;
 2144                 case 0xc000:
 2145                         prio = 55;
 2146                         break;
 2147                 }
 2148 
 2149                 if (tbfdebug > 1)
 2150                         log(LOG_DEBUG, "port %x prio %d\n",
 2151                             ntohs(udp->uh_dport), prio);
 2152         }
 2153 
 2154         return (prio);
 2155 }
 2156 
 2157 /*
 2158  * End of token bucket filter modifications
 2159  */
 2160 #ifdef RSVP_ISI
 2161 int
 2162 ip_rsvp_vif_init(struct socket *so, struct mbuf *m)
 2163 {
 2164         int vifi, s;
 2165 
 2166         if (rsvpdebug)
 2167                 printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n",
 2168                     so->so_type, so->so_proto->pr_protocol);
 2169 
 2170         if (so->so_type != SOCK_RAW ||
 2171             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2172                 return (EOPNOTSUPP);
 2173 
 2174         /* Check mbuf. */
 2175         if (m == NULL || m->m_len != sizeof(int)) {
 2176                 return (EINVAL);
 2177         }
 2178         vifi = *(mtod(m, int *));
 2179 
 2180         if (rsvpdebug)
 2181                 printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n",
 2182                     vifi, rsvp_on);
 2183 
 2184         s = splsoftnet();
 2185 
 2186         /* Check vif. */
 2187         if (!legal_vif_num(vifi)) {
 2188                 splx(s);
 2189                 return (EADDRNOTAVAIL);
 2190         }
 2191 
 2192         /* Check if socket is available. */
 2193         if (viftable[vifi].v_rsvpd != NULL) {
 2194                 splx(s);
 2195                 return (EADDRINUSE);
 2196         }
 2197 
 2198         viftable[vifi].v_rsvpd = so;
 2199         /* This may seem silly, but we need to be sure we don't over-increment
 2200          * the RSVP counter, in case something slips up.
 2201          */
 2202         if (!viftable[vifi].v_rsvp_on) {
 2203                 viftable[vifi].v_rsvp_on = 1;
 2204                 rsvp_on++;
 2205         }
 2206 
 2207         splx(s);
 2208         return (0);
 2209 }
 2210 
 2211 int
 2212 ip_rsvp_vif_done(struct socket *so, struct mbuf *m)
 2213 {
 2214         int vifi, s;
 2215 
 2216         if (rsvpdebug)
 2217                 printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n",
 2218                     so->so_type, so->so_proto->pr_protocol);
 2219 
 2220         if (so->so_type != SOCK_RAW ||
 2221             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2222                 return (EOPNOTSUPP);
 2223 
 2224         /* Check mbuf. */
 2225         if (m == NULL || m->m_len != sizeof(int)) {
 2226                 return (EINVAL);
 2227         }
 2228         vifi = *(mtod(m, int *));
 2229 
 2230         s = splsoftnet();
 2231 
 2232         /* Check vif. */
 2233         if (!legal_vif_num(vifi)) {
 2234                 splx(s);
 2235                 return (EADDRNOTAVAIL);
 2236         }
 2237 
 2238         if (rsvpdebug)
 2239                 printf("ip_rsvp_vif_done: v_rsvpd = %x so = %x\n",
 2240                     viftable[vifi].v_rsvpd, so);
 2241 
 2242         viftable[vifi].v_rsvpd = NULL;
 2243         /*
 2244          * This may seem silly, but we need to be sure we don't over-decrement
 2245          * the RSVP counter, in case something slips up.
 2246          */
 2247         if (viftable[vifi].v_rsvp_on) {
 2248                 viftable[vifi].v_rsvp_on = 0;
 2249                 rsvp_on--;
 2250         }
 2251 
 2252         splx(s);
 2253         return (0);
 2254 }
 2255 
 2256 void
 2257 ip_rsvp_force_done(struct socket *so)
 2258 {
 2259         int vifi, s;
 2260 
 2261         /* Don't bother if it is not the right type of socket. */
 2262         if (so->so_type != SOCK_RAW ||
 2263             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2264                 return;
 2265 
 2266         s = splsoftnet();
 2267 
 2268         /*
 2269          * The socket may be attached to more than one vif...this
 2270          * is perfectly legal.
 2271          */
 2272         for (vifi = 0; vifi < numvifs; vifi++) {
 2273                 if (viftable[vifi].v_rsvpd == so) {
 2274                         viftable[vifi].v_rsvpd = NULL;
 2275                         /*
 2276                          * This may seem silly, but we need to be sure we don't
 2277                          * over-decrement the RSVP counter, in case something
 2278                          * slips up.
 2279                          */
 2280                         if (viftable[vifi].v_rsvp_on) {
 2281                                 viftable[vifi].v_rsvp_on = 0;
 2282                                 rsvp_on--;
 2283                         }
 2284                 }
 2285         }
 2286 
 2287         splx(s);
 2288         return;
 2289 }
 2290 
 2291 void
 2292 rsvp_input(struct mbuf *m, struct ifnet *ifp)
 2293 {
 2294         int vifi, s;
 2295         struct ip *ip = mtod(m, struct ip *);
 2296         static struct sockaddr_in rsvp_src = { sizeof(sin), AF_INET };
 2297 
 2298         if (rsvpdebug)
 2299                 printf("rsvp_input: rsvp_on %d\n", rsvp_on);
 2300 
 2301         /*
 2302          * Can still get packets with rsvp_on = 0 if there is a local member
 2303          * of the group to which the RSVP packet is addressed.  But in this
 2304          * case we want to throw the packet away.
 2305          */
 2306         if (!rsvp_on) {
 2307                 m_freem(m);
 2308                 return;
 2309         }
 2310 
 2311         /*
 2312          * If the old-style non-vif-associated socket is set, then use
 2313          * it and ignore the new ones.
 2314          */
 2315         if (ip_rsvpd != NULL) {
 2316                 if (rsvpdebug)
 2317                         printf("rsvp_input: "
 2318                             "Sending packet up old-style socket\n");
 2319                 rip_input(m, 0);        /*XXX*/
 2320                 return;
 2321         }
 2322 
 2323         s = splsoftnet();
 2324 
 2325         if (rsvpdebug)
 2326                 printf("rsvp_input: check vifs\n");
 2327 
 2328         /* Find which vif the packet arrived on. */
 2329         for (vifi = 0; vifi < numvifs; vifi++) {
 2330                 if (viftable[vifi].v_ifp == ifp)
 2331                         break;
 2332         }
 2333 
 2334         if (vifi == numvifs) {
 2335                 /* Can't find vif packet arrived on. Drop packet. */
 2336                 if (rsvpdebug)
 2337                         printf("rsvp_input: "
 2338                             "Can't find vif for packet...dropping it.\n");
 2339                 m_freem(m);
 2340                 splx(s);
 2341                 return;
 2342         }
 2343 
 2344         if (rsvpdebug)
 2345                 printf("rsvp_input: check socket\n");
 2346 
 2347         if (viftable[vifi].v_rsvpd == NULL) {
 2348                 /*
 2349                  * drop packet, since there is no specific socket for this
 2350                  * interface
 2351                  */
 2352                 if (rsvpdebug)
 2353                         printf("rsvp_input: No socket defined for vif %d\n",
 2354                             vifi);
 2355                 m_freem(m);
 2356                 splx(s);
 2357                 return;
 2358         }
 2359 
 2360         rsvp_src.sin_addr = ip->ip_src;
 2361 
 2362         if (rsvpdebug && m)
 2363                 printf("rsvp_input: m->m_len = %d, sbspace() = %d\n",
 2364                     m->m_len, sbspace(&viftable[vifi].v_rsvpd->so_rcv));
 2365 
 2366         if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0)
 2367                 if (rsvpdebug)
 2368                         printf("rsvp_input: Failed to append to socket\n");
 2369         else
 2370                 if (rsvpdebug)
 2371                         printf("rsvp_input: send packet up\n");
 2372 
 2373         splx(s);
 2374 }
 2375 #endif /* RSVP_ISI */
 2376 
 2377 /*
 2378  * Code for bandwidth monitors
 2379  */
 2380 
 2381 /*
 2382  * Define common interface for timeval-related methods
 2383  */
 2384 #define BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp)
 2385 #define BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp))
 2386 #define BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp))
 2387 
 2388 static uint32_t
 2389 compute_bw_meter_flags(struct bw_upcall *req)
 2390 {
 2391         uint32_t flags = 0;
 2392 
 2393         if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
 2394                 flags |= BW_METER_UNIT_PACKETS;
 2395         if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
 2396                 flags |= BW_METER_UNIT_BYTES;
 2397         if (req->bu_flags & BW_UPCALL_GEQ)
 2398                 flags |= BW_METER_GEQ;
 2399         if (req->bu_flags & BW_UPCALL_LEQ)
 2400                 flags |= BW_METER_LEQ;
 2401 
 2402         return (flags);
 2403 }
 2404 
 2405 /*
 2406  * Add a bw_meter entry
 2407  */
 2408 static int
 2409 add_bw_upcall(struct mbuf *m)
 2410 {
 2411         int s;
 2412         struct mfc *mfc;
 2413         struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
 2414             BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
 2415         struct timeval now;
 2416         struct bw_meter *x;
 2417         uint32_t flags;
 2418         struct bw_upcall *req;
 2419 
 2420         if (m == NULL || m->m_len < sizeof(struct bw_upcall))
 2421                 return (EINVAL);
 2422 
 2423         req = mtod(m, struct bw_upcall *);
 2424 
 2425         if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2426                 return (EOPNOTSUPP);
 2427 
 2428         /* Test if the flags are valid */
 2429         if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
 2430                 return (EINVAL);
 2431         if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
 2432                 return (EINVAL);
 2433         if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2434             == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2435                 return (EINVAL);
 2436 
 2437         /* Test if the threshold time interval is valid */
 2438         if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
 2439                 return (EINVAL);
 2440 
 2441         flags = compute_bw_meter_flags(req);
 2442 
 2443         /* Find if we have already same bw_meter entry */
 2444         s = splsoftnet();
 2445         mfc = mfc_find(&req->bu_src, &req->bu_dst);
 2446         if (mfc == NULL) {
 2447                 splx(s);
 2448                 return (EADDRNOTAVAIL);
 2449         }
 2450         for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
 2451                 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2452                     &req->bu_threshold.b_time, ==)) &&
 2453                     (x->bm_threshold.b_packets ==
 2454                     req->bu_threshold.b_packets) &&
 2455                     (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 2456                     (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
 2457                         splx(s);
 2458                         return (0);     /* XXX Already installed */
 2459                 }
 2460         }
 2461 
 2462         /* Allocate the new bw_meter entry */
 2463         x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
 2464         if (x == NULL) {
 2465                 splx(s);
 2466                 return (ENOBUFS);
 2467         }
 2468 
 2469         /* Set the new bw_meter entry */
 2470         x->bm_threshold.b_time = req->bu_threshold.b_time;
 2471         microtime(&now);
 2472         x->bm_start_time = now;
 2473         x->bm_threshold.b_packets = req->bu_threshold.b_packets;
 2474         x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
 2475         x->bm_measured.b_packets = 0;
 2476         x->bm_measured.b_bytes = 0;
 2477         x->bm_flags = flags;
 2478         x->bm_time_next = NULL;
 2479         x->bm_time_hash = BW_METER_BUCKETS;
 2480 
 2481         /* Add the new bw_meter entry to the front of entries for this MFC */
 2482         x->bm_mfc = mfc;
 2483         x->bm_mfc_next = mfc->mfc_bw_meter;
 2484         mfc->mfc_bw_meter = x;
 2485         schedule_bw_meter(x, &now);
 2486         splx(s);
 2487 
 2488         return (0);
 2489 }
 2490 
 2491 static void
 2492 free_bw_list(struct bw_meter *list)
 2493 {
 2494         while (list != NULL) {
 2495                 struct bw_meter *x = list;
 2496 
 2497                 list = list->bm_mfc_next;
 2498                 unschedule_bw_meter(x);
 2499                 free(x, M_BWMETER);
 2500         }
 2501 }
 2502 
 2503 /*
 2504  * Delete one or multiple bw_meter entries
 2505  */
 2506 static int
 2507 del_bw_upcall(struct mbuf *m)
 2508 {
 2509         int s;
 2510         struct mfc *mfc;
 2511         struct bw_meter *x;
 2512         struct bw_upcall *req;
 2513 
 2514         if (m == NULL || m->m_len < sizeof(struct bw_upcall))
 2515                 return (EINVAL);
 2516 
 2517         req = mtod(m, struct bw_upcall *);
 2518 
 2519         if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2520                 return (EOPNOTSUPP);
 2521 
 2522         s = splsoftnet();
 2523         /* Find the corresponding MFC entry */
 2524         mfc = mfc_find(&req->bu_src, &req->bu_dst);
 2525         if (mfc == NULL) {
 2526                 splx(s);
 2527                 return (EADDRNOTAVAIL);
 2528         } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
 2529                 /* Delete all bw_meter entries for this mfc */
 2530                 struct bw_meter *list;
 2531 
 2532                 list = mfc->mfc_bw_meter;
 2533                 mfc->mfc_bw_meter = NULL;
 2534                 free_bw_list(list);
 2535                 splx(s);
 2536                 return (0);
 2537         } else {        /* Delete a single bw_meter entry */
 2538                 struct bw_meter *prev;
 2539                 uint32_t flags = 0;
 2540 
 2541                 flags = compute_bw_meter_flags(req);
 2542 
 2543                 /* Find the bw_meter entry to delete */
 2544                 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
 2545                     prev = x, x = x->bm_mfc_next) {
 2546                         if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2547                             &req->bu_threshold.b_time, ==)) &&
 2548                             (x->bm_threshold.b_packets ==
 2549                             req->bu_threshold.b_packets) &&
 2550                             (x->bm_threshold.b_bytes ==
 2551                             req->bu_threshold.b_bytes) &&
 2552                             (x->bm_flags & BW_METER_USER_FLAGS) == flags)
 2553                                 break;
 2554                 }
 2555                 if (x != NULL) { /* Delete entry from the list for this MFC */
 2556                         if (prev != NULL) {
 2557                                 /* remove from middle */
 2558                                 prev->bm_mfc_next = x->bm_mfc_next;
 2559                         } else {
 2560                                 /* new head of list */
 2561                                 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;
 2562                         }
 2563 
 2564                         unschedule_bw_meter(x);
 2565                         splx(s);
 2566                         /* Free the bw_meter entry */
 2567                         free(x, M_BWMETER);
 2568                         return (0);
 2569                 } else {
 2570                         splx(s);
 2571                         return (EINVAL);
 2572                 }
 2573         }
 2574         /* NOTREACHED */
 2575 }
 2576 
 2577 /*
 2578  * Perform bandwidth measurement processing that may result in an upcall
 2579  */
 2580 static void
 2581 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 2582 {
 2583         struct timeval delta;
 2584 
 2585         delta = *nowp;
 2586         BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2587 
 2588         if (x->bm_flags & BW_METER_GEQ) {
 2589                 /* Processing for ">=" type of bw_meter entry */
 2590                 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2591                         /* Reset the bw_meter entry */
 2592                         x->bm_start_time = *nowp;
 2593                         x->bm_measured.b_packets = 0;
 2594                         x->bm_measured.b_bytes = 0;
 2595                         x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2596                 }
 2597 
 2598                 /* Record that a packet is received */
 2599                 x->bm_measured.b_packets++;
 2600                 x->bm_measured.b_bytes += plen;
 2601 
 2602                 /* Test if we should deliver an upcall */
 2603                 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
 2604                         if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2605                             (x->bm_measured.b_packets >=
 2606                             x->bm_threshold.b_packets)) ||
 2607                             ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2608                             (x->bm_measured.b_bytes >=
 2609                             x->bm_threshold.b_bytes))) {
 2610                                 /* Prepare an upcall for delivery */
 2611                                 bw_meter_prepare_upcall(x, nowp);
 2612                                 x->bm_flags |= BW_METER_UPCALL_DELIVERED;
 2613                         }
 2614                 }
 2615         } else if (x->bm_flags & BW_METER_LEQ) {
 2616                 /* Processing for "<=" type of bw_meter entry */
 2617                 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2618                         /*
 2619                          * We are behind time with the multicast forwarding
 2620                          * table scanning for "<=" type of bw_meter entries,
 2621                          * so test now if we should deliver an upcall.
 2622                          */
 2623                         if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2624                             (x->bm_measured.b_packets <=
 2625                             x->bm_threshold.b_packets)) ||
 2626                             ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2627                             (x->bm_measured.b_bytes <=
 2628                             x->bm_threshold.b_bytes))) {
 2629                                 /* Prepare an upcall for delivery */
 2630                                 bw_meter_prepare_upcall(x, nowp);
 2631                         }
 2632                         /* Reschedule the bw_meter entry */
 2633                         unschedule_bw_meter(x);
 2634                         schedule_bw_meter(x, nowp);
 2635                 }
 2636 
 2637                 /* Record that a packet is received */
 2638                 x->bm_measured.b_packets++;
 2639                 x->bm_measured.b_bytes += plen;
 2640 
 2641                 /* Test if we should restart the measuring interval */
 2642                 if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
 2643                     x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
 2644                     (x->bm_flags & BW_METER_UNIT_BYTES &&
 2645                     x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
 2646                         /* Don't restart the measuring interval */
 2647                 } else {
 2648                         /* Do restart the measuring interval */
 2649                         /*
 2650                          * XXX: note that we don't unschedule and schedule,
 2651                          * because this might be too much overhead per packet.
 2652                          * Instead, when we process all entries for a given
 2653                          * timer hash bin, we check whether it is really a
 2654                          * timeout. If not, we reschedule at that time.
 2655                          */
 2656                         x->bm_start_time = *nowp;
 2657                         x->bm_measured.b_packets = 0;
 2658                         x->bm_measured.b_bytes = 0;
 2659                         x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2660                 }
 2661         }
 2662 }
 2663 
 2664 /*
 2665  * Prepare a bandwidth-related upcall
 2666  */
 2667 static void
 2668 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 2669 {
 2670         struct timeval delta;
 2671         struct bw_upcall *u;
 2672 
 2673         /* Compute the measured time interval */
 2674         delta = *nowp;
 2675         BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2676 
 2677         /* If there are too many pending upcalls, deliver them now */
 2678         if (bw_upcalls_n >= BW_UPCALLS_MAX)
 2679                 bw_upcalls_send();
 2680 
 2681         /* Set the bw_upcall entry */
 2682         u = &bw_upcalls[bw_upcalls_n++];
 2683         u->bu_src = x->bm_mfc->mfc_origin;
 2684         u->bu_dst = x->bm_mfc->mfc_mcastgrp;
 2685         u->bu_threshold.b_time = x->bm_threshold.b_time;
 2686         u->bu_threshold.b_packets = x->bm_threshold.b_packets;
 2687         u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
 2688         u->bu_measured.b_time = delta;
 2689         u->bu_measured.b_packets = x->bm_measured.b_packets;
 2690         u->bu_measured.b_bytes = x->bm_measured.b_bytes;
 2691         u->bu_flags = 0;
 2692         if (x->bm_flags & BW_METER_UNIT_PACKETS)
 2693                 u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
 2694         if (x->bm_flags & BW_METER_UNIT_BYTES)
 2695                 u->bu_flags |= BW_UPCALL_UNIT_BYTES;
 2696         if (x->bm_flags & BW_METER_GEQ)
 2697                 u->bu_flags |= BW_UPCALL_GEQ;
 2698         if (x->bm_flags & BW_METER_LEQ)
 2699                 u->bu_flags |= BW_UPCALL_LEQ;
 2700 }
 2701 
 2702 /*
 2703  * Send the pending bandwidth-related upcalls
 2704  */
 2705 static void
 2706 bw_upcalls_send(void)
 2707 {
 2708         struct mbuf *m;
 2709         int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
 2710         struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 2711         static struct igmpmsg igmpmsg = {
 2712             0,                  /* unused1 */
 2713             0,                  /* unused2 */
 2714             IGMPMSG_BW_UPCALL,  /* im_msgtype */
 2715             0,                  /* im_mbz  */
 2716             0,                  /* im_vif  */
 2717             0,                  /* unused3 */
 2718             { 0 },              /* im_src  */
 2719             { 0 } };            /* im_dst  */
 2720 
 2721         if (bw_upcalls_n == 0)
 2722                 return;         /* No pending upcalls */
 2723 
 2724         bw_upcalls_n = 0;
 2725 
 2726         /*
 2727          * Allocate a new mbuf, initialize it with the header and
 2728          * the payload for the pending calls.
 2729          */
 2730         MGETHDR(m, M_DONTWAIT, MT_HEADER);
 2731         if (m == NULL) {
 2732                 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
 2733                 return;
 2734         }
 2735 
 2736         m->m_len = m->m_pkthdr.len = 0;
 2737         m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
 2738         m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]);
 2739 
 2740         /*
 2741          * Send the upcalls
 2742          * XXX do we need to set the address in k_igmpsrc ?
 2743          */
 2744         mrtstat.mrts_upcalls++;
 2745         if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) {
 2746                 log(LOG_WARNING,
 2747                     "bw_upcalls_send: ip_mrouter socket queue full\n");
 2748                 ++mrtstat.mrts_upq_sockfull;
 2749         }
 2750 }
 2751 
 2752 /*
 2753  * Compute the timeout hash value for the bw_meter entries
 2754  */
 2755 #define BW_METER_TIMEHASH(bw_meter, hash) do {                          \
 2756         struct timeval next_timeval = (bw_meter)->bm_start_time;        \
 2757                                                                         \
 2758         BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
 2759         (hash) = next_timeval.tv_sec;                                   \
 2760         if (next_timeval.tv_usec)                                       \
 2761                 (hash)++; /* XXX: make sure we don't timeout early */   \
 2762         (hash) %= BW_METER_BUCKETS;                                     \
 2763 } while (/*CONSTCOND*/ 0)
 2764 
 2765 /*
 2766  * Schedule a timer to process periodically bw_meter entry of type "<="
 2767  * by linking the entry in the proper hash bucket.
 2768  */
 2769 static void
 2770 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
 2771 {
 2772         int time_hash;
 2773 
 2774         if (!(x->bm_flags & BW_METER_LEQ))
 2775                 return; /* XXX: we schedule timers only for "<=" entries */
 2776 
 2777         /* Reset the bw_meter entry */
 2778         x->bm_start_time = *nowp;
 2779         x->bm_measured.b_packets = 0;
 2780         x->bm_measured.b_bytes = 0;
 2781         x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2782 
 2783         /* Compute the timeout hash value and insert the entry */
 2784         BW_METER_TIMEHASH(x, time_hash);
 2785         x->bm_time_next = bw_meter_timers[time_hash];
 2786         bw_meter_timers[time_hash] = x;
 2787         x->bm_time_hash = time_hash;
 2788 }
 2789 
 2790 /*
 2791  * Unschedule the periodic timer that processes bw_meter entry of type "<="
 2792  * by removing the entry from the proper hash bucket.
 2793  */
 2794 static void
 2795 unschedule_bw_meter(struct bw_meter *x)
 2796 {
 2797         int time_hash;
 2798         struct bw_meter *prev, *tmp;
 2799 
 2800         if (!(x->bm_flags & BW_METER_LEQ))
 2801                 return; /* XXX: we schedule timers only for "<=" entries */
 2802 
 2803         /* Compute the timeout hash value and delete the entry */
 2804         time_hash = x->bm_time_hash;
 2805         if (time_hash >= BW_METER_BUCKETS)
 2806                 return;         /* Entry was not scheduled */
 2807 
 2808         for (prev = NULL, tmp = bw_meter_timers[time_hash];
 2809             tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
 2810                 if (tmp == x)
 2811                         break;
 2812 
 2813         if (tmp == NULL)
 2814                 panic("unschedule_bw_meter: bw_meter entry not found");
 2815 
 2816         if (prev != NULL)
 2817                 prev->bm_time_next = x->bm_time_next;
 2818         else
 2819                 bw_meter_timers[time_hash] = x->bm_time_next;
 2820 
 2821         x->bm_time_next = NULL;
 2822         x->bm_time_hash = BW_METER_BUCKETS;
 2823 }
 2824 
 2825 /*
 2826  * Process all "<=" type of bw_meter that should be processed now,
 2827  * and for each entry prepare an upcall if necessary. Each processed
 2828  * entry is rescheduled again for the (periodic) processing.
 2829  *
 2830  * This is run periodically (once per second normally). On each round,
 2831  * all the potentially matching entries are in the hash slot that we are
 2832  * looking at.
 2833  */
 2834 static void
 2835 bw_meter_process()
 2836 {
 2837         int s;
 2838         static uint32_t last_tv_sec;    /* last time we processed this */
 2839 
 2840         uint32_t loops;
 2841         int i;
 2842         struct timeval now, process_endtime;
 2843 
 2844         microtime(&now);
 2845         if (last_tv_sec == now.tv_sec)
 2846                 return;         /* nothing to do */
 2847 
 2848         loops = now.tv_sec - last_tv_sec;
 2849         last_tv_sec = now.tv_sec;
 2850         if (loops > BW_METER_BUCKETS)
 2851                 loops = BW_METER_BUCKETS;
 2852 
 2853         s = splsoftnet();
 2854         /*
 2855          * Process all bins of bw_meter entries from the one after the last
 2856          * processed to the current one. On entry, i points to the last bucket
 2857          * visited, so we need to increment i at the beginning of the loop.
 2858          */
 2859         for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
 2860                 struct bw_meter *x, *tmp_list;
 2861 
 2862                 if (++i >= BW_METER_BUCKETS)
 2863                         i = 0;
 2864 
 2865                 /* Disconnect the list of bw_meter entries from the bin */
 2866                 tmp_list = bw_meter_timers[i];
 2867                 bw_meter_timers[i] = NULL;
 2868 
 2869                 /* Process the list of bw_meter entries */
 2870                 while (tmp_list != NULL) {
 2871                         x = tmp_list;
 2872                         tmp_list = tmp_list->bm_time_next;
 2873 
 2874                         /* Test if the time interval is over */
 2875                         process_endtime = x->bm_start_time;
 2876                         BW_TIMEVALADD(&process_endtime,
 2877                             &x->bm_threshold.b_time);
 2878                         if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
 2879                                 /* Not yet: reschedule, but don't reset */
 2880                                 int time_hash;
 2881 
 2882                                 BW_METER_TIMEHASH(x, time_hash);
 2883                                 if (time_hash == i &&
 2884                                     process_endtime.tv_sec == now.tv_sec) {
 2885                                         /*
 2886                                          * XXX: somehow the bin processing is
 2887                                          * a bit ahead of time. Put the entry
 2888                                          * in the next bin.
 2889                                          */
 2890                                         if (++time_hash >= BW_METER_BUCKETS)
 2891                                                 time_hash = 0;
 2892                                 }
 2893                                 x->bm_time_next = bw_meter_timers[time_hash];
 2894                                 bw_meter_timers[time_hash] = x;
 2895                                 x->bm_time_hash = time_hash;
 2896 
 2897                                 continue;
 2898                         }
 2899 
 2900                         /* Test if we should deliver an upcall */
 2901                         if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2902                             (x->bm_measured.b_packets <=
 2903                             x->bm_threshold.b_packets)) ||
 2904                             ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2905                             (x->bm_measured.b_bytes <=
 2906                             x->bm_threshold.b_bytes))) {
 2907                                 /* Prepare an upcall for delivery */
 2908                                 bw_meter_prepare_upcall(x, &now);
 2909                         }
 2910 
 2911                         /* Reschedule for next processing */
 2912                         schedule_bw_meter(x, &now);
 2913                 }
 2914         }
 2915 
 2916         /* Send all upcalls that are pending delivery */
 2917         bw_upcalls_send();
 2918 
 2919         splx(s);
 2920 }
 2921 
 2922 /*
 2923  * A periodic function for sending all upcalls that are pending delivery
 2924  */
 2925 static void
 2926 expire_bw_upcalls_send(void *unused)
 2927 {
 2928         int s;
 2929 
 2930         s = splsoftnet();
 2931         bw_upcalls_send();
 2932         splx(s);
 2933 
 2934         timeout_add(&bw_upcalls_ch, BW_UPCALLS_PERIOD);
 2935 }
 2936 
 2937 /*
 2938  * A periodic function for periodic scanning of the multicast forwarding
 2939  * table for processing all "<=" bw_meter entries.
 2940  */
 2941 static void
 2942 expire_bw_meter_process(void *unused)
 2943 {
 2944         if (mrt_api_config & MRT_MFC_BW_UPCALL)
 2945                 bw_meter_process();
 2946 
 2947         timeout_add(&bw_meter_ch, BW_METER_PERIOD);
 2948 }
 2949 
 2950 /*
 2951  * End of bandwidth monitoring code
 2952  */
 2953 
 2954 #ifdef PIM
 2955 /*
 2956  * Send the packet up to the user daemon, or eventually do kernel encapsulation
 2957  */
 2958 static int
 2959 pim_register_send(struct ip *ip, struct vif *vifp,
 2960         struct mbuf *m, struct mfc *rt)
 2961 {
 2962         struct mbuf *mb_copy, *mm;
 2963 
 2964         if (mrtdebug & DEBUG_PIM)
 2965                 log(LOG_DEBUG, "pim_register_send: ");
 2966 
 2967         mb_copy = pim_register_prepare(ip, m);
 2968         if (mb_copy == NULL)
 2969                 return (ENOBUFS);
 2970 
 2971         /*
 2972          * Send all the fragments. Note that the mbuf for each fragment
 2973          * is freed by the sending machinery.
 2974          */
 2975         for (mm = mb_copy; mm; mm = mb_copy) {
 2976                 mb_copy = mm->m_nextpkt;
 2977                 mm->m_nextpkt = NULL;
 2978                 mm = m_pullup(mm, sizeof(struct ip));
 2979                 if (mm != NULL) {
 2980                         ip = mtod(mm, struct ip *);
 2981                         if ((mrt_api_config & MRT_MFC_RP) &&
 2982                             !in_nullhost(rt->mfc_rp)) {
 2983                                 pim_register_send_rp(ip, vifp, mm, rt);
 2984                         } else {
 2985                                 pim_register_send_upcall(ip, vifp, mm, rt);
 2986                         }
 2987                 }
 2988         }
 2989 
 2990         return (0);
 2991 }
 2992 
 2993 /*
 2994  * Return a copy of the data packet that is ready for PIM Register
 2995  * encapsulation.
 2996  * XXX: Note that in the returned copy the IP header is a valid one.
 2997  */
 2998 static struct mbuf *
 2999 pim_register_prepare(struct ip *ip, struct mbuf *m)
 3000 {
 3001         struct mbuf *mb_copy = NULL;
 3002         int mtu;
 3003 
 3004         /* Take care of delayed checksums */
 3005         if (m->m_pkthdr.csum_flags & (M_TCPV4_CSUM_OUT | M_UDPV4_CSUM_OUT)) {
 3006                 in_delayed_cksum(m);
 3007                 m->m_pkthdr.csum_flags &=
 3008                     ~(M_UDPV4_CSUM_OUT | M_TCPV4_CSUM_OUT);
 3009         }
 3010 
 3011         /*
 3012          * Copy the old packet & pullup its IP header into the
 3013          * new mbuf so we can modify it.
 3014          */
 3015         mb_copy = m_copy(m, 0, M_COPYALL);
 3016         if (mb_copy == NULL)
 3017                 return (NULL);
 3018         mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
 3019         if (mb_copy == NULL)
 3020                 return (NULL);
 3021 
 3022         /* take care of the TTL */
 3023         ip = mtod(mb_copy, struct ip *);
 3024         --ip->ip_ttl;
 3025 
 3026         /* Compute the MTU after the PIM Register encapsulation */
 3027         mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
 3028 
 3029         if (ntohs(ip->ip_len) <= mtu) {
 3030                 /* Turn the IP header into a valid one */
 3031                 ip->ip_sum = 0;
 3032                 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 3033         } else {
 3034                 /* Fragment the packet */
 3035                 if (ip_fragment(mb_copy, NULL, mtu) != 0) {
 3036                         /* XXX: mb_copy was freed by ip_fragment() */
 3037                         return (NULL);
 3038                 }
 3039         }
 3040         return (mb_copy);
 3041 }
 3042 
 3043 /*
 3044  * Send an upcall with the data packet to the user-level process.
 3045  */
 3046 static int
 3047 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
 3048         struct mbuf *mb_copy, struct mfc *rt)
 3049 {
 3050         struct mbuf *mb_first;
 3051         int len = ntohs(ip->ip_len);
 3052         struct igmpmsg *im;
 3053         struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 3054 
 3055         /* Add a new mbuf with an upcall header */
 3056         MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 3057         if (mb_first == NULL) {
 3058                 m_freem(mb_copy);
 3059                 return (ENOBUFS);
 3060         }
 3061         mb_first->m_data += max_linkhdr;
 3062         mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
 3063         mb_first->m_len = sizeof(struct igmpmsg);
 3064         mb_first->m_next = mb_copy;
 3065 
 3066         /* Send message to routing daemon */
 3067         im = mtod(mb_first, struct igmpmsg *);
 3068         im->im_msgtype = IGMPMSG_WHOLEPKT;
 3069         im->im_mbz = 0;
 3070         im->im_vif = vifp - viftable;
 3071         im->im_src = ip->ip_src;
 3072         im->im_dst = ip->ip_dst;
 3073 
 3074         k_igmpsrc.sin_addr = ip->ip_src;
 3075 
 3076         mrtstat.mrts_upcalls++;
 3077 
 3078         if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) {
 3079                 if (mrtdebug & DEBUG_PIM)
 3080                         log(LOG_WARNING, "mcast: pim_register_send_upcall: "
 3081                             "ip_mrouter socket queue full");
 3082                 ++mrtstat.mrts_upq_sockfull;
 3083                 return (ENOBUFS);
 3084         }
 3085 
 3086         /* Keep statistics */
 3087         pimstat.pims_snd_registers_msgs++;
 3088         pimstat.pims_snd_registers_bytes += len;
 3089 
 3090         return (0);
 3091 }
 3092 
 3093 /*
 3094  * Encapsulate the data packet in PIM Register message and send it to the RP.
 3095  */
 3096 static int
 3097 pim_register_send_rp(struct ip *ip, struct vif *vifp,
 3098         struct mbuf *mb_copy, struct mfc *rt)
 3099 {
 3100         struct mbuf *mb_first;
 3101         struct ip *ip_outer;
 3102         struct pim_encap_pimhdr *pimhdr;
 3103         int len = ntohs(ip->ip_len);
 3104         vifi_t vifi = rt->mfc_parent;
 3105 
 3106         if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) {
 3107                 m_freem(mb_copy);
 3108                 return (EADDRNOTAVAIL);         /* The iif vif is invalid */
 3109         }
 3110 
 3111         /* Add a new mbuf with the encapsulating header */
 3112         MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 3113         if (mb_first == NULL) {
 3114                 m_freem(mb_copy);
 3115                 return (ENOBUFS);
 3116         }
 3117         mb_first->m_data += max_linkhdr;
 3118         mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
 3119         mb_first->m_next = mb_copy;
 3120 
 3121         mb_first->m_pkthdr.len = len + mb_first->m_len;
 3122 
 3123         /* Fill in the encapsulating IP and PIM header */
 3124         ip_outer = mtod(mb_first, struct ip *);
 3125         *ip_outer = pim_encap_iphdr;
 3126         ip_outer->ip_id = htons(ip_randomid());
 3127         ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
 3128             sizeof(pim_encap_pimhdr));
 3129         ip_outer->ip_src = viftable[vifi].v_lcl_addr;
 3130         ip_outer->ip_dst = rt->mfc_rp;
 3131         /*
 3132          * Copy the inner header TOS to the outer header, and take care of the
 3133          * IP_DF bit.
 3134          */
 3135         ip_outer->ip_tos = ip->ip_tos;
 3136         if (ntohs(ip->ip_off) & IP_DF)
 3137                 ip_outer->ip_off |= htons(IP_DF);
 3138         pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
 3139             + sizeof(pim_encap_iphdr));
 3140         *pimhdr = pim_encap_pimhdr;
 3141         /* If the iif crosses a border, set the Border-bit */
 3142         if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
 3143                 pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
 3144 
 3145         mb_first->m_data += sizeof(pim_encap_iphdr);
 3146         pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
 3147         mb_first->m_data -= sizeof(pim_encap_iphdr);
 3148 
 3149         if (vifp->v_rate_limit == 0)
 3150                 tbf_send_packet(vifp, mb_first);
 3151         else
 3152                 tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len));
 3153 
 3154         /* Keep statistics */
 3155         pimstat.pims_snd_registers_msgs++;
 3156         pimstat.pims_snd_registers_bytes += len;
 3157 
 3158         return (0);
 3159 }
 3160 
 3161 /*
 3162  * PIM-SMv2 and PIM-DM messages processing.
 3163  * Receives and verifies the PIM control messages, and passes them
 3164  * up to the listening socket, using rip_input().
 3165  * The only message with special processing is the PIM_REGISTER message
 3166  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
 3167  * is passed to if_simloop().
 3168  */
 3169 void
 3170 pim_input(struct mbuf *m, ...)
 3171 {
 3172         struct ip *ip = mtod(m, struct ip *);
 3173         struct pim *pim;
 3174         int minlen;
 3175         int datalen;
 3176         int ip_tos;
 3177         int iphlen;
 3178         va_list ap;
 3179 
 3180         va_start(ap, m);
 3181         iphlen = va_arg(ap, int);
 3182         va_end(ap);
 3183 
 3184         datalen = ntohs(ip->ip_len) - iphlen;
 3185 
 3186         /* Keep statistics */
 3187         pimstat.pims_rcv_total_msgs++;
 3188         pimstat.pims_rcv_total_bytes += datalen;
 3189 
 3190         /* Validate lengths */
 3191         if (datalen < PIM_MINLEN) {
 3192                 pimstat.pims_rcv_tooshort++;
 3193                 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
 3194                     datalen, (u_long)ip->ip_src.s_addr);
 3195                 m_freem(m);
 3196                 return;
 3197         }
 3198 
 3199         /*
 3200          * If the packet is at least as big as a REGISTER, go agead
 3201          * and grab the PIM REGISTER header size, to avoid another
 3202          * possible m_pullup() later.
 3203          * 
 3204          * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
 3205          * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
 3206          */
 3207         minlen = iphlen + (datalen >= PIM_REG_MINLEN ?
 3208             PIM_REG_MINLEN : PIM_MINLEN);
 3209         /*
 3210          * Get the IP and PIM headers in contiguous memory, and
 3211          * possibly the PIM REGISTER header.
 3212          */
 3213         if ((m->m_flags & M_EXT || m->m_len < minlen) &&
 3214             (m = m_pullup(m, minlen)) == NULL) {
 3215                 log(LOG_ERR, "pim_input: m_pullup failure\n");
 3216                 return;
 3217         }
 3218         /* m_pullup() may have given us a new mbuf so reset ip. */
 3219         ip = mtod(m, struct ip *);
 3220         ip_tos = ip->ip_tos;
 3221 
 3222         /* adjust mbuf to point to the PIM header */
 3223         m->m_data += iphlen;
 3224         m->m_len  -= iphlen;
 3225         pim = mtod(m, struct pim *);
 3226 
 3227         /*
 3228          * Validate checksum. If PIM REGISTER, exclude the data packet.
 3229          *
 3230          * XXX: some older PIMv2 implementations don't make this distinction,
 3231          * so for compatibility reason perform the checksum over part of the
 3232          * message, and if error, then over the whole message.
 3233          */
 3234         if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER &&
 3235             in_cksum(m, PIM_MINLEN) == 0) {
 3236                 /* do nothing, checksum okay */
 3237         } else if (in_cksum(m, datalen)) {
 3238                 pimstat.pims_rcv_badsum++;
 3239                 if (mrtdebug & DEBUG_PIM)
 3240                         log(LOG_DEBUG, "pim_input: invalid checksum");
 3241                 m_freem(m);
 3242                 return;
 3243         }
 3244 
 3245         /* PIM version check */
 3246         if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
 3247                 pimstat.pims_rcv_badversion++;
 3248                 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
 3249                     PIM_VT_V(pim->pim_vt), PIM_VERSION);
 3250                 m_freem(m);
 3251                 return;
 3252         }
 3253 
 3254         /* restore mbuf back to the outer IP */
 3255         m->m_data -= iphlen;
 3256         m->m_len  += iphlen;
 3257 
 3258         if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
 3259                 /*
 3260                  * Since this is a REGISTER, we'll make a copy of the register
 3261                  * headers ip + pim + u_int32 + encap_ip, to be passed up to the
 3262                  * routing daemon.
 3263                  */
 3264                 int s;
 3265                 struct sockaddr_in dst = { sizeof(dst), AF_INET };
 3266                 struct mbuf *mcp;
 3267                 struct ip *encap_ip;
 3268                 u_int32_t *reghdr;
 3269                 struct ifnet *vifp;
 3270 
 3271                 s = splsoftnet();
 3272                 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) {
 3273                         splx(s);
 3274                         if (mrtdebug & DEBUG_PIM)
 3275                                 log(LOG_DEBUG, "pim_input: register vif "
 3276                                     "not set: %d\n", reg_vif_num);
 3277                         m_freem(m);
 3278                         return;
 3279                 }
 3280                 /* XXX need refcnt? */
 3281                 vifp = viftable[reg_vif_num].v_ifp;
 3282                 splx(s);
 3283 
 3284                 /* Validate length */
 3285                 if (datalen < PIM_REG_MINLEN) {
 3286                         pimstat.pims_rcv_tooshort++;
 3287                         pimstat.pims_rcv_badregisters++;
 3288                         log(LOG_ERR, "pim_input: register packet size "
 3289                             "too small %d from %lx\n",
 3290                             datalen, (u_long)ip->ip_src.s_addr);
 3291                         m_freem(m);
 3292                         return;
 3293                 }
 3294 
 3295                 reghdr = (u_int32_t *)(pim + 1);
 3296                 encap_ip = (struct ip *)(reghdr + 1);
 3297 
 3298                 if (mrtdebug & DEBUG_PIM) {
 3299                         log(LOG_DEBUG, "pim_input[register], encap_ip: "
 3300                             "%lx -> %lx, encap_ip len %d\n",
 3301                             (u_long)ntohl(encap_ip->ip_src.s_addr),
 3302                             (u_long)ntohl(encap_ip->ip_dst.s_addr),
 3303                             ntohs(encap_ip->ip_len));
 3304                 }
 3305 
 3306                 /* verify the version number of the inner packet */
 3307                 if (encap_ip->ip_v != IPVERSION) {
 3308                         pimstat.pims_rcv_badregisters++;
 3309                         if (mrtdebug & DEBUG_PIM) {
 3310                                 log(LOG_DEBUG, "pim_input: invalid IP version"
 3311                                     " (%d) of the inner packet\n",
 3312                                     encap_ip->ip_v);
 3313                         }
 3314                         m_freem(m);
 3315                         return;
 3316                 }
 3317 
 3318                 /* verify the inner packet is destined to a mcast group */
 3319                 if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) {
 3320                         pimstat.pims_rcv_badregisters++;
 3321                         if (mrtdebug & DEBUG_PIM)
 3322                                 log(LOG_DEBUG,
 3323                                     "pim_input: inner packet of register is"
 3324                                     " not multicast %lx\n",
 3325                                     (u_long)ntohl(encap_ip->ip_dst.s_addr));
 3326                         m_freem(m);
 3327                         return;
 3328                 }
 3329 
 3330                 /* If a NULL_REGISTER, pass it to the daemon */
 3331                 if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 3332                         goto pim_input_to_daemon;
 3333 
 3334                 /*
 3335                  * Copy the TOS from the outer IP header to the inner
 3336                  * IP header.
 3337                  */
 3338                 if (encap_ip->ip_tos != ip_tos) {
 3339                         /* Outer TOS -> inner TOS */
 3340                         encap_ip->ip_tos = ip_tos;
 3341                         /* Recompute the inner header checksum. Sigh... */
 3342 
 3343                         /* adjust mbuf to point to the inner IP header */
 3344                         m->m_data += (iphlen + PIM_MINLEN);
 3345                         m->m_len  -= (iphlen + PIM_MINLEN);
 3346 
 3347                         encap_ip->ip_sum = 0;
 3348                         encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
 3349 
 3350                         /* restore mbuf to point back to the outer IP header */
 3351                         m->m_data -= (iphlen + PIM_MINLEN);
 3352                         m->m_len  += (iphlen + PIM_MINLEN);
 3353                 }
 3354 
 3355                 /*
 3356                  * Decapsulate the inner IP packet and loopback to forward it
 3357                  * as a normal multicast packet. Also, make a copy of the 
 3358                  *     outer_iphdr + pimhdr + reghdr + encap_iphdr
 3359                  * to pass to the daemon later, so it can take the appropriate
 3360                  * actions (e.g., send back PIM_REGISTER_STOP).
 3361                  * XXX: here m->m_data points to the outer IP header.
 3362                  */
 3363                 mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN);
 3364                 if (mcp == NULL) {
 3365                         log(LOG_ERR, "pim_input: pim register: could not "
 3366                             "copy register head\n");
 3367                         m_freem(m);
 3368                         return;
 3369                 }
 3370 
 3371                 /* Keep statistics */
 3372                 /* XXX: registers_bytes include only the encap. mcast pkt */
 3373                 pimstat.pims_rcv_registers_msgs++;
 3374                 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);
 3375 
 3376                 /* forward the inner ip packet; point m_data at the inner ip. */
 3377                 m_adj(m, iphlen + PIM_MINLEN);
 3378 
 3379                 if (mrtdebug & DEBUG_PIM) {
 3380                         log(LOG_DEBUG,
 3381                             "pim_input: forwarding decapsulated register: "
 3382                             "src %lx, dst %lx, vif %d\n",
 3383                             (u_long)ntohl(encap_ip->ip_src.s_addr),
 3384                             (u_long)ntohl(encap_ip->ip_dst.s_addr),
 3385                             reg_vif_num);
 3386                 }
 3387                 /* NB: vifp was collected above; can it change on us? */
 3388                 looutput(vifp, m, (struct sockaddr *)&dst,
 3389                     (struct rtentry *)NULL);
 3390 
 3391                 /* prepare the register head to send to the mrouting daemon */
 3392                 m = mcp;
 3393         }
 3394 
 3395 pim_input_to_daemon:
 3396         /*
 3397          * Pass the PIM message up to the daemon; if it is a Register message,
 3398          * pass the 'head' only up to the daemon. This includes the
 3399          * outer IP header, PIM header, PIM-Register header and the
 3400          * inner IP header.
 3401          * XXX: the outer IP header pkt size of a Register is not adjust to
 3402          * reflect the fact that the inner multicast data is truncated.
 3403          */
 3404         rip_input(m);
 3405 
 3406         return;
 3407 }
 3408 #endif /* PIM */

/* [<][>][^][v][top][bottom][index][help] */