File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / bird / sysdep / unix / io.c
Revision 1.1.1.2 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 19:50:23 2021 UTC (4 years ago) by misho
Branches: bird, MAIN
CVS tags: v1_6_8p3, HEAD
bird 1.6.8

    1: /*
    2:  *	BIRD Internet Routing Daemon -- Unix I/O
    3:  *
    4:  *	(c) 1998--2004 Martin Mares <mj@ucw.cz>
    5:  *      (c) 2004       Ondrej Filip <feela@network.cz>
    6:  *
    7:  *	Can be freely distributed and used under the terms of the GNU GPL.
    8:  */
    9: 
   10: /* Unfortunately, some glibc versions hide parts of RFC 3542 API
   11:    if _GNU_SOURCE is not defined. */
   12: #ifndef _GNU_SOURCE
   13: #define _GNU_SOURCE
   14: #endif
   15: 
   16: #include <stdio.h>
   17: #include <stdlib.h>
   18: #include <time.h>
   19: #include <sys/time.h>
   20: #include <sys/types.h>
   21: #include <sys/socket.h>
   22: #include <sys/uio.h>
   23: #include <sys/un.h>
   24: #include <poll.h>
   25: #include <unistd.h>
   26: #include <fcntl.h>
   27: #include <errno.h>
   28: #include <net/if.h>
   29: #include <netinet/in.h>
   30: #include <netinet/tcp.h>
   31: #include <netinet/udp.h>
   32: #include <netinet/icmp6.h>
   33: 
   34: #include "nest/bird.h"
   35: #include "lib/lists.h"
   36: #include "lib/resource.h"
   37: #include "lib/timer.h"
   38: #include "lib/socket.h"
   39: #include "lib/event.h"
   40: #include "lib/string.h"
   41: #include "nest/iface.h"
   42: 
   43: #include "lib/unix.h"
   44: #include "lib/sysio.h"
   45: 
   46: /* Maximum number of calls of tx handler for one socket in one
   47:  * poll iteration. Should be small enough to not monopolize CPU by
   48:  * one protocol instance.
   49:  */
   50: #define MAX_STEPS 4
   51: 
   52: /* Maximum number of calls of rx handler for all sockets in one poll
   53:    iteration. RX callbacks are often much more costly so we limit
   54:    this to gen small latencies */
   55: #define MAX_RX_STEPS 4
   56: 
   57: 
   58: /*
   59:  *	Tracked Files
   60:  */
   61: 
   62: struct rfile {
   63:   resource r;
   64:   FILE *f;
   65: };
   66: 
   67: static void
   68: rf_free(resource *r)
   69: {
   70:   struct rfile *a = (struct rfile *) r;
   71: 
   72:   fclose(a->f);
   73: }
   74: 
   75: static void
   76: rf_dump(resource *r)
   77: {
   78:   struct rfile *a = (struct rfile *) r;
   79: 
   80:   debug("(FILE *%p)\n", a->f);
   81: }
   82: 
   83: static struct resclass rf_class = {
   84:   "FILE",
   85:   sizeof(struct rfile),
   86:   rf_free,
   87:   rf_dump,
   88:   NULL,
   89:   NULL
   90: };
   91: 
   92: struct rfile *
   93: rf_open(pool *p, char *name, char *mode)
   94: {
   95:   FILE *f = fopen(name, mode);
   96: 
   97:   if (!f)
   98:     return NULL;
   99: 
  100:   struct rfile *r = ralloc(p, &rf_class);
  101:   r->f = f;
  102:   return r;
  103: }
  104: 
  105: void *
  106: rf_file(struct rfile *f)
  107: {
  108:   return f->f;
  109: }
  110: 
  111: int
  112: rf_fileno(struct rfile *f)
  113: {
  114:   return fileno(f->f);
  115: }
  116: 
  117: 
  118: /**
  119:  * DOC: Timers
  120:  *
  121:  * Timers are resources which represent a wish of a module to call
  122:  * a function at the specified time. The platform dependent code
  123:  * doesn't guarantee exact timing, only that a timer function
  124:  * won't be called before the requested time.
  125:  *
  126:  * In BIRD, time is represented by values of the &bird_clock_t type
  127:  * which are integral numbers interpreted as a relative number of seconds since
  128:  * some fixed time point in past. The current time can be read
  129:  * from variable @now with reasonable accuracy and is monotonic. There is also
  130:  * a current 'absolute' time in variable @now_real reported by OS.
  131:  *
  132:  * Each timer is described by a &timer structure containing a pointer
  133:  * to the handler function (@hook), data private to this function (@data),
  134:  * time the function should be called at (@expires, 0 for inactive timers),
  135:  * for the other fields see |timer.h|.
  136:  */
  137: 
  138: #define NEAR_TIMER_LIMIT 4
  139: 
  140: static list near_timers, far_timers;
  141: static bird_clock_t first_far_timer = TIME_INFINITY;
  142: 
  143: /* now must be different from 0, because 0 is a special value in timer->expires */
  144: bird_clock_t now = 1, now_real, boot_time;
  145: 
  146: static void
  147: update_times_plain(void)
  148: {
  149:   bird_clock_t new_time = time(NULL);
  150:   int delta = new_time - now_real;
  151: 
  152:   if ((delta >= 0) && (delta < 60))
  153:     now += delta;
  154:   else if (now_real != 0)
  155:    log(L_WARN "Time jump, delta %d s", delta);
  156: 
  157:   now_real = new_time;
  158: }
  159: 
  160: static void
  161: update_times_gettime(void)
  162: {
  163:   struct timespec ts;
  164:   int rv;
  165: 
  166:   rv = clock_gettime(CLOCK_MONOTONIC, &ts);
  167:   if (rv != 0)
  168:     die("clock_gettime: %m");
  169: 
  170:   if (ts.tv_sec != now) {
  171:     if (ts.tv_sec < now)
  172:       log(L_ERR "Monotonic timer is broken");
  173: 
  174:     now = ts.tv_sec;
  175:     now_real = time(NULL);
  176:   }
  177: }
  178: 
  179: static int clock_monotonic_available;
  180: 
  181: static inline void
  182: update_times(void)
  183: {
  184:   if (clock_monotonic_available)
  185:     update_times_gettime();
  186:   else
  187:     update_times_plain();
  188: }
  189: 
  190: static inline void
  191: init_times(void)
  192: {
  193:  struct timespec ts;
  194:  clock_monotonic_available = (clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
  195:  if (!clock_monotonic_available)
  196:    log(L_WARN "Monotonic timer is missing");
  197: }
  198: 
  199: 
  200: static void
  201: tm_free(resource *r)
  202: {
  203:   timer *t = (timer *) r;
  204: 
  205:   tm_stop(t);
  206: }
  207: 
  208: static void
  209: tm_dump(resource *r)
  210: {
  211:   timer *t = (timer *) r;
  212: 
  213:   debug("(code %p, data %p, ", t->hook, t->data);
  214:   if (t->randomize)
  215:     debug("rand %d, ", t->randomize);
  216:   if (t->recurrent)
  217:     debug("recur %d, ", t->recurrent);
  218:   if (t->expires)
  219:     debug("expires in %d sec)\n", t->expires - now);
  220:   else
  221:     debug("inactive)\n");
  222: }
  223: 
  224: static struct resclass tm_class = {
  225:   "Timer",
  226:   sizeof(timer),
  227:   tm_free,
  228:   tm_dump,
  229:   NULL,
  230:   NULL
  231: };
  232: 
  233: /**
  234:  * tm_new - create a timer
  235:  * @p: pool
  236:  *
  237:  * This function creates a new timer resource and returns
  238:  * a pointer to it. To use the timer, you need to fill in
  239:  * the structure fields and call tm_start() to start timing.
  240:  */
  241: timer *
  242: tm_new(pool *p)
  243: {
  244:   timer *t = ralloc(p, &tm_class);
  245:   return t;
  246: }
  247: 
  248: static inline void
  249: tm_insert_near(timer *t)
  250: {
  251:   node *n = HEAD(near_timers);
  252: 
  253:   while (n->next && (SKIP_BACK(timer, n, n)->expires < t->expires))
  254:     n = n->next;
  255:   insert_node(&t->n, n->prev);
  256: }
  257: 
  258: /**
  259:  * tm_start - start a timer
  260:  * @t: timer
  261:  * @after: number of seconds the timer should be run after
  262:  *
  263:  * This function schedules the hook function of the timer to
  264:  * be called after @after seconds. If the timer has been already
  265:  * started, it's @expire time is replaced by the new value.
  266:  *
  267:  * You can have set the @randomize field of @t, the timeout
  268:  * will be increased by a random number of seconds chosen
  269:  * uniformly from range 0 .. @randomize.
  270:  *
  271:  * You can call tm_start() from the handler function of the timer
  272:  * to request another run of the timer. Also, you can set the @recurrent
  273:  * field to have the timer re-added automatically with the same timeout.
  274:  */
  275: void
  276: tm_start(timer *t, unsigned after)
  277: {
  278:   bird_clock_t when;
  279: 
  280:   if (t->randomize)
  281:     after += random() % (t->randomize + 1);
  282:   when = now + after;
  283:   if (t->expires == when)
  284:     return;
  285:   if (t->expires)
  286:     rem_node(&t->n);
  287:   t->expires = when;
  288:   if (after <= NEAR_TIMER_LIMIT)
  289:     tm_insert_near(t);
  290:   else
  291:     {
  292:       if (!first_far_timer || first_far_timer > when)
  293: 	first_far_timer = when;
  294:       add_tail(&far_timers, &t->n);
  295:     }
  296: }
  297: 
  298: /**
  299:  * tm_stop - stop a timer
  300:  * @t: timer
  301:  *
  302:  * This function stops a timer. If the timer is already stopped,
  303:  * nothing happens.
  304:  */
  305: void
  306: tm_stop(timer *t)
  307: {
  308:   if (t->expires)
  309:     {
  310:       rem_node(&t->n);
  311:       t->expires = 0;
  312:     }
  313: }
  314: 
  315: static void
  316: tm_dump_them(char *name, list *l)
  317: {
  318:   node *n;
  319:   timer *t;
  320: 
  321:   debug("%s timers:\n", name);
  322:   WALK_LIST(n, *l)
  323:     {
  324:       t = SKIP_BACK(timer, n, n);
  325:       debug("%p ", t);
  326:       tm_dump(&t->r);
  327:     }
  328:   debug("\n");
  329: }
  330: 
  331: void
  332: tm_dump_all(void)
  333: {
  334:   tm_dump_them("Near", &near_timers);
  335:   tm_dump_them("Far", &far_timers);
  336: }
  337: 
  338: static inline time_t
  339: tm_first_shot(void)
  340: {
  341:   time_t x = first_far_timer;
  342: 
  343:   if (!EMPTY_LIST(near_timers))
  344:     {
  345:       timer *t = SKIP_BACK(timer, n, HEAD(near_timers));
  346:       if (t->expires < x)
  347: 	x = t->expires;
  348:     }
  349:   return x;
  350: }
  351: 
  352: void io_log_event(void *hook, void *data);
  353: 
  354: static void
  355: tm_shot(void)
  356: {
  357:   timer *t;
  358:   node *n, *m;
  359: 
  360:   if (first_far_timer <= now)
  361:     {
  362:       bird_clock_t limit = now + NEAR_TIMER_LIMIT;
  363:       first_far_timer = TIME_INFINITY;
  364:       n = HEAD(far_timers);
  365:       while (m = n->next)
  366: 	{
  367: 	  t = SKIP_BACK(timer, n, n);
  368: 	  if (t->expires <= limit)
  369: 	    {
  370: 	      rem_node(n);
  371: 	      tm_insert_near(t);
  372: 	    }
  373: 	  else if (t->expires < first_far_timer)
  374: 	    first_far_timer = t->expires;
  375: 	  n = m;
  376: 	}
  377:     }
  378:   while ((n = HEAD(near_timers)) -> next)
  379:     {
  380:       int delay;
  381:       t = SKIP_BACK(timer, n, n);
  382:       if (t->expires > now)
  383: 	break;
  384:       rem_node(n);
  385:       delay = t->expires - now;
  386:       t->expires = 0;
  387:       if (t->recurrent)
  388: 	{
  389: 	  int i = t->recurrent - delay;
  390: 	  if (i < 0)
  391: 	    i = 0;
  392: 	  tm_start(t, i);
  393: 	}
  394:       io_log_event(t->hook, t->data);
  395:       t->hook(t);
  396:     }
  397: }
  398: 
  399: /**
  400:  * tm_parse_datetime - parse a date and time
  401:  * @x: datetime string
  402:  *
  403:  * tm_parse_datetime() takes a textual representation of
  404:  * a date and time (dd-mm-yyyy hh:mm:ss)
  405:  * and converts it to the corresponding value of type &bird_clock_t.
  406:  */
  407: bird_clock_t
  408: tm_parse_datetime(char *x)
  409: {
  410:   struct tm tm;
  411:   int n;
  412:   time_t t;
  413: 
  414:   if (sscanf(x, "%d-%d-%d %d:%d:%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &tm.tm_hour, &tm.tm_min, &tm.tm_sec, &n) != 6 || x[n])
  415:     return tm_parse_date(x);
  416:   tm.tm_mon--;
  417:   tm.tm_year -= 1900;
  418:   t = mktime(&tm);
  419:   if (t == (time_t) -1)
  420:     return 0;
  421:   return t;
  422: }
  423: /**
  424:  * tm_parse_date - parse a date
  425:  * @x: date string
  426:  *
  427:  * tm_parse_date() takes a textual representation of a date (dd-mm-yyyy)
  428:  * and converts it to the corresponding value of type &bird_clock_t.
  429:  */
  430: bird_clock_t
  431: tm_parse_date(char *x)
  432: {
  433:   struct tm tm;
  434:   int n;
  435:   time_t t;
  436: 
  437:   if (sscanf(x, "%d-%d-%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &n) != 3 || x[n])
  438:     return 0;
  439:   tm.tm_mon--;
  440:   tm.tm_year -= 1900;
  441:   tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
  442:   t = mktime(&tm);
  443:   if (t == (time_t) -1)
  444:     return 0;
  445:   return t;
  446: }
  447: 
  448: static void
  449: tm_format_reltime(char *x, struct tm *tm, bird_clock_t delta)
  450: {
  451:   static char *month_names[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
  452: 				   "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
  453: 
  454:   if (delta < 20*3600)
  455:     bsprintf(x, "%02d:%02d", tm->tm_hour, tm->tm_min);
  456:   else if (delta < 360*86400)
  457:     bsprintf(x, "%s%02d", month_names[tm->tm_mon], tm->tm_mday);
  458:   else
  459:     bsprintf(x, "%d", tm->tm_year+1900);
  460: }
  461: 
  462: #include "conf/conf.h"
  463: 
  464: /**
  465:  * tm_format_datetime - convert date and time to textual representation
  466:  * @x: destination buffer of size %TM_DATETIME_BUFFER_SIZE
  467:  * @fmt_spec: specification of resulting textual representation of the time
  468:  * @t: time
  469:  *
  470:  * This function formats the given relative time value @t to a textual
  471:  * date/time representation (dd-mm-yyyy hh:mm:ss) in real time.
  472:  */
  473: void
  474: tm_format_datetime(char *x, struct timeformat *fmt_spec, bird_clock_t t)
  475: {
  476:   const char *fmt_used;
  477:   struct tm *tm;
  478:   bird_clock_t delta = now - t;
  479:   t = now_real - delta;
  480:   tm = localtime(&t);
  481: 
  482:   if (fmt_spec->fmt1 == NULL)
  483:     return tm_format_reltime(x, tm, delta);
  484: 
  485:   if ((fmt_spec->limit == 0) || (delta < fmt_spec->limit))
  486:     fmt_used = fmt_spec->fmt1;
  487:   else
  488:     fmt_used = fmt_spec->fmt2;
  489: 
  490:   int rv = strftime(x, TM_DATETIME_BUFFER_SIZE, fmt_used, tm);
  491:   if (((rv == 0) && fmt_used[0]) || (rv == TM_DATETIME_BUFFER_SIZE))
  492:     strcpy(x, "<too-long>");
  493: }
  494: 
  495: int
  496: tm_format_real_time(char *x, size_t max, const char *fmt, bird_clock_t t)
  497: {
  498:   struct tm tm;
  499: 
  500:   if (!localtime_r(&t, &tm))
  501:     return 0;
  502: 
  503:   if (!strftime(x, max, fmt, &tm))
  504:     return 0;
  505: 
  506:   return 1;
  507: }
  508: 
  509: 
  510: /**
  511:  * DOC: Sockets
  512:  *
  513:  * Socket resources represent network connections. Their data structure (&socket)
  514:  * contains a lot of fields defining the exact type of the socket, the local and
  515:  * remote addresses and ports, pointers to socket buffers and finally pointers to
  516:  * hook functions to be called when new data have arrived to the receive buffer
  517:  * (@rx_hook), when the contents of the transmit buffer have been transmitted
  518:  * (@tx_hook) and when an error or connection close occurs (@err_hook).
  519:  *
  520:  * Freeing of sockets from inside socket hooks is perfectly safe.
  521:  */
  522: 
  523: #ifndef SOL_IP
  524: #define SOL_IP IPPROTO_IP
  525: #endif
  526: 
  527: #ifndef SOL_IPV6
  528: #define SOL_IPV6 IPPROTO_IPV6
  529: #endif
  530: 
  531: #ifndef SOL_ICMPV6
  532: #define SOL_ICMPV6 IPPROTO_ICMPV6
  533: #endif
  534: 
  535: 
  536: /*
  537:  *	Sockaddr helper functions
  538:  */
  539: 
  540: static inline int UNUSED sockaddr_length(int af)
  541: { return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
  542: 
  543: static inline void
  544: sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
  545: {
  546:   memset(sa, 0, sizeof(struct sockaddr_in));
  547: #ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
  548:   sa->sin_len = sizeof(struct sockaddr_in);
  549: #endif
  550:   sa->sin_family = AF_INET;
  551:   sa->sin_port = htons(port);
  552:   sa->sin_addr = ipa_to_in4(a);
  553: }
  554: 
  555: static inline void
  556: sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
  557: {
  558:   memset(sa, 0, sizeof(struct sockaddr_in6));
  559: #ifdef SIN6_LEN
  560:   sa->sin6_len = sizeof(struct sockaddr_in6);
  561: #endif
  562:   sa->sin6_family = AF_INET6;
  563:   sa->sin6_port = htons(port);
  564:   sa->sin6_flowinfo = 0;
  565:   sa->sin6_addr = ipa_to_in6(a);
  566: 
  567:   if (ifa && ipa_is_link_local(a))
  568:     sa->sin6_scope_id = ifa->index;
  569: }
  570: 
  571: void
  572: sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
  573: {
  574:   if (af == AF_INET)
  575:     sockaddr_fill4((struct sockaddr_in *) sa, a, port);
  576:   else if (af == AF_INET6)
  577:     sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
  578:   else
  579:     bug("Unknown AF");
  580: }
  581: 
  582: static inline void
  583: sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
  584: {
  585:   *port = ntohs(sa->sin_port);
  586:   *a = ipa_from_in4(sa->sin_addr);
  587: }
  588: 
  589: static inline void
  590: sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
  591: {
  592:   *port = ntohs(sa->sin6_port);
  593:   *a = ipa_from_in6(sa->sin6_addr);
  594: 
  595:   if (ifa && ipa_is_link_local(*a))
  596:     *ifa = if_find_by_index(sa->sin6_scope_id);
  597: }
  598: 
  599: int
  600: sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
  601: {
  602:   if (sa->sa.sa_family != af)
  603:     goto fail;
  604: 
  605:   if (af == AF_INET)
  606:     sockaddr_read4((struct sockaddr_in *) sa, a, port);
  607:   else if (af == AF_INET6)
  608:     sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
  609:   else
  610:     goto fail;
  611: 
  612:   return 0;
  613: 
  614:  fail:
  615:   *a = IPA_NONE;
  616:   *port = 0;
  617:   return -1;
  618: }
  619: 
  620: 
  621: /*
  622:  *	IPv6 multicast syscalls
  623:  */
  624: 
  625: /* Fortunately standardized in RFC 3493 */
  626: 
  627: #define INIT_MREQ6(maddr,ifa) \
  628:   { .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
  629: 
  630: static inline int
  631: sk_setup_multicast6(sock *s)
  632: {
  633:   int index = s->iface->index;
  634:   int ttl = s->ttl;
  635:   int n = 0;
  636: 
  637:   if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
  638:     ERR("IPV6_MULTICAST_IF");
  639: 
  640:   if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
  641:     ERR("IPV6_MULTICAST_HOPS");
  642: 
  643:   if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
  644:     ERR("IPV6_MULTICAST_LOOP");
  645: 
  646:   return 0;
  647: }
  648: 
  649: static inline int
  650: sk_join_group6(sock *s, ip_addr maddr)
  651: {
  652:   struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
  653: 
  654:   if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
  655:     ERR("IPV6_JOIN_GROUP");
  656: 
  657:   return 0;
  658: }
  659: 
  660: static inline int
  661: sk_leave_group6(sock *s, ip_addr maddr)
  662: {
  663:   struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
  664: 
  665:   if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
  666:     ERR("IPV6_LEAVE_GROUP");
  667: 
  668:   return 0;
  669: }
  670: 
  671: 
  672: /*
  673:  *	IPv6 packet control messages
  674:  */
  675: 
  676: /* Also standardized, in RFC 3542 */
  677: 
  678: /*
  679:  * RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
  680:  * type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
  681:  * don't have IPV6_RECVPKTINFO we suppose the OS implements the older
  682:  * RFC and we use IPV6_PKTINFO.
  683:  */
  684: #ifndef IPV6_RECVPKTINFO
  685: #define IPV6_RECVPKTINFO IPV6_PKTINFO
  686: #endif
  687: /*
  688:  * Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
  689:  */
  690: #ifndef IPV6_RECVHOPLIMIT
  691: #define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
  692: #endif
  693: 
  694: 
  695: #define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
  696: #define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
  697: 
  698: static inline int
  699: sk_request_cmsg6_pktinfo(sock *s)
  700: {
  701:   int y = 1;
  702: 
  703:   if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
  704:     ERR("IPV6_RECVPKTINFO");
  705: 
  706:   return 0;
  707: }
  708: 
  709: static inline int
  710: sk_request_cmsg6_ttl(sock *s)
  711: {
  712:   int y = 1;
  713: 
  714:   if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
  715:     ERR("IPV6_RECVHOPLIMIT");
  716: 
  717:   return 0;
  718: }
  719: 
  720: static inline void
  721: sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
  722: {
  723:   if (cm->cmsg_type == IPV6_PKTINFO)
  724:   {
  725:     struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
  726:     s->laddr = ipa_from_in6(pi->ipi6_addr);
  727:     s->lifindex = pi->ipi6_ifindex;
  728:   }
  729: }
  730: 
  731: static inline void
  732: sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
  733: {
  734:   if (cm->cmsg_type == IPV6_HOPLIMIT)
  735:     s->rcv_ttl = * (int *) CMSG_DATA(cm);
  736: }
  737: 
  738: static inline void
  739: sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
  740: {
  741:   struct cmsghdr *cm;
  742:   struct in6_pktinfo *pi;
  743:   int controllen = 0;
  744: 
  745:   msg->msg_control = cbuf;
  746:   msg->msg_controllen = cbuflen;
  747: 
  748:   cm = CMSG_FIRSTHDR(msg);
  749:   cm->cmsg_level = SOL_IPV6;
  750:   cm->cmsg_type = IPV6_PKTINFO;
  751:   cm->cmsg_len = CMSG_LEN(sizeof(*pi));
  752:   controllen += CMSG_SPACE(sizeof(*pi));
  753: 
  754:   pi = (struct in6_pktinfo *) CMSG_DATA(cm);
  755:   pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
  756:   pi->ipi6_addr = ipa_to_in6(s->saddr);
  757: 
  758:   msg->msg_controllen = controllen;
  759: }
  760: 
  761: 
  762: /*
  763:  *	Miscellaneous socket syscalls
  764:  */
  765: 
  766: static inline int
  767: sk_set_ttl4(sock *s, int ttl)
  768: {
  769:   if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
  770:     ERR("IP_TTL");
  771: 
  772:   return 0;
  773: }
  774: 
  775: static inline int
  776: sk_set_ttl6(sock *s, int ttl)
  777: {
  778:   if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
  779:     ERR("IPV6_UNICAST_HOPS");
  780: 
  781:   return 0;
  782: }
  783: 
  784: static inline int
  785: sk_set_tos4(sock *s, int tos)
  786: {
  787:   if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
  788:     ERR("IP_TOS");
  789: 
  790:   return 0;
  791: }
  792: 
  793: static inline int
  794: sk_set_tos6(sock *s, int tos)
  795: {
  796:   if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
  797:     ERR("IPV6_TCLASS");
  798: 
  799:   return 0;
  800: }
  801: 
  802: static inline int
  803: sk_set_high_port(sock *s UNUSED)
  804: {
  805:   /* Port range setting is optional, ignore it if not supported */
  806: 
  807: #ifdef IP_PORTRANGE
  808:   if (sk_is_ipv4(s))
  809:   {
  810:     int range = IP_PORTRANGE_HIGH;
  811:     if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
  812:       ERR("IP_PORTRANGE");
  813:   }
  814: #endif
  815: 
  816: #ifdef IPV6_PORTRANGE
  817:   if (sk_is_ipv6(s))
  818:   {
  819:     int range = IPV6_PORTRANGE_HIGH;
  820:     if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
  821:       ERR("IPV6_PORTRANGE");
  822:   }
  823: #endif
  824: 
  825:   return 0;
  826: }
  827: 
  828: static inline byte *
  829: sk_skip_ip_header(byte *pkt, int *len)
  830: {
  831:   if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
  832:     return NULL;
  833: 
  834:   int hlen = (*pkt & 0x0f) * 4;
  835:   if ((hlen < 20) || (hlen > *len))
  836:     return NULL;
  837: 
  838:   *len -= hlen;
  839:   return pkt + hlen;
  840: }
  841: 
  842: byte *
  843: sk_rx_buffer(sock *s, int *len)
  844: {
  845:   if (sk_is_ipv4(s) && (s->type == SK_IP))
  846:     return sk_skip_ip_header(s->rbuf, len);
  847:   else
  848:     return s->rbuf;
  849: }
  850: 
  851: 
  852: /*
  853:  *	Public socket functions
  854:  */
  855: 
  856: /**
  857:  * sk_setup_multicast - enable multicast for given socket
  858:  * @s: socket
  859:  *
  860:  * Prepare transmission of multicast packets for given datagram socket.
  861:  * The socket must have defined @iface.
  862:  *
  863:  * Result: 0 for success, -1 for an error.
  864:  */
  865: 
  866: int
  867: sk_setup_multicast(sock *s)
  868: {
  869:   ASSERT(s->iface);
  870: 
  871:   if (sk_is_ipv4(s))
  872:     return sk_setup_multicast4(s);
  873:   else
  874:     return sk_setup_multicast6(s);
  875: }
  876: 
  877: /**
  878:  * sk_join_group - join multicast group for given socket
  879:  * @s: socket
  880:  * @maddr: multicast address
  881:  *
  882:  * Join multicast group for given datagram socket and associated interface.
  883:  * The socket must have defined @iface.
  884:  *
  885:  * Result: 0 for success, -1 for an error.
  886:  */
  887: 
  888: int
  889: sk_join_group(sock *s, ip_addr maddr)
  890: {
  891:   if (sk_is_ipv4(s))
  892:     return sk_join_group4(s, maddr);
  893:   else
  894:     return sk_join_group6(s, maddr);
  895: }
  896: 
  897: /**
  898:  * sk_leave_group - leave multicast group for given socket
  899:  * @s: socket
  900:  * @maddr: multicast address
  901:  *
  902:  * Leave multicast group for given datagram socket and associated interface.
  903:  * The socket must have defined @iface.
  904:  *
  905:  * Result: 0 for success, -1 for an error.
  906:  */
  907: 
  908: int
  909: sk_leave_group(sock *s, ip_addr maddr)
  910: {
  911:   if (sk_is_ipv4(s))
  912:     return sk_leave_group4(s, maddr);
  913:   else
  914:     return sk_leave_group6(s, maddr);
  915: }
  916: 
  917: /**
  918:  * sk_setup_broadcast - enable broadcast for given socket
  919:  * @s: socket
  920:  *
  921:  * Allow reception and transmission of broadcast packets for given datagram
  922:  * socket. The socket must have defined @iface. For transmission, packets should
  923:  * be send to @brd address of @iface.
  924:  *
  925:  * Result: 0 for success, -1 for an error.
  926:  */
  927: 
  928: int
  929: sk_setup_broadcast(sock *s)
  930: {
  931:   int y = 1;
  932: 
  933:   if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
  934:     ERR("SO_BROADCAST");
  935: 
  936:   return 0;
  937: }
  938: 
  939: /**
  940:  * sk_set_ttl - set transmit TTL for given socket
  941:  * @s: socket
  942:  * @ttl: TTL value
  943:  *
  944:  * Set TTL for already opened connections when TTL was not set before. Useful
  945:  * for accepted connections when different ones should have different TTL.
  946:  *
  947:  * Result: 0 for success, -1 for an error.
  948:  */
  949: 
  950: int
  951: sk_set_ttl(sock *s, int ttl)
  952: {
  953:   s->ttl = ttl;
  954: 
  955:   if (sk_is_ipv4(s))
  956:     return sk_set_ttl4(s, ttl);
  957:   else
  958:     return sk_set_ttl6(s, ttl);
  959: }
  960: 
  961: /**
  962:  * sk_set_min_ttl - set minimal accepted TTL for given socket
  963:  * @s: socket
  964:  * @ttl: TTL value
  965:  *
  966:  * Set minimal accepted TTL for given socket. Can be used for TTL security.
  967:  * implementations.
  968:  *
  969:  * Result: 0 for success, -1 for an error.
  970:  */
  971: 
  972: int
  973: sk_set_min_ttl(sock *s, int ttl)
  974: {
  975:   if (sk_is_ipv4(s))
  976:     return sk_set_min_ttl4(s, ttl);
  977:   else
  978:     return sk_set_min_ttl6(s, ttl);
  979: }
  980: 
  981: #if 0
  982: /**
  983:  * sk_set_md5_auth - add / remove MD5 security association for given socket
  984:  * @s: socket
  985:  * @local: IP address of local side
  986:  * @remote: IP address of remote side
  987:  * @ifa: Interface for link-local IP address
  988:  * @passwd: Password used for MD5 authentication
  989:  * @setkey: Update also system SA/SP database
  990:  *
  991:  * In TCP MD5 handling code in kernel, there is a set of security associations
  992:  * used for choosing password and other authentication parameters according to
  993:  * the local and remote address. This function is useful for listening socket,
  994:  * for active sockets it may be enough to set s->password field.
  995:  *
  996:  * When called with passwd != NULL, the new pair is added,
  997:  * When called with passwd == NULL, the existing pair is removed.
  998:  *
  999:  * Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
 1000:  * stored in global SA/SP database (but the behavior also must be enabled on
 1001:  * per-socket basis). In case of multiple sockets to the same neighbor, the
 1002:  * socket-specific state must be configured for each socket while global state
 1003:  * just once per src-dst pair. The @setkey argument controls whether the global
 1004:  * state (SA/SP database) is also updated.
 1005:  *
 1006:  * Result: 0 for success, -1 for an error.
 1007:  */
 1008: 
 1009: int
 1010: sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
 1011: { DUMMY; }
 1012: #endif
 1013: 
 1014: /**
 1015:  * sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
 1016:  * @s: socket
 1017:  * @offset: offset
 1018:  *
 1019:  * Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
 1020:  * kernel will automatically fill it for outgoing packets and check it for
 1021:  * incoming packets. Should not be used on ICMPv6 sockets, where the position is
 1022:  * known to the kernel.
 1023:  *
 1024:  * Result: 0 for success, -1 for an error.
 1025:  */
 1026: 
 1027: int
 1028: sk_set_ipv6_checksum(sock *s, int offset)
 1029: {
 1030:   if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
 1031:     ERR("IPV6_CHECKSUM");
 1032: 
 1033:   return 0;
 1034: }
 1035: 
 1036: int
 1037: sk_set_icmp6_filter(sock *s, int p1, int p2)
 1038: {
 1039:   /* a bit of lame interface, but it is here only for Radv */
 1040:   struct icmp6_filter f;
 1041: 
 1042:   ICMP6_FILTER_SETBLOCKALL(&f);
 1043:   ICMP6_FILTER_SETPASS(p1, &f);
 1044:   ICMP6_FILTER_SETPASS(p2, &f);
 1045: 
 1046:   if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
 1047:     ERR("ICMP6_FILTER");
 1048: 
 1049:   return 0;
 1050: }
 1051: 
 1052: void
 1053: sk_log_error(sock *s, const char *p)
 1054: {
 1055:   log(L_ERR "%s: Socket error: %s%#m", p, s->err);
 1056: }
 1057: 
 1058: 
 1059: /*
 1060:  *	Actual struct birdsock code
 1061:  */
 1062: 
 1063: static list sock_list;
 1064: static struct birdsock *current_sock;
 1065: static struct birdsock *stored_sock;
 1066: 
 1067: static inline sock *
 1068: sk_next(sock *s)
 1069: {
 1070:   if (!s->n.next->next)
 1071:     return NULL;
 1072:   else
 1073:     return SKIP_BACK(sock, n, s->n.next);
 1074: }
 1075: 
 1076: static void
 1077: sk_alloc_bufs(sock *s)
 1078: {
 1079:   if (!s->rbuf && s->rbsize)
 1080:     s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
 1081:   s->rpos = s->rbuf;
 1082:   if (!s->tbuf && s->tbsize)
 1083:     s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
 1084:   s->tpos = s->ttx = s->tbuf;
 1085: }
 1086: 
 1087: static void
 1088: sk_free_bufs(sock *s)
 1089: {
 1090:   if (s->rbuf_alloc)
 1091:   {
 1092:     xfree(s->rbuf_alloc);
 1093:     s->rbuf = s->rbuf_alloc = NULL;
 1094:   }
 1095:   if (s->tbuf_alloc)
 1096:   {
 1097:     xfree(s->tbuf_alloc);
 1098:     s->tbuf = s->tbuf_alloc = NULL;
 1099:   }
 1100: }
 1101: 
 1102: static void
 1103: sk_free(resource *r)
 1104: {
 1105:   sock *s = (sock *) r;
 1106: 
 1107:   sk_free_bufs(s);
 1108:   if (s->fd >= 0)
 1109:   {
 1110:     close(s->fd);
 1111: 
 1112:     /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
 1113:     if (s->flags & SKF_THREAD)
 1114:       return;
 1115: 
 1116:     if (s == current_sock)
 1117:       current_sock = sk_next(s);
 1118:     if (s == stored_sock)
 1119:       stored_sock = sk_next(s);
 1120:     rem_node(&s->n);
 1121:   }
 1122: }
 1123: 
 1124: void
 1125: sk_set_rbsize(sock *s, uint val)
 1126: {
 1127:   ASSERT(s->rbuf_alloc == s->rbuf);
 1128: 
 1129:   if (s->rbsize == val)
 1130:     return;
 1131: 
 1132:   s->rbsize = val;
 1133:   xfree(s->rbuf_alloc);
 1134:   s->rbuf_alloc = xmalloc(val);
 1135:   s->rpos = s->rbuf = s->rbuf_alloc;
 1136: }
 1137: 
 1138: void
 1139: sk_set_tbsize(sock *s, uint val)
 1140: {
 1141:   ASSERT(s->tbuf_alloc == s->tbuf);
 1142: 
 1143:   if (s->tbsize == val)
 1144:     return;
 1145: 
 1146:   byte *old_tbuf = s->tbuf;
 1147: 
 1148:   s->tbsize = val;
 1149:   s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
 1150:   s->tpos = s->tbuf + (s->tpos - old_tbuf);
 1151:   s->ttx  = s->tbuf + (s->ttx  - old_tbuf);
 1152: }
 1153: 
 1154: void
 1155: sk_set_tbuf(sock *s, void *tbuf)
 1156: {
 1157:   s->tbuf = tbuf ?: s->tbuf_alloc;
 1158:   s->ttx = s->tpos = s->tbuf;
 1159: }
 1160: 
 1161: void
 1162: sk_reallocate(sock *s)
 1163: {
 1164:   sk_free_bufs(s);
 1165:   sk_alloc_bufs(s);
 1166: }
 1167: 
 1168: static void
 1169: sk_dump(resource *r)
 1170: {
 1171:   sock *s = (sock *) r;
 1172:   static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "DEL!" };
 1173: 
 1174:   debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
 1175: 	sk_type_names[s->type],
 1176: 	s->data,
 1177: 	s->saddr,
 1178: 	s->sport,
 1179: 	s->daddr,
 1180: 	s->dport,
 1181: 	s->tos,
 1182: 	s->ttl,
 1183: 	s->iface ? s->iface->name : "none");
 1184: }
 1185: 
 1186: static struct resclass sk_class = {
 1187:   "Socket",
 1188:   sizeof(sock),
 1189:   sk_free,
 1190:   sk_dump,
 1191:   NULL,
 1192:   NULL
 1193: };
 1194: 
 1195: /**
 1196:  * sk_new - create a socket
 1197:  * @p: pool
 1198:  *
 1199:  * This function creates a new socket resource. If you want to use it,
 1200:  * you need to fill in all the required fields of the structure and
 1201:  * call sk_open() to do the actual opening of the socket.
 1202:  *
 1203:  * The real function name is sock_new(), sk_new() is a macro wrapper
 1204:  * to avoid collision with OpenSSL.
 1205:  */
 1206: sock *
 1207: sock_new(pool *p)
 1208: {
 1209:   sock *s = ralloc(p, &sk_class);
 1210:   s->pool = p;
 1211:   // s->saddr = s->daddr = IPA_NONE;
 1212:   s->tos = s->priority = s->ttl = -1;
 1213:   s->fd = -1;
 1214:   return s;
 1215: }
 1216: 
 1217: static int
 1218: sk_setup(sock *s)
 1219: {
 1220:   int y = 1;
 1221:   int fd = s->fd;
 1222: 
 1223:   if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
 1224:     ERR("O_NONBLOCK");
 1225: 
 1226:   if (!s->af)
 1227:     return 0;
 1228: 
 1229:   if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
 1230:     s->flags |= SKF_PKTINFO;
 1231: 
 1232: #ifdef CONFIG_USE_HDRINCL
 1233:   if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
 1234:   {
 1235:     s->flags &= ~SKF_PKTINFO;
 1236:     s->flags |= SKF_HDRINCL;
 1237:     if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
 1238:       ERR("IP_HDRINCL");
 1239:   }
 1240: #endif
 1241: 
 1242:   if (s->vrf && !s->iface)
 1243:   {
 1244:     /* Bind socket to associated VRF interface.
 1245:        This is Linux-specific, but so is SO_BINDTODEVICE. */
 1246: #ifdef SO_BINDTODEVICE
 1247:     struct ifreq ifr = {};
 1248:     strcpy(ifr.ifr_name, s->vrf->name);
 1249:     if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
 1250:       ERR("SO_BINDTODEVICE");
 1251: #endif
 1252:   }
 1253: 
 1254:   if (s->iface)
 1255:   {
 1256: #ifdef SO_BINDTODEVICE
 1257:     struct ifreq ifr = {};
 1258:     strcpy(ifr.ifr_name, s->iface->name);
 1259:     if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
 1260:       ERR("SO_BINDTODEVICE");
 1261: #endif
 1262: 
 1263: #ifdef CONFIG_UNIX_DONTROUTE
 1264:     if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
 1265:       ERR("SO_DONTROUTE");
 1266: #endif
 1267:   }
 1268: 
 1269:   if (sk_is_ipv4(s))
 1270:   {
 1271:     if (s->flags & SKF_LADDR_RX)
 1272:       if (sk_request_cmsg4_pktinfo(s) < 0)
 1273: 	return -1;
 1274: 
 1275:     if (s->flags & SKF_TTL_RX)
 1276:       if (sk_request_cmsg4_ttl(s) < 0)
 1277: 	return -1;
 1278: 
 1279:     if ((s->type == SK_UDP) || (s->type == SK_IP))
 1280:       if (sk_disable_mtu_disc4(s) < 0)
 1281: 	return -1;
 1282: 
 1283:     if (s->ttl >= 0)
 1284:       if (sk_set_ttl4(s, s->ttl) < 0)
 1285: 	return -1;
 1286: 
 1287:     if (s->tos >= 0)
 1288:       if (sk_set_tos4(s, s->tos) < 0)
 1289: 	return -1;
 1290:   }
 1291: 
 1292:   if (sk_is_ipv6(s))
 1293:   {
 1294:     if (s->flags & SKF_V6ONLY)
 1295:       if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
 1296: 	ERR("IPV6_V6ONLY");
 1297: 
 1298:     if (s->flags & SKF_LADDR_RX)
 1299:       if (sk_request_cmsg6_pktinfo(s) < 0)
 1300: 	return -1;
 1301: 
 1302:     if (s->flags & SKF_TTL_RX)
 1303:       if (sk_request_cmsg6_ttl(s) < 0)
 1304: 	return -1;
 1305: 
 1306:     if ((s->type == SK_UDP) || (s->type == SK_IP))
 1307:       if (sk_disable_mtu_disc6(s) < 0)
 1308: 	return -1;
 1309: 
 1310:     if (s->ttl >= 0)
 1311:       if (sk_set_ttl6(s, s->ttl) < 0)
 1312: 	return -1;
 1313: 
 1314:     if (s->tos >= 0)
 1315:       if (sk_set_tos6(s, s->tos) < 0)
 1316: 	return -1;
 1317:   }
 1318: 
 1319:   /* Must be after sk_set_tos4() as setting ToS on Linux also mangles priority */
 1320:   if (s->priority >= 0)
 1321:     if (sk_set_priority(s, s->priority) < 0)
 1322:       return -1;
 1323: 
 1324:   return 0;
 1325: }
 1326: 
 1327: static void
 1328: sk_insert(sock *s)
 1329: {
 1330:   add_tail(&sock_list, &s->n);
 1331: }
 1332: 
 1333: static void
 1334: sk_tcp_connected(sock *s)
 1335: {
 1336:   sockaddr sa;
 1337:   int sa_len = sizeof(sa);
 1338: 
 1339:   if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
 1340:       (sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
 1341:     log(L_WARN "SOCK: Cannot get local IP address for TCP>");
 1342: 
 1343:   s->type = SK_TCP;
 1344:   sk_alloc_bufs(s);
 1345:   s->tx_hook(s);
 1346: }
 1347: 
 1348: static int
 1349: sk_passive_connected(sock *s, int type)
 1350: {
 1351:   sockaddr loc_sa, rem_sa;
 1352:   int loc_sa_len = sizeof(loc_sa);
 1353:   int rem_sa_len = sizeof(rem_sa);
 1354: 
 1355:   int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
 1356:   if (fd < 0)
 1357:   {
 1358:     if ((errno != EINTR) && (errno != EAGAIN))
 1359:       s->err_hook(s, errno);
 1360:     return 0;
 1361:   }
 1362: 
 1363:   sock *t = sk_new(s->pool);
 1364:   t->type = type;
 1365:   t->fd = fd;
 1366:   t->af = s->af;
 1367:   t->ttl = s->ttl;
 1368:   t->tos = s->tos;
 1369:   t->rbsize = s->rbsize;
 1370:   t->tbsize = s->tbsize;
 1371: 
 1372:   if (type == SK_TCP)
 1373:   {
 1374:     if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
 1375: 	(sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
 1376:       log(L_WARN "SOCK: Cannot get local IP address for TCP<");
 1377: 
 1378:     if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
 1379:       log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
 1380:   }
 1381: 
 1382:   if (sk_setup(t) < 0)
 1383:   {
 1384:     /* FIXME: Call err_hook instead ? */
 1385:     log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
 1386: 
 1387:     /* FIXME: handle it better in rfree() */
 1388:     close(t->fd);
 1389:     t->fd = -1;
 1390:     rfree(t);
 1391:     return 1;
 1392:   }
 1393: 
 1394:   sk_insert(t);
 1395:   sk_alloc_bufs(t);
 1396:   s->rx_hook(t, 0);
 1397:   return 1;
 1398: }
 1399: 
 1400: /**
 1401:  * sk_open - open a socket
 1402:  * @s: socket
 1403:  *
 1404:  * This function takes a socket resource created by sk_new() and
 1405:  * initialized by the user and binds a corresponding network connection
 1406:  * to it.
 1407:  *
 1408:  * Result: 0 for success, -1 for an error.
 1409:  */
 1410: int
 1411: sk_open(sock *s)
 1412: {
 1413:   int af = BIRD_AF;
 1414:   int fd = -1;
 1415:   int do_bind = 0;
 1416:   int bind_port = 0;
 1417:   ip_addr bind_addr = IPA_NONE;
 1418:   sockaddr sa;
 1419: 
 1420:   switch (s->type)
 1421:   {
 1422:   case SK_TCP_ACTIVE:
 1423:     s->ttx = "";			/* Force s->ttx != s->tpos */
 1424:     /* Fall thru */
 1425:   case SK_TCP_PASSIVE:
 1426:     fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
 1427:     bind_port = s->sport;
 1428:     bind_addr = s->saddr;
 1429:     do_bind = bind_port || ipa_nonzero(bind_addr);
 1430:     break;
 1431: 
 1432:   case SK_UDP:
 1433:     fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
 1434:     bind_port = s->sport;
 1435:     bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
 1436:     do_bind = 1;
 1437:     break;
 1438: 
 1439:   case SK_IP:
 1440:     fd = socket(af, SOCK_RAW, s->dport);
 1441:     bind_port = 0;
 1442:     bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
 1443:     do_bind = ipa_nonzero(bind_addr);
 1444:     break;
 1445: 
 1446:   case SK_MAGIC:
 1447:     af = 0;
 1448:     fd = s->fd;
 1449:     break;
 1450: 
 1451:   default:
 1452:     bug("sk_open() called for invalid sock type %d", s->type);
 1453:   }
 1454: 
 1455:   if (fd < 0)
 1456:     ERR("socket");
 1457: 
 1458:   s->af = af;
 1459:   s->fd = fd;
 1460: 
 1461:   if (sk_setup(s) < 0)
 1462:     goto err;
 1463: 
 1464:   if (do_bind)
 1465:   {
 1466:     if (bind_port)
 1467:     {
 1468:       int y = 1;
 1469: 
 1470:       if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
 1471: 	ERR2("SO_REUSEADDR");
 1472: 
 1473: #ifdef CONFIG_NO_IFACE_BIND
 1474:       /* Workaround missing ability to bind to an iface */
 1475:       if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
 1476:       {
 1477: 	if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
 1478: 	  ERR2("SO_REUSEPORT");
 1479:       }
 1480: #endif
 1481:     }
 1482:     else
 1483:       if (s->flags & SKF_HIGH_PORT)
 1484: 	if (sk_set_high_port(s) < 0)
 1485: 	  log(L_WARN "Socket error: %s%#m", s->err);
 1486: 
 1487:     sockaddr_fill(&sa, af, bind_addr, s->iface, bind_port);
 1488:     if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
 1489:       ERR2("bind");
 1490:   }
 1491: 
 1492:   if (s->password)
 1493:     if (sk_set_md5_auth(s, s->saddr, s->daddr, s->iface, s->password, 0) < 0)
 1494:       goto err;
 1495: 
 1496:   switch (s->type)
 1497:   {
 1498:   case SK_TCP_ACTIVE:
 1499:     sockaddr_fill(&sa, af, s->daddr, s->iface, s->dport);
 1500:     if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
 1501:       sk_tcp_connected(s);
 1502:     else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
 1503: 	     errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
 1504:       ERR2("connect");
 1505:     break;
 1506: 
 1507:   case SK_TCP_PASSIVE:
 1508:     if (listen(fd, 8) < 0)
 1509:       ERR2("listen");
 1510:     break;
 1511: 
 1512:   case SK_MAGIC:
 1513:     break;
 1514: 
 1515:   default:
 1516:     sk_alloc_bufs(s);
 1517:   }
 1518: 
 1519:   if (!(s->flags & SKF_THREAD))
 1520:     sk_insert(s);
 1521:   return 0;
 1522: 
 1523: err:
 1524:   close(fd);
 1525:   s->fd = -1;
 1526:   return -1;
 1527: }
 1528: 
 1529: int
 1530: sk_open_unix(sock *s, char *name)
 1531: {
 1532:   struct sockaddr_un sa;
 1533:   int fd;
 1534: 
 1535:   /* We are sloppy during error (leak fd and not set s->err), but we die anyway */
 1536: 
 1537:   fd = socket(AF_UNIX, SOCK_STREAM, 0);
 1538:   if (fd < 0)
 1539:     return -1;
 1540: 
 1541:   if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
 1542:     return -1;
 1543: 
 1544:   /* Path length checked in test_old_bird() */
 1545:   sa.sun_family = AF_UNIX;
 1546:   strcpy(sa.sun_path, name);
 1547: 
 1548:   if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
 1549:     return -1;
 1550: 
 1551:   if (listen(fd, 8) < 0)
 1552:     return -1;
 1553: 
 1554:   s->fd = fd;
 1555:   sk_insert(s);
 1556:   return 0;
 1557: }
 1558: 
 1559: 
 1560: #define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
 1561: 			  CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
 1562: #define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
 1563: 
 1564: static void
 1565: sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
 1566: {
 1567:   if (sk_is_ipv4(s))
 1568:     sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
 1569:   else
 1570:     sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
 1571: }
 1572: 
 1573: static void
 1574: sk_process_cmsgs(sock *s, struct msghdr *msg)
 1575: {
 1576:   struct cmsghdr *cm;
 1577: 
 1578:   s->laddr = IPA_NONE;
 1579:   s->lifindex = 0;
 1580:   s->rcv_ttl = -1;
 1581: 
 1582:   for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
 1583:   {
 1584:     if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
 1585:     {
 1586:       sk_process_cmsg4_pktinfo(s, cm);
 1587:       sk_process_cmsg4_ttl(s, cm);
 1588:     }
 1589: 
 1590:     if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
 1591:     {
 1592:       sk_process_cmsg6_pktinfo(s, cm);
 1593:       sk_process_cmsg6_ttl(s, cm);
 1594:     }
 1595:   }
 1596: }
 1597: 
 1598: 
 1599: static inline int
 1600: sk_sendmsg(sock *s)
 1601: {
 1602:   struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
 1603:   byte cmsg_buf[CMSG_TX_SPACE];
 1604:   sockaddr dst;
 1605:   int flags = 0;
 1606: 
 1607:   sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
 1608: 
 1609:   struct msghdr msg = {
 1610:     .msg_name = &dst.sa,
 1611:     .msg_namelen = SA_LEN(dst),
 1612:     .msg_iov = &iov,
 1613:     .msg_iovlen = 1
 1614:   };
 1615: 
 1616: #ifdef CONFIG_DONTROUTE_UNICAST
 1617:   /* FreeBSD silently changes TTL to 1 when MSG_DONTROUTE is used, therefore we
 1618:      cannot use it for other cases (e.g. when TTL security is used). */
 1619:   if (ipa_is_ip4(s->daddr) && ip4_is_unicast(ipa_to_ip4(s->daddr)) && (s->ttl == 1))
 1620:     flags = MSG_DONTROUTE;
 1621: #endif
 1622: 
 1623: #ifdef CONFIG_USE_HDRINCL
 1624:   byte hdr[20];
 1625:   struct iovec iov2[2] = { {hdr, 20}, iov };
 1626: 
 1627:   if (s->flags & SKF_HDRINCL)
 1628:   {
 1629:     sk_prepare_ip_header(s, hdr, iov.iov_len);
 1630:     msg.msg_iov = iov2;
 1631:     msg.msg_iovlen = 2;
 1632:   }
 1633: #endif
 1634: 
 1635:   if (s->flags & SKF_PKTINFO)
 1636:     sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
 1637: 
 1638:   return sendmsg(s->fd, &msg, flags);
 1639: }
 1640: 
 1641: static inline int
 1642: sk_recvmsg(sock *s)
 1643: {
 1644:   struct iovec iov = {s->rbuf, s->rbsize};
 1645:   byte cmsg_buf[CMSG_RX_SPACE];
 1646:   sockaddr src;
 1647: 
 1648:   struct msghdr msg = {
 1649:     .msg_name = &src.sa,
 1650:     .msg_namelen = sizeof(src), // XXXX ??
 1651:     .msg_iov = &iov,
 1652:     .msg_iovlen = 1,
 1653:     .msg_control = cmsg_buf,
 1654:     .msg_controllen = sizeof(cmsg_buf),
 1655:     .msg_flags = 0
 1656:   };
 1657: 
 1658:   int rv = recvmsg(s->fd, &msg, 0);
 1659:   if (rv < 0)
 1660:     return rv;
 1661: 
 1662:   //ifdef IPV4
 1663:   //  if (cf_type == SK_IP)
 1664:   //    rv = ipv4_skip_header(pbuf, rv);
 1665:   //endif
 1666: 
 1667:   sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
 1668:   sk_process_cmsgs(s, &msg);
 1669: 
 1670:   if (msg.msg_flags & MSG_TRUNC)
 1671:     s->flags |= SKF_TRUNCATED;
 1672:   else
 1673:     s->flags &= ~SKF_TRUNCATED;
 1674: 
 1675:   return rv;
 1676: }
 1677: 
 1678: 
 1679: static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
 1680: 
 1681: static int
 1682: sk_maybe_write(sock *s)
 1683: {
 1684:   int e;
 1685: 
 1686:   switch (s->type)
 1687:   {
 1688:   case SK_TCP:
 1689:   case SK_MAGIC:
 1690:   case SK_UNIX:
 1691:     while (s->ttx != s->tpos)
 1692:     {
 1693:       e = write(s->fd, s->ttx, s->tpos - s->ttx);
 1694: 
 1695:       if (e < 0)
 1696:       {
 1697: 	if (errno != EINTR && errno != EAGAIN)
 1698: 	{
 1699: 	  reset_tx_buffer(s);
 1700: 	  /* EPIPE is just a connection close notification during TX */
 1701: 	  s->err_hook(s, (errno != EPIPE) ? errno : 0);
 1702: 	  return -1;
 1703: 	}
 1704: 	return 0;
 1705:       }
 1706:       s->ttx += e;
 1707:     }
 1708:     reset_tx_buffer(s);
 1709:     return 1;
 1710: 
 1711:   case SK_UDP:
 1712:   case SK_IP:
 1713:     {
 1714:       if (s->tbuf == s->tpos)
 1715: 	return 1;
 1716: 
 1717:       e = sk_sendmsg(s);
 1718: 
 1719:       if (e < 0)
 1720:       {
 1721: 	if (errno != EINTR && errno != EAGAIN)
 1722: 	{
 1723: 	  reset_tx_buffer(s);
 1724: 	  s->err_hook(s, errno);
 1725: 	  return -1;
 1726: 	}
 1727: 
 1728: 	if (!s->tx_hook)
 1729: 	  reset_tx_buffer(s);
 1730: 	return 0;
 1731:       }
 1732:       reset_tx_buffer(s);
 1733:       return 1;
 1734:     }
 1735:   default:
 1736:     bug("sk_maybe_write: unknown socket type %d", s->type);
 1737:   }
 1738: }
 1739: 
 1740: int
 1741: sk_rx_ready(sock *s)
 1742: {
 1743:   int rv;
 1744:   struct pollfd pfd = { .fd = s->fd };
 1745:   pfd.events |= POLLIN;
 1746: 
 1747:  redo:
 1748:   rv = poll(&pfd, 1, 0);
 1749: 
 1750:   if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
 1751:     goto redo;
 1752: 
 1753:   return rv;
 1754: }
 1755: 
 1756: /**
 1757:  * sk_send - send data to a socket
 1758:  * @s: socket
 1759:  * @len: number of bytes to send
 1760:  *
 1761:  * This function sends @len bytes of data prepared in the
 1762:  * transmit buffer of the socket @s to the network connection.
 1763:  * If the packet can be sent immediately, it does so and returns
 1764:  * 1, else it queues the packet for later processing, returns 0
 1765:  * and calls the @tx_hook of the socket when the tranmission
 1766:  * takes place.
 1767:  */
 1768: int
 1769: sk_send(sock *s, unsigned len)
 1770: {
 1771:   s->ttx = s->tbuf;
 1772:   s->tpos = s->tbuf + len;
 1773:   return sk_maybe_write(s);
 1774: }
 1775: 
 1776: /**
 1777:  * sk_send_to - send data to a specific destination
 1778:  * @s: socket
 1779:  * @len: number of bytes to send
 1780:  * @addr: IP address to send the packet to
 1781:  * @port: port to send the packet to
 1782:  *
 1783:  * This is a sk_send() replacement for connection-less packet sockets
 1784:  * which allows destination of the packet to be chosen dynamically.
 1785:  * Raw IP sockets should use 0 for @port.
 1786:  */
 1787: int
 1788: sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
 1789: {
 1790:   s->daddr = addr;
 1791:   if (port)
 1792:     s->dport = port;
 1793: 
 1794:   s->ttx = s->tbuf;
 1795:   s->tpos = s->tbuf + len;
 1796:   return sk_maybe_write(s);
 1797: }
 1798: 
 1799: /*
 1800: int
 1801: sk_send_full(sock *s, unsigned len, struct iface *ifa,
 1802: 	     ip_addr saddr, ip_addr daddr, unsigned dport)
 1803: {
 1804:   s->iface = ifa;
 1805:   s->saddr = saddr;
 1806:   s->daddr = daddr;
 1807:   s->dport = dport;
 1808:   s->ttx = s->tbuf;
 1809:   s->tpos = s->tbuf + len;
 1810:   return sk_maybe_write(s);
 1811: }
 1812: */
 1813: 
 1814:  /* sk_read() and sk_write() are called from BFD's event loop */
 1815: 
 1816: int
 1817: sk_read(sock *s, int revents)
 1818: {
 1819:   switch (s->type)
 1820:   {
 1821:   case SK_TCP_PASSIVE:
 1822:     return sk_passive_connected(s, SK_TCP);
 1823: 
 1824:   case SK_UNIX_PASSIVE:
 1825:     return sk_passive_connected(s, SK_UNIX);
 1826: 
 1827:   case SK_TCP:
 1828:   case SK_UNIX:
 1829:     {
 1830:       int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
 1831: 
 1832:       if (c < 0)
 1833:       {
 1834: 	if (errno != EINTR && errno != EAGAIN)
 1835: 	  s->err_hook(s, errno);
 1836: 	else if (errno == EAGAIN && !(revents & POLLIN))
 1837: 	{
 1838: 	  log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents);
 1839: 	  s->err_hook(s, 0);
 1840: 	}
 1841:       }
 1842:       else if (!c)
 1843: 	s->err_hook(s, 0);
 1844:       else
 1845:       {
 1846: 	s->rpos += c;
 1847: 	if (s->rx_hook(s, s->rpos - s->rbuf))
 1848: 	{
 1849: 	  /* We need to be careful since the socket could have been deleted by the hook */
 1850: 	  if (current_sock == s)
 1851: 	    s->rpos = s->rbuf;
 1852: 	}
 1853: 	return 1;
 1854:       }
 1855:       return 0;
 1856:     }
 1857: 
 1858:   case SK_MAGIC:
 1859:     return s->rx_hook(s, 0);
 1860: 
 1861:   default:
 1862:     {
 1863:       int e = sk_recvmsg(s);
 1864: 
 1865:       if (e < 0)
 1866:       {
 1867: 	if (errno != EINTR && errno != EAGAIN)
 1868: 	  s->err_hook(s, errno);
 1869: 	return 0;
 1870:       }
 1871: 
 1872:       s->rpos = s->rbuf + e;
 1873:       s->rx_hook(s, e);
 1874:       return 1;
 1875:     }
 1876:   }
 1877: }
 1878: 
 1879: int
 1880: sk_write(sock *s)
 1881: {
 1882:   switch (s->type)
 1883:   {
 1884:   case SK_TCP_ACTIVE:
 1885:     {
 1886:       sockaddr sa;
 1887:       sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
 1888: 
 1889:       if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
 1890: 	sk_tcp_connected(s);
 1891:       else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
 1892: 	s->err_hook(s, errno);
 1893:       return 0;
 1894:     }
 1895: 
 1896:   default:
 1897:     if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
 1898:     {
 1899:       if (s->tx_hook)
 1900: 	s->tx_hook(s);
 1901:       return 1;
 1902:     }
 1903:     return 0;
 1904:   }
 1905: }
 1906: 
 1907: void
 1908: sk_err(sock *s, int revents)
 1909: {
 1910:   int se = 0, sse = sizeof(se);
 1911:   if ((s->type != SK_MAGIC) && (revents & POLLERR))
 1912:     if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0)
 1913:     {
 1914:       log(L_ERR "IO: Socket error: SO_ERROR: %m");
 1915:       se = 0;
 1916:     }
 1917: 
 1918:   s->err_hook(s, se);
 1919: }
 1920: 
 1921: void
 1922: sk_dump_all(void)
 1923: {
 1924:   node *n;
 1925:   sock *s;
 1926: 
 1927:   debug("Open sockets:\n");
 1928:   WALK_LIST(n, sock_list)
 1929:   {
 1930:     s = SKIP_BACK(sock, n, n);
 1931:     debug("%p ", s);
 1932:     sk_dump(&s->r);
 1933:   }
 1934:   debug("\n");
 1935: }
 1936: 
 1937: 
 1938: /*
 1939:  *	Internal event log and watchdog
 1940:  */
 1941: 
 1942: #define EVENT_LOG_LENGTH 32
 1943: 
 1944: struct event_log_entry
 1945: {
 1946:   void *hook;
 1947:   void *data;
 1948:   btime timestamp;
 1949:   btime duration;
 1950: };
 1951: 
 1952: static struct event_log_entry event_log[EVENT_LOG_LENGTH];
 1953: static struct event_log_entry *event_open;
 1954: static int event_log_pos, event_log_num, watchdog_active;
 1955: static btime last_time;
 1956: static btime loop_time;
 1957: 
 1958: static void
 1959: io_update_time(void)
 1960: {
 1961:   struct timespec ts;
 1962:   int rv;
 1963: 
 1964:   if (!clock_monotonic_available)
 1965:     return;
 1966: 
 1967:   /*
 1968:    * This is third time-tracking procedure (after update_times() above and
 1969:    * times_update() in BFD), dedicated to internal event log and latency
 1970:    * tracking. Hopefully, we consolidate these sometimes.
 1971:    */
 1972: 
 1973:   rv = clock_gettime(CLOCK_MONOTONIC, &ts);
 1974:   if (rv < 0)
 1975:     die("clock_gettime: %m");
 1976: 
 1977:   last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
 1978: 
 1979:   if (event_open)
 1980:   {
 1981:     event_open->duration = last_time - event_open->timestamp;
 1982: 
 1983:     if (event_open->duration > config->latency_limit)
 1984:       log(L_WARN "Event 0x%p 0x%p took %d ms",
 1985: 	  event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
 1986: 
 1987:     event_open = NULL;
 1988:   }
 1989: }
 1990: 
 1991: /**
 1992:  * io_log_event - mark approaching event into event log
 1993:  * @hook: event hook address
 1994:  * @data: event data address
 1995:  *
 1996:  * Store info (hook, data, timestamp) about the following internal event into
 1997:  * a circular event log (@event_log). When latency tracking is enabled, the log
 1998:  * entry is kept open (in @event_open) so the duration can be filled later.
 1999:  */
 2000: void
 2001: io_log_event(void *hook, void *data)
 2002: {
 2003:   if (config->latency_debug)
 2004:     io_update_time();
 2005: 
 2006:   struct event_log_entry *en = event_log + event_log_pos;
 2007: 
 2008:   en->hook = hook;
 2009:   en->data = data;
 2010:   en->timestamp = last_time;
 2011:   en->duration = 0;
 2012: 
 2013:   event_log_num++;
 2014:   event_log_pos++;
 2015:   event_log_pos %= EVENT_LOG_LENGTH;
 2016: 
 2017:   event_open = config->latency_debug ? en : NULL;
 2018: }
 2019: 
 2020: static inline void
 2021: io_close_event(void)
 2022: {
 2023:   if (event_open)
 2024:     io_update_time();
 2025: }
 2026: 
 2027: void
 2028: io_log_dump(void)
 2029: {
 2030:   int i;
 2031: 
 2032:   log(L_DEBUG "Event log:");
 2033:   for (i = 0; i < EVENT_LOG_LENGTH; i++)
 2034:   {
 2035:     struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
 2036:     if (en->hook)
 2037:       log(L_DEBUG "  Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
 2038: 	  (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
 2039:   }
 2040: }
 2041: 
 2042: void
 2043: watchdog_sigalrm(int sig UNUSED)
 2044: {
 2045:   /* Update last_time and duration, but skip latency check */
 2046:   config->latency_limit = 0xffffffff;
 2047:   io_update_time();
 2048: 
 2049:   /* We want core dump */
 2050:   abort();
 2051: }
 2052: 
 2053: static inline void
 2054: watchdog_start1(void)
 2055: {
 2056:   io_update_time();
 2057: 
 2058:   loop_time = last_time;
 2059: }
 2060: 
 2061: static inline void
 2062: watchdog_start(void)
 2063: {
 2064:   io_update_time();
 2065: 
 2066:   loop_time = last_time;
 2067:   event_log_num = 0;
 2068: 
 2069:   if (config->watchdog_timeout)
 2070:   {
 2071:     alarm(config->watchdog_timeout);
 2072:     watchdog_active = 1;
 2073:   }
 2074: }
 2075: 
 2076: static inline void
 2077: watchdog_stop(void)
 2078: {
 2079:   io_update_time();
 2080: 
 2081:   if (watchdog_active)
 2082:   {
 2083:     alarm(0);
 2084:     watchdog_active = 0;
 2085:   }
 2086: 
 2087:   btime duration = last_time - loop_time;
 2088:   if (duration > config->watchdog_warning)
 2089:     log(L_WARN "I/O loop cycle took %d ms for %d events",
 2090: 	(int) (duration TO_MS), event_log_num);
 2091: }
 2092: 
 2093: 
 2094: /*
 2095:  *	Main I/O Loop
 2096:  */
 2097: 
 2098: volatile int async_config_flag;		/* Asynchronous reconfiguration/dump scheduled */
 2099: volatile int async_dump_flag;
 2100: volatile int async_shutdown_flag;
 2101: 
 2102: void
 2103: io_init(void)
 2104: {
 2105:   init_list(&near_timers);
 2106:   init_list(&far_timers);
 2107:   init_list(&sock_list);
 2108:   init_list(&global_event_list);
 2109:   krt_io_init();
 2110:   init_times();
 2111:   update_times();
 2112:   boot_time = now;
 2113:   srandom((int) now_real);
 2114: }
 2115: 
 2116: static int short_loops = 0;
 2117: #define SHORT_LOOP_MAX 10
 2118: 
 2119: void
 2120: io_loop(void)
 2121: {
 2122:   int poll_tout;
 2123:   time_t tout;
 2124:   int nfds, events, pout;
 2125:   sock *s;
 2126:   node *n;
 2127:   int fdmax = 256;
 2128:   struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd));
 2129: 
 2130:   watchdog_start1();
 2131:   for(;;)
 2132:     {
 2133:       events = ev_run_list(&global_event_list);
 2134:     timers:
 2135:       update_times();
 2136:       tout = tm_first_shot();
 2137:       if (tout <= now)
 2138: 	{
 2139: 	  tm_shot();
 2140: 	  goto timers;
 2141: 	}
 2142:       poll_tout = (events ? 0 : MIN(tout - now, 3)) * 1000; /* Time in milliseconds */
 2143: 
 2144:       io_close_event();
 2145: 
 2146:       nfds = 0;
 2147:       WALK_LIST(n, sock_list)
 2148: 	{
 2149: 	  pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
 2150: 	  s = SKIP_BACK(sock, n, n);
 2151: 	  if (s->rx_hook)
 2152: 	    {
 2153: 	      pfd[nfds].fd = s->fd;
 2154: 	      pfd[nfds].events |= POLLIN;
 2155: 	    }
 2156: 	  if (s->tx_hook && s->ttx != s->tpos)
 2157: 	    {
 2158: 	      pfd[nfds].fd = s->fd;
 2159: 	      pfd[nfds].events |= POLLOUT;
 2160: 	    }
 2161: 	  if (pfd[nfds].fd != -1)
 2162: 	    {
 2163: 	      s->index = nfds;
 2164: 	      nfds++;
 2165: 	    }
 2166: 	  else
 2167: 	    s->index = -1;
 2168: 
 2169: 	  if (nfds >= fdmax)
 2170: 	    {
 2171: 	      fdmax *= 2;
 2172: 	      pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd));
 2173: 	    }
 2174: 	}
 2175: 
 2176:       /*
 2177:        * Yes, this is racy. But even if the signal comes before this test
 2178:        * and entering poll(), it gets caught on the next timer tick.
 2179:        */
 2180: 
 2181:       if (async_config_flag)
 2182: 	{
 2183: 	  io_log_event(async_config, NULL);
 2184: 	  async_config();
 2185: 	  async_config_flag = 0;
 2186: 	  continue;
 2187: 	}
 2188:       if (async_dump_flag)
 2189: 	{
 2190: 	  io_log_event(async_dump, NULL);
 2191: 	  async_dump();
 2192: 	  async_dump_flag = 0;
 2193: 	  continue;
 2194: 	}
 2195:       if (async_shutdown_flag)
 2196: 	{
 2197: 	  io_log_event(async_shutdown, NULL);
 2198: 	  async_shutdown();
 2199: 	  async_shutdown_flag = 0;
 2200: 	  continue;
 2201: 	}
 2202: 
 2203:       /* And finally enter poll() to find active sockets */
 2204:       watchdog_stop();
 2205:       pout = poll(pfd, nfds, poll_tout);
 2206:       watchdog_start();
 2207: 
 2208:       if (pout < 0)
 2209: 	{
 2210: 	  if (errno == EINTR || errno == EAGAIN)
 2211: 	    continue;
 2212: 	  die("poll: %m");
 2213: 	}
 2214:       if (pout)
 2215: 	{
 2216: 	  /* guaranteed to be non-empty */
 2217: 	  current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
 2218: 
 2219: 	  while (current_sock)
 2220: 	    {
 2221: 	      sock *s = current_sock;
 2222: 	      if (s->index == -1)
 2223: 		{
 2224: 		  current_sock = sk_next(s);
 2225: 		  goto next;
 2226: 		}
 2227: 
 2228: 	      int e;
 2229: 	      int steps;
 2230: 
 2231: 	      steps = MAX_STEPS;
 2232: 	      if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
 2233: 		do
 2234: 		  {
 2235: 		    steps--;
 2236: 		    io_log_event(s->rx_hook, s->data);
 2237: 		    e = sk_read(s, pfd[s->index].revents);
 2238: 		    if (s != current_sock)
 2239: 		      goto next;
 2240: 		  }
 2241: 		while (e && s->rx_hook && steps);
 2242: 
 2243: 	      steps = MAX_STEPS;
 2244: 	      if (pfd[s->index].revents & POLLOUT)
 2245: 		do
 2246: 		  {
 2247: 		    steps--;
 2248: 		    io_log_event(s->tx_hook, s->data);
 2249: 		    e = sk_write(s);
 2250: 		    if (s != current_sock)
 2251: 		      goto next;
 2252: 		  }
 2253: 		while (e && steps);
 2254: 
 2255: 	      current_sock = sk_next(s);
 2256: 	    next: ;
 2257: 	    }
 2258: 
 2259: 	  short_loops++;
 2260: 	  if (events && (short_loops < SHORT_LOOP_MAX))
 2261: 	    continue;
 2262: 	  short_loops = 0;
 2263: 
 2264: 	  int count = 0;
 2265: 	  current_sock = stored_sock;
 2266: 	  if (current_sock == NULL)
 2267: 	    current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
 2268: 
 2269: 	  while (current_sock && count < MAX_RX_STEPS)
 2270: 	    {
 2271: 	      sock *s = current_sock;
 2272: 	      if (s->index == -1)
 2273: 		{
 2274: 		  current_sock = sk_next(s);
 2275: 		  goto next2;
 2276: 		}
 2277: 
 2278: 	      if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
 2279: 		{
 2280: 		  count++;
 2281: 		  io_log_event(s->rx_hook, s->data);
 2282: 		  sk_read(s, pfd[s->index].revents);
 2283: 		  if (s != current_sock)
 2284: 		    goto next2;
 2285: 		}
 2286: 
 2287: 	      if (pfd[s->index].revents & (POLLHUP | POLLERR))
 2288: 		{
 2289: 		  sk_err(s, pfd[s->index].revents);
 2290: 		  if (s != current_sock)
 2291: 		    goto next2;
 2292: 		}
 2293: 
 2294: 	      current_sock = sk_next(s);
 2295: 	    next2: ;
 2296: 	    }
 2297: 
 2298: 
 2299: 	  stored_sock = current_sock;
 2300: 	}
 2301:     }
 2302: }
 2303: 
 2304: void
 2305: test_old_bird(char *path)
 2306: {
 2307:   int fd;
 2308:   struct sockaddr_un sa;
 2309: 
 2310:   fd = socket(AF_UNIX, SOCK_STREAM, 0);
 2311:   if (fd < 0)
 2312:     die("Cannot create socket: %m");
 2313:   if (strlen(path) >= sizeof(sa.sun_path))
 2314:     die("Socket path too long");
 2315:   bzero(&sa, sizeof(sa));
 2316:   sa.sun_family = AF_UNIX;
 2317:   strcpy(sa.sun_path, path);
 2318:   if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
 2319:     die("I found another BIRD running.");
 2320:   close(fd);
 2321: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>