Annotation of embedaddon/bird/sysdep/unix/io.c, revision 1.1.1.2

1.1       misho       1: /*
                      2:  *     BIRD Internet Routing Daemon -- Unix I/O
                      3:  *
                      4:  *     (c) 1998--2004 Martin Mares <mj@ucw.cz>
                      5:  *      (c) 2004       Ondrej Filip <feela@network.cz>
                      6:  *
                      7:  *     Can be freely distributed and used under the terms of the GNU GPL.
                      8:  */
                      9: 
                     10: /* Unfortunately, some glibc versions hide parts of RFC 3542 API
                     11:    if _GNU_SOURCE is not defined. */
                     12: #ifndef _GNU_SOURCE
                     13: #define _GNU_SOURCE
                     14: #endif
                     15: 
                     16: #include <stdio.h>
                     17: #include <stdlib.h>
                     18: #include <time.h>
                     19: #include <sys/time.h>
                     20: #include <sys/types.h>
                     21: #include <sys/socket.h>
                     22: #include <sys/uio.h>
                     23: #include <sys/un.h>
                     24: #include <poll.h>
                     25: #include <unistd.h>
                     26: #include <fcntl.h>
                     27: #include <errno.h>
                     28: #include <net/if.h>
                     29: #include <netinet/in.h>
                     30: #include <netinet/tcp.h>
                     31: #include <netinet/udp.h>
                     32: #include <netinet/icmp6.h>
                     33: 
                     34: #include "nest/bird.h"
                     35: #include "lib/lists.h"
                     36: #include "lib/resource.h"
                     37: #include "lib/timer.h"
                     38: #include "lib/socket.h"
                     39: #include "lib/event.h"
                     40: #include "lib/string.h"
                     41: #include "nest/iface.h"
                     42: 
                     43: #include "lib/unix.h"
                     44: #include "lib/sysio.h"
                     45: 
                     46: /* Maximum number of calls of tx handler for one socket in one
                     47:  * poll iteration. Should be small enough to not monopolize CPU by
                     48:  * one protocol instance.
                     49:  */
                     50: #define MAX_STEPS 4
                     51: 
                     52: /* Maximum number of calls of rx handler for all sockets in one poll
                     53:    iteration. RX callbacks are often much more costly so we limit
                     54:    this to gen small latencies */
                     55: #define MAX_RX_STEPS 4
                     56: 
1.1.1.2 ! misho      57: 
1.1       misho      58: /*
                     59:  *     Tracked Files
                     60:  */
                     61: 
                     62: struct rfile {
                     63:   resource r;
                     64:   FILE *f;
                     65: };
                     66: 
                     67: static void
                     68: rf_free(resource *r)
                     69: {
                     70:   struct rfile *a = (struct rfile *) r;
                     71: 
                     72:   fclose(a->f);
                     73: }
                     74: 
                     75: static void
                     76: rf_dump(resource *r)
                     77: {
                     78:   struct rfile *a = (struct rfile *) r;
                     79: 
                     80:   debug("(FILE *%p)\n", a->f);
                     81: }
                     82: 
                     83: static struct resclass rf_class = {
                     84:   "FILE",
                     85:   sizeof(struct rfile),
                     86:   rf_free,
                     87:   rf_dump,
                     88:   NULL,
                     89:   NULL
                     90: };
                     91: 
1.1.1.2 ! misho      92: struct rfile *
        !            93: rf_open(pool *p, char *name, char *mode)
1.1       misho      94: {
                     95:   FILE *f = fopen(name, mode);
                     96: 
1.1.1.2 ! misho      97:   if (!f)
        !            98:     return NULL;
        !            99: 
        !           100:   struct rfile *r = ralloc(p, &rf_class);
        !           101:   r->f = f;
        !           102:   return r;
        !           103: }
        !           104: 
        !           105: void *
        !           106: rf_file(struct rfile *f)
        !           107: {
        !           108:   return f->f;
        !           109: }
        !           110: 
        !           111: int
        !           112: rf_fileno(struct rfile *f)
        !           113: {
        !           114:   return fileno(f->f);
1.1       misho     115: }
                    116: 
1.1.1.2 ! misho     117: 
1.1       misho     118: /**
                    119:  * DOC: Timers
                    120:  *
                    121:  * Timers are resources which represent a wish of a module to call
                    122:  * a function at the specified time. The platform dependent code
                    123:  * doesn't guarantee exact timing, only that a timer function
                    124:  * won't be called before the requested time.
                    125:  *
                    126:  * In BIRD, time is represented by values of the &bird_clock_t type
                    127:  * which are integral numbers interpreted as a relative number of seconds since
                    128:  * some fixed time point in past. The current time can be read
                    129:  * from variable @now with reasonable accuracy and is monotonic. There is also
                    130:  * a current 'absolute' time in variable @now_real reported by OS.
                    131:  *
                    132:  * Each timer is described by a &timer structure containing a pointer
                    133:  * to the handler function (@hook), data private to this function (@data),
                    134:  * time the function should be called at (@expires, 0 for inactive timers),
                    135:  * for the other fields see |timer.h|.
                    136:  */
                    137: 
                    138: #define NEAR_TIMER_LIMIT 4
                    139: 
                    140: static list near_timers, far_timers;
                    141: static bird_clock_t first_far_timer = TIME_INFINITY;
                    142: 
                    143: /* now must be different from 0, because 0 is a special value in timer->expires */
                    144: bird_clock_t now = 1, now_real, boot_time;
                    145: 
                    146: static void
                    147: update_times_plain(void)
                    148: {
                    149:   bird_clock_t new_time = time(NULL);
                    150:   int delta = new_time - now_real;
                    151: 
                    152:   if ((delta >= 0) && (delta < 60))
                    153:     now += delta;
                    154:   else if (now_real != 0)
                    155:    log(L_WARN "Time jump, delta %d s", delta);
                    156: 
                    157:   now_real = new_time;
                    158: }
                    159: 
                    160: static void
                    161: update_times_gettime(void)
                    162: {
                    163:   struct timespec ts;
                    164:   int rv;
                    165: 
                    166:   rv = clock_gettime(CLOCK_MONOTONIC, &ts);
                    167:   if (rv != 0)
                    168:     die("clock_gettime: %m");
                    169: 
                    170:   if (ts.tv_sec != now) {
                    171:     if (ts.tv_sec < now)
                    172:       log(L_ERR "Monotonic timer is broken");
                    173: 
                    174:     now = ts.tv_sec;
                    175:     now_real = time(NULL);
                    176:   }
                    177: }
                    178: 
                    179: static int clock_monotonic_available;
                    180: 
                    181: static inline void
                    182: update_times(void)
                    183: {
                    184:   if (clock_monotonic_available)
                    185:     update_times_gettime();
                    186:   else
                    187:     update_times_plain();
                    188: }
                    189: 
                    190: static inline void
                    191: init_times(void)
                    192: {
                    193:  struct timespec ts;
                    194:  clock_monotonic_available = (clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
                    195:  if (!clock_monotonic_available)
                    196:    log(L_WARN "Monotonic timer is missing");
                    197: }
                    198: 
                    199: 
                    200: static void
                    201: tm_free(resource *r)
                    202: {
                    203:   timer *t = (timer *) r;
                    204: 
                    205:   tm_stop(t);
                    206: }
                    207: 
                    208: static void
                    209: tm_dump(resource *r)
                    210: {
                    211:   timer *t = (timer *) r;
                    212: 
                    213:   debug("(code %p, data %p, ", t->hook, t->data);
                    214:   if (t->randomize)
                    215:     debug("rand %d, ", t->randomize);
                    216:   if (t->recurrent)
                    217:     debug("recur %d, ", t->recurrent);
                    218:   if (t->expires)
                    219:     debug("expires in %d sec)\n", t->expires - now);
                    220:   else
                    221:     debug("inactive)\n");
                    222: }
                    223: 
                    224: static struct resclass tm_class = {
                    225:   "Timer",
                    226:   sizeof(timer),
                    227:   tm_free,
                    228:   tm_dump,
                    229:   NULL,
                    230:   NULL
                    231: };
                    232: 
                    233: /**
                    234:  * tm_new - create a timer
                    235:  * @p: pool
                    236:  *
                    237:  * This function creates a new timer resource and returns
                    238:  * a pointer to it. To use the timer, you need to fill in
                    239:  * the structure fields and call tm_start() to start timing.
                    240:  */
                    241: timer *
                    242: tm_new(pool *p)
                    243: {
                    244:   timer *t = ralloc(p, &tm_class);
                    245:   return t;
                    246: }
                    247: 
                    248: static inline void
                    249: tm_insert_near(timer *t)
                    250: {
                    251:   node *n = HEAD(near_timers);
                    252: 
                    253:   while (n->next && (SKIP_BACK(timer, n, n)->expires < t->expires))
                    254:     n = n->next;
                    255:   insert_node(&t->n, n->prev);
                    256: }
                    257: 
                    258: /**
                    259:  * tm_start - start a timer
                    260:  * @t: timer
                    261:  * @after: number of seconds the timer should be run after
                    262:  *
                    263:  * This function schedules the hook function of the timer to
                    264:  * be called after @after seconds. If the timer has been already
                    265:  * started, it's @expire time is replaced by the new value.
                    266:  *
                    267:  * You can have set the @randomize field of @t, the timeout
                    268:  * will be increased by a random number of seconds chosen
                    269:  * uniformly from range 0 .. @randomize.
                    270:  *
                    271:  * You can call tm_start() from the handler function of the timer
                    272:  * to request another run of the timer. Also, you can set the @recurrent
                    273:  * field to have the timer re-added automatically with the same timeout.
                    274:  */
                    275: void
                    276: tm_start(timer *t, unsigned after)
                    277: {
                    278:   bird_clock_t when;
                    279: 
                    280:   if (t->randomize)
                    281:     after += random() % (t->randomize + 1);
                    282:   when = now + after;
                    283:   if (t->expires == when)
                    284:     return;
                    285:   if (t->expires)
                    286:     rem_node(&t->n);
                    287:   t->expires = when;
                    288:   if (after <= NEAR_TIMER_LIMIT)
                    289:     tm_insert_near(t);
                    290:   else
                    291:     {
                    292:       if (!first_far_timer || first_far_timer > when)
                    293:        first_far_timer = when;
                    294:       add_tail(&far_timers, &t->n);
                    295:     }
                    296: }
                    297: 
                    298: /**
                    299:  * tm_stop - stop a timer
                    300:  * @t: timer
                    301:  *
                    302:  * This function stops a timer. If the timer is already stopped,
                    303:  * nothing happens.
                    304:  */
                    305: void
                    306: tm_stop(timer *t)
                    307: {
                    308:   if (t->expires)
                    309:     {
                    310:       rem_node(&t->n);
                    311:       t->expires = 0;
                    312:     }
                    313: }
                    314: 
                    315: static void
                    316: tm_dump_them(char *name, list *l)
                    317: {
                    318:   node *n;
                    319:   timer *t;
                    320: 
                    321:   debug("%s timers:\n", name);
                    322:   WALK_LIST(n, *l)
                    323:     {
                    324:       t = SKIP_BACK(timer, n, n);
                    325:       debug("%p ", t);
                    326:       tm_dump(&t->r);
                    327:     }
                    328:   debug("\n");
                    329: }
                    330: 
                    331: void
                    332: tm_dump_all(void)
                    333: {
                    334:   tm_dump_them("Near", &near_timers);
                    335:   tm_dump_them("Far", &far_timers);
                    336: }
                    337: 
                    338: static inline time_t
                    339: tm_first_shot(void)
                    340: {
                    341:   time_t x = first_far_timer;
                    342: 
                    343:   if (!EMPTY_LIST(near_timers))
                    344:     {
                    345:       timer *t = SKIP_BACK(timer, n, HEAD(near_timers));
                    346:       if (t->expires < x)
                    347:        x = t->expires;
                    348:     }
                    349:   return x;
                    350: }
                    351: 
                    352: void io_log_event(void *hook, void *data);
                    353: 
                    354: static void
                    355: tm_shot(void)
                    356: {
                    357:   timer *t;
                    358:   node *n, *m;
                    359: 
                    360:   if (first_far_timer <= now)
                    361:     {
                    362:       bird_clock_t limit = now + NEAR_TIMER_LIMIT;
                    363:       first_far_timer = TIME_INFINITY;
                    364:       n = HEAD(far_timers);
                    365:       while (m = n->next)
                    366:        {
                    367:          t = SKIP_BACK(timer, n, n);
                    368:          if (t->expires <= limit)
                    369:            {
                    370:              rem_node(n);
                    371:              tm_insert_near(t);
                    372:            }
                    373:          else if (t->expires < first_far_timer)
                    374:            first_far_timer = t->expires;
                    375:          n = m;
                    376:        }
                    377:     }
                    378:   while ((n = HEAD(near_timers)) -> next)
                    379:     {
                    380:       int delay;
                    381:       t = SKIP_BACK(timer, n, n);
                    382:       if (t->expires > now)
                    383:        break;
                    384:       rem_node(n);
                    385:       delay = t->expires - now;
                    386:       t->expires = 0;
                    387:       if (t->recurrent)
                    388:        {
                    389:          int i = t->recurrent - delay;
                    390:          if (i < 0)
                    391:            i = 0;
                    392:          tm_start(t, i);
                    393:        }
                    394:       io_log_event(t->hook, t->data);
                    395:       t->hook(t);
                    396:     }
                    397: }
                    398: 
                    399: /**
                    400:  * tm_parse_datetime - parse a date and time
                    401:  * @x: datetime string
                    402:  *
                    403:  * tm_parse_datetime() takes a textual representation of
                    404:  * a date and time (dd-mm-yyyy hh:mm:ss)
                    405:  * and converts it to the corresponding value of type &bird_clock_t.
                    406:  */
                    407: bird_clock_t
                    408: tm_parse_datetime(char *x)
                    409: {
                    410:   struct tm tm;
                    411:   int n;
                    412:   time_t t;
                    413: 
                    414:   if (sscanf(x, "%d-%d-%d %d:%d:%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &tm.tm_hour, &tm.tm_min, &tm.tm_sec, &n) != 6 || x[n])
                    415:     return tm_parse_date(x);
                    416:   tm.tm_mon--;
                    417:   tm.tm_year -= 1900;
                    418:   t = mktime(&tm);
                    419:   if (t == (time_t) -1)
                    420:     return 0;
                    421:   return t;
                    422: }
                    423: /**
                    424:  * tm_parse_date - parse a date
                    425:  * @x: date string
                    426:  *
                    427:  * tm_parse_date() takes a textual representation of a date (dd-mm-yyyy)
                    428:  * and converts it to the corresponding value of type &bird_clock_t.
                    429:  */
                    430: bird_clock_t
                    431: tm_parse_date(char *x)
                    432: {
                    433:   struct tm tm;
                    434:   int n;
                    435:   time_t t;
                    436: 
                    437:   if (sscanf(x, "%d-%d-%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &n) != 3 || x[n])
                    438:     return 0;
                    439:   tm.tm_mon--;
                    440:   tm.tm_year -= 1900;
                    441:   tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
                    442:   t = mktime(&tm);
                    443:   if (t == (time_t) -1)
                    444:     return 0;
                    445:   return t;
                    446: }
                    447: 
                    448: static void
                    449: tm_format_reltime(char *x, struct tm *tm, bird_clock_t delta)
                    450: {
                    451:   static char *month_names[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
                    452:                                   "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
                    453: 
                    454:   if (delta < 20*3600)
                    455:     bsprintf(x, "%02d:%02d", tm->tm_hour, tm->tm_min);
                    456:   else if (delta < 360*86400)
                    457:     bsprintf(x, "%s%02d", month_names[tm->tm_mon], tm->tm_mday);
                    458:   else
                    459:     bsprintf(x, "%d", tm->tm_year+1900);
                    460: }
                    461: 
                    462: #include "conf/conf.h"
                    463: 
                    464: /**
                    465:  * tm_format_datetime - convert date and time to textual representation
                    466:  * @x: destination buffer of size %TM_DATETIME_BUFFER_SIZE
                    467:  * @fmt_spec: specification of resulting textual representation of the time
                    468:  * @t: time
                    469:  *
                    470:  * This function formats the given relative time value @t to a textual
                    471:  * date/time representation (dd-mm-yyyy hh:mm:ss) in real time.
                    472:  */
                    473: void
                    474: tm_format_datetime(char *x, struct timeformat *fmt_spec, bird_clock_t t)
                    475: {
                    476:   const char *fmt_used;
                    477:   struct tm *tm;
                    478:   bird_clock_t delta = now - t;
                    479:   t = now_real - delta;
                    480:   tm = localtime(&t);
                    481: 
                    482:   if (fmt_spec->fmt1 == NULL)
                    483:     return tm_format_reltime(x, tm, delta);
                    484: 
                    485:   if ((fmt_spec->limit == 0) || (delta < fmt_spec->limit))
                    486:     fmt_used = fmt_spec->fmt1;
                    487:   else
                    488:     fmt_used = fmt_spec->fmt2;
                    489: 
                    490:   int rv = strftime(x, TM_DATETIME_BUFFER_SIZE, fmt_used, tm);
                    491:   if (((rv == 0) && fmt_used[0]) || (rv == TM_DATETIME_BUFFER_SIZE))
                    492:     strcpy(x, "<too-long>");
                    493: }
                    494: 
1.1.1.2 ! misho     495: int
        !           496: tm_format_real_time(char *x, size_t max, const char *fmt, bird_clock_t t)
        !           497: {
        !           498:   struct tm tm;
        !           499: 
        !           500:   if (!localtime_r(&t, &tm))
        !           501:     return 0;
        !           502: 
        !           503:   if (!strftime(x, max, fmt, &tm))
        !           504:     return 0;
        !           505: 
        !           506:   return 1;
        !           507: }
        !           508: 
1.1       misho     509: 
                    510: /**
                    511:  * DOC: Sockets
                    512:  *
                    513:  * Socket resources represent network connections. Their data structure (&socket)
                    514:  * contains a lot of fields defining the exact type of the socket, the local and
                    515:  * remote addresses and ports, pointers to socket buffers and finally pointers to
                    516:  * hook functions to be called when new data have arrived to the receive buffer
                    517:  * (@rx_hook), when the contents of the transmit buffer have been transmitted
                    518:  * (@tx_hook) and when an error or connection close occurs (@err_hook).
                    519:  *
                    520:  * Freeing of sockets from inside socket hooks is perfectly safe.
                    521:  */
                    522: 
                    523: #ifndef SOL_IP
                    524: #define SOL_IP IPPROTO_IP
                    525: #endif
                    526: 
                    527: #ifndef SOL_IPV6
                    528: #define SOL_IPV6 IPPROTO_IPV6
                    529: #endif
                    530: 
                    531: #ifndef SOL_ICMPV6
                    532: #define SOL_ICMPV6 IPPROTO_ICMPV6
                    533: #endif
                    534: 
                    535: 
                    536: /*
                    537:  *     Sockaddr helper functions
                    538:  */
                    539: 
                    540: static inline int UNUSED sockaddr_length(int af)
                    541: { return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
                    542: 
                    543: static inline void
                    544: sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
                    545: {
                    546:   memset(sa, 0, sizeof(struct sockaddr_in));
1.1.1.2 ! misho     547: #ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
1.1       misho     548:   sa->sin_len = sizeof(struct sockaddr_in);
                    549: #endif
                    550:   sa->sin_family = AF_INET;
                    551:   sa->sin_port = htons(port);
                    552:   sa->sin_addr = ipa_to_in4(a);
                    553: }
                    554: 
                    555: static inline void
                    556: sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
                    557: {
                    558:   memset(sa, 0, sizeof(struct sockaddr_in6));
                    559: #ifdef SIN6_LEN
                    560:   sa->sin6_len = sizeof(struct sockaddr_in6);
                    561: #endif
                    562:   sa->sin6_family = AF_INET6;
                    563:   sa->sin6_port = htons(port);
                    564:   sa->sin6_flowinfo = 0;
                    565:   sa->sin6_addr = ipa_to_in6(a);
                    566: 
                    567:   if (ifa && ipa_is_link_local(a))
                    568:     sa->sin6_scope_id = ifa->index;
                    569: }
                    570: 
                    571: void
                    572: sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
                    573: {
                    574:   if (af == AF_INET)
                    575:     sockaddr_fill4((struct sockaddr_in *) sa, a, port);
                    576:   else if (af == AF_INET6)
                    577:     sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
                    578:   else
                    579:     bug("Unknown AF");
                    580: }
                    581: 
                    582: static inline void
                    583: sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
                    584: {
                    585:   *port = ntohs(sa->sin_port);
                    586:   *a = ipa_from_in4(sa->sin_addr);
                    587: }
                    588: 
                    589: static inline void
                    590: sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
                    591: {
                    592:   *port = ntohs(sa->sin6_port);
                    593:   *a = ipa_from_in6(sa->sin6_addr);
                    594: 
                    595:   if (ifa && ipa_is_link_local(*a))
                    596:     *ifa = if_find_by_index(sa->sin6_scope_id);
                    597: }
                    598: 
                    599: int
                    600: sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
                    601: {
                    602:   if (sa->sa.sa_family != af)
                    603:     goto fail;
                    604: 
                    605:   if (af == AF_INET)
                    606:     sockaddr_read4((struct sockaddr_in *) sa, a, port);
                    607:   else if (af == AF_INET6)
                    608:     sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
                    609:   else
                    610:     goto fail;
                    611: 
                    612:   return 0;
                    613: 
                    614:  fail:
                    615:   *a = IPA_NONE;
                    616:   *port = 0;
                    617:   return -1;
                    618: }
                    619: 
                    620: 
                    621: /*
                    622:  *     IPv6 multicast syscalls
                    623:  */
                    624: 
                    625: /* Fortunately standardized in RFC 3493 */
                    626: 
                    627: #define INIT_MREQ6(maddr,ifa) \
                    628:   { .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
                    629: 
                    630: static inline int
                    631: sk_setup_multicast6(sock *s)
                    632: {
                    633:   int index = s->iface->index;
                    634:   int ttl = s->ttl;
                    635:   int n = 0;
                    636: 
                    637:   if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
                    638:     ERR("IPV6_MULTICAST_IF");
                    639: 
                    640:   if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
                    641:     ERR("IPV6_MULTICAST_HOPS");
                    642: 
                    643:   if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
                    644:     ERR("IPV6_MULTICAST_LOOP");
                    645: 
                    646:   return 0;
                    647: }
                    648: 
                    649: static inline int
                    650: sk_join_group6(sock *s, ip_addr maddr)
                    651: {
                    652:   struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
                    653: 
                    654:   if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
                    655:     ERR("IPV6_JOIN_GROUP");
                    656: 
                    657:   return 0;
                    658: }
                    659: 
                    660: static inline int
                    661: sk_leave_group6(sock *s, ip_addr maddr)
                    662: {
                    663:   struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
                    664: 
                    665:   if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
                    666:     ERR("IPV6_LEAVE_GROUP");
                    667: 
                    668:   return 0;
                    669: }
                    670: 
                    671: 
                    672: /*
                    673:  *     IPv6 packet control messages
                    674:  */
                    675: 
                    676: /* Also standardized, in RFC 3542 */
                    677: 
                    678: /*
                    679:  * RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
                    680:  * type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
                    681:  * don't have IPV6_RECVPKTINFO we suppose the OS implements the older
                    682:  * RFC and we use IPV6_PKTINFO.
                    683:  */
                    684: #ifndef IPV6_RECVPKTINFO
                    685: #define IPV6_RECVPKTINFO IPV6_PKTINFO
                    686: #endif
                    687: /*
                    688:  * Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
                    689:  */
                    690: #ifndef IPV6_RECVHOPLIMIT
                    691: #define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
                    692: #endif
                    693: 
                    694: 
                    695: #define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
                    696: #define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
                    697: 
                    698: static inline int
                    699: sk_request_cmsg6_pktinfo(sock *s)
                    700: {
                    701:   int y = 1;
                    702: 
                    703:   if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
                    704:     ERR("IPV6_RECVPKTINFO");
                    705: 
                    706:   return 0;
                    707: }
                    708: 
                    709: static inline int
                    710: sk_request_cmsg6_ttl(sock *s)
                    711: {
                    712:   int y = 1;
                    713: 
                    714:   if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
                    715:     ERR("IPV6_RECVHOPLIMIT");
                    716: 
                    717:   return 0;
                    718: }
                    719: 
                    720: static inline void
                    721: sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
                    722: {
                    723:   if (cm->cmsg_type == IPV6_PKTINFO)
                    724:   {
                    725:     struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
                    726:     s->laddr = ipa_from_in6(pi->ipi6_addr);
                    727:     s->lifindex = pi->ipi6_ifindex;
                    728:   }
                    729: }
                    730: 
                    731: static inline void
                    732: sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
                    733: {
                    734:   if (cm->cmsg_type == IPV6_HOPLIMIT)
                    735:     s->rcv_ttl = * (int *) CMSG_DATA(cm);
                    736: }
                    737: 
                    738: static inline void
                    739: sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
                    740: {
                    741:   struct cmsghdr *cm;
                    742:   struct in6_pktinfo *pi;
                    743:   int controllen = 0;
                    744: 
                    745:   msg->msg_control = cbuf;
                    746:   msg->msg_controllen = cbuflen;
                    747: 
                    748:   cm = CMSG_FIRSTHDR(msg);
                    749:   cm->cmsg_level = SOL_IPV6;
                    750:   cm->cmsg_type = IPV6_PKTINFO;
                    751:   cm->cmsg_len = CMSG_LEN(sizeof(*pi));
                    752:   controllen += CMSG_SPACE(sizeof(*pi));
                    753: 
                    754:   pi = (struct in6_pktinfo *) CMSG_DATA(cm);
                    755:   pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
                    756:   pi->ipi6_addr = ipa_to_in6(s->saddr);
                    757: 
                    758:   msg->msg_controllen = controllen;
                    759: }
                    760: 
                    761: 
                    762: /*
                    763:  *     Miscellaneous socket syscalls
                    764:  */
                    765: 
                    766: static inline int
                    767: sk_set_ttl4(sock *s, int ttl)
                    768: {
                    769:   if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
                    770:     ERR("IP_TTL");
                    771: 
                    772:   return 0;
                    773: }
                    774: 
                    775: static inline int
                    776: sk_set_ttl6(sock *s, int ttl)
                    777: {
                    778:   if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
                    779:     ERR("IPV6_UNICAST_HOPS");
                    780: 
                    781:   return 0;
                    782: }
                    783: 
                    784: static inline int
                    785: sk_set_tos4(sock *s, int tos)
                    786: {
                    787:   if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
                    788:     ERR("IP_TOS");
                    789: 
                    790:   return 0;
                    791: }
                    792: 
                    793: static inline int
                    794: sk_set_tos6(sock *s, int tos)
                    795: {
                    796:   if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
                    797:     ERR("IPV6_TCLASS");
                    798: 
                    799:   return 0;
                    800: }
                    801: 
                    802: static inline int
                    803: sk_set_high_port(sock *s UNUSED)
                    804: {
                    805:   /* Port range setting is optional, ignore it if not supported */
                    806: 
                    807: #ifdef IP_PORTRANGE
                    808:   if (sk_is_ipv4(s))
                    809:   {
                    810:     int range = IP_PORTRANGE_HIGH;
                    811:     if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
                    812:       ERR("IP_PORTRANGE");
                    813:   }
                    814: #endif
                    815: 
                    816: #ifdef IPV6_PORTRANGE
                    817:   if (sk_is_ipv6(s))
                    818:   {
                    819:     int range = IPV6_PORTRANGE_HIGH;
                    820:     if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
                    821:       ERR("IPV6_PORTRANGE");
                    822:   }
                    823: #endif
                    824: 
                    825:   return 0;
                    826: }
                    827: 
                    828: static inline byte *
                    829: sk_skip_ip_header(byte *pkt, int *len)
                    830: {
                    831:   if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
                    832:     return NULL;
                    833: 
                    834:   int hlen = (*pkt & 0x0f) * 4;
                    835:   if ((hlen < 20) || (hlen > *len))
                    836:     return NULL;
                    837: 
                    838:   *len -= hlen;
                    839:   return pkt + hlen;
                    840: }
                    841: 
                    842: byte *
                    843: sk_rx_buffer(sock *s, int *len)
                    844: {
                    845:   if (sk_is_ipv4(s) && (s->type == SK_IP))
                    846:     return sk_skip_ip_header(s->rbuf, len);
                    847:   else
                    848:     return s->rbuf;
                    849: }
                    850: 
                    851: 
                    852: /*
                    853:  *     Public socket functions
                    854:  */
                    855: 
                    856: /**
                    857:  * sk_setup_multicast - enable multicast for given socket
                    858:  * @s: socket
                    859:  *
                    860:  * Prepare transmission of multicast packets for given datagram socket.
                    861:  * The socket must have defined @iface.
                    862:  *
                    863:  * Result: 0 for success, -1 for an error.
                    864:  */
                    865: 
                    866: int
                    867: sk_setup_multicast(sock *s)
                    868: {
                    869:   ASSERT(s->iface);
                    870: 
                    871:   if (sk_is_ipv4(s))
                    872:     return sk_setup_multicast4(s);
                    873:   else
                    874:     return sk_setup_multicast6(s);
                    875: }
                    876: 
                    877: /**
                    878:  * sk_join_group - join multicast group for given socket
                    879:  * @s: socket
                    880:  * @maddr: multicast address
                    881:  *
                    882:  * Join multicast group for given datagram socket and associated interface.
                    883:  * The socket must have defined @iface.
                    884:  *
                    885:  * Result: 0 for success, -1 for an error.
                    886:  */
                    887: 
                    888: int
                    889: sk_join_group(sock *s, ip_addr maddr)
                    890: {
                    891:   if (sk_is_ipv4(s))
                    892:     return sk_join_group4(s, maddr);
                    893:   else
                    894:     return sk_join_group6(s, maddr);
                    895: }
                    896: 
                    897: /**
                    898:  * sk_leave_group - leave multicast group for given socket
                    899:  * @s: socket
                    900:  * @maddr: multicast address
                    901:  *
                    902:  * Leave multicast group for given datagram socket and associated interface.
                    903:  * The socket must have defined @iface.
                    904:  *
                    905:  * Result: 0 for success, -1 for an error.
                    906:  */
                    907: 
                    908: int
                    909: sk_leave_group(sock *s, ip_addr maddr)
                    910: {
                    911:   if (sk_is_ipv4(s))
                    912:     return sk_leave_group4(s, maddr);
                    913:   else
                    914:     return sk_leave_group6(s, maddr);
                    915: }
                    916: 
                    917: /**
                    918:  * sk_setup_broadcast - enable broadcast for given socket
                    919:  * @s: socket
                    920:  *
                    921:  * Allow reception and transmission of broadcast packets for given datagram
                    922:  * socket. The socket must have defined @iface. For transmission, packets should
                    923:  * be send to @brd address of @iface.
                    924:  *
                    925:  * Result: 0 for success, -1 for an error.
                    926:  */
                    927: 
                    928: int
                    929: sk_setup_broadcast(sock *s)
                    930: {
                    931:   int y = 1;
                    932: 
                    933:   if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
                    934:     ERR("SO_BROADCAST");
                    935: 
                    936:   return 0;
                    937: }
                    938: 
                    939: /**
                    940:  * sk_set_ttl - set transmit TTL for given socket
                    941:  * @s: socket
                    942:  * @ttl: TTL value
                    943:  *
                    944:  * Set TTL for already opened connections when TTL was not set before. Useful
                    945:  * for accepted connections when different ones should have different TTL.
                    946:  *
                    947:  * Result: 0 for success, -1 for an error.
                    948:  */
                    949: 
                    950: int
                    951: sk_set_ttl(sock *s, int ttl)
                    952: {
                    953:   s->ttl = ttl;
                    954: 
                    955:   if (sk_is_ipv4(s))
                    956:     return sk_set_ttl4(s, ttl);
                    957:   else
                    958:     return sk_set_ttl6(s, ttl);
                    959: }
                    960: 
                    961: /**
                    962:  * sk_set_min_ttl - set minimal accepted TTL for given socket
                    963:  * @s: socket
                    964:  * @ttl: TTL value
                    965:  *
                    966:  * Set minimal accepted TTL for given socket. Can be used for TTL security.
                    967:  * implementations.
                    968:  *
                    969:  * Result: 0 for success, -1 for an error.
                    970:  */
                    971: 
                    972: int
                    973: sk_set_min_ttl(sock *s, int ttl)
                    974: {
                    975:   if (sk_is_ipv4(s))
                    976:     return sk_set_min_ttl4(s, ttl);
                    977:   else
                    978:     return sk_set_min_ttl6(s, ttl);
                    979: }
                    980: 
                    981: #if 0
                    982: /**
                    983:  * sk_set_md5_auth - add / remove MD5 security association for given socket
                    984:  * @s: socket
                    985:  * @local: IP address of local side
                    986:  * @remote: IP address of remote side
                    987:  * @ifa: Interface for link-local IP address
                    988:  * @passwd: Password used for MD5 authentication
                    989:  * @setkey: Update also system SA/SP database
                    990:  *
                    991:  * In TCP MD5 handling code in kernel, there is a set of security associations
                    992:  * used for choosing password and other authentication parameters according to
                    993:  * the local and remote address. This function is useful for listening socket,
                    994:  * for active sockets it may be enough to set s->password field.
                    995:  *
                    996:  * When called with passwd != NULL, the new pair is added,
                    997:  * When called with passwd == NULL, the existing pair is removed.
                    998:  *
                    999:  * Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
                   1000:  * stored in global SA/SP database (but the behavior also must be enabled on
                   1001:  * per-socket basis). In case of multiple sockets to the same neighbor, the
                   1002:  * socket-specific state must be configured for each socket while global state
                   1003:  * just once per src-dst pair. The @setkey argument controls whether the global
                   1004:  * state (SA/SP database) is also updated.
                   1005:  *
                   1006:  * Result: 0 for success, -1 for an error.
                   1007:  */
                   1008: 
                   1009: int
                   1010: sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
                   1011: { DUMMY; }
                   1012: #endif
                   1013: 
                   1014: /**
                   1015:  * sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
                   1016:  * @s: socket
                   1017:  * @offset: offset
                   1018:  *
                   1019:  * Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
                   1020:  * kernel will automatically fill it for outgoing packets and check it for
                   1021:  * incoming packets. Should not be used on ICMPv6 sockets, where the position is
                   1022:  * known to the kernel.
                   1023:  *
                   1024:  * Result: 0 for success, -1 for an error.
                   1025:  */
                   1026: 
                   1027: int
                   1028: sk_set_ipv6_checksum(sock *s, int offset)
                   1029: {
                   1030:   if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
                   1031:     ERR("IPV6_CHECKSUM");
                   1032: 
                   1033:   return 0;
                   1034: }
                   1035: 
                   1036: int
                   1037: sk_set_icmp6_filter(sock *s, int p1, int p2)
                   1038: {
                   1039:   /* a bit of lame interface, but it is here only for Radv */
                   1040:   struct icmp6_filter f;
                   1041: 
                   1042:   ICMP6_FILTER_SETBLOCKALL(&f);
                   1043:   ICMP6_FILTER_SETPASS(p1, &f);
                   1044:   ICMP6_FILTER_SETPASS(p2, &f);
                   1045: 
                   1046:   if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
                   1047:     ERR("ICMP6_FILTER");
                   1048: 
                   1049:   return 0;
                   1050: }
                   1051: 
                   1052: void
                   1053: sk_log_error(sock *s, const char *p)
                   1054: {
                   1055:   log(L_ERR "%s: Socket error: %s%#m", p, s->err);
                   1056: }
                   1057: 
                   1058: 
                   1059: /*
                   1060:  *     Actual struct birdsock code
                   1061:  */
                   1062: 
                   1063: static list sock_list;
                   1064: static struct birdsock *current_sock;
                   1065: static struct birdsock *stored_sock;
                   1066: 
                   1067: static inline sock *
                   1068: sk_next(sock *s)
                   1069: {
                   1070:   if (!s->n.next->next)
                   1071:     return NULL;
                   1072:   else
                   1073:     return SKIP_BACK(sock, n, s->n.next);
                   1074: }
                   1075: 
                   1076: static void
                   1077: sk_alloc_bufs(sock *s)
                   1078: {
                   1079:   if (!s->rbuf && s->rbsize)
                   1080:     s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
                   1081:   s->rpos = s->rbuf;
                   1082:   if (!s->tbuf && s->tbsize)
                   1083:     s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
                   1084:   s->tpos = s->ttx = s->tbuf;
                   1085: }
                   1086: 
                   1087: static void
                   1088: sk_free_bufs(sock *s)
                   1089: {
                   1090:   if (s->rbuf_alloc)
                   1091:   {
                   1092:     xfree(s->rbuf_alloc);
                   1093:     s->rbuf = s->rbuf_alloc = NULL;
                   1094:   }
                   1095:   if (s->tbuf_alloc)
                   1096:   {
                   1097:     xfree(s->tbuf_alloc);
                   1098:     s->tbuf = s->tbuf_alloc = NULL;
                   1099:   }
                   1100: }
                   1101: 
                   1102: static void
                   1103: sk_free(resource *r)
                   1104: {
                   1105:   sock *s = (sock *) r;
                   1106: 
                   1107:   sk_free_bufs(s);
                   1108:   if (s->fd >= 0)
                   1109:   {
                   1110:     close(s->fd);
                   1111: 
                   1112:     /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
                   1113:     if (s->flags & SKF_THREAD)
                   1114:       return;
                   1115: 
                   1116:     if (s == current_sock)
                   1117:       current_sock = sk_next(s);
                   1118:     if (s == stored_sock)
                   1119:       stored_sock = sk_next(s);
                   1120:     rem_node(&s->n);
                   1121:   }
                   1122: }
                   1123: 
                   1124: void
                   1125: sk_set_rbsize(sock *s, uint val)
                   1126: {
                   1127:   ASSERT(s->rbuf_alloc == s->rbuf);
                   1128: 
                   1129:   if (s->rbsize == val)
                   1130:     return;
                   1131: 
                   1132:   s->rbsize = val;
                   1133:   xfree(s->rbuf_alloc);
                   1134:   s->rbuf_alloc = xmalloc(val);
                   1135:   s->rpos = s->rbuf = s->rbuf_alloc;
                   1136: }
                   1137: 
                   1138: void
                   1139: sk_set_tbsize(sock *s, uint val)
                   1140: {
                   1141:   ASSERT(s->tbuf_alloc == s->tbuf);
                   1142: 
                   1143:   if (s->tbsize == val)
                   1144:     return;
                   1145: 
                   1146:   byte *old_tbuf = s->tbuf;
                   1147: 
                   1148:   s->tbsize = val;
                   1149:   s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
                   1150:   s->tpos = s->tbuf + (s->tpos - old_tbuf);
                   1151:   s->ttx  = s->tbuf + (s->ttx  - old_tbuf);
                   1152: }
                   1153: 
                   1154: void
                   1155: sk_set_tbuf(sock *s, void *tbuf)
                   1156: {
                   1157:   s->tbuf = tbuf ?: s->tbuf_alloc;
                   1158:   s->ttx = s->tpos = s->tbuf;
                   1159: }
                   1160: 
                   1161: void
                   1162: sk_reallocate(sock *s)
                   1163: {
                   1164:   sk_free_bufs(s);
                   1165:   sk_alloc_bufs(s);
                   1166: }
                   1167: 
                   1168: static void
                   1169: sk_dump(resource *r)
                   1170: {
                   1171:   sock *s = (sock *) r;
                   1172:   static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "DEL!" };
                   1173: 
                   1174:   debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
                   1175:        sk_type_names[s->type],
                   1176:        s->data,
                   1177:        s->saddr,
                   1178:        s->sport,
                   1179:        s->daddr,
                   1180:        s->dport,
                   1181:        s->tos,
                   1182:        s->ttl,
                   1183:        s->iface ? s->iface->name : "none");
                   1184: }
                   1185: 
                   1186: static struct resclass sk_class = {
                   1187:   "Socket",
                   1188:   sizeof(sock),
                   1189:   sk_free,
                   1190:   sk_dump,
                   1191:   NULL,
                   1192:   NULL
                   1193: };
                   1194: 
                   1195: /**
                   1196:  * sk_new - create a socket
                   1197:  * @p: pool
                   1198:  *
                   1199:  * This function creates a new socket resource. If you want to use it,
                   1200:  * you need to fill in all the required fields of the structure and
                   1201:  * call sk_open() to do the actual opening of the socket.
                   1202:  *
                   1203:  * The real function name is sock_new(), sk_new() is a macro wrapper
                   1204:  * to avoid collision with OpenSSL.
                   1205:  */
                   1206: sock *
                   1207: sock_new(pool *p)
                   1208: {
                   1209:   sock *s = ralloc(p, &sk_class);
                   1210:   s->pool = p;
                   1211:   // s->saddr = s->daddr = IPA_NONE;
                   1212:   s->tos = s->priority = s->ttl = -1;
                   1213:   s->fd = -1;
                   1214:   return s;
                   1215: }
                   1216: 
                   1217: static int
                   1218: sk_setup(sock *s)
                   1219: {
                   1220:   int y = 1;
                   1221:   int fd = s->fd;
                   1222: 
                   1223:   if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
                   1224:     ERR("O_NONBLOCK");
                   1225: 
                   1226:   if (!s->af)
                   1227:     return 0;
                   1228: 
                   1229:   if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
                   1230:     s->flags |= SKF_PKTINFO;
                   1231: 
                   1232: #ifdef CONFIG_USE_HDRINCL
                   1233:   if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
                   1234:   {
                   1235:     s->flags &= ~SKF_PKTINFO;
                   1236:     s->flags |= SKF_HDRINCL;
                   1237:     if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
                   1238:       ERR("IP_HDRINCL");
                   1239:   }
                   1240: #endif
                   1241: 
1.1.1.2 ! misho    1242:   if (s->vrf && !s->iface)
        !          1243:   {
        !          1244:     /* Bind socket to associated VRF interface.
        !          1245:        This is Linux-specific, but so is SO_BINDTODEVICE. */
        !          1246: #ifdef SO_BINDTODEVICE
        !          1247:     struct ifreq ifr = {};
        !          1248:     strcpy(ifr.ifr_name, s->vrf->name);
        !          1249:     if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
        !          1250:       ERR("SO_BINDTODEVICE");
        !          1251: #endif
        !          1252:   }
        !          1253: 
1.1       misho    1254:   if (s->iface)
                   1255:   {
                   1256: #ifdef SO_BINDTODEVICE
                   1257:     struct ifreq ifr = {};
                   1258:     strcpy(ifr.ifr_name, s->iface->name);
                   1259:     if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
                   1260:       ERR("SO_BINDTODEVICE");
                   1261: #endif
                   1262: 
                   1263: #ifdef CONFIG_UNIX_DONTROUTE
                   1264:     if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
                   1265:       ERR("SO_DONTROUTE");
                   1266: #endif
                   1267:   }
                   1268: 
                   1269:   if (sk_is_ipv4(s))
                   1270:   {
                   1271:     if (s->flags & SKF_LADDR_RX)
                   1272:       if (sk_request_cmsg4_pktinfo(s) < 0)
                   1273:        return -1;
                   1274: 
                   1275:     if (s->flags & SKF_TTL_RX)
                   1276:       if (sk_request_cmsg4_ttl(s) < 0)
                   1277:        return -1;
                   1278: 
                   1279:     if ((s->type == SK_UDP) || (s->type == SK_IP))
                   1280:       if (sk_disable_mtu_disc4(s) < 0)
                   1281:        return -1;
                   1282: 
                   1283:     if (s->ttl >= 0)
                   1284:       if (sk_set_ttl4(s, s->ttl) < 0)
                   1285:        return -1;
                   1286: 
                   1287:     if (s->tos >= 0)
                   1288:       if (sk_set_tos4(s, s->tos) < 0)
                   1289:        return -1;
                   1290:   }
                   1291: 
                   1292:   if (sk_is_ipv6(s))
                   1293:   {
                   1294:     if (s->flags & SKF_V6ONLY)
                   1295:       if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
                   1296:        ERR("IPV6_V6ONLY");
                   1297: 
                   1298:     if (s->flags & SKF_LADDR_RX)
                   1299:       if (sk_request_cmsg6_pktinfo(s) < 0)
                   1300:        return -1;
                   1301: 
                   1302:     if (s->flags & SKF_TTL_RX)
                   1303:       if (sk_request_cmsg6_ttl(s) < 0)
                   1304:        return -1;
                   1305: 
                   1306:     if ((s->type == SK_UDP) || (s->type == SK_IP))
                   1307:       if (sk_disable_mtu_disc6(s) < 0)
                   1308:        return -1;
                   1309: 
                   1310:     if (s->ttl >= 0)
                   1311:       if (sk_set_ttl6(s, s->ttl) < 0)
                   1312:        return -1;
                   1313: 
                   1314:     if (s->tos >= 0)
                   1315:       if (sk_set_tos6(s, s->tos) < 0)
                   1316:        return -1;
                   1317:   }
                   1318: 
1.1.1.2 ! misho    1319:   /* Must be after sk_set_tos4() as setting ToS on Linux also mangles priority */
        !          1320:   if (s->priority >= 0)
        !          1321:     if (sk_set_priority(s, s->priority) < 0)
        !          1322:       return -1;
        !          1323: 
1.1       misho    1324:   return 0;
                   1325: }
                   1326: 
                   1327: static void
                   1328: sk_insert(sock *s)
                   1329: {
                   1330:   add_tail(&sock_list, &s->n);
                   1331: }
                   1332: 
                   1333: static void
                   1334: sk_tcp_connected(sock *s)
                   1335: {
                   1336:   sockaddr sa;
                   1337:   int sa_len = sizeof(sa);
                   1338: 
                   1339:   if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
                   1340:       (sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
                   1341:     log(L_WARN "SOCK: Cannot get local IP address for TCP>");
                   1342: 
                   1343:   s->type = SK_TCP;
                   1344:   sk_alloc_bufs(s);
                   1345:   s->tx_hook(s);
                   1346: }
                   1347: 
                   1348: static int
                   1349: sk_passive_connected(sock *s, int type)
                   1350: {
                   1351:   sockaddr loc_sa, rem_sa;
                   1352:   int loc_sa_len = sizeof(loc_sa);
                   1353:   int rem_sa_len = sizeof(rem_sa);
                   1354: 
                   1355:   int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
                   1356:   if (fd < 0)
                   1357:   {
                   1358:     if ((errno != EINTR) && (errno != EAGAIN))
                   1359:       s->err_hook(s, errno);
                   1360:     return 0;
                   1361:   }
                   1362: 
                   1363:   sock *t = sk_new(s->pool);
                   1364:   t->type = type;
                   1365:   t->fd = fd;
                   1366:   t->af = s->af;
                   1367:   t->ttl = s->ttl;
                   1368:   t->tos = s->tos;
                   1369:   t->rbsize = s->rbsize;
                   1370:   t->tbsize = s->tbsize;
                   1371: 
                   1372:   if (type == SK_TCP)
                   1373:   {
                   1374:     if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
                   1375:        (sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
                   1376:       log(L_WARN "SOCK: Cannot get local IP address for TCP<");
                   1377: 
                   1378:     if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
                   1379:       log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
                   1380:   }
                   1381: 
                   1382:   if (sk_setup(t) < 0)
                   1383:   {
                   1384:     /* FIXME: Call err_hook instead ? */
                   1385:     log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
                   1386: 
                   1387:     /* FIXME: handle it better in rfree() */
                   1388:     close(t->fd);
                   1389:     t->fd = -1;
                   1390:     rfree(t);
                   1391:     return 1;
                   1392:   }
                   1393: 
                   1394:   sk_insert(t);
                   1395:   sk_alloc_bufs(t);
                   1396:   s->rx_hook(t, 0);
                   1397:   return 1;
                   1398: }
                   1399: 
                   1400: /**
                   1401:  * sk_open - open a socket
                   1402:  * @s: socket
                   1403:  *
                   1404:  * This function takes a socket resource created by sk_new() and
                   1405:  * initialized by the user and binds a corresponding network connection
                   1406:  * to it.
                   1407:  *
                   1408:  * Result: 0 for success, -1 for an error.
                   1409:  */
                   1410: int
                   1411: sk_open(sock *s)
                   1412: {
                   1413:   int af = BIRD_AF;
                   1414:   int fd = -1;
                   1415:   int do_bind = 0;
                   1416:   int bind_port = 0;
                   1417:   ip_addr bind_addr = IPA_NONE;
                   1418:   sockaddr sa;
                   1419: 
                   1420:   switch (s->type)
                   1421:   {
                   1422:   case SK_TCP_ACTIVE:
                   1423:     s->ttx = "";                       /* Force s->ttx != s->tpos */
                   1424:     /* Fall thru */
                   1425:   case SK_TCP_PASSIVE:
                   1426:     fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
                   1427:     bind_port = s->sport;
                   1428:     bind_addr = s->saddr;
                   1429:     do_bind = bind_port || ipa_nonzero(bind_addr);
                   1430:     break;
                   1431: 
                   1432:   case SK_UDP:
                   1433:     fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
                   1434:     bind_port = s->sport;
                   1435:     bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
                   1436:     do_bind = 1;
                   1437:     break;
                   1438: 
                   1439:   case SK_IP:
                   1440:     fd = socket(af, SOCK_RAW, s->dport);
                   1441:     bind_port = 0;
                   1442:     bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
                   1443:     do_bind = ipa_nonzero(bind_addr);
                   1444:     break;
                   1445: 
                   1446:   case SK_MAGIC:
                   1447:     af = 0;
                   1448:     fd = s->fd;
                   1449:     break;
                   1450: 
                   1451:   default:
                   1452:     bug("sk_open() called for invalid sock type %d", s->type);
                   1453:   }
                   1454: 
                   1455:   if (fd < 0)
                   1456:     ERR("socket");
                   1457: 
                   1458:   s->af = af;
                   1459:   s->fd = fd;
                   1460: 
                   1461:   if (sk_setup(s) < 0)
                   1462:     goto err;
                   1463: 
                   1464:   if (do_bind)
                   1465:   {
                   1466:     if (bind_port)
                   1467:     {
                   1468:       int y = 1;
                   1469: 
                   1470:       if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
                   1471:        ERR2("SO_REUSEADDR");
                   1472: 
                   1473: #ifdef CONFIG_NO_IFACE_BIND
                   1474:       /* Workaround missing ability to bind to an iface */
                   1475:       if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
                   1476:       {
                   1477:        if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
                   1478:          ERR2("SO_REUSEPORT");
                   1479:       }
                   1480: #endif
                   1481:     }
                   1482:     else
                   1483:       if (s->flags & SKF_HIGH_PORT)
                   1484:        if (sk_set_high_port(s) < 0)
                   1485:          log(L_WARN "Socket error: %s%#m", s->err);
                   1486: 
                   1487:     sockaddr_fill(&sa, af, bind_addr, s->iface, bind_port);
                   1488:     if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
                   1489:       ERR2("bind");
                   1490:   }
                   1491: 
                   1492:   if (s->password)
                   1493:     if (sk_set_md5_auth(s, s->saddr, s->daddr, s->iface, s->password, 0) < 0)
                   1494:       goto err;
                   1495: 
                   1496:   switch (s->type)
                   1497:   {
                   1498:   case SK_TCP_ACTIVE:
                   1499:     sockaddr_fill(&sa, af, s->daddr, s->iface, s->dport);
                   1500:     if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
                   1501:       sk_tcp_connected(s);
                   1502:     else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
                   1503:             errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
                   1504:       ERR2("connect");
                   1505:     break;
                   1506: 
                   1507:   case SK_TCP_PASSIVE:
                   1508:     if (listen(fd, 8) < 0)
                   1509:       ERR2("listen");
                   1510:     break;
                   1511: 
                   1512:   case SK_MAGIC:
                   1513:     break;
                   1514: 
                   1515:   default:
                   1516:     sk_alloc_bufs(s);
                   1517:   }
                   1518: 
                   1519:   if (!(s->flags & SKF_THREAD))
                   1520:     sk_insert(s);
                   1521:   return 0;
                   1522: 
                   1523: err:
                   1524:   close(fd);
                   1525:   s->fd = -1;
                   1526:   return -1;
                   1527: }
                   1528: 
                   1529: int
                   1530: sk_open_unix(sock *s, char *name)
                   1531: {
                   1532:   struct sockaddr_un sa;
                   1533:   int fd;
                   1534: 
                   1535:   /* We are sloppy during error (leak fd and not set s->err), but we die anyway */
                   1536: 
                   1537:   fd = socket(AF_UNIX, SOCK_STREAM, 0);
                   1538:   if (fd < 0)
                   1539:     return -1;
                   1540: 
                   1541:   if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
                   1542:     return -1;
                   1543: 
                   1544:   /* Path length checked in test_old_bird() */
                   1545:   sa.sun_family = AF_UNIX;
                   1546:   strcpy(sa.sun_path, name);
                   1547: 
                   1548:   if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
                   1549:     return -1;
                   1550: 
                   1551:   if (listen(fd, 8) < 0)
                   1552:     return -1;
                   1553: 
                   1554:   s->fd = fd;
                   1555:   sk_insert(s);
                   1556:   return 0;
                   1557: }
                   1558: 
                   1559: 
                   1560: #define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
                   1561:                          CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
                   1562: #define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
                   1563: 
                   1564: static void
                   1565: sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
                   1566: {
                   1567:   if (sk_is_ipv4(s))
                   1568:     sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
                   1569:   else
                   1570:     sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
                   1571: }
                   1572: 
                   1573: static void
                   1574: sk_process_cmsgs(sock *s, struct msghdr *msg)
                   1575: {
                   1576:   struct cmsghdr *cm;
                   1577: 
                   1578:   s->laddr = IPA_NONE;
                   1579:   s->lifindex = 0;
                   1580:   s->rcv_ttl = -1;
                   1581: 
                   1582:   for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
                   1583:   {
                   1584:     if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
                   1585:     {
                   1586:       sk_process_cmsg4_pktinfo(s, cm);
                   1587:       sk_process_cmsg4_ttl(s, cm);
                   1588:     }
                   1589: 
                   1590:     if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
                   1591:     {
                   1592:       sk_process_cmsg6_pktinfo(s, cm);
                   1593:       sk_process_cmsg6_ttl(s, cm);
                   1594:     }
                   1595:   }
                   1596: }
                   1597: 
                   1598: 
                   1599: static inline int
                   1600: sk_sendmsg(sock *s)
                   1601: {
                   1602:   struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
                   1603:   byte cmsg_buf[CMSG_TX_SPACE];
                   1604:   sockaddr dst;
1.1.1.2 ! misho    1605:   int flags = 0;
1.1       misho    1606: 
                   1607:   sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
                   1608: 
                   1609:   struct msghdr msg = {
                   1610:     .msg_name = &dst.sa,
                   1611:     .msg_namelen = SA_LEN(dst),
                   1612:     .msg_iov = &iov,
                   1613:     .msg_iovlen = 1
                   1614:   };
                   1615: 
1.1.1.2 ! misho    1616: #ifdef CONFIG_DONTROUTE_UNICAST
        !          1617:   /* FreeBSD silently changes TTL to 1 when MSG_DONTROUTE is used, therefore we
        !          1618:      cannot use it for other cases (e.g. when TTL security is used). */
        !          1619:   if (ipa_is_ip4(s->daddr) && ip4_is_unicast(ipa_to_ip4(s->daddr)) && (s->ttl == 1))
        !          1620:     flags = MSG_DONTROUTE;
        !          1621: #endif
        !          1622: 
1.1       misho    1623: #ifdef CONFIG_USE_HDRINCL
                   1624:   byte hdr[20];
                   1625:   struct iovec iov2[2] = { {hdr, 20}, iov };
                   1626: 
                   1627:   if (s->flags & SKF_HDRINCL)
                   1628:   {
                   1629:     sk_prepare_ip_header(s, hdr, iov.iov_len);
                   1630:     msg.msg_iov = iov2;
                   1631:     msg.msg_iovlen = 2;
                   1632:   }
                   1633: #endif
                   1634: 
                   1635:   if (s->flags & SKF_PKTINFO)
                   1636:     sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
                   1637: 
1.1.1.2 ! misho    1638:   return sendmsg(s->fd, &msg, flags);
1.1       misho    1639: }
                   1640: 
                   1641: static inline int
                   1642: sk_recvmsg(sock *s)
                   1643: {
                   1644:   struct iovec iov = {s->rbuf, s->rbsize};
                   1645:   byte cmsg_buf[CMSG_RX_SPACE];
                   1646:   sockaddr src;
                   1647: 
                   1648:   struct msghdr msg = {
                   1649:     .msg_name = &src.sa,
                   1650:     .msg_namelen = sizeof(src), // XXXX ??
                   1651:     .msg_iov = &iov,
                   1652:     .msg_iovlen = 1,
                   1653:     .msg_control = cmsg_buf,
                   1654:     .msg_controllen = sizeof(cmsg_buf),
                   1655:     .msg_flags = 0
                   1656:   };
                   1657: 
                   1658:   int rv = recvmsg(s->fd, &msg, 0);
                   1659:   if (rv < 0)
                   1660:     return rv;
                   1661: 
                   1662:   //ifdef IPV4
                   1663:   //  if (cf_type == SK_IP)
                   1664:   //    rv = ipv4_skip_header(pbuf, rv);
                   1665:   //endif
                   1666: 
                   1667:   sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
                   1668:   sk_process_cmsgs(s, &msg);
                   1669: 
                   1670:   if (msg.msg_flags & MSG_TRUNC)
                   1671:     s->flags |= SKF_TRUNCATED;
                   1672:   else
                   1673:     s->flags &= ~SKF_TRUNCATED;
                   1674: 
                   1675:   return rv;
                   1676: }
                   1677: 
                   1678: 
                   1679: static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
                   1680: 
                   1681: static int
                   1682: sk_maybe_write(sock *s)
                   1683: {
                   1684:   int e;
                   1685: 
                   1686:   switch (s->type)
                   1687:   {
                   1688:   case SK_TCP:
                   1689:   case SK_MAGIC:
                   1690:   case SK_UNIX:
                   1691:     while (s->ttx != s->tpos)
                   1692:     {
                   1693:       e = write(s->fd, s->ttx, s->tpos - s->ttx);
                   1694: 
                   1695:       if (e < 0)
                   1696:       {
                   1697:        if (errno != EINTR && errno != EAGAIN)
                   1698:        {
                   1699:          reset_tx_buffer(s);
                   1700:          /* EPIPE is just a connection close notification during TX */
                   1701:          s->err_hook(s, (errno != EPIPE) ? errno : 0);
                   1702:          return -1;
                   1703:        }
                   1704:        return 0;
                   1705:       }
                   1706:       s->ttx += e;
                   1707:     }
                   1708:     reset_tx_buffer(s);
                   1709:     return 1;
                   1710: 
                   1711:   case SK_UDP:
                   1712:   case SK_IP:
                   1713:     {
                   1714:       if (s->tbuf == s->tpos)
                   1715:        return 1;
                   1716: 
                   1717:       e = sk_sendmsg(s);
                   1718: 
                   1719:       if (e < 0)
                   1720:       {
                   1721:        if (errno != EINTR && errno != EAGAIN)
                   1722:        {
                   1723:          reset_tx_buffer(s);
                   1724:          s->err_hook(s, errno);
                   1725:          return -1;
                   1726:        }
                   1727: 
                   1728:        if (!s->tx_hook)
                   1729:          reset_tx_buffer(s);
                   1730:        return 0;
                   1731:       }
                   1732:       reset_tx_buffer(s);
                   1733:       return 1;
                   1734:     }
                   1735:   default:
                   1736:     bug("sk_maybe_write: unknown socket type %d", s->type);
                   1737:   }
                   1738: }
                   1739: 
                   1740: int
                   1741: sk_rx_ready(sock *s)
                   1742: {
                   1743:   int rv;
                   1744:   struct pollfd pfd = { .fd = s->fd };
                   1745:   pfd.events |= POLLIN;
                   1746: 
                   1747:  redo:
                   1748:   rv = poll(&pfd, 1, 0);
                   1749: 
                   1750:   if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
                   1751:     goto redo;
                   1752: 
                   1753:   return rv;
                   1754: }
                   1755: 
                   1756: /**
                   1757:  * sk_send - send data to a socket
                   1758:  * @s: socket
                   1759:  * @len: number of bytes to send
                   1760:  *
                   1761:  * This function sends @len bytes of data prepared in the
                   1762:  * transmit buffer of the socket @s to the network connection.
                   1763:  * If the packet can be sent immediately, it does so and returns
                   1764:  * 1, else it queues the packet for later processing, returns 0
                   1765:  * and calls the @tx_hook of the socket when the tranmission
                   1766:  * takes place.
                   1767:  */
                   1768: int
                   1769: sk_send(sock *s, unsigned len)
                   1770: {
                   1771:   s->ttx = s->tbuf;
                   1772:   s->tpos = s->tbuf + len;
                   1773:   return sk_maybe_write(s);
                   1774: }
                   1775: 
                   1776: /**
                   1777:  * sk_send_to - send data to a specific destination
                   1778:  * @s: socket
                   1779:  * @len: number of bytes to send
                   1780:  * @addr: IP address to send the packet to
                   1781:  * @port: port to send the packet to
                   1782:  *
                   1783:  * This is a sk_send() replacement for connection-less packet sockets
                   1784:  * which allows destination of the packet to be chosen dynamically.
                   1785:  * Raw IP sockets should use 0 for @port.
                   1786:  */
                   1787: int
                   1788: sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
                   1789: {
                   1790:   s->daddr = addr;
                   1791:   if (port)
                   1792:     s->dport = port;
                   1793: 
                   1794:   s->ttx = s->tbuf;
                   1795:   s->tpos = s->tbuf + len;
                   1796:   return sk_maybe_write(s);
                   1797: }
                   1798: 
                   1799: /*
                   1800: int
                   1801: sk_send_full(sock *s, unsigned len, struct iface *ifa,
                   1802:             ip_addr saddr, ip_addr daddr, unsigned dport)
                   1803: {
                   1804:   s->iface = ifa;
                   1805:   s->saddr = saddr;
                   1806:   s->daddr = daddr;
                   1807:   s->dport = dport;
                   1808:   s->ttx = s->tbuf;
                   1809:   s->tpos = s->tbuf + len;
                   1810:   return sk_maybe_write(s);
                   1811: }
                   1812: */
                   1813: 
                   1814:  /* sk_read() and sk_write() are called from BFD's event loop */
                   1815: 
                   1816: int
                   1817: sk_read(sock *s, int revents)
                   1818: {
                   1819:   switch (s->type)
                   1820:   {
                   1821:   case SK_TCP_PASSIVE:
                   1822:     return sk_passive_connected(s, SK_TCP);
                   1823: 
                   1824:   case SK_UNIX_PASSIVE:
                   1825:     return sk_passive_connected(s, SK_UNIX);
                   1826: 
                   1827:   case SK_TCP:
                   1828:   case SK_UNIX:
                   1829:     {
                   1830:       int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
                   1831: 
                   1832:       if (c < 0)
                   1833:       {
                   1834:        if (errno != EINTR && errno != EAGAIN)
                   1835:          s->err_hook(s, errno);
                   1836:        else if (errno == EAGAIN && !(revents & POLLIN))
                   1837:        {
                   1838:          log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents);
                   1839:          s->err_hook(s, 0);
                   1840:        }
                   1841:       }
                   1842:       else if (!c)
                   1843:        s->err_hook(s, 0);
                   1844:       else
                   1845:       {
                   1846:        s->rpos += c;
                   1847:        if (s->rx_hook(s, s->rpos - s->rbuf))
                   1848:        {
                   1849:          /* We need to be careful since the socket could have been deleted by the hook */
                   1850:          if (current_sock == s)
                   1851:            s->rpos = s->rbuf;
                   1852:        }
                   1853:        return 1;
                   1854:       }
                   1855:       return 0;
                   1856:     }
                   1857: 
                   1858:   case SK_MAGIC:
                   1859:     return s->rx_hook(s, 0);
                   1860: 
                   1861:   default:
                   1862:     {
                   1863:       int e = sk_recvmsg(s);
                   1864: 
                   1865:       if (e < 0)
                   1866:       {
                   1867:        if (errno != EINTR && errno != EAGAIN)
                   1868:          s->err_hook(s, errno);
                   1869:        return 0;
                   1870:       }
                   1871: 
                   1872:       s->rpos = s->rbuf + e;
                   1873:       s->rx_hook(s, e);
                   1874:       return 1;
                   1875:     }
                   1876:   }
                   1877: }
                   1878: 
                   1879: int
                   1880: sk_write(sock *s)
                   1881: {
                   1882:   switch (s->type)
                   1883:   {
                   1884:   case SK_TCP_ACTIVE:
                   1885:     {
                   1886:       sockaddr sa;
                   1887:       sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
                   1888: 
                   1889:       if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
                   1890:        sk_tcp_connected(s);
                   1891:       else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
                   1892:        s->err_hook(s, errno);
                   1893:       return 0;
                   1894:     }
                   1895: 
                   1896:   default:
                   1897:     if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
                   1898:     {
                   1899:       if (s->tx_hook)
                   1900:        s->tx_hook(s);
                   1901:       return 1;
                   1902:     }
                   1903:     return 0;
                   1904:   }
                   1905: }
                   1906: 
                   1907: void
                   1908: sk_err(sock *s, int revents)
                   1909: {
                   1910:   int se = 0, sse = sizeof(se);
                   1911:   if ((s->type != SK_MAGIC) && (revents & POLLERR))
                   1912:     if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0)
                   1913:     {
                   1914:       log(L_ERR "IO: Socket error: SO_ERROR: %m");
                   1915:       se = 0;
                   1916:     }
                   1917: 
                   1918:   s->err_hook(s, se);
                   1919: }
                   1920: 
                   1921: void
                   1922: sk_dump_all(void)
                   1923: {
                   1924:   node *n;
                   1925:   sock *s;
                   1926: 
                   1927:   debug("Open sockets:\n");
                   1928:   WALK_LIST(n, sock_list)
                   1929:   {
                   1930:     s = SKIP_BACK(sock, n, n);
                   1931:     debug("%p ", s);
                   1932:     sk_dump(&s->r);
                   1933:   }
                   1934:   debug("\n");
                   1935: }
                   1936: 
                   1937: 
                   1938: /*
                   1939:  *     Internal event log and watchdog
                   1940:  */
                   1941: 
                   1942: #define EVENT_LOG_LENGTH 32
                   1943: 
                   1944: struct event_log_entry
                   1945: {
                   1946:   void *hook;
                   1947:   void *data;
                   1948:   btime timestamp;
                   1949:   btime duration;
                   1950: };
                   1951: 
                   1952: static struct event_log_entry event_log[EVENT_LOG_LENGTH];
                   1953: static struct event_log_entry *event_open;
                   1954: static int event_log_pos, event_log_num, watchdog_active;
                   1955: static btime last_time;
                   1956: static btime loop_time;
                   1957: 
                   1958: static void
                   1959: io_update_time(void)
                   1960: {
                   1961:   struct timespec ts;
                   1962:   int rv;
                   1963: 
                   1964:   if (!clock_monotonic_available)
                   1965:     return;
                   1966: 
                   1967:   /*
                   1968:    * This is third time-tracking procedure (after update_times() above and
                   1969:    * times_update() in BFD), dedicated to internal event log and latency
                   1970:    * tracking. Hopefully, we consolidate these sometimes.
                   1971:    */
                   1972: 
                   1973:   rv = clock_gettime(CLOCK_MONOTONIC, &ts);
                   1974:   if (rv < 0)
                   1975:     die("clock_gettime: %m");
                   1976: 
                   1977:   last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
                   1978: 
                   1979:   if (event_open)
                   1980:   {
                   1981:     event_open->duration = last_time - event_open->timestamp;
                   1982: 
                   1983:     if (event_open->duration > config->latency_limit)
                   1984:       log(L_WARN "Event 0x%p 0x%p took %d ms",
                   1985:          event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
                   1986: 
                   1987:     event_open = NULL;
                   1988:   }
                   1989: }
                   1990: 
                   1991: /**
                   1992:  * io_log_event - mark approaching event into event log
                   1993:  * @hook: event hook address
                   1994:  * @data: event data address
                   1995:  *
                   1996:  * Store info (hook, data, timestamp) about the following internal event into
                   1997:  * a circular event log (@event_log). When latency tracking is enabled, the log
                   1998:  * entry is kept open (in @event_open) so the duration can be filled later.
                   1999:  */
                   2000: void
                   2001: io_log_event(void *hook, void *data)
                   2002: {
                   2003:   if (config->latency_debug)
                   2004:     io_update_time();
                   2005: 
                   2006:   struct event_log_entry *en = event_log + event_log_pos;
                   2007: 
                   2008:   en->hook = hook;
                   2009:   en->data = data;
                   2010:   en->timestamp = last_time;
                   2011:   en->duration = 0;
                   2012: 
                   2013:   event_log_num++;
                   2014:   event_log_pos++;
                   2015:   event_log_pos %= EVENT_LOG_LENGTH;
                   2016: 
                   2017:   event_open = config->latency_debug ? en : NULL;
                   2018: }
                   2019: 
                   2020: static inline void
                   2021: io_close_event(void)
                   2022: {
                   2023:   if (event_open)
                   2024:     io_update_time();
                   2025: }
                   2026: 
                   2027: void
                   2028: io_log_dump(void)
                   2029: {
                   2030:   int i;
                   2031: 
                   2032:   log(L_DEBUG "Event log:");
                   2033:   for (i = 0; i < EVENT_LOG_LENGTH; i++)
                   2034:   {
                   2035:     struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
                   2036:     if (en->hook)
                   2037:       log(L_DEBUG "  Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
                   2038:          (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
                   2039:   }
                   2040: }
                   2041: 
                   2042: void
                   2043: watchdog_sigalrm(int sig UNUSED)
                   2044: {
                   2045:   /* Update last_time and duration, but skip latency check */
                   2046:   config->latency_limit = 0xffffffff;
                   2047:   io_update_time();
                   2048: 
                   2049:   /* We want core dump */
                   2050:   abort();
                   2051: }
                   2052: 
                   2053: static inline void
                   2054: watchdog_start1(void)
                   2055: {
                   2056:   io_update_time();
                   2057: 
                   2058:   loop_time = last_time;
                   2059: }
                   2060: 
                   2061: static inline void
                   2062: watchdog_start(void)
                   2063: {
                   2064:   io_update_time();
                   2065: 
                   2066:   loop_time = last_time;
                   2067:   event_log_num = 0;
                   2068: 
                   2069:   if (config->watchdog_timeout)
                   2070:   {
                   2071:     alarm(config->watchdog_timeout);
                   2072:     watchdog_active = 1;
                   2073:   }
                   2074: }
                   2075: 
                   2076: static inline void
                   2077: watchdog_stop(void)
                   2078: {
                   2079:   io_update_time();
                   2080: 
                   2081:   if (watchdog_active)
                   2082:   {
                   2083:     alarm(0);
                   2084:     watchdog_active = 0;
                   2085:   }
                   2086: 
                   2087:   btime duration = last_time - loop_time;
                   2088:   if (duration > config->watchdog_warning)
                   2089:     log(L_WARN "I/O loop cycle took %d ms for %d events",
                   2090:        (int) (duration TO_MS), event_log_num);
                   2091: }
                   2092: 
                   2093: 
                   2094: /*
                   2095:  *     Main I/O Loop
                   2096:  */
                   2097: 
                   2098: volatile int async_config_flag;                /* Asynchronous reconfiguration/dump scheduled */
                   2099: volatile int async_dump_flag;
                   2100: volatile int async_shutdown_flag;
                   2101: 
                   2102: void
                   2103: io_init(void)
                   2104: {
                   2105:   init_list(&near_timers);
                   2106:   init_list(&far_timers);
                   2107:   init_list(&sock_list);
                   2108:   init_list(&global_event_list);
                   2109:   krt_io_init();
                   2110:   init_times();
                   2111:   update_times();
                   2112:   boot_time = now;
                   2113:   srandom((int) now_real);
                   2114: }
                   2115: 
                   2116: static int short_loops = 0;
                   2117: #define SHORT_LOOP_MAX 10
                   2118: 
                   2119: void
                   2120: io_loop(void)
                   2121: {
                   2122:   int poll_tout;
                   2123:   time_t tout;
                   2124:   int nfds, events, pout;
                   2125:   sock *s;
                   2126:   node *n;
                   2127:   int fdmax = 256;
                   2128:   struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd));
                   2129: 
                   2130:   watchdog_start1();
                   2131:   for(;;)
                   2132:     {
                   2133:       events = ev_run_list(&global_event_list);
                   2134:     timers:
                   2135:       update_times();
                   2136:       tout = tm_first_shot();
                   2137:       if (tout <= now)
                   2138:        {
                   2139:          tm_shot();
                   2140:          goto timers;
                   2141:        }
                   2142:       poll_tout = (events ? 0 : MIN(tout - now, 3)) * 1000; /* Time in milliseconds */
                   2143: 
                   2144:       io_close_event();
                   2145: 
                   2146:       nfds = 0;
                   2147:       WALK_LIST(n, sock_list)
                   2148:        {
                   2149:          pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
                   2150:          s = SKIP_BACK(sock, n, n);
                   2151:          if (s->rx_hook)
                   2152:            {
                   2153:              pfd[nfds].fd = s->fd;
                   2154:              pfd[nfds].events |= POLLIN;
                   2155:            }
                   2156:          if (s->tx_hook && s->ttx != s->tpos)
                   2157:            {
                   2158:              pfd[nfds].fd = s->fd;
                   2159:              pfd[nfds].events |= POLLOUT;
                   2160:            }
                   2161:          if (pfd[nfds].fd != -1)
                   2162:            {
                   2163:              s->index = nfds;
                   2164:              nfds++;
                   2165:            }
                   2166:          else
                   2167:            s->index = -1;
                   2168: 
                   2169:          if (nfds >= fdmax)
                   2170:            {
                   2171:              fdmax *= 2;
                   2172:              pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd));
                   2173:            }
                   2174:        }
                   2175: 
                   2176:       /*
                   2177:        * Yes, this is racy. But even if the signal comes before this test
                   2178:        * and entering poll(), it gets caught on the next timer tick.
                   2179:        */
                   2180: 
                   2181:       if (async_config_flag)
                   2182:        {
                   2183:          io_log_event(async_config, NULL);
                   2184:          async_config();
                   2185:          async_config_flag = 0;
                   2186:          continue;
                   2187:        }
                   2188:       if (async_dump_flag)
                   2189:        {
                   2190:          io_log_event(async_dump, NULL);
                   2191:          async_dump();
                   2192:          async_dump_flag = 0;
                   2193:          continue;
                   2194:        }
                   2195:       if (async_shutdown_flag)
                   2196:        {
                   2197:          io_log_event(async_shutdown, NULL);
                   2198:          async_shutdown();
                   2199:          async_shutdown_flag = 0;
                   2200:          continue;
                   2201:        }
                   2202: 
                   2203:       /* And finally enter poll() to find active sockets */
                   2204:       watchdog_stop();
                   2205:       pout = poll(pfd, nfds, poll_tout);
                   2206:       watchdog_start();
                   2207: 
                   2208:       if (pout < 0)
                   2209:        {
                   2210:          if (errno == EINTR || errno == EAGAIN)
                   2211:            continue;
                   2212:          die("poll: %m");
                   2213:        }
                   2214:       if (pout)
                   2215:        {
                   2216:          /* guaranteed to be non-empty */
                   2217:          current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
                   2218: 
                   2219:          while (current_sock)
                   2220:            {
                   2221:              sock *s = current_sock;
                   2222:              if (s->index == -1)
                   2223:                {
                   2224:                  current_sock = sk_next(s);
                   2225:                  goto next;
                   2226:                }
                   2227: 
                   2228:              int e;
                   2229:              int steps;
                   2230: 
                   2231:              steps = MAX_STEPS;
                   2232:              if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
                   2233:                do
                   2234:                  {
                   2235:                    steps--;
                   2236:                    io_log_event(s->rx_hook, s->data);
                   2237:                    e = sk_read(s, pfd[s->index].revents);
                   2238:                    if (s != current_sock)
                   2239:                      goto next;
                   2240:                  }
                   2241:                while (e && s->rx_hook && steps);
                   2242: 
                   2243:              steps = MAX_STEPS;
                   2244:              if (pfd[s->index].revents & POLLOUT)
                   2245:                do
                   2246:                  {
                   2247:                    steps--;
                   2248:                    io_log_event(s->tx_hook, s->data);
                   2249:                    e = sk_write(s);
                   2250:                    if (s != current_sock)
                   2251:                      goto next;
                   2252:                  }
                   2253:                while (e && steps);
                   2254: 
                   2255:              current_sock = sk_next(s);
                   2256:            next: ;
                   2257:            }
                   2258: 
                   2259:          short_loops++;
                   2260:          if (events && (short_loops < SHORT_LOOP_MAX))
                   2261:            continue;
                   2262:          short_loops = 0;
                   2263: 
                   2264:          int count = 0;
                   2265:          current_sock = stored_sock;
                   2266:          if (current_sock == NULL)
                   2267:            current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
                   2268: 
                   2269:          while (current_sock && count < MAX_RX_STEPS)
                   2270:            {
                   2271:              sock *s = current_sock;
                   2272:              if (s->index == -1)
                   2273:                {
                   2274:                  current_sock = sk_next(s);
                   2275:                  goto next2;
                   2276:                }
                   2277: 
                   2278:              if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
                   2279:                {
                   2280:                  count++;
                   2281:                  io_log_event(s->rx_hook, s->data);
                   2282:                  sk_read(s, pfd[s->index].revents);
                   2283:                  if (s != current_sock)
                   2284:                    goto next2;
                   2285:                }
                   2286: 
                   2287:              if (pfd[s->index].revents & (POLLHUP | POLLERR))
                   2288:                {
                   2289:                  sk_err(s, pfd[s->index].revents);
                   2290:                  if (s != current_sock)
                   2291:                    goto next2;
                   2292:                }
                   2293: 
                   2294:              current_sock = sk_next(s);
                   2295:            next2: ;
                   2296:            }
                   2297: 
                   2298: 
                   2299:          stored_sock = current_sock;
                   2300:        }
                   2301:     }
                   2302: }
                   2303: 
                   2304: void
                   2305: test_old_bird(char *path)
                   2306: {
                   2307:   int fd;
                   2308:   struct sockaddr_un sa;
                   2309: 
                   2310:   fd = socket(AF_UNIX, SOCK_STREAM, 0);
                   2311:   if (fd < 0)
                   2312:     die("Cannot create socket: %m");
                   2313:   if (strlen(path) >= sizeof(sa.sun_path))
                   2314:     die("Socket path too long");
                   2315:   bzero(&sa, sizeof(sa));
                   2316:   sa.sun_family = AF_UNIX;
                   2317:   strcpy(sa.sun_path, path);
                   2318:   if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
                   2319:     die("I found another BIRD running.");
                   2320:   close(fd);
                   2321: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>