Annotation of embedaddon/bird/sysdep/unix/io.c, revision 1.1.1.1
1.1 misho 1: /*
2: * BIRD Internet Routing Daemon -- Unix I/O
3: *
4: * (c) 1998--2004 Martin Mares <mj@ucw.cz>
5: * (c) 2004 Ondrej Filip <feela@network.cz>
6: *
7: * Can be freely distributed and used under the terms of the GNU GPL.
8: */
9:
10: /* Unfortunately, some glibc versions hide parts of RFC 3542 API
11: if _GNU_SOURCE is not defined. */
12: #ifndef _GNU_SOURCE
13: #define _GNU_SOURCE
14: #endif
15:
16: #include <stdio.h>
17: #include <stdlib.h>
18: #include <time.h>
19: #include <sys/time.h>
20: #include <sys/types.h>
21: #include <sys/socket.h>
22: #include <sys/uio.h>
23: #include <sys/un.h>
24: #include <poll.h>
25: #include <unistd.h>
26: #include <fcntl.h>
27: #include <errno.h>
28: #include <net/if.h>
29: #include <netinet/in.h>
30: #include <netinet/tcp.h>
31: #include <netinet/udp.h>
32: #include <netinet/icmp6.h>
33:
34: #include "nest/bird.h"
35: #include "lib/lists.h"
36: #include "lib/resource.h"
37: #include "lib/timer.h"
38: #include "lib/socket.h"
39: #include "lib/event.h"
40: #include "lib/string.h"
41: #include "nest/iface.h"
42:
43: #include "lib/unix.h"
44: #include "lib/sysio.h"
45:
46: /* Maximum number of calls of tx handler for one socket in one
47: * poll iteration. Should be small enough to not monopolize CPU by
48: * one protocol instance.
49: */
50: #define MAX_STEPS 4
51:
52: /* Maximum number of calls of rx handler for all sockets in one poll
53: iteration. RX callbacks are often much more costly so we limit
54: this to gen small latencies */
55: #define MAX_RX_STEPS 4
56:
57: /*
58: * Tracked Files
59: */
60:
61: struct rfile {
62: resource r;
63: FILE *f;
64: };
65:
66: static void
67: rf_free(resource *r)
68: {
69: struct rfile *a = (struct rfile *) r;
70:
71: fclose(a->f);
72: }
73:
74: static void
75: rf_dump(resource *r)
76: {
77: struct rfile *a = (struct rfile *) r;
78:
79: debug("(FILE *%p)\n", a->f);
80: }
81:
82: static struct resclass rf_class = {
83: "FILE",
84: sizeof(struct rfile),
85: rf_free,
86: rf_dump,
87: NULL,
88: NULL
89: };
90:
91: void *
92: tracked_fopen(pool *p, char *name, char *mode)
93: {
94: FILE *f = fopen(name, mode);
95:
96: if (f)
97: {
98: struct rfile *r = ralloc(p, &rf_class);
99: r->f = f;
100: }
101: return f;
102: }
103:
104: /**
105: * DOC: Timers
106: *
107: * Timers are resources which represent a wish of a module to call
108: * a function at the specified time. The platform dependent code
109: * doesn't guarantee exact timing, only that a timer function
110: * won't be called before the requested time.
111: *
112: * In BIRD, time is represented by values of the &bird_clock_t type
113: * which are integral numbers interpreted as a relative number of seconds since
114: * some fixed time point in past. The current time can be read
115: * from variable @now with reasonable accuracy and is monotonic. There is also
116: * a current 'absolute' time in variable @now_real reported by OS.
117: *
118: * Each timer is described by a &timer structure containing a pointer
119: * to the handler function (@hook), data private to this function (@data),
120: * time the function should be called at (@expires, 0 for inactive timers),
121: * for the other fields see |timer.h|.
122: */
123:
124: #define NEAR_TIMER_LIMIT 4
125:
126: static list near_timers, far_timers;
127: static bird_clock_t first_far_timer = TIME_INFINITY;
128:
129: /* now must be different from 0, because 0 is a special value in timer->expires */
130: bird_clock_t now = 1, now_real, boot_time;
131:
132: static void
133: update_times_plain(void)
134: {
135: bird_clock_t new_time = time(NULL);
136: int delta = new_time - now_real;
137:
138: if ((delta >= 0) && (delta < 60))
139: now += delta;
140: else if (now_real != 0)
141: log(L_WARN "Time jump, delta %d s", delta);
142:
143: now_real = new_time;
144: }
145:
146: static void
147: update_times_gettime(void)
148: {
149: struct timespec ts;
150: int rv;
151:
152: rv = clock_gettime(CLOCK_MONOTONIC, &ts);
153: if (rv != 0)
154: die("clock_gettime: %m");
155:
156: if (ts.tv_sec != now) {
157: if (ts.tv_sec < now)
158: log(L_ERR "Monotonic timer is broken");
159:
160: now = ts.tv_sec;
161: now_real = time(NULL);
162: }
163: }
164:
165: static int clock_monotonic_available;
166:
167: static inline void
168: update_times(void)
169: {
170: if (clock_monotonic_available)
171: update_times_gettime();
172: else
173: update_times_plain();
174: }
175:
176: static inline void
177: init_times(void)
178: {
179: struct timespec ts;
180: clock_monotonic_available = (clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
181: if (!clock_monotonic_available)
182: log(L_WARN "Monotonic timer is missing");
183: }
184:
185:
186: static void
187: tm_free(resource *r)
188: {
189: timer *t = (timer *) r;
190:
191: tm_stop(t);
192: }
193:
194: static void
195: tm_dump(resource *r)
196: {
197: timer *t = (timer *) r;
198:
199: debug("(code %p, data %p, ", t->hook, t->data);
200: if (t->randomize)
201: debug("rand %d, ", t->randomize);
202: if (t->recurrent)
203: debug("recur %d, ", t->recurrent);
204: if (t->expires)
205: debug("expires in %d sec)\n", t->expires - now);
206: else
207: debug("inactive)\n");
208: }
209:
210: static struct resclass tm_class = {
211: "Timer",
212: sizeof(timer),
213: tm_free,
214: tm_dump,
215: NULL,
216: NULL
217: };
218:
219: /**
220: * tm_new - create a timer
221: * @p: pool
222: *
223: * This function creates a new timer resource and returns
224: * a pointer to it. To use the timer, you need to fill in
225: * the structure fields and call tm_start() to start timing.
226: */
227: timer *
228: tm_new(pool *p)
229: {
230: timer *t = ralloc(p, &tm_class);
231: return t;
232: }
233:
234: static inline void
235: tm_insert_near(timer *t)
236: {
237: node *n = HEAD(near_timers);
238:
239: while (n->next && (SKIP_BACK(timer, n, n)->expires < t->expires))
240: n = n->next;
241: insert_node(&t->n, n->prev);
242: }
243:
244: /**
245: * tm_start - start a timer
246: * @t: timer
247: * @after: number of seconds the timer should be run after
248: *
249: * This function schedules the hook function of the timer to
250: * be called after @after seconds. If the timer has been already
251: * started, it's @expire time is replaced by the new value.
252: *
253: * You can have set the @randomize field of @t, the timeout
254: * will be increased by a random number of seconds chosen
255: * uniformly from range 0 .. @randomize.
256: *
257: * You can call tm_start() from the handler function of the timer
258: * to request another run of the timer. Also, you can set the @recurrent
259: * field to have the timer re-added automatically with the same timeout.
260: */
261: void
262: tm_start(timer *t, unsigned after)
263: {
264: bird_clock_t when;
265:
266: if (t->randomize)
267: after += random() % (t->randomize + 1);
268: when = now + after;
269: if (t->expires == when)
270: return;
271: if (t->expires)
272: rem_node(&t->n);
273: t->expires = when;
274: if (after <= NEAR_TIMER_LIMIT)
275: tm_insert_near(t);
276: else
277: {
278: if (!first_far_timer || first_far_timer > when)
279: first_far_timer = when;
280: add_tail(&far_timers, &t->n);
281: }
282: }
283:
284: /**
285: * tm_stop - stop a timer
286: * @t: timer
287: *
288: * This function stops a timer. If the timer is already stopped,
289: * nothing happens.
290: */
291: void
292: tm_stop(timer *t)
293: {
294: if (t->expires)
295: {
296: rem_node(&t->n);
297: t->expires = 0;
298: }
299: }
300:
301: static void
302: tm_dump_them(char *name, list *l)
303: {
304: node *n;
305: timer *t;
306:
307: debug("%s timers:\n", name);
308: WALK_LIST(n, *l)
309: {
310: t = SKIP_BACK(timer, n, n);
311: debug("%p ", t);
312: tm_dump(&t->r);
313: }
314: debug("\n");
315: }
316:
317: void
318: tm_dump_all(void)
319: {
320: tm_dump_them("Near", &near_timers);
321: tm_dump_them("Far", &far_timers);
322: }
323:
324: static inline time_t
325: tm_first_shot(void)
326: {
327: time_t x = first_far_timer;
328:
329: if (!EMPTY_LIST(near_timers))
330: {
331: timer *t = SKIP_BACK(timer, n, HEAD(near_timers));
332: if (t->expires < x)
333: x = t->expires;
334: }
335: return x;
336: }
337:
338: void io_log_event(void *hook, void *data);
339:
340: static void
341: tm_shot(void)
342: {
343: timer *t;
344: node *n, *m;
345:
346: if (first_far_timer <= now)
347: {
348: bird_clock_t limit = now + NEAR_TIMER_LIMIT;
349: first_far_timer = TIME_INFINITY;
350: n = HEAD(far_timers);
351: while (m = n->next)
352: {
353: t = SKIP_BACK(timer, n, n);
354: if (t->expires <= limit)
355: {
356: rem_node(n);
357: tm_insert_near(t);
358: }
359: else if (t->expires < first_far_timer)
360: first_far_timer = t->expires;
361: n = m;
362: }
363: }
364: while ((n = HEAD(near_timers)) -> next)
365: {
366: int delay;
367: t = SKIP_BACK(timer, n, n);
368: if (t->expires > now)
369: break;
370: rem_node(n);
371: delay = t->expires - now;
372: t->expires = 0;
373: if (t->recurrent)
374: {
375: int i = t->recurrent - delay;
376: if (i < 0)
377: i = 0;
378: tm_start(t, i);
379: }
380: io_log_event(t->hook, t->data);
381: t->hook(t);
382: }
383: }
384:
385: /**
386: * tm_parse_datetime - parse a date and time
387: * @x: datetime string
388: *
389: * tm_parse_datetime() takes a textual representation of
390: * a date and time (dd-mm-yyyy hh:mm:ss)
391: * and converts it to the corresponding value of type &bird_clock_t.
392: */
393: bird_clock_t
394: tm_parse_datetime(char *x)
395: {
396: struct tm tm;
397: int n;
398: time_t t;
399:
400: if (sscanf(x, "%d-%d-%d %d:%d:%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &tm.tm_hour, &tm.tm_min, &tm.tm_sec, &n) != 6 || x[n])
401: return tm_parse_date(x);
402: tm.tm_mon--;
403: tm.tm_year -= 1900;
404: t = mktime(&tm);
405: if (t == (time_t) -1)
406: return 0;
407: return t;
408: }
409: /**
410: * tm_parse_date - parse a date
411: * @x: date string
412: *
413: * tm_parse_date() takes a textual representation of a date (dd-mm-yyyy)
414: * and converts it to the corresponding value of type &bird_clock_t.
415: */
416: bird_clock_t
417: tm_parse_date(char *x)
418: {
419: struct tm tm;
420: int n;
421: time_t t;
422:
423: if (sscanf(x, "%d-%d-%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &n) != 3 || x[n])
424: return 0;
425: tm.tm_mon--;
426: tm.tm_year -= 1900;
427: tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
428: t = mktime(&tm);
429: if (t == (time_t) -1)
430: return 0;
431: return t;
432: }
433:
434: static void
435: tm_format_reltime(char *x, struct tm *tm, bird_clock_t delta)
436: {
437: static char *month_names[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
438: "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
439:
440: if (delta < 20*3600)
441: bsprintf(x, "%02d:%02d", tm->tm_hour, tm->tm_min);
442: else if (delta < 360*86400)
443: bsprintf(x, "%s%02d", month_names[tm->tm_mon], tm->tm_mday);
444: else
445: bsprintf(x, "%d", tm->tm_year+1900);
446: }
447:
448: #include "conf/conf.h"
449:
450: /**
451: * tm_format_datetime - convert date and time to textual representation
452: * @x: destination buffer of size %TM_DATETIME_BUFFER_SIZE
453: * @fmt_spec: specification of resulting textual representation of the time
454: * @t: time
455: *
456: * This function formats the given relative time value @t to a textual
457: * date/time representation (dd-mm-yyyy hh:mm:ss) in real time.
458: */
459: void
460: tm_format_datetime(char *x, struct timeformat *fmt_spec, bird_clock_t t)
461: {
462: const char *fmt_used;
463: struct tm *tm;
464: bird_clock_t delta = now - t;
465: t = now_real - delta;
466: tm = localtime(&t);
467:
468: if (fmt_spec->fmt1 == NULL)
469: return tm_format_reltime(x, tm, delta);
470:
471: if ((fmt_spec->limit == 0) || (delta < fmt_spec->limit))
472: fmt_used = fmt_spec->fmt1;
473: else
474: fmt_used = fmt_spec->fmt2;
475:
476: int rv = strftime(x, TM_DATETIME_BUFFER_SIZE, fmt_used, tm);
477: if (((rv == 0) && fmt_used[0]) || (rv == TM_DATETIME_BUFFER_SIZE))
478: strcpy(x, "<too-long>");
479: }
480:
481:
482: /**
483: * DOC: Sockets
484: *
485: * Socket resources represent network connections. Their data structure (&socket)
486: * contains a lot of fields defining the exact type of the socket, the local and
487: * remote addresses and ports, pointers to socket buffers and finally pointers to
488: * hook functions to be called when new data have arrived to the receive buffer
489: * (@rx_hook), when the contents of the transmit buffer have been transmitted
490: * (@tx_hook) and when an error or connection close occurs (@err_hook).
491: *
492: * Freeing of sockets from inside socket hooks is perfectly safe.
493: */
494:
495: #ifndef SOL_IP
496: #define SOL_IP IPPROTO_IP
497: #endif
498:
499: #ifndef SOL_IPV6
500: #define SOL_IPV6 IPPROTO_IPV6
501: #endif
502:
503: #ifndef SOL_ICMPV6
504: #define SOL_ICMPV6 IPPROTO_ICMPV6
505: #endif
506:
507:
508: /*
509: * Sockaddr helper functions
510: */
511:
512: static inline int UNUSED sockaddr_length(int af)
513: { return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
514:
515: static inline void
516: sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
517: {
518: memset(sa, 0, sizeof(struct sockaddr_in));
519: #ifdef HAVE_SIN_LEN
520: sa->sin_len = sizeof(struct sockaddr_in);
521: #endif
522: sa->sin_family = AF_INET;
523: sa->sin_port = htons(port);
524: sa->sin_addr = ipa_to_in4(a);
525: }
526:
527: static inline void
528: sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
529: {
530: memset(sa, 0, sizeof(struct sockaddr_in6));
531: #ifdef SIN6_LEN
532: sa->sin6_len = sizeof(struct sockaddr_in6);
533: #endif
534: sa->sin6_family = AF_INET6;
535: sa->sin6_port = htons(port);
536: sa->sin6_flowinfo = 0;
537: sa->sin6_addr = ipa_to_in6(a);
538:
539: if (ifa && ipa_is_link_local(a))
540: sa->sin6_scope_id = ifa->index;
541: }
542:
543: void
544: sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
545: {
546: if (af == AF_INET)
547: sockaddr_fill4((struct sockaddr_in *) sa, a, port);
548: else if (af == AF_INET6)
549: sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
550: else
551: bug("Unknown AF");
552: }
553:
554: static inline void
555: sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
556: {
557: *port = ntohs(sa->sin_port);
558: *a = ipa_from_in4(sa->sin_addr);
559: }
560:
561: static inline void
562: sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
563: {
564: *port = ntohs(sa->sin6_port);
565: *a = ipa_from_in6(sa->sin6_addr);
566:
567: if (ifa && ipa_is_link_local(*a))
568: *ifa = if_find_by_index(sa->sin6_scope_id);
569: }
570:
571: int
572: sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
573: {
574: if (sa->sa.sa_family != af)
575: goto fail;
576:
577: if (af == AF_INET)
578: sockaddr_read4((struct sockaddr_in *) sa, a, port);
579: else if (af == AF_INET6)
580: sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
581: else
582: goto fail;
583:
584: return 0;
585:
586: fail:
587: *a = IPA_NONE;
588: *port = 0;
589: return -1;
590: }
591:
592:
593: /*
594: * IPv6 multicast syscalls
595: */
596:
597: /* Fortunately standardized in RFC 3493 */
598:
599: #define INIT_MREQ6(maddr,ifa) \
600: { .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
601:
602: static inline int
603: sk_setup_multicast6(sock *s)
604: {
605: int index = s->iface->index;
606: int ttl = s->ttl;
607: int n = 0;
608:
609: if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
610: ERR("IPV6_MULTICAST_IF");
611:
612: if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
613: ERR("IPV6_MULTICAST_HOPS");
614:
615: if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
616: ERR("IPV6_MULTICAST_LOOP");
617:
618: return 0;
619: }
620:
621: static inline int
622: sk_join_group6(sock *s, ip_addr maddr)
623: {
624: struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
625:
626: if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
627: ERR("IPV6_JOIN_GROUP");
628:
629: return 0;
630: }
631:
632: static inline int
633: sk_leave_group6(sock *s, ip_addr maddr)
634: {
635: struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
636:
637: if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
638: ERR("IPV6_LEAVE_GROUP");
639:
640: return 0;
641: }
642:
643:
644: /*
645: * IPv6 packet control messages
646: */
647:
648: /* Also standardized, in RFC 3542 */
649:
650: /*
651: * RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
652: * type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
653: * don't have IPV6_RECVPKTINFO we suppose the OS implements the older
654: * RFC and we use IPV6_PKTINFO.
655: */
656: #ifndef IPV6_RECVPKTINFO
657: #define IPV6_RECVPKTINFO IPV6_PKTINFO
658: #endif
659: /*
660: * Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
661: */
662: #ifndef IPV6_RECVHOPLIMIT
663: #define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
664: #endif
665:
666:
667: #define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
668: #define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
669:
670: static inline int
671: sk_request_cmsg6_pktinfo(sock *s)
672: {
673: int y = 1;
674:
675: if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
676: ERR("IPV6_RECVPKTINFO");
677:
678: return 0;
679: }
680:
681: static inline int
682: sk_request_cmsg6_ttl(sock *s)
683: {
684: int y = 1;
685:
686: if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
687: ERR("IPV6_RECVHOPLIMIT");
688:
689: return 0;
690: }
691:
692: static inline void
693: sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
694: {
695: if (cm->cmsg_type == IPV6_PKTINFO)
696: {
697: struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
698: s->laddr = ipa_from_in6(pi->ipi6_addr);
699: s->lifindex = pi->ipi6_ifindex;
700: }
701: }
702:
703: static inline void
704: sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
705: {
706: if (cm->cmsg_type == IPV6_HOPLIMIT)
707: s->rcv_ttl = * (int *) CMSG_DATA(cm);
708: }
709:
710: static inline void
711: sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
712: {
713: struct cmsghdr *cm;
714: struct in6_pktinfo *pi;
715: int controllen = 0;
716:
717: msg->msg_control = cbuf;
718: msg->msg_controllen = cbuflen;
719:
720: cm = CMSG_FIRSTHDR(msg);
721: cm->cmsg_level = SOL_IPV6;
722: cm->cmsg_type = IPV6_PKTINFO;
723: cm->cmsg_len = CMSG_LEN(sizeof(*pi));
724: controllen += CMSG_SPACE(sizeof(*pi));
725:
726: pi = (struct in6_pktinfo *) CMSG_DATA(cm);
727: pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
728: pi->ipi6_addr = ipa_to_in6(s->saddr);
729:
730: msg->msg_controllen = controllen;
731: }
732:
733:
734: /*
735: * Miscellaneous socket syscalls
736: */
737:
738: static inline int
739: sk_set_ttl4(sock *s, int ttl)
740: {
741: if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
742: ERR("IP_TTL");
743:
744: return 0;
745: }
746:
747: static inline int
748: sk_set_ttl6(sock *s, int ttl)
749: {
750: if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
751: ERR("IPV6_UNICAST_HOPS");
752:
753: return 0;
754: }
755:
756: static inline int
757: sk_set_tos4(sock *s, int tos)
758: {
759: if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
760: ERR("IP_TOS");
761:
762: return 0;
763: }
764:
765: static inline int
766: sk_set_tos6(sock *s, int tos)
767: {
768: if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
769: ERR("IPV6_TCLASS");
770:
771: return 0;
772: }
773:
774: static inline int
775: sk_set_high_port(sock *s UNUSED)
776: {
777: /* Port range setting is optional, ignore it if not supported */
778:
779: #ifdef IP_PORTRANGE
780: if (sk_is_ipv4(s))
781: {
782: int range = IP_PORTRANGE_HIGH;
783: if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
784: ERR("IP_PORTRANGE");
785: }
786: #endif
787:
788: #ifdef IPV6_PORTRANGE
789: if (sk_is_ipv6(s))
790: {
791: int range = IPV6_PORTRANGE_HIGH;
792: if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
793: ERR("IPV6_PORTRANGE");
794: }
795: #endif
796:
797: return 0;
798: }
799:
800: static inline byte *
801: sk_skip_ip_header(byte *pkt, int *len)
802: {
803: if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
804: return NULL;
805:
806: int hlen = (*pkt & 0x0f) * 4;
807: if ((hlen < 20) || (hlen > *len))
808: return NULL;
809:
810: *len -= hlen;
811: return pkt + hlen;
812: }
813:
814: byte *
815: sk_rx_buffer(sock *s, int *len)
816: {
817: if (sk_is_ipv4(s) && (s->type == SK_IP))
818: return sk_skip_ip_header(s->rbuf, len);
819: else
820: return s->rbuf;
821: }
822:
823:
824: /*
825: * Public socket functions
826: */
827:
828: /**
829: * sk_setup_multicast - enable multicast for given socket
830: * @s: socket
831: *
832: * Prepare transmission of multicast packets for given datagram socket.
833: * The socket must have defined @iface.
834: *
835: * Result: 0 for success, -1 for an error.
836: */
837:
838: int
839: sk_setup_multicast(sock *s)
840: {
841: ASSERT(s->iface);
842:
843: if (sk_is_ipv4(s))
844: return sk_setup_multicast4(s);
845: else
846: return sk_setup_multicast6(s);
847: }
848:
849: /**
850: * sk_join_group - join multicast group for given socket
851: * @s: socket
852: * @maddr: multicast address
853: *
854: * Join multicast group for given datagram socket and associated interface.
855: * The socket must have defined @iface.
856: *
857: * Result: 0 for success, -1 for an error.
858: */
859:
860: int
861: sk_join_group(sock *s, ip_addr maddr)
862: {
863: if (sk_is_ipv4(s))
864: return sk_join_group4(s, maddr);
865: else
866: return sk_join_group6(s, maddr);
867: }
868:
869: /**
870: * sk_leave_group - leave multicast group for given socket
871: * @s: socket
872: * @maddr: multicast address
873: *
874: * Leave multicast group for given datagram socket and associated interface.
875: * The socket must have defined @iface.
876: *
877: * Result: 0 for success, -1 for an error.
878: */
879:
880: int
881: sk_leave_group(sock *s, ip_addr maddr)
882: {
883: if (sk_is_ipv4(s))
884: return sk_leave_group4(s, maddr);
885: else
886: return sk_leave_group6(s, maddr);
887: }
888:
889: /**
890: * sk_setup_broadcast - enable broadcast for given socket
891: * @s: socket
892: *
893: * Allow reception and transmission of broadcast packets for given datagram
894: * socket. The socket must have defined @iface. For transmission, packets should
895: * be send to @brd address of @iface.
896: *
897: * Result: 0 for success, -1 for an error.
898: */
899:
900: int
901: sk_setup_broadcast(sock *s)
902: {
903: int y = 1;
904:
905: if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
906: ERR("SO_BROADCAST");
907:
908: return 0;
909: }
910:
911: /**
912: * sk_set_ttl - set transmit TTL for given socket
913: * @s: socket
914: * @ttl: TTL value
915: *
916: * Set TTL for already opened connections when TTL was not set before. Useful
917: * for accepted connections when different ones should have different TTL.
918: *
919: * Result: 0 for success, -1 for an error.
920: */
921:
922: int
923: sk_set_ttl(sock *s, int ttl)
924: {
925: s->ttl = ttl;
926:
927: if (sk_is_ipv4(s))
928: return sk_set_ttl4(s, ttl);
929: else
930: return sk_set_ttl6(s, ttl);
931: }
932:
933: /**
934: * sk_set_min_ttl - set minimal accepted TTL for given socket
935: * @s: socket
936: * @ttl: TTL value
937: *
938: * Set minimal accepted TTL for given socket. Can be used for TTL security.
939: * implementations.
940: *
941: * Result: 0 for success, -1 for an error.
942: */
943:
944: int
945: sk_set_min_ttl(sock *s, int ttl)
946: {
947: if (sk_is_ipv4(s))
948: return sk_set_min_ttl4(s, ttl);
949: else
950: return sk_set_min_ttl6(s, ttl);
951: }
952:
953: #if 0
954: /**
955: * sk_set_md5_auth - add / remove MD5 security association for given socket
956: * @s: socket
957: * @local: IP address of local side
958: * @remote: IP address of remote side
959: * @ifa: Interface for link-local IP address
960: * @passwd: Password used for MD5 authentication
961: * @setkey: Update also system SA/SP database
962: *
963: * In TCP MD5 handling code in kernel, there is a set of security associations
964: * used for choosing password and other authentication parameters according to
965: * the local and remote address. This function is useful for listening socket,
966: * for active sockets it may be enough to set s->password field.
967: *
968: * When called with passwd != NULL, the new pair is added,
969: * When called with passwd == NULL, the existing pair is removed.
970: *
971: * Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
972: * stored in global SA/SP database (but the behavior also must be enabled on
973: * per-socket basis). In case of multiple sockets to the same neighbor, the
974: * socket-specific state must be configured for each socket while global state
975: * just once per src-dst pair. The @setkey argument controls whether the global
976: * state (SA/SP database) is also updated.
977: *
978: * Result: 0 for success, -1 for an error.
979: */
980:
981: int
982: sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
983: { DUMMY; }
984: #endif
985:
986: /**
987: * sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
988: * @s: socket
989: * @offset: offset
990: *
991: * Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
992: * kernel will automatically fill it for outgoing packets and check it for
993: * incoming packets. Should not be used on ICMPv6 sockets, where the position is
994: * known to the kernel.
995: *
996: * Result: 0 for success, -1 for an error.
997: */
998:
999: int
1000: sk_set_ipv6_checksum(sock *s, int offset)
1001: {
1002: if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
1003: ERR("IPV6_CHECKSUM");
1004:
1005: return 0;
1006: }
1007:
1008: int
1009: sk_set_icmp6_filter(sock *s, int p1, int p2)
1010: {
1011: /* a bit of lame interface, but it is here only for Radv */
1012: struct icmp6_filter f;
1013:
1014: ICMP6_FILTER_SETBLOCKALL(&f);
1015: ICMP6_FILTER_SETPASS(p1, &f);
1016: ICMP6_FILTER_SETPASS(p2, &f);
1017:
1018: if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
1019: ERR("ICMP6_FILTER");
1020:
1021: return 0;
1022: }
1023:
1024: void
1025: sk_log_error(sock *s, const char *p)
1026: {
1027: log(L_ERR "%s: Socket error: %s%#m", p, s->err);
1028: }
1029:
1030:
1031: /*
1032: * Actual struct birdsock code
1033: */
1034:
1035: static list sock_list;
1036: static struct birdsock *current_sock;
1037: static struct birdsock *stored_sock;
1038:
1039: static inline sock *
1040: sk_next(sock *s)
1041: {
1042: if (!s->n.next->next)
1043: return NULL;
1044: else
1045: return SKIP_BACK(sock, n, s->n.next);
1046: }
1047:
1048: static void
1049: sk_alloc_bufs(sock *s)
1050: {
1051: if (!s->rbuf && s->rbsize)
1052: s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
1053: s->rpos = s->rbuf;
1054: if (!s->tbuf && s->tbsize)
1055: s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
1056: s->tpos = s->ttx = s->tbuf;
1057: }
1058:
1059: static void
1060: sk_free_bufs(sock *s)
1061: {
1062: if (s->rbuf_alloc)
1063: {
1064: xfree(s->rbuf_alloc);
1065: s->rbuf = s->rbuf_alloc = NULL;
1066: }
1067: if (s->tbuf_alloc)
1068: {
1069: xfree(s->tbuf_alloc);
1070: s->tbuf = s->tbuf_alloc = NULL;
1071: }
1072: }
1073:
1074: static void
1075: sk_free(resource *r)
1076: {
1077: sock *s = (sock *) r;
1078:
1079: sk_free_bufs(s);
1080: if (s->fd >= 0)
1081: {
1082: close(s->fd);
1083:
1084: /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
1085: if (s->flags & SKF_THREAD)
1086: return;
1087:
1088: if (s == current_sock)
1089: current_sock = sk_next(s);
1090: if (s == stored_sock)
1091: stored_sock = sk_next(s);
1092: rem_node(&s->n);
1093: }
1094: }
1095:
1096: void
1097: sk_set_rbsize(sock *s, uint val)
1098: {
1099: ASSERT(s->rbuf_alloc == s->rbuf);
1100:
1101: if (s->rbsize == val)
1102: return;
1103:
1104: s->rbsize = val;
1105: xfree(s->rbuf_alloc);
1106: s->rbuf_alloc = xmalloc(val);
1107: s->rpos = s->rbuf = s->rbuf_alloc;
1108: }
1109:
1110: void
1111: sk_set_tbsize(sock *s, uint val)
1112: {
1113: ASSERT(s->tbuf_alloc == s->tbuf);
1114:
1115: if (s->tbsize == val)
1116: return;
1117:
1118: byte *old_tbuf = s->tbuf;
1119:
1120: s->tbsize = val;
1121: s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
1122: s->tpos = s->tbuf + (s->tpos - old_tbuf);
1123: s->ttx = s->tbuf + (s->ttx - old_tbuf);
1124: }
1125:
1126: void
1127: sk_set_tbuf(sock *s, void *tbuf)
1128: {
1129: s->tbuf = tbuf ?: s->tbuf_alloc;
1130: s->ttx = s->tpos = s->tbuf;
1131: }
1132:
1133: void
1134: sk_reallocate(sock *s)
1135: {
1136: sk_free_bufs(s);
1137: sk_alloc_bufs(s);
1138: }
1139:
1140: static void
1141: sk_dump(resource *r)
1142: {
1143: sock *s = (sock *) r;
1144: static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "DEL!" };
1145:
1146: debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
1147: sk_type_names[s->type],
1148: s->data,
1149: s->saddr,
1150: s->sport,
1151: s->daddr,
1152: s->dport,
1153: s->tos,
1154: s->ttl,
1155: s->iface ? s->iface->name : "none");
1156: }
1157:
1158: static struct resclass sk_class = {
1159: "Socket",
1160: sizeof(sock),
1161: sk_free,
1162: sk_dump,
1163: NULL,
1164: NULL
1165: };
1166:
1167: /**
1168: * sk_new - create a socket
1169: * @p: pool
1170: *
1171: * This function creates a new socket resource. If you want to use it,
1172: * you need to fill in all the required fields of the structure and
1173: * call sk_open() to do the actual opening of the socket.
1174: *
1175: * The real function name is sock_new(), sk_new() is a macro wrapper
1176: * to avoid collision with OpenSSL.
1177: */
1178: sock *
1179: sock_new(pool *p)
1180: {
1181: sock *s = ralloc(p, &sk_class);
1182: s->pool = p;
1183: // s->saddr = s->daddr = IPA_NONE;
1184: s->tos = s->priority = s->ttl = -1;
1185: s->fd = -1;
1186: return s;
1187: }
1188:
1189: static int
1190: sk_setup(sock *s)
1191: {
1192: int y = 1;
1193: int fd = s->fd;
1194:
1195: if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1196: ERR("O_NONBLOCK");
1197:
1198: if (!s->af)
1199: return 0;
1200:
1201: if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
1202: s->flags |= SKF_PKTINFO;
1203:
1204: #ifdef CONFIG_USE_HDRINCL
1205: if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
1206: {
1207: s->flags &= ~SKF_PKTINFO;
1208: s->flags |= SKF_HDRINCL;
1209: if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
1210: ERR("IP_HDRINCL");
1211: }
1212: #endif
1213:
1214: if (s->iface)
1215: {
1216: #ifdef SO_BINDTODEVICE
1217: struct ifreq ifr = {};
1218: strcpy(ifr.ifr_name, s->iface->name);
1219: if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
1220: ERR("SO_BINDTODEVICE");
1221: #endif
1222:
1223: #ifdef CONFIG_UNIX_DONTROUTE
1224: if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
1225: ERR("SO_DONTROUTE");
1226: #endif
1227: }
1228:
1229: if (s->priority >= 0)
1230: if (sk_set_priority(s, s->priority) < 0)
1231: return -1;
1232:
1233: if (sk_is_ipv4(s))
1234: {
1235: if (s->flags & SKF_LADDR_RX)
1236: if (sk_request_cmsg4_pktinfo(s) < 0)
1237: return -1;
1238:
1239: if (s->flags & SKF_TTL_RX)
1240: if (sk_request_cmsg4_ttl(s) < 0)
1241: return -1;
1242:
1243: if ((s->type == SK_UDP) || (s->type == SK_IP))
1244: if (sk_disable_mtu_disc4(s) < 0)
1245: return -1;
1246:
1247: if (s->ttl >= 0)
1248: if (sk_set_ttl4(s, s->ttl) < 0)
1249: return -1;
1250:
1251: if (s->tos >= 0)
1252: if (sk_set_tos4(s, s->tos) < 0)
1253: return -1;
1254: }
1255:
1256: if (sk_is_ipv6(s))
1257: {
1258: if (s->flags & SKF_V6ONLY)
1259: if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
1260: ERR("IPV6_V6ONLY");
1261:
1262: if (s->flags & SKF_LADDR_RX)
1263: if (sk_request_cmsg6_pktinfo(s) < 0)
1264: return -1;
1265:
1266: if (s->flags & SKF_TTL_RX)
1267: if (sk_request_cmsg6_ttl(s) < 0)
1268: return -1;
1269:
1270: if ((s->type == SK_UDP) || (s->type == SK_IP))
1271: if (sk_disable_mtu_disc6(s) < 0)
1272: return -1;
1273:
1274: if (s->ttl >= 0)
1275: if (sk_set_ttl6(s, s->ttl) < 0)
1276: return -1;
1277:
1278: if (s->tos >= 0)
1279: if (sk_set_tos6(s, s->tos) < 0)
1280: return -1;
1281: }
1282:
1283: return 0;
1284: }
1285:
1286: static void
1287: sk_insert(sock *s)
1288: {
1289: add_tail(&sock_list, &s->n);
1290: }
1291:
1292: static void
1293: sk_tcp_connected(sock *s)
1294: {
1295: sockaddr sa;
1296: int sa_len = sizeof(sa);
1297:
1298: if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
1299: (sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
1300: log(L_WARN "SOCK: Cannot get local IP address for TCP>");
1301:
1302: s->type = SK_TCP;
1303: sk_alloc_bufs(s);
1304: s->tx_hook(s);
1305: }
1306:
1307: static int
1308: sk_passive_connected(sock *s, int type)
1309: {
1310: sockaddr loc_sa, rem_sa;
1311: int loc_sa_len = sizeof(loc_sa);
1312: int rem_sa_len = sizeof(rem_sa);
1313:
1314: int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
1315: if (fd < 0)
1316: {
1317: if ((errno != EINTR) && (errno != EAGAIN))
1318: s->err_hook(s, errno);
1319: return 0;
1320: }
1321:
1322: sock *t = sk_new(s->pool);
1323: t->type = type;
1324: t->fd = fd;
1325: t->af = s->af;
1326: t->ttl = s->ttl;
1327: t->tos = s->tos;
1328: t->rbsize = s->rbsize;
1329: t->tbsize = s->tbsize;
1330:
1331: if (type == SK_TCP)
1332: {
1333: if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
1334: (sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
1335: log(L_WARN "SOCK: Cannot get local IP address for TCP<");
1336:
1337: if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
1338: log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
1339: }
1340:
1341: if (sk_setup(t) < 0)
1342: {
1343: /* FIXME: Call err_hook instead ? */
1344: log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
1345:
1346: /* FIXME: handle it better in rfree() */
1347: close(t->fd);
1348: t->fd = -1;
1349: rfree(t);
1350: return 1;
1351: }
1352:
1353: sk_insert(t);
1354: sk_alloc_bufs(t);
1355: s->rx_hook(t, 0);
1356: return 1;
1357: }
1358:
1359: /**
1360: * sk_open - open a socket
1361: * @s: socket
1362: *
1363: * This function takes a socket resource created by sk_new() and
1364: * initialized by the user and binds a corresponding network connection
1365: * to it.
1366: *
1367: * Result: 0 for success, -1 for an error.
1368: */
1369: int
1370: sk_open(sock *s)
1371: {
1372: int af = BIRD_AF;
1373: int fd = -1;
1374: int do_bind = 0;
1375: int bind_port = 0;
1376: ip_addr bind_addr = IPA_NONE;
1377: sockaddr sa;
1378:
1379: switch (s->type)
1380: {
1381: case SK_TCP_ACTIVE:
1382: s->ttx = ""; /* Force s->ttx != s->tpos */
1383: /* Fall thru */
1384: case SK_TCP_PASSIVE:
1385: fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
1386: bind_port = s->sport;
1387: bind_addr = s->saddr;
1388: do_bind = bind_port || ipa_nonzero(bind_addr);
1389: break;
1390:
1391: case SK_UDP:
1392: fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
1393: bind_port = s->sport;
1394: bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1395: do_bind = 1;
1396: break;
1397:
1398: case SK_IP:
1399: fd = socket(af, SOCK_RAW, s->dport);
1400: bind_port = 0;
1401: bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1402: do_bind = ipa_nonzero(bind_addr);
1403: break;
1404:
1405: case SK_MAGIC:
1406: af = 0;
1407: fd = s->fd;
1408: break;
1409:
1410: default:
1411: bug("sk_open() called for invalid sock type %d", s->type);
1412: }
1413:
1414: if (fd < 0)
1415: ERR("socket");
1416:
1417: s->af = af;
1418: s->fd = fd;
1419:
1420: if (sk_setup(s) < 0)
1421: goto err;
1422:
1423: if (do_bind)
1424: {
1425: if (bind_port)
1426: {
1427: int y = 1;
1428:
1429: if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
1430: ERR2("SO_REUSEADDR");
1431:
1432: #ifdef CONFIG_NO_IFACE_BIND
1433: /* Workaround missing ability to bind to an iface */
1434: if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
1435: {
1436: if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
1437: ERR2("SO_REUSEPORT");
1438: }
1439: #endif
1440: }
1441: else
1442: if (s->flags & SKF_HIGH_PORT)
1443: if (sk_set_high_port(s) < 0)
1444: log(L_WARN "Socket error: %s%#m", s->err);
1445:
1446: sockaddr_fill(&sa, af, bind_addr, s->iface, bind_port);
1447: if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
1448: ERR2("bind");
1449: }
1450:
1451: if (s->password)
1452: if (sk_set_md5_auth(s, s->saddr, s->daddr, s->iface, s->password, 0) < 0)
1453: goto err;
1454:
1455: switch (s->type)
1456: {
1457: case SK_TCP_ACTIVE:
1458: sockaddr_fill(&sa, af, s->daddr, s->iface, s->dport);
1459: if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
1460: sk_tcp_connected(s);
1461: else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
1462: errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
1463: ERR2("connect");
1464: break;
1465:
1466: case SK_TCP_PASSIVE:
1467: if (listen(fd, 8) < 0)
1468: ERR2("listen");
1469: break;
1470:
1471: case SK_MAGIC:
1472: break;
1473:
1474: default:
1475: sk_alloc_bufs(s);
1476: }
1477:
1478: if (!(s->flags & SKF_THREAD))
1479: sk_insert(s);
1480: return 0;
1481:
1482: err:
1483: close(fd);
1484: s->fd = -1;
1485: return -1;
1486: }
1487:
1488: int
1489: sk_open_unix(sock *s, char *name)
1490: {
1491: struct sockaddr_un sa;
1492: int fd;
1493:
1494: /* We are sloppy during error (leak fd and not set s->err), but we die anyway */
1495:
1496: fd = socket(AF_UNIX, SOCK_STREAM, 0);
1497: if (fd < 0)
1498: return -1;
1499:
1500: if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1501: return -1;
1502:
1503: /* Path length checked in test_old_bird() */
1504: sa.sun_family = AF_UNIX;
1505: strcpy(sa.sun_path, name);
1506:
1507: if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
1508: return -1;
1509:
1510: if (listen(fd, 8) < 0)
1511: return -1;
1512:
1513: s->fd = fd;
1514: sk_insert(s);
1515: return 0;
1516: }
1517:
1518:
1519: #define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
1520: CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
1521: #define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
1522:
1523: static void
1524: sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
1525: {
1526: if (sk_is_ipv4(s))
1527: sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
1528: else
1529: sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
1530: }
1531:
1532: static void
1533: sk_process_cmsgs(sock *s, struct msghdr *msg)
1534: {
1535: struct cmsghdr *cm;
1536:
1537: s->laddr = IPA_NONE;
1538: s->lifindex = 0;
1539: s->rcv_ttl = -1;
1540:
1541: for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
1542: {
1543: if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
1544: {
1545: sk_process_cmsg4_pktinfo(s, cm);
1546: sk_process_cmsg4_ttl(s, cm);
1547: }
1548:
1549: if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
1550: {
1551: sk_process_cmsg6_pktinfo(s, cm);
1552: sk_process_cmsg6_ttl(s, cm);
1553: }
1554: }
1555: }
1556:
1557:
1558: static inline int
1559: sk_sendmsg(sock *s)
1560: {
1561: struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
1562: byte cmsg_buf[CMSG_TX_SPACE];
1563: sockaddr dst;
1564:
1565: sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
1566:
1567: struct msghdr msg = {
1568: .msg_name = &dst.sa,
1569: .msg_namelen = SA_LEN(dst),
1570: .msg_iov = &iov,
1571: .msg_iovlen = 1
1572: };
1573:
1574: #ifdef CONFIG_USE_HDRINCL
1575: byte hdr[20];
1576: struct iovec iov2[2] = { {hdr, 20}, iov };
1577:
1578: if (s->flags & SKF_HDRINCL)
1579: {
1580: sk_prepare_ip_header(s, hdr, iov.iov_len);
1581: msg.msg_iov = iov2;
1582: msg.msg_iovlen = 2;
1583: }
1584: #endif
1585:
1586: if (s->flags & SKF_PKTINFO)
1587: sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
1588:
1589: return sendmsg(s->fd, &msg, 0);
1590: }
1591:
1592: static inline int
1593: sk_recvmsg(sock *s)
1594: {
1595: struct iovec iov = {s->rbuf, s->rbsize};
1596: byte cmsg_buf[CMSG_RX_SPACE];
1597: sockaddr src;
1598:
1599: struct msghdr msg = {
1600: .msg_name = &src.sa,
1601: .msg_namelen = sizeof(src), // XXXX ??
1602: .msg_iov = &iov,
1603: .msg_iovlen = 1,
1604: .msg_control = cmsg_buf,
1605: .msg_controllen = sizeof(cmsg_buf),
1606: .msg_flags = 0
1607: };
1608:
1609: int rv = recvmsg(s->fd, &msg, 0);
1610: if (rv < 0)
1611: return rv;
1612:
1613: //ifdef IPV4
1614: // if (cf_type == SK_IP)
1615: // rv = ipv4_skip_header(pbuf, rv);
1616: //endif
1617:
1618: sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
1619: sk_process_cmsgs(s, &msg);
1620:
1621: if (msg.msg_flags & MSG_TRUNC)
1622: s->flags |= SKF_TRUNCATED;
1623: else
1624: s->flags &= ~SKF_TRUNCATED;
1625:
1626: return rv;
1627: }
1628:
1629:
1630: static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
1631:
1632: static int
1633: sk_maybe_write(sock *s)
1634: {
1635: int e;
1636:
1637: switch (s->type)
1638: {
1639: case SK_TCP:
1640: case SK_MAGIC:
1641: case SK_UNIX:
1642: while (s->ttx != s->tpos)
1643: {
1644: e = write(s->fd, s->ttx, s->tpos - s->ttx);
1645:
1646: if (e < 0)
1647: {
1648: if (errno != EINTR && errno != EAGAIN)
1649: {
1650: reset_tx_buffer(s);
1651: /* EPIPE is just a connection close notification during TX */
1652: s->err_hook(s, (errno != EPIPE) ? errno : 0);
1653: return -1;
1654: }
1655: return 0;
1656: }
1657: s->ttx += e;
1658: }
1659: reset_tx_buffer(s);
1660: return 1;
1661:
1662: case SK_UDP:
1663: case SK_IP:
1664: {
1665: if (s->tbuf == s->tpos)
1666: return 1;
1667:
1668: e = sk_sendmsg(s);
1669:
1670: if (e < 0)
1671: {
1672: if (errno != EINTR && errno != EAGAIN)
1673: {
1674: reset_tx_buffer(s);
1675: s->err_hook(s, errno);
1676: return -1;
1677: }
1678:
1679: if (!s->tx_hook)
1680: reset_tx_buffer(s);
1681: return 0;
1682: }
1683: reset_tx_buffer(s);
1684: return 1;
1685: }
1686: default:
1687: bug("sk_maybe_write: unknown socket type %d", s->type);
1688: }
1689: }
1690:
1691: int
1692: sk_rx_ready(sock *s)
1693: {
1694: int rv;
1695: struct pollfd pfd = { .fd = s->fd };
1696: pfd.events |= POLLIN;
1697:
1698: redo:
1699: rv = poll(&pfd, 1, 0);
1700:
1701: if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
1702: goto redo;
1703:
1704: return rv;
1705: }
1706:
1707: /**
1708: * sk_send - send data to a socket
1709: * @s: socket
1710: * @len: number of bytes to send
1711: *
1712: * This function sends @len bytes of data prepared in the
1713: * transmit buffer of the socket @s to the network connection.
1714: * If the packet can be sent immediately, it does so and returns
1715: * 1, else it queues the packet for later processing, returns 0
1716: * and calls the @tx_hook of the socket when the tranmission
1717: * takes place.
1718: */
1719: int
1720: sk_send(sock *s, unsigned len)
1721: {
1722: s->ttx = s->tbuf;
1723: s->tpos = s->tbuf + len;
1724: return sk_maybe_write(s);
1725: }
1726:
1727: /**
1728: * sk_send_to - send data to a specific destination
1729: * @s: socket
1730: * @len: number of bytes to send
1731: * @addr: IP address to send the packet to
1732: * @port: port to send the packet to
1733: *
1734: * This is a sk_send() replacement for connection-less packet sockets
1735: * which allows destination of the packet to be chosen dynamically.
1736: * Raw IP sockets should use 0 for @port.
1737: */
1738: int
1739: sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
1740: {
1741: s->daddr = addr;
1742: if (port)
1743: s->dport = port;
1744:
1745: s->ttx = s->tbuf;
1746: s->tpos = s->tbuf + len;
1747: return sk_maybe_write(s);
1748: }
1749:
1750: /*
1751: int
1752: sk_send_full(sock *s, unsigned len, struct iface *ifa,
1753: ip_addr saddr, ip_addr daddr, unsigned dport)
1754: {
1755: s->iface = ifa;
1756: s->saddr = saddr;
1757: s->daddr = daddr;
1758: s->dport = dport;
1759: s->ttx = s->tbuf;
1760: s->tpos = s->tbuf + len;
1761: return sk_maybe_write(s);
1762: }
1763: */
1764:
1765: /* sk_read() and sk_write() are called from BFD's event loop */
1766:
1767: int
1768: sk_read(sock *s, int revents)
1769: {
1770: switch (s->type)
1771: {
1772: case SK_TCP_PASSIVE:
1773: return sk_passive_connected(s, SK_TCP);
1774:
1775: case SK_UNIX_PASSIVE:
1776: return sk_passive_connected(s, SK_UNIX);
1777:
1778: case SK_TCP:
1779: case SK_UNIX:
1780: {
1781: int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
1782:
1783: if (c < 0)
1784: {
1785: if (errno != EINTR && errno != EAGAIN)
1786: s->err_hook(s, errno);
1787: else if (errno == EAGAIN && !(revents & POLLIN))
1788: {
1789: log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents);
1790: s->err_hook(s, 0);
1791: }
1792: }
1793: else if (!c)
1794: s->err_hook(s, 0);
1795: else
1796: {
1797: s->rpos += c;
1798: if (s->rx_hook(s, s->rpos - s->rbuf))
1799: {
1800: /* We need to be careful since the socket could have been deleted by the hook */
1801: if (current_sock == s)
1802: s->rpos = s->rbuf;
1803: }
1804: return 1;
1805: }
1806: return 0;
1807: }
1808:
1809: case SK_MAGIC:
1810: return s->rx_hook(s, 0);
1811:
1812: default:
1813: {
1814: int e = sk_recvmsg(s);
1815:
1816: if (e < 0)
1817: {
1818: if (errno != EINTR && errno != EAGAIN)
1819: s->err_hook(s, errno);
1820: return 0;
1821: }
1822:
1823: s->rpos = s->rbuf + e;
1824: s->rx_hook(s, e);
1825: return 1;
1826: }
1827: }
1828: }
1829:
1830: int
1831: sk_write(sock *s)
1832: {
1833: switch (s->type)
1834: {
1835: case SK_TCP_ACTIVE:
1836: {
1837: sockaddr sa;
1838: sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
1839:
1840: if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
1841: sk_tcp_connected(s);
1842: else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
1843: s->err_hook(s, errno);
1844: return 0;
1845: }
1846:
1847: default:
1848: if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
1849: {
1850: if (s->tx_hook)
1851: s->tx_hook(s);
1852: return 1;
1853: }
1854: return 0;
1855: }
1856: }
1857:
1858: void
1859: sk_err(sock *s, int revents)
1860: {
1861: int se = 0, sse = sizeof(se);
1862: if ((s->type != SK_MAGIC) && (revents & POLLERR))
1863: if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0)
1864: {
1865: log(L_ERR "IO: Socket error: SO_ERROR: %m");
1866: se = 0;
1867: }
1868:
1869: s->err_hook(s, se);
1870: }
1871:
1872: void
1873: sk_dump_all(void)
1874: {
1875: node *n;
1876: sock *s;
1877:
1878: debug("Open sockets:\n");
1879: WALK_LIST(n, sock_list)
1880: {
1881: s = SKIP_BACK(sock, n, n);
1882: debug("%p ", s);
1883: sk_dump(&s->r);
1884: }
1885: debug("\n");
1886: }
1887:
1888:
1889: /*
1890: * Internal event log and watchdog
1891: */
1892:
1893: #define EVENT_LOG_LENGTH 32
1894:
1895: struct event_log_entry
1896: {
1897: void *hook;
1898: void *data;
1899: btime timestamp;
1900: btime duration;
1901: };
1902:
1903: static struct event_log_entry event_log[EVENT_LOG_LENGTH];
1904: static struct event_log_entry *event_open;
1905: static int event_log_pos, event_log_num, watchdog_active;
1906: static btime last_time;
1907: static btime loop_time;
1908:
1909: static void
1910: io_update_time(void)
1911: {
1912: struct timespec ts;
1913: int rv;
1914:
1915: if (!clock_monotonic_available)
1916: return;
1917:
1918: /*
1919: * This is third time-tracking procedure (after update_times() above and
1920: * times_update() in BFD), dedicated to internal event log and latency
1921: * tracking. Hopefully, we consolidate these sometimes.
1922: */
1923:
1924: rv = clock_gettime(CLOCK_MONOTONIC, &ts);
1925: if (rv < 0)
1926: die("clock_gettime: %m");
1927:
1928: last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
1929:
1930: if (event_open)
1931: {
1932: event_open->duration = last_time - event_open->timestamp;
1933:
1934: if (event_open->duration > config->latency_limit)
1935: log(L_WARN "Event 0x%p 0x%p took %d ms",
1936: event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
1937:
1938: event_open = NULL;
1939: }
1940: }
1941:
1942: /**
1943: * io_log_event - mark approaching event into event log
1944: * @hook: event hook address
1945: * @data: event data address
1946: *
1947: * Store info (hook, data, timestamp) about the following internal event into
1948: * a circular event log (@event_log). When latency tracking is enabled, the log
1949: * entry is kept open (in @event_open) so the duration can be filled later.
1950: */
1951: void
1952: io_log_event(void *hook, void *data)
1953: {
1954: if (config->latency_debug)
1955: io_update_time();
1956:
1957: struct event_log_entry *en = event_log + event_log_pos;
1958:
1959: en->hook = hook;
1960: en->data = data;
1961: en->timestamp = last_time;
1962: en->duration = 0;
1963:
1964: event_log_num++;
1965: event_log_pos++;
1966: event_log_pos %= EVENT_LOG_LENGTH;
1967:
1968: event_open = config->latency_debug ? en : NULL;
1969: }
1970:
1971: static inline void
1972: io_close_event(void)
1973: {
1974: if (event_open)
1975: io_update_time();
1976: }
1977:
1978: void
1979: io_log_dump(void)
1980: {
1981: int i;
1982:
1983: log(L_DEBUG "Event log:");
1984: for (i = 0; i < EVENT_LOG_LENGTH; i++)
1985: {
1986: struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
1987: if (en->hook)
1988: log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
1989: (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
1990: }
1991: }
1992:
1993: void
1994: watchdog_sigalrm(int sig UNUSED)
1995: {
1996: /* Update last_time and duration, but skip latency check */
1997: config->latency_limit = 0xffffffff;
1998: io_update_time();
1999:
2000: /* We want core dump */
2001: abort();
2002: }
2003:
2004: static inline void
2005: watchdog_start1(void)
2006: {
2007: io_update_time();
2008:
2009: loop_time = last_time;
2010: }
2011:
2012: static inline void
2013: watchdog_start(void)
2014: {
2015: io_update_time();
2016:
2017: loop_time = last_time;
2018: event_log_num = 0;
2019:
2020: if (config->watchdog_timeout)
2021: {
2022: alarm(config->watchdog_timeout);
2023: watchdog_active = 1;
2024: }
2025: }
2026:
2027: static inline void
2028: watchdog_stop(void)
2029: {
2030: io_update_time();
2031:
2032: if (watchdog_active)
2033: {
2034: alarm(0);
2035: watchdog_active = 0;
2036: }
2037:
2038: btime duration = last_time - loop_time;
2039: if (duration > config->watchdog_warning)
2040: log(L_WARN "I/O loop cycle took %d ms for %d events",
2041: (int) (duration TO_MS), event_log_num);
2042: }
2043:
2044:
2045: /*
2046: * Main I/O Loop
2047: */
2048:
2049: volatile int async_config_flag; /* Asynchronous reconfiguration/dump scheduled */
2050: volatile int async_dump_flag;
2051: volatile int async_shutdown_flag;
2052:
2053: void
2054: io_init(void)
2055: {
2056: init_list(&near_timers);
2057: init_list(&far_timers);
2058: init_list(&sock_list);
2059: init_list(&global_event_list);
2060: krt_io_init();
2061: init_times();
2062: update_times();
2063: boot_time = now;
2064: srandom((int) now_real);
2065: }
2066:
2067: static int short_loops = 0;
2068: #define SHORT_LOOP_MAX 10
2069:
2070: void
2071: io_loop(void)
2072: {
2073: int poll_tout;
2074: time_t tout;
2075: int nfds, events, pout;
2076: sock *s;
2077: node *n;
2078: int fdmax = 256;
2079: struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd));
2080:
2081: watchdog_start1();
2082: for(;;)
2083: {
2084: events = ev_run_list(&global_event_list);
2085: timers:
2086: update_times();
2087: tout = tm_first_shot();
2088: if (tout <= now)
2089: {
2090: tm_shot();
2091: goto timers;
2092: }
2093: poll_tout = (events ? 0 : MIN(tout - now, 3)) * 1000; /* Time in milliseconds */
2094:
2095: io_close_event();
2096:
2097: nfds = 0;
2098: WALK_LIST(n, sock_list)
2099: {
2100: pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
2101: s = SKIP_BACK(sock, n, n);
2102: if (s->rx_hook)
2103: {
2104: pfd[nfds].fd = s->fd;
2105: pfd[nfds].events |= POLLIN;
2106: }
2107: if (s->tx_hook && s->ttx != s->tpos)
2108: {
2109: pfd[nfds].fd = s->fd;
2110: pfd[nfds].events |= POLLOUT;
2111: }
2112: if (pfd[nfds].fd != -1)
2113: {
2114: s->index = nfds;
2115: nfds++;
2116: }
2117: else
2118: s->index = -1;
2119:
2120: if (nfds >= fdmax)
2121: {
2122: fdmax *= 2;
2123: pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd));
2124: }
2125: }
2126:
2127: /*
2128: * Yes, this is racy. But even if the signal comes before this test
2129: * and entering poll(), it gets caught on the next timer tick.
2130: */
2131:
2132: if (async_config_flag)
2133: {
2134: io_log_event(async_config, NULL);
2135: async_config();
2136: async_config_flag = 0;
2137: continue;
2138: }
2139: if (async_dump_flag)
2140: {
2141: io_log_event(async_dump, NULL);
2142: async_dump();
2143: async_dump_flag = 0;
2144: continue;
2145: }
2146: if (async_shutdown_flag)
2147: {
2148: io_log_event(async_shutdown, NULL);
2149: async_shutdown();
2150: async_shutdown_flag = 0;
2151: continue;
2152: }
2153:
2154: /* And finally enter poll() to find active sockets */
2155: watchdog_stop();
2156: pout = poll(pfd, nfds, poll_tout);
2157: watchdog_start();
2158:
2159: if (pout < 0)
2160: {
2161: if (errno == EINTR || errno == EAGAIN)
2162: continue;
2163: die("poll: %m");
2164: }
2165: if (pout)
2166: {
2167: /* guaranteed to be non-empty */
2168: current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2169:
2170: while (current_sock)
2171: {
2172: sock *s = current_sock;
2173: if (s->index == -1)
2174: {
2175: current_sock = sk_next(s);
2176: goto next;
2177: }
2178:
2179: int e;
2180: int steps;
2181:
2182: steps = MAX_STEPS;
2183: if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
2184: do
2185: {
2186: steps--;
2187: io_log_event(s->rx_hook, s->data);
2188: e = sk_read(s, pfd[s->index].revents);
2189: if (s != current_sock)
2190: goto next;
2191: }
2192: while (e && s->rx_hook && steps);
2193:
2194: steps = MAX_STEPS;
2195: if (pfd[s->index].revents & POLLOUT)
2196: do
2197: {
2198: steps--;
2199: io_log_event(s->tx_hook, s->data);
2200: e = sk_write(s);
2201: if (s != current_sock)
2202: goto next;
2203: }
2204: while (e && steps);
2205:
2206: current_sock = sk_next(s);
2207: next: ;
2208: }
2209:
2210: short_loops++;
2211: if (events && (short_loops < SHORT_LOOP_MAX))
2212: continue;
2213: short_loops = 0;
2214:
2215: int count = 0;
2216: current_sock = stored_sock;
2217: if (current_sock == NULL)
2218: current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2219:
2220: while (current_sock && count < MAX_RX_STEPS)
2221: {
2222: sock *s = current_sock;
2223: if (s->index == -1)
2224: {
2225: current_sock = sk_next(s);
2226: goto next2;
2227: }
2228:
2229: if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
2230: {
2231: count++;
2232: io_log_event(s->rx_hook, s->data);
2233: sk_read(s, pfd[s->index].revents);
2234: if (s != current_sock)
2235: goto next2;
2236: }
2237:
2238: if (pfd[s->index].revents & (POLLHUP | POLLERR))
2239: {
2240: sk_err(s, pfd[s->index].revents);
2241: if (s != current_sock)
2242: goto next2;
2243: }
2244:
2245: current_sock = sk_next(s);
2246: next2: ;
2247: }
2248:
2249:
2250: stored_sock = current_sock;
2251: }
2252: }
2253: }
2254:
2255: void
2256: test_old_bird(char *path)
2257: {
2258: int fd;
2259: struct sockaddr_un sa;
2260:
2261: fd = socket(AF_UNIX, SOCK_STREAM, 0);
2262: if (fd < 0)
2263: die("Cannot create socket: %m");
2264: if (strlen(path) >= sizeof(sa.sun_path))
2265: die("Socket path too long");
2266: bzero(&sa, sizeof(sa));
2267: sa.sun_family = AF_UNIX;
2268: strcpy(sa.sun_path, path);
2269: if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
2270: die("I found another BIRD running.");
2271: close(fd);
2272: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>