Annotation of embedaddon/quagga/watchquagga/watchquagga.c, revision 1.1
1.1 ! misho 1: /*
! 2: Monitor status of quagga daemons and restart if necessary.
! 3:
! 4: Copyright (C) 2004 Andrew J. Schorr
! 5:
! 6: This program is free software; you can redistribute it and/or modify
! 7: it under the terms of the GNU General Public License as published by
! 8: the Free Software Foundation; either version 2 of the License, or
! 9: (at your option) any later version.
! 10:
! 11: This program is distributed in the hope that it will be useful,
! 12: but WITHOUT ANY WARRANTY; without even the implied warranty of
! 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! 14: GNU General Public License for more details.
! 15:
! 16: You should have received a copy of the GNU General Public License
! 17: along with this program; if not, write to the Free Software
! 18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
! 19: */
! 20:
! 21: #include <zebra.h>
! 22: #include <thread.h>
! 23: #include <log.h>
! 24: #include <network.h>
! 25: #include <sigevent.h>
! 26: #include <lib/version.h>
! 27: #include <getopt.h>
! 28: #include <sys/un.h>
! 29: #include <sys/wait.h>
! 30:
! 31: #ifndef MIN
! 32: #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
! 33: #endif
! 34:
! 35: /* Macros to help randomize timers. */
! 36: #define JITTER(X) ((random() % ((X)+1))-((X)/2))
! 37: #define FUZZY(X) ((X)+JITTER((X)/20))
! 38:
! 39: #define DEFAULT_PERIOD 5
! 40: #define DEFAULT_TIMEOUT 10
! 41: #define DEFAULT_RESTART_TIMEOUT 20
! 42: #define DEFAULT_LOGLEVEL LOG_INFO
! 43: #define DEFAULT_MIN_RESTART 60
! 44: #define DEFAULT_MAX_RESTART 600
! 45: #ifdef PATH_WATCHQUAGGA_PID
! 46: #define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
! 47: #else
! 48: #define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
! 49: #endif
! 50: #ifdef DAEMON_VTY_DIR
! 51: #define VTYDIR DAEMON_VTY_DIR
! 52: #else
! 53: #define VTYDIR STATEDIR
! 54: #endif
! 55:
! 56: #define PING_TOKEN "PING"
! 57:
! 58: /* Needs to be global, referenced somewhere inside libzebra. */
! 59: struct thread_master *master;
! 60:
! 61: typedef enum
! 62: {
! 63: MODE_MONITOR = 0,
! 64: MODE_GLOBAL_RESTART,
! 65: MODE_SEPARATE_RESTART,
! 66: MODE_PHASED_ZEBRA_RESTART,
! 67: MODE_PHASED_ALL_RESTART
! 68: } watch_mode_t;
! 69:
! 70: static const char *mode_str[] =
! 71: {
! 72: "monitor",
! 73: "global restart",
! 74: "individual daemon restart",
! 75: "phased zebra restart",
! 76: "phased global restart for any failure",
! 77: };
! 78:
! 79: typedef enum
! 80: {
! 81: PHASE_NONE = 0,
! 82: PHASE_STOPS_PENDING,
! 83: PHASE_WAITING_DOWN,
! 84: PHASE_ZEBRA_RESTART_PENDING,
! 85: PHASE_WAITING_ZEBRA_UP
! 86: } restart_phase_t;
! 87:
! 88: static const char *phase_str[] =
! 89: {
! 90: "None",
! 91: "Stop jobs running",
! 92: "Waiting for other daemons to come down",
! 93: "Zebra restart job running",
! 94: "Waiting for zebra to come up",
! 95: "Start jobs running",
! 96: };
! 97:
! 98: #define PHASE_TIMEOUT (3*gs.restart_timeout)
! 99:
! 100: struct restart_info
! 101: {
! 102: const char *name;
! 103: const char *what;
! 104: pid_t pid;
! 105: struct timeval time;
! 106: long interval;
! 107: struct thread *t_kill;
! 108: int kills;
! 109: };
! 110:
! 111: static struct global_state
! 112: {
! 113: watch_mode_t mode;
! 114: restart_phase_t phase;
! 115: struct thread *t_phase_hanging;
! 116: const char *vtydir;
! 117: long period;
! 118: long timeout;
! 119: long restart_timeout;
! 120: long min_restart_interval;
! 121: long max_restart_interval;
! 122: int do_ping;
! 123: struct daemon *daemons;
! 124: const char *restart_command;
! 125: const char *start_command;
! 126: const char *stop_command;
! 127: struct restart_info restart;
! 128: int unresponsive_restart;
! 129: int loglevel;
! 130: struct daemon *special; /* points to zebra when doing phased restart */
! 131: int numdaemons;
! 132: int numpids;
! 133: int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
! 134: } gs = {
! 135: .mode = MODE_MONITOR,
! 136: .phase = PHASE_NONE,
! 137: .vtydir = VTYDIR,
! 138: .period = 1000*DEFAULT_PERIOD,
! 139: .timeout = DEFAULT_TIMEOUT,
! 140: .restart_timeout = DEFAULT_RESTART_TIMEOUT,
! 141: .loglevel = DEFAULT_LOGLEVEL,
! 142: .min_restart_interval = DEFAULT_MIN_RESTART,
! 143: .max_restart_interval = DEFAULT_MAX_RESTART,
! 144: .do_ping = 1,
! 145: };
! 146:
! 147: typedef enum
! 148: {
! 149: DAEMON_INIT,
! 150: DAEMON_DOWN,
! 151: DAEMON_CONNECTING,
! 152: DAEMON_UP,
! 153: DAEMON_UNRESPONSIVE
! 154: } daemon_state_t;
! 155:
! 156: #define IS_UP(DMN) \
! 157: (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
! 158:
! 159: static const char *state_str[] =
! 160: {
! 161: "Init",
! 162: "Down",
! 163: "Connecting",
! 164: "Up",
! 165: "Unresponsive",
! 166: };
! 167:
! 168: struct daemon {
! 169: const char *name;
! 170: daemon_state_t state;
! 171: int fd;
! 172: struct timeval echo_sent;
! 173: u_int connect_tries;
! 174: struct thread *t_wakeup;
! 175: struct thread *t_read;
! 176: struct thread *t_write;
! 177: struct daemon *next;
! 178: struct restart_info restart;
! 179: };
! 180:
! 181: static const struct option longopts[] =
! 182: {
! 183: { "daemon", no_argument, NULL, 'd'},
! 184: { "statedir", required_argument, NULL, 'S'},
! 185: { "no-echo", no_argument, NULL, 'e'},
! 186: { "loglevel", required_argument, NULL, 'l'},
! 187: { "interval", required_argument, NULL, 'i'},
! 188: { "timeout", required_argument, NULL, 't'},
! 189: { "restart-timeout", required_argument, NULL, 'T'},
! 190: { "restart", required_argument, NULL, 'r'},
! 191: { "start-command", required_argument, NULL, 's'},
! 192: { "kill-command", required_argument, NULL, 'k'},
! 193: { "restart-all", required_argument, NULL, 'R'},
! 194: { "all-restart", no_argument, NULL, 'a'},
! 195: { "always-all-restart", no_argument, NULL, 'A'},
! 196: { "unresponsive-restart", no_argument, NULL, 'z'},
! 197: { "min-restart-interval", required_argument, NULL, 'm'},
! 198: { "max-restart-interval", required_argument, NULL, 'M'},
! 199: { "pid-file", required_argument, NULL, 'p'},
! 200: { "blank-string", required_argument, NULL, 'b'},
! 201: { "help", no_argument, NULL, 'h'},
! 202: { "version", no_argument, NULL, 'v'},
! 203: { NULL, 0, NULL, 0 }
! 204: };
! 205:
! 206: static int try_connect(struct daemon *dmn);
! 207: static int wakeup_send_echo(struct thread *t_wakeup);
! 208: static void try_restart(struct daemon *dmn);
! 209: static void phase_check(void);
! 210:
! 211: static int
! 212: usage(const char *progname, int status)
! 213: {
! 214: if (status != 0)
! 215: fprintf(stderr, "Try `%s --help' for more information.\n", progname);
! 216: else
! 217: printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
! 218: Watchdog program to monitor status of quagga daemons and try to restart\n\
! 219: them if they are down or unresponsive. It determines whether a daemon is\n\
! 220: up based on whether it can connect to the daemon's vty unix stream socket.\n\
! 221: It then repeatedly sends echo commands over that socket to determine whether\n\
! 222: the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
! 223: on the socket connection and know immediately that the daemon is down.\n\n\
! 224: The daemons to be monitored should be listed on the command line.\n\n\
! 225: This program can run in one of 5 modes:\n\n\
! 226: 0. Mode: %s.\n\
! 227: Just monitor and report on status changes. Example:\n\
! 228: %s -d zebra ospfd bgpd\n\n\
! 229: 1. Mode: %s.\n\
! 230: Whenever any daemon hangs or crashes, use the given command to restart\n\
! 231: them all. Example:\n\
! 232: %s -dz \\\n\
! 233: -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
! 234: zebra ospfd\n\n\
! 235: 2. Mode: %s.\n\
! 236: When any single daemon hangs or crashes, restart only the daemon that's\n\
! 237: in trouble using the supplied restart command. Example:\n\
! 238: %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
! 239: 3. Mode: %s.\n\
! 240: The same as the previous mode, except that there is special treatment when\n\
! 241: the zebra daemon is in trouble. In that case, a phased restart approach\n\
! 242: is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
! 243: daemons. Example:\n\
! 244: %s -adz -r '/sbin/service %%s restart' \\\n\
! 245: -s '/sbin/service %%s start' \\\n\
! 246: -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
! 247: 4. Mode: %s.\n\
! 248: This is the same as the previous mode, except that the phased restart\n\
! 249: procedure is used whenever any of the daemons hangs or crashes. Example:\n\
! 250: %s -Adz -r '/sbin/service %%s restart' \\\n\
! 251: -s '/sbin/service %%s start' \\\n\
! 252: -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
! 253: As of this writing, it is believed that mode 2 [%s]\n\
! 254: is not safe, and mode 3 [%s] may not be safe with some of the\n\
! 255: routing daemons.\n\n\
! 256: In order to avoid attempting to restart the daemons in a fast loop,\n\
! 257: the -m and -M options allow you to control the minimum delay between\n\
! 258: restart commands. The minimum restart delay is recalculated each time\n\
! 259: a restart is attempted: if the time since the last restart attempt exceeds\n\
! 260: twice the -M value, then the restart delay is set to the -m value.\n\
! 261: Otherwise, the interval is doubled (but capped at the -M value).\n\n\
! 262: Options:\n\
! 263: -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
! 264: to syslog instead of stdout.\n\
! 265: -S, --statedir Set the vty socket directory (default is %s)\n\
! 266: -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
! 267: option is necessary if the daemons do not support the\n\
! 268: echo command)\n\
! 269: -l, --loglevel Set the logging level (default is %d).\n\
! 270: The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
! 271: but it can be set higher than %d if extra-verbose debugging\n\
! 272: messages are desired.\n\
! 273: -m, --min-restart-interval\n\
! 274: Set the minimum seconds to wait between invocations of daemon\n\
! 275: restart commands (default is %d).\n\
! 276: -M, --max-restart-interval\n\
! 277: Set the maximum seconds to wait between invocations of daemon\n\
! 278: restart commands (default is %d).\n\
! 279: -i, --interval Set the status polling interval in seconds (default is %d)\n\
! 280: -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
! 281: -T, --restart-timeout\n\
! 282: Set the restart (kill) timeout in seconds (default is %d).\n\
! 283: If any background jobs are still running after this much\n\
! 284: time has elapsed, they will be killed.\n\
! 285: -r, --restart Supply a Bourne shell command to use to restart a single\n\
! 286: daemon. The command string should include '%%s' where the\n\
! 287: name of the daemon should be substituted.\n\
! 288: Note that -r and -R are incompatible.\n\
! 289: -s, --start-command\n\
! 290: Supply a Bourne shell to command to use to start a single\n\
! 291: daemon. The command string should include '%%s' where the\n\
! 292: name of the daemon should be substituted.\n\
! 293: -k, --kill-command\n\
! 294: Supply a Bourne shell to command to use to stop a single\n\
! 295: daemon. The command string should include '%%s' where the\n\
! 296: name of the daemon should be substituted.\n\
! 297: -R, --restart-all\n\
! 298: When one or more daemons is down, try to restart everything\n\
! 299: using the Bourne shell command supplied as the argument.\n\
! 300: Note that -r and -R are incompatible.\n\
! 301: -z, --unresponsive-restart\n\
! 302: When a daemon is unresponsive, treat it as being down for\n\
! 303: restart purposes.\n\
! 304: -a, --all-restart\n\
! 305: When zebra hangs or crashes, restart all daemons using\n\
! 306: this phased approach: 1. stop all other daemons; 2. restart\n\
! 307: zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
! 308: -A, --always-all-restart\n\
! 309: When any daemon (not just zebra) hangs or crashes, use the\n\
! 310: same phased restart mechanism described above for -a.\n\
! 311: Requires -r, -s, and -k.\n\
! 312: -p, --pid-file Set process identifier file name\n\
! 313: (default is %s).\n\
! 314: -b, --blank-string\n\
! 315: When the supplied argument string is found in any of the\n\
! 316: various shell command arguments (-r, -s, -k, or -R), replace\n\
! 317: it with a space. This is an ugly hack to circumvent problems\n\
! 318: passing command-line arguments with embedded spaces.\n\
! 319: -v, --version Print program version\n\
! 320: -h, --help Display this help and exit\n\
! 321: ", progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
! 322: progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],mode_str[3],
! 323: VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
! 324: DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
! 325: DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,DEFAULT_PIDFILE);
! 326:
! 327: return status;
! 328: }
! 329:
! 330: static pid_t
! 331: run_background(const char *shell_cmd)
! 332: {
! 333: pid_t child;
! 334:
! 335: switch (child = fork())
! 336: {
! 337: case -1:
! 338: zlog_err("fork failed, cannot run command [%s]: %s",
! 339: shell_cmd,safe_strerror(errno));
! 340: return -1;
! 341: case 0:
! 342: /* Child process. */
! 343: /* Use separate process group so child processes can be killed easily. */
! 344: if (setpgid(0,0) < 0)
! 345: zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
! 346: {
! 347: const char *argv[4] = { "sh", "-c", shell_cmd, NULL};
! 348: execv("/bin/sh",(char *const *)argv);
! 349: zlog_err("execv(/bin/sh -c '%s') failed: %s",
! 350: shell_cmd,safe_strerror(errno));
! 351: _exit(127);
! 352: }
! 353: default:
! 354: /* Parent process: we will reap the child later. */
! 355: zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
! 356: return child;
! 357: }
! 358: }
! 359:
! 360: static struct timeval *
! 361: time_elapsed(struct timeval *result, const struct timeval *start_time)
! 362: {
! 363: gettimeofday(result,NULL);
! 364: result->tv_sec -= start_time->tv_sec;
! 365: result->tv_usec -= start_time->tv_usec;
! 366: while (result->tv_usec < 0)
! 367: {
! 368: result->tv_usec += 1000000L;
! 369: result->tv_sec--;
! 370: }
! 371: return result;
! 372: }
! 373:
! 374: static int
! 375: restart_kill(struct thread *t_kill)
! 376: {
! 377: struct restart_info *restart = THREAD_ARG(t_kill);
! 378: struct timeval delay;
! 379:
! 380: time_elapsed(&delay,&restart->time);
! 381: zlog_warn("Warning: %s %s child process %d still running after "
! 382: "%ld seconds, sending signal %d",
! 383: restart->what,restart->name,(int)restart->pid,delay.tv_sec,
! 384: (restart->kills ? SIGKILL : SIGTERM));
! 385: kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
! 386: restart->kills++;
! 387: restart->t_kill = thread_add_timer(master,restart_kill,restart,
! 388: gs.restart_timeout);
! 389: return 0;
! 390: }
! 391:
! 392: static struct restart_info *
! 393: find_child(pid_t child)
! 394: {
! 395: if (gs.mode == MODE_GLOBAL_RESTART)
! 396: {
! 397: if (gs.restart.pid == child)
! 398: return &gs.restart;
! 399: }
! 400: else
! 401: {
! 402: struct daemon *dmn;
! 403: for (dmn = gs.daemons; dmn; dmn = dmn->next)
! 404: {
! 405: if (dmn->restart.pid == child)
! 406: return &dmn->restart;
! 407: }
! 408: }
! 409: return NULL;
! 410: }
! 411:
! 412: static void
! 413: sigchild(void)
! 414: {
! 415: pid_t child;
! 416: int status;
! 417: const char *name;
! 418: const char *what;
! 419: struct restart_info *restart;
! 420:
! 421: switch (child = waitpid(-1,&status,WNOHANG))
! 422: {
! 423: case -1:
! 424: zlog_err("waitpid failed: %s",safe_strerror(errno));
! 425: return;
! 426: case 0:
! 427: zlog_warn("SIGCHLD received, but waitpid did not reap a child");
! 428: return;
! 429: }
! 430:
! 431: if ((restart = find_child(child)) != NULL)
! 432: {
! 433: name = restart->name;
! 434: what = restart->what;
! 435: restart->pid = 0;
! 436: gs.numpids--;
! 437: thread_cancel(restart->t_kill);
! 438: restart->t_kill = NULL;
! 439: /* Update restart time to reflect the time the command completed. */
! 440: gettimeofday(&restart->time,NULL);
! 441: }
! 442: else
! 443: {
! 444: zlog_err("waitpid returned status for an unknown child process %d",
! 445: (int)child);
! 446: name = "(unknown)";
! 447: what = "background";
! 448: }
! 449: if (WIFSTOPPED(status))
! 450: zlog_warn("warning: %s %s process %d is stopped",
! 451: what,name,(int)child);
! 452: else if (WIFSIGNALED(status))
! 453: zlog_warn("%s %s process %d terminated due to signal %d",
! 454: what,name,(int)child,WTERMSIG(status));
! 455: else if (WIFEXITED(status))
! 456: {
! 457: if (WEXITSTATUS(status) != 0)
! 458: zlog_warn("%s %s process %d exited with non-zero status %d",
! 459: what,name,(int)child,WEXITSTATUS(status));
! 460: else
! 461: zlog_debug("%s %s process %d exited normally",what,name,(int)child);
! 462: }
! 463: else
! 464: zlog_err("cannot interpret %s %s process %d wait status 0x%x",
! 465: what,name,(int)child,status);
! 466: phase_check();
! 467: }
! 468:
! 469: static int
! 470: run_job(struct restart_info *restart, const char *cmdtype, const char *command,
! 471: int force, int update_interval)
! 472: {
! 473: struct timeval delay;
! 474:
! 475: if (gs.loglevel > LOG_DEBUG+1)
! 476: zlog_debug("attempting to %s %s",cmdtype,restart->name);
! 477:
! 478: if (restart->pid)
! 479: {
! 480: if (gs.loglevel > LOG_DEBUG+1)
! 481: zlog_debug("cannot %s %s, previous pid %d still running",
! 482: cmdtype,restart->name,(int)restart->pid);
! 483: return -1;
! 484: }
! 485:
! 486: /* Note: time_elapsed test must come before the force test, since we need
! 487: to make sure that delay is initialized for use below in updating the
! 488: restart interval. */
! 489: if ((time_elapsed(&delay,&restart->time)->tv_sec < restart->interval) &&
! 490: !force)
! 491: {
! 492: if (gs.loglevel > LOG_DEBUG+1)
! 493: zlog_debug("postponing %s %s: "
! 494: "elapsed time %ld < retry interval %ld",
! 495: cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
! 496: return -1;
! 497: }
! 498:
! 499: gettimeofday(&restart->time,NULL);
! 500: restart->kills = 0;
! 501: {
! 502: char cmd[strlen(command)+strlen(restart->name)+1];
! 503: snprintf(cmd,sizeof(cmd),command,restart->name);
! 504: if ((restart->pid = run_background(cmd)) > 0)
! 505: {
! 506: restart->t_kill = thread_add_timer(master,restart_kill,restart,
! 507: gs.restart_timeout);
! 508: restart->what = cmdtype;
! 509: gs.numpids++;
! 510: }
! 511: else
! 512: restart->pid = 0;
! 513: }
! 514:
! 515: /* Calculate the new restart interval. */
! 516: if (update_interval)
! 517: {
! 518: if (delay.tv_sec > 2*gs.max_restart_interval)
! 519: restart->interval = gs.min_restart_interval;
! 520: else if ((restart->interval *= 2) > gs.max_restart_interval)
! 521: restart->interval = gs.max_restart_interval;
! 522: if (gs.loglevel > LOG_DEBUG+1)
! 523: zlog_debug("restart %s interval is now %ld",
! 524: restart->name,restart->interval);
! 525: }
! 526: return restart->pid;
! 527: }
! 528:
! 529: #define SET_READ_HANDLER(DMN) \
! 530: (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
! 531:
! 532: #define SET_WAKEUP_DOWN(DMN) \
! 533: (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
! 534: FUZZY(gs.period))
! 535:
! 536: #define SET_WAKEUP_UNRESPONSIVE(DMN) \
! 537: (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
! 538: FUZZY(gs.period))
! 539:
! 540: #define SET_WAKEUP_ECHO(DMN) \
! 541: (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
! 542: FUZZY(gs.period))
! 543:
! 544: static int
! 545: wakeup_down(struct thread *t_wakeup)
! 546: {
! 547: struct daemon *dmn = THREAD_ARG(t_wakeup);
! 548:
! 549: dmn->t_wakeup = NULL;
! 550: if (try_connect(dmn) < 0)
! 551: SET_WAKEUP_DOWN(dmn);
! 552: if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
! 553: try_restart(dmn);
! 554: return 0;
! 555: }
! 556:
! 557: static int
! 558: wakeup_init(struct thread *t_wakeup)
! 559: {
! 560: struct daemon *dmn = THREAD_ARG(t_wakeup);
! 561:
! 562: dmn->t_wakeup = NULL;
! 563: if (try_connect(dmn) < 0)
! 564: {
! 565: SET_WAKEUP_DOWN(dmn);
! 566: zlog_err("%s state -> down : initial connection attempt failed",
! 567: dmn->name);
! 568: dmn->state = DAEMON_DOWN;
! 569: }
! 570: return 0;
! 571: }
! 572:
! 573: static void
! 574: daemon_down(struct daemon *dmn, const char *why)
! 575: {
! 576: if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
! 577: zlog_err("%s state -> down : %s",dmn->name,why);
! 578: else if (gs.loglevel > LOG_DEBUG)
! 579: zlog_debug("%s still down : %s",dmn->name,why);
! 580: if (IS_UP(dmn))
! 581: gs.numdown++;
! 582: dmn->state = DAEMON_DOWN;
! 583: if (dmn->fd >= 0)
! 584: {
! 585: close(dmn->fd);
! 586: dmn->fd = -1;
! 587: }
! 588: THREAD_OFF(dmn->t_read);
! 589: THREAD_OFF(dmn->t_write);
! 590: THREAD_OFF(dmn->t_wakeup);
! 591: if (try_connect(dmn) < 0)
! 592: SET_WAKEUP_DOWN(dmn);
! 593: phase_check();
! 594: }
! 595:
! 596: static int
! 597: handle_read(struct thread *t_read)
! 598: {
! 599: struct daemon *dmn = THREAD_ARG(t_read);
! 600: static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
! 601: char buf[sizeof(resp)+100];
! 602: ssize_t rc;
! 603: struct timeval delay;
! 604:
! 605: dmn->t_read = NULL;
! 606: if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
! 607: {
! 608: char why[100];
! 609:
! 610: if (ERRNO_IO_RETRY(errno))
! 611: {
! 612: /* Pretend it never happened. */
! 613: SET_READ_HANDLER(dmn);
! 614: return 0;
! 615: }
! 616: snprintf(why,sizeof(why),"unexpected read error: %s",
! 617: safe_strerror(errno));
! 618: daemon_down(dmn,why);
! 619: return 0;
! 620: }
! 621: if (rc == 0)
! 622: {
! 623: daemon_down(dmn,"read returned EOF");
! 624: return 0;
! 625: }
! 626: if (!dmn->echo_sent.tv_sec)
! 627: {
! 628: char why[sizeof(buf)+100];
! 629: snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
! 630: (int)rc,(int)rc,buf);
! 631: daemon_down(dmn,why);
! 632: return 0;
! 633: }
! 634:
! 635: /* We are expecting an echo response: is there any chance that the
! 636: response would not be returned entirely in the first read? That
! 637: seems inconceivable... */
! 638: if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
! 639: {
! 640: char why[100+sizeof(buf)];
! 641: snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
! 642: "(expecting %u): %.*s",
! 643: (int)rc,(u_int)sizeof(resp),(int)rc,buf);
! 644: daemon_down(dmn,why);
! 645: return 0;
! 646: }
! 647:
! 648: time_elapsed(&delay,&dmn->echo_sent);
! 649: dmn->echo_sent.tv_sec = 0;
! 650: if (dmn->state == DAEMON_UNRESPONSIVE)
! 651: {
! 652: if (delay.tv_sec < gs.timeout)
! 653: {
! 654: dmn->state = DAEMON_UP;
! 655: zlog_warn("%s state -> up : echo response received after %ld.%06ld "
! 656: "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
! 657: }
! 658: else
! 659: zlog_warn("%s: slow echo response finally received after %ld.%06ld "
! 660: "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
! 661: }
! 662: else if (gs.loglevel > LOG_DEBUG+1)
! 663: zlog_debug("%s: echo response received after %ld.%06ld seconds",
! 664: dmn->name,delay.tv_sec,delay.tv_usec);
! 665:
! 666: SET_READ_HANDLER(dmn);
! 667: if (dmn->t_wakeup)
! 668: thread_cancel(dmn->t_wakeup);
! 669: SET_WAKEUP_ECHO(dmn);
! 670:
! 671: return 0;
! 672: }
! 673:
! 674: static void
! 675: daemon_up(struct daemon *dmn, const char *why)
! 676: {
! 677: dmn->state = DAEMON_UP;
! 678: gs.numdown--;
! 679: dmn->connect_tries = 0;
! 680: zlog_notice("%s state -> up : %s",dmn->name,why);
! 681: if (gs.do_ping)
! 682: SET_WAKEUP_ECHO(dmn);
! 683: phase_check();
! 684: }
! 685:
! 686: static int
! 687: check_connect(struct thread *t_write)
! 688: {
! 689: struct daemon *dmn = THREAD_ARG(t_write);
! 690: int sockerr;
! 691: socklen_t reslen = sizeof(sockerr);
! 692:
! 693: dmn->t_write = NULL;
! 694: if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
! 695: {
! 696: zlog_warn("%s: check_connect: getsockopt failed: %s",
! 697: dmn->name,safe_strerror(errno));
! 698: daemon_down(dmn,"getsockopt failed checking connection success");
! 699: return 0;
! 700: }
! 701: if ((reslen == sizeof(sockerr)) && sockerr)
! 702: {
! 703: char why[100];
! 704: snprintf(why,sizeof(why),
! 705: "getsockopt reports that connection attempt failed: %s",
! 706: safe_strerror(sockerr));
! 707: daemon_down(dmn,why);
! 708: return 0;
! 709: }
! 710:
! 711: daemon_up(dmn,"delayed connect succeeded");
! 712: return 0;
! 713: }
! 714:
! 715: static int
! 716: wakeup_connect_hanging(struct thread *t_wakeup)
! 717: {
! 718: struct daemon *dmn = THREAD_ARG(t_wakeup);
! 719: char why[100];
! 720:
! 721: dmn->t_wakeup = NULL;
! 722: snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
! 723: gs.timeout);
! 724: daemon_down(dmn,why);
! 725: return 0;
! 726: }
! 727:
! 728: /* Making connection to protocol daemon. */
! 729: static int
! 730: try_connect(struct daemon *dmn)
! 731: {
! 732: int sock;
! 733: struct sockaddr_un addr;
! 734: socklen_t len;
! 735:
! 736: if (gs.loglevel > LOG_DEBUG+1)
! 737: zlog_debug("%s: attempting to connect",dmn->name);
! 738: dmn->connect_tries++;
! 739:
! 740: memset (&addr, 0, sizeof (struct sockaddr_un));
! 741: addr.sun_family = AF_UNIX;
! 742: snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
! 743: gs.vtydir,dmn->name);
! 744: #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
! 745: len = addr.sun_len = SUN_LEN(&addr);
! 746: #else
! 747: len = sizeof (addr.sun_family) + strlen (addr.sun_path);
! 748: #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
! 749:
! 750: /* Quick check to see if we might succeed before we go to the trouble
! 751: of creating a socket. */
! 752: if (access(addr.sun_path, W_OK) < 0)
! 753: {
! 754: if (errno != ENOENT)
! 755: zlog_err("%s: access to socket %s denied: %s",
! 756: dmn->name,addr.sun_path,safe_strerror(errno));
! 757: return -1;
! 758: }
! 759:
! 760: if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
! 761: {
! 762: zlog_err("%s(%s): cannot make socket: %s",
! 763: __func__,addr.sun_path, safe_strerror(errno));
! 764: return -1;
! 765: }
! 766:
! 767: if (set_nonblocking(sock) < 0)
! 768: {
! 769: zlog_err("%s(%s): set_nonblocking(%d) failed",
! 770: __func__, addr.sun_path, sock);
! 771: close(sock);
! 772: return -1;
! 773: }
! 774:
! 775: if (connect (sock, (struct sockaddr *) &addr, len) < 0)
! 776: {
! 777: if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
! 778: {
! 779: if (gs.loglevel > LOG_DEBUG)
! 780: zlog_debug("%s(%s): connect failed: %s",
! 781: __func__,addr.sun_path, safe_strerror(errno));
! 782: close (sock);
! 783: return -1;
! 784: }
! 785: if (gs.loglevel > LOG_DEBUG)
! 786: zlog_debug("%s: connection in progress",dmn->name);
! 787: dmn->state = DAEMON_CONNECTING;
! 788: dmn->fd = sock;
! 789: dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
! 790: dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
! 791: gs.timeout);
! 792: SET_READ_HANDLER(dmn);
! 793: return 0;
! 794: }
! 795:
! 796: dmn->fd = sock;
! 797: SET_READ_HANDLER(dmn);
! 798: daemon_up(dmn,"connect succeeded");
! 799: return 1;
! 800: }
! 801:
! 802: static int
! 803: phase_hanging(struct thread *t_hanging)
! 804: {
! 805: gs.t_phase_hanging = NULL;
! 806: zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
! 807: phase_str[gs.phase],PHASE_TIMEOUT);
! 808: gs.phase = PHASE_NONE;
! 809: return 0;
! 810: }
! 811:
! 812: static void
! 813: set_phase(restart_phase_t new_phase)
! 814: {
! 815: gs.phase = new_phase;
! 816: if (gs.t_phase_hanging)
! 817: thread_cancel(gs.t_phase_hanging);
! 818: gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
! 819: PHASE_TIMEOUT);
! 820: }
! 821:
! 822: static void
! 823: phase_check(void)
! 824: {
! 825: switch (gs.phase)
! 826: {
! 827: case PHASE_NONE:
! 828: break;
! 829: case PHASE_STOPS_PENDING:
! 830: if (gs.numpids)
! 831: break;
! 832: zlog_info("Phased restart: all routing daemon stop jobs have completed.");
! 833: set_phase(PHASE_WAITING_DOWN);
! 834: /*FALLTHRU*/
! 835: case PHASE_WAITING_DOWN:
! 836: if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
! 837: break;
! 838: zlog_info("Phased restart: all routing daemons now down.");
! 839: run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
! 840: set_phase(PHASE_ZEBRA_RESTART_PENDING);
! 841: /*FALLTHRU*/
! 842: case PHASE_ZEBRA_RESTART_PENDING:
! 843: if (gs.special->restart.pid)
! 844: break;
! 845: zlog_info("Phased restart: %s restart job completed.",gs.special->name);
! 846: set_phase(PHASE_WAITING_ZEBRA_UP);
! 847: /*FALLTHRU*/
! 848: case PHASE_WAITING_ZEBRA_UP:
! 849: if (!IS_UP(gs.special))
! 850: break;
! 851: zlog_info("Phased restart: %s is now up.",gs.special->name);
! 852: {
! 853: struct daemon *dmn;
! 854: for (dmn = gs.daemons; dmn; dmn = dmn->next)
! 855: {
! 856: if (dmn != gs.special)
! 857: run_job(&dmn->restart,"start",gs.start_command,1,0);
! 858: }
! 859: }
! 860: gs.phase = PHASE_NONE;
! 861: THREAD_OFF(gs.t_phase_hanging);
! 862: zlog_notice("Phased global restart has completed.");
! 863: break;
! 864: }
! 865: }
! 866:
! 867: static void
! 868: try_restart(struct daemon *dmn)
! 869: {
! 870: switch (gs.mode)
! 871: {
! 872: case MODE_MONITOR:
! 873: return;
! 874: case MODE_GLOBAL_RESTART:
! 875: run_job(&gs.restart,"restart",gs.restart_command,0,1);
! 876: break;
! 877: case MODE_SEPARATE_RESTART:
! 878: run_job(&dmn->restart,"restart",gs.restart_command,0,1);
! 879: break;
! 880: case MODE_PHASED_ZEBRA_RESTART:
! 881: if (dmn != gs.special)
! 882: {
! 883: if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
! 884: run_job(&dmn->restart,"restart",gs.restart_command,0,1);
! 885: else
! 886: zlog_debug("%s: postponing restart attempt because master %s daemon "
! 887: "not up [%s], or phased restart in progress",
! 888: dmn->name,gs.special->name,state_str[gs.special->state]);
! 889: break;
! 890: }
! 891: /*FALLTHRU*/
! 892: case MODE_PHASED_ALL_RESTART:
! 893: if ((gs.phase != PHASE_NONE) || gs.numpids)
! 894: {
! 895: if (gs.loglevel > LOG_DEBUG+1)
! 896: zlog_debug("postponing phased global restart: restart already in "
! 897: "progress [%s], or outstanding child processes [%d]",
! 898: phase_str[gs.phase],gs.numpids);
! 899: break;
! 900: }
! 901: /* Is it too soon for a restart? */
! 902: {
! 903: struct timeval delay;
! 904: if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
! 905: gs.special->restart.interval)
! 906: {
! 907: if (gs.loglevel > LOG_DEBUG+1)
! 908: zlog_debug("postponing phased global restart: "
! 909: "elapsed time %ld < retry interval %ld",
! 910: (long)delay.tv_sec,gs.special->restart.interval);
! 911: break;
! 912: }
! 913: }
! 914: zlog_info("Phased restart: stopping all routing daemons.");
! 915: /* First step: stop all other daemons. */
! 916: for (dmn = gs.daemons; dmn; dmn = dmn->next)
! 917: {
! 918: if (dmn != gs.special)
! 919: run_job(&dmn->restart,"stop",gs.stop_command,1,1);
! 920: }
! 921: set_phase(PHASE_STOPS_PENDING);
! 922: break;
! 923: default:
! 924: zlog_err("error: unknown restart mode %d",gs.mode);
! 925: break;
! 926: }
! 927: }
! 928:
! 929: static int
! 930: wakeup_unresponsive(struct thread *t_wakeup)
! 931: {
! 932: struct daemon *dmn = THREAD_ARG(t_wakeup);
! 933:
! 934: dmn->t_wakeup = NULL;
! 935: if (dmn->state != DAEMON_UNRESPONSIVE)
! 936: zlog_err("%s: no longer unresponsive (now %s), "
! 937: "wakeup should have been cancelled!",
! 938: dmn->name,state_str[dmn->state]);
! 939: else
! 940: {
! 941: SET_WAKEUP_UNRESPONSIVE(dmn);
! 942: try_restart(dmn);
! 943: }
! 944: return 0;
! 945: }
! 946:
! 947: static int
! 948: wakeup_no_answer(struct thread *t_wakeup)
! 949: {
! 950: struct daemon *dmn = THREAD_ARG(t_wakeup);
! 951:
! 952: dmn->t_wakeup = NULL;
! 953: dmn->state = DAEMON_UNRESPONSIVE;
! 954: zlog_err("%s state -> unresponsive : no response yet to ping "
! 955: "sent %ld seconds ago",dmn->name,gs.timeout);
! 956: if (gs.unresponsive_restart)
! 957: {
! 958: SET_WAKEUP_UNRESPONSIVE(dmn);
! 959: try_restart(dmn);
! 960: }
! 961: return 0;
! 962: }
! 963:
! 964: static int
! 965: wakeup_send_echo(struct thread *t_wakeup)
! 966: {
! 967: static const char echocmd[] = "echo " PING_TOKEN;
! 968: ssize_t rc;
! 969: struct daemon *dmn = THREAD_ARG(t_wakeup);
! 970:
! 971: dmn->t_wakeup = NULL;
! 972: if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
! 973: ((size_t)rc != sizeof(echocmd)))
! 974: {
! 975: char why[100+sizeof(echocmd)];
! 976: snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
! 977: echocmd,(int)rc,(u_int)sizeof(echocmd));
! 978: daemon_down(dmn,why);
! 979: }
! 980: else
! 981: {
! 982: gettimeofday(&dmn->echo_sent,NULL);
! 983: dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
! 984: }
! 985: return 0;
! 986: }
! 987:
! 988: static void
! 989: sigint(void)
! 990: {
! 991: zlog_notice("Terminating on signal");
! 992: exit(0);
! 993: }
! 994:
! 995: static int
! 996: valid_command(const char *cmd)
! 997: {
! 998: char *p;
! 999:
! 1000: return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
! 1001: }
! 1002:
! 1003: /* This is an ugly hack to circumvent problems with passing command-line
! 1004: arguments that contain spaces. The fix is to use a configuration file. */
! 1005: static char *
! 1006: translate_blanks(const char *cmd, const char *blankstr)
! 1007: {
! 1008: char *res;
! 1009: char *p;
! 1010: size_t bslen = strlen(blankstr);
! 1011:
! 1012: if (!(res = strdup(cmd)))
! 1013: {
! 1014: perror("strdup");
! 1015: exit(1);
! 1016: }
! 1017: while ((p = strstr(res,blankstr)) != NULL)
! 1018: {
! 1019: *p = ' ';
! 1020: if (bslen != 1)
! 1021: memmove(p+1,p+bslen,strlen(p+bslen)+1);
! 1022: }
! 1023: return res;
! 1024: }
! 1025:
! 1026: int
! 1027: main(int argc, char **argv)
! 1028: {
! 1029: const char *progname;
! 1030: int opt;
! 1031: int daemon_mode = 0;
! 1032: const char *pidfile = DEFAULT_PIDFILE;
! 1033: const char *special = "zebra";
! 1034: const char *blankstr = NULL;
! 1035: static struct quagga_signal_t my_signals[] =
! 1036: {
! 1037: {
! 1038: .signal = SIGINT,
! 1039: .handler = sigint,
! 1040: },
! 1041: {
! 1042: .signal = SIGTERM,
! 1043: .handler = sigint,
! 1044: },
! 1045: {
! 1046: .signal = SIGCHLD,
! 1047: .handler = sigchild,
! 1048: },
! 1049: };
! 1050:
! 1051: if ((progname = strrchr (argv[0], '/')) != NULL)
! 1052: progname++;
! 1053: else
! 1054: progname = argv[0];
! 1055:
! 1056: gs.restart.name = "all";
! 1057: while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
! 1058: longopts, 0)) != EOF)
! 1059: {
! 1060: switch (opt)
! 1061: {
! 1062: case 0:
! 1063: break;
! 1064: case 'a':
! 1065: if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
! 1066: {
! 1067: fputs("Ambiguous operating mode selected.\n",stderr);
! 1068: return usage(progname,1);
! 1069: }
! 1070: gs.mode = MODE_PHASED_ZEBRA_RESTART;
! 1071: break;
! 1072: case 'A':
! 1073: if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
! 1074: {
! 1075: fputs("Ambiguous operating mode selected.\n",stderr);
! 1076: return usage(progname,1);
! 1077: }
! 1078: gs.mode = MODE_PHASED_ALL_RESTART;
! 1079: break;
! 1080: case 'b':
! 1081: blankstr = optarg;
! 1082: break;
! 1083: case 'd':
! 1084: daemon_mode = 1;
! 1085: break;
! 1086: case 'e':
! 1087: gs.do_ping = 0;
! 1088: break;
! 1089: case 'k':
! 1090: if (!valid_command(optarg))
! 1091: {
! 1092: fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
! 1093: optarg);
! 1094: return usage(progname,1);
! 1095: }
! 1096: gs.stop_command = optarg;
! 1097: break;
! 1098: case 'l':
! 1099: {
! 1100: char garbage[3];
! 1101: if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
! 1102: (gs.loglevel < LOG_EMERG))
! 1103: {
! 1104: fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
! 1105: return usage(progname,1);
! 1106: }
! 1107: }
! 1108: break;
! 1109: case 'm':
! 1110: {
! 1111: char garbage[3];
! 1112: if ((sscanf(optarg,"%ld%1s",
! 1113: &gs.min_restart_interval,garbage) != 1) ||
! 1114: (gs.min_restart_interval < 0))
! 1115: {
! 1116: fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
! 1117: optarg);
! 1118: return usage(progname,1);
! 1119: }
! 1120: }
! 1121: break;
! 1122: case 'M':
! 1123: {
! 1124: char garbage[3];
! 1125: if ((sscanf(optarg,"%ld%1s",
! 1126: &gs.max_restart_interval,garbage) != 1) ||
! 1127: (gs.max_restart_interval < 0))
! 1128: {
! 1129: fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
! 1130: optarg);
! 1131: return usage(progname,1);
! 1132: }
! 1133: }
! 1134: break;
! 1135: case 'i':
! 1136: {
! 1137: char garbage[3];
! 1138: int period;
! 1139: if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
! 1140: (gs.period < 1))
! 1141: {
! 1142: fprintf(stderr,"Invalid interval argument: %s\n",optarg);
! 1143: return usage(progname,1);
! 1144: }
! 1145: gs.period = 1000*period;
! 1146: }
! 1147: break;
! 1148: case 'p':
! 1149: pidfile = optarg;
! 1150: break;
! 1151: case 'r':
! 1152: if ((gs.mode == MODE_GLOBAL_RESTART) ||
! 1153: (gs.mode == MODE_SEPARATE_RESTART))
! 1154: {
! 1155: fputs("Ambiguous operating mode selected.\n",stderr);
! 1156: return usage(progname,1);
! 1157: }
! 1158: if (!valid_command(optarg))
! 1159: {
! 1160: fprintf(stderr,
! 1161: "Invalid restart command, must contain '%%s': %s\n",
! 1162: optarg);
! 1163: return usage(progname,1);
! 1164: }
! 1165: gs.restart_command = optarg;
! 1166: if (gs.mode == MODE_MONITOR)
! 1167: gs.mode = MODE_SEPARATE_RESTART;
! 1168: break;
! 1169: case 'R':
! 1170: if (gs.mode != MODE_MONITOR)
! 1171: {
! 1172: fputs("Ambiguous operating mode selected.\n",stderr);
! 1173: return usage(progname,1);
! 1174: }
! 1175: if (strchr(optarg,'%'))
! 1176: {
! 1177: fprintf(stderr,
! 1178: "Invalid restart-all arg, must not contain '%%s': %s\n",
! 1179: optarg);
! 1180: return usage(progname,1);
! 1181: }
! 1182: gs.restart_command = optarg;
! 1183: gs.mode = MODE_GLOBAL_RESTART;
! 1184: break;
! 1185: case 's':
! 1186: if (!valid_command(optarg))
! 1187: {
! 1188: fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
! 1189: optarg);
! 1190: return usage(progname,1);
! 1191: }
! 1192: gs.start_command = optarg;
! 1193: break;
! 1194: case 'S':
! 1195: gs.vtydir = optarg;
! 1196: break;
! 1197: case 't':
! 1198: {
! 1199: char garbage[3];
! 1200: if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
! 1201: (gs.timeout < 1))
! 1202: {
! 1203: fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
! 1204: return usage(progname,1);
! 1205: }
! 1206: }
! 1207: break;
! 1208: case 'T':
! 1209: {
! 1210: char garbage[3];
! 1211: if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
! 1212: (gs.restart_timeout < 1))
! 1213: {
! 1214: fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
! 1215: return usage(progname,1);
! 1216: }
! 1217: }
! 1218: break;
! 1219: case 'z':
! 1220: gs.unresponsive_restart = 1;
! 1221: break;
! 1222: case 'v':
! 1223: printf ("%s version %s\n", progname, QUAGGA_VERSION);
! 1224: puts("Copyright 2004 Andrew J. Schorr");
! 1225: return 0;
! 1226: case 'h':
! 1227: return usage(progname,0);
! 1228: default:
! 1229: fputs("Invalid option.\n",stderr);
! 1230: return usage(progname,1);
! 1231: }
! 1232: }
! 1233:
! 1234: if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
! 1235: {
! 1236: fputs("Option -z requires a -r or -R restart option.\n",stderr);
! 1237: return usage(progname,1);
! 1238: }
! 1239: switch (gs.mode)
! 1240: {
! 1241: case MODE_MONITOR:
! 1242: if (gs.restart_command || gs.start_command || gs.stop_command)
! 1243: {
! 1244: fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
! 1245: mode_str[gs.mode]);
! 1246: return usage(progname,1);
! 1247: }
! 1248: break;
! 1249: case MODE_GLOBAL_RESTART:
! 1250: case MODE_SEPARATE_RESTART:
! 1251: if (!gs.restart_command || gs.start_command || gs.stop_command)
! 1252: {
! 1253: fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
! 1254: mode_str[gs.mode]);
! 1255: return usage(progname,1);
! 1256: }
! 1257: break;
! 1258: case MODE_PHASED_ZEBRA_RESTART:
! 1259: case MODE_PHASED_ALL_RESTART:
! 1260: if (!gs.restart_command || !gs.start_command || !gs.stop_command)
! 1261: {
! 1262: fprintf(stderr,
! 1263: "Need start, kill, and restart commands in [%s] mode.\n",
! 1264: mode_str[gs.mode]);
! 1265: return usage(progname,1);
! 1266: }
! 1267: break;
! 1268: }
! 1269:
! 1270: if (blankstr)
! 1271: {
! 1272: if (gs.restart_command)
! 1273: gs.restart_command = translate_blanks(gs.restart_command,blankstr);
! 1274: if (gs.start_command)
! 1275: gs.start_command = translate_blanks(gs.start_command,blankstr);
! 1276: if (gs.stop_command)
! 1277: gs.stop_command = translate_blanks(gs.stop_command,blankstr);
! 1278: }
! 1279:
! 1280: gs.restart.interval = gs.min_restart_interval;
! 1281: master = thread_master_create();
! 1282: signal_init (master, Q_SIGC(my_signals), my_signals);
! 1283: srandom(time(NULL));
! 1284:
! 1285: {
! 1286: int i;
! 1287: struct daemon *tail = NULL;
! 1288:
! 1289: for (i = optind; i < argc; i++)
! 1290: {
! 1291: struct daemon *dmn;
! 1292:
! 1293: if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
! 1294: {
! 1295: fprintf(stderr,"calloc(1,%u) failed: %s\n",
! 1296: (u_int)sizeof(*dmn), safe_strerror(errno));
! 1297: return 1;
! 1298: }
! 1299: dmn->name = dmn->restart.name = argv[i];
! 1300: dmn->state = DAEMON_INIT;
! 1301: gs.numdaemons++;
! 1302: gs.numdown++;
! 1303: dmn->fd = -1;
! 1304: dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
! 1305: 100+(random() % 900));
! 1306: dmn->restart.interval = gs.min_restart_interval;
! 1307: if (tail)
! 1308: tail->next = dmn;
! 1309: else
! 1310: gs.daemons = dmn;
! 1311: tail = dmn;
! 1312:
! 1313: if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
! 1314: (gs.mode == MODE_PHASED_ALL_RESTART)) &&
! 1315: !strcmp(dmn->name,special))
! 1316: gs.special = dmn;
! 1317: }
! 1318: }
! 1319: if (!gs.daemons)
! 1320: {
! 1321: fputs("Must specify one or more daemons to monitor.\n",stderr);
! 1322: return usage(progname,1);
! 1323: }
! 1324: if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
! 1325: (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
! 1326: {
! 1327: fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
! 1328: mode_str[gs.mode],special);
! 1329: return usage(progname,1);
! 1330: }
! 1331: if (gs.special && (gs.numdaemons < 2))
! 1332: {
! 1333: fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
! 1334: "to watch.\n",mode_str[gs.mode]);
! 1335: return usage(progname,1);
! 1336: }
! 1337:
! 1338: zlog_default = openzlog(progname, ZLOG_NONE,
! 1339: LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
! 1340: zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
! 1341: if (daemon_mode)
! 1342: {
! 1343: zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
! 1344: if (daemon (0, 0) < 0)
! 1345: {
! 1346: fprintf(stderr, "Watchquagga daemon failed: %s", strerror(errno));
! 1347: exit (1);
! 1348: }
! 1349: }
! 1350: else
! 1351: zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
! 1352:
! 1353: /* Make sure we're not already running. */
! 1354: pid_output (pidfile);
! 1355:
! 1356: /* Announce which daemons are being monitored. */
! 1357: {
! 1358: struct daemon *dmn;
! 1359: size_t len = 0;
! 1360:
! 1361: for (dmn = gs.daemons; dmn; dmn = dmn->next)
! 1362: len += strlen(dmn->name)+1;
! 1363:
! 1364: {
! 1365: char buf[len+1];
! 1366: char *p = buf;
! 1367:
! 1368: for (dmn = gs.daemons; dmn; dmn = dmn->next)
! 1369: {
! 1370: if (p != buf)
! 1371: *p++ = ' ';
! 1372: strcpy(p,dmn->name);
! 1373: p += strlen(p);
! 1374: }
! 1375: zlog_notice("%s %s watching [%s], mode [%s]",
! 1376: progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
! 1377: }
! 1378: }
! 1379:
! 1380: {
! 1381: struct thread thread;
! 1382:
! 1383: while (thread_fetch (master, &thread))
! 1384: thread_call (&thread);
! 1385: }
! 1386:
! 1387: /* Not reached. */
! 1388: return 0;
! 1389: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>