File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / quagga / watchquagga / watchquagga.c
Revision 1.1.1.3 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Nov 2 10:09:12 2016 UTC (7 years, 8 months ago) by misho
Branches: quagga, MAIN
CVS tags: v1_0_20160315, HEAD
quagga 1.0.20160315

    1: /*
    2:     Monitor status of quagga daemons and restart if necessary.
    3: 
    4:     Copyright (C) 2004  Andrew J. Schorr
    5: 
    6:     This program is free software; you can redistribute it and/or modify
    7:     it under the terms of the GNU General Public License as published by
    8:     the Free Software Foundation; either version 2 of the License, or
    9:     (at your option) any later version.
   10: 
   11:     This program is distributed in the hope that it will be useful,
   12:     but WITHOUT ANY WARRANTY; without even the implied warranty of
   13:     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14:     GNU General Public License for more details.
   15: 
   16:     You should have received a copy of the GNU General Public License
   17:     along with this program; if not, write to the Free Software
   18:     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   19:  */
   20: 
   21: #include <zebra.h>
   22: #include <thread.h>
   23: #include <log.h>
   24: #include <network.h>
   25: #include <sigevent.h>
   26: #include <lib/version.h>
   27: #include <getopt.h>
   28: #include <sys/un.h>
   29: #include <sys/wait.h>
   30: #include <memory.h>
   31: 
   32: #ifndef MIN
   33: #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
   34: #endif
   35: 
   36: /* Macros to help randomize timers. */
   37: #define JITTER(X) ((random() % ((X)+1))-((X)/2))
   38: #define FUZZY(X) ((X)+JITTER((X)/20))
   39: 
   40: #define DEFAULT_PERIOD		5
   41: #define DEFAULT_TIMEOUT		10
   42: #define DEFAULT_RESTART_TIMEOUT	20
   43: #define DEFAULT_LOGLEVEL	LOG_INFO
   44: #define DEFAULT_MIN_RESTART	60
   45: #define DEFAULT_MAX_RESTART	600
   46: #ifdef PATH_WATCHQUAGGA_PID
   47: #define DEFAULT_PIDFILE		PATH_WATCHQUAGGA_PID
   48: #else
   49: #define DEFAULT_PIDFILE		STATEDIR "/watchquagga.pid"
   50: #endif
   51: #ifdef DAEMON_VTY_DIR
   52: #define VTYDIR			DAEMON_VTY_DIR
   53: #else
   54: #define VTYDIR			STATEDIR
   55: #endif
   56: 
   57: #define PING_TOKEN	"PING"
   58: 
   59: /* Needs to be global, referenced somewhere inside libzebra. */
   60: struct thread_master *master;
   61: 
   62: typedef enum
   63: {
   64:   MODE_MONITOR = 0,
   65:   MODE_GLOBAL_RESTART,
   66:   MODE_SEPARATE_RESTART,
   67:   MODE_PHASED_ZEBRA_RESTART,
   68:   MODE_PHASED_ALL_RESTART
   69: } watch_mode_t;
   70: 
   71: static const char *mode_str[] =
   72: {
   73:   "monitor",
   74:   "global restart",
   75:   "individual daemon restart",
   76:   "phased zebra restart",
   77:   "phased global restart for any failure",
   78: };
   79: 
   80: typedef enum
   81: {
   82:   PHASE_NONE = 0,
   83:   PHASE_STOPS_PENDING,
   84:   PHASE_WAITING_DOWN,
   85:   PHASE_ZEBRA_RESTART_PENDING,
   86:   PHASE_WAITING_ZEBRA_UP
   87: } restart_phase_t;
   88: 
   89: static const char *phase_str[] =
   90: {
   91:   "None",
   92:   "Stop jobs running",
   93:   "Waiting for other daemons to come down",
   94:   "Zebra restart job running",
   95:   "Waiting for zebra to come up",
   96:   "Start jobs running",
   97: };
   98: 
   99: #define PHASE_TIMEOUT (3*gs.restart_timeout)
  100: 
  101: struct restart_info
  102: {
  103:   const char *name;
  104:   const char *what;
  105:   pid_t pid;
  106:   struct timeval time;
  107:   long interval;
  108:   struct thread *t_kill;
  109:   int kills;
  110: };
  111: 
  112: static struct global_state
  113: {
  114:   watch_mode_t mode;
  115:   restart_phase_t phase;
  116:   struct thread *t_phase_hanging;
  117:   const char *vtydir;
  118:   long period;
  119:   long timeout;
  120:   long restart_timeout;
  121:   long min_restart_interval;
  122:   long max_restart_interval;
  123:   int do_ping;
  124:   struct daemon *daemons;
  125:   const char *restart_command;
  126:   const char *start_command;
  127:   const char *stop_command;
  128:   struct restart_info restart;
  129:   int unresponsive_restart;
  130:   int loglevel;
  131:   struct daemon *special;	/* points to zebra when doing phased restart */
  132:   int numdaemons;
  133:   int numpids;
  134:   int numdown;		/* # of daemons that are not UP or UNRESPONSIVE */
  135: } gs = {
  136:   .mode = MODE_MONITOR,
  137:   .phase = PHASE_NONE,
  138:   .vtydir = VTYDIR,
  139:   .period = 1000*DEFAULT_PERIOD,
  140:   .timeout = DEFAULT_TIMEOUT,
  141:   .restart_timeout = DEFAULT_RESTART_TIMEOUT,
  142:   .loglevel = DEFAULT_LOGLEVEL,
  143:   .min_restart_interval = DEFAULT_MIN_RESTART,
  144:   .max_restart_interval = DEFAULT_MAX_RESTART,
  145:   .do_ping = 1,
  146: };
  147: 
  148: typedef enum
  149: {
  150:   DAEMON_INIT,
  151:   DAEMON_DOWN,
  152:   DAEMON_CONNECTING,
  153:   DAEMON_UP,
  154:   DAEMON_UNRESPONSIVE
  155: } daemon_state_t;
  156: 
  157: #define IS_UP(DMN) \
  158:   (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
  159: 
  160: static const char *state_str[] =
  161: {
  162:   "Init",
  163:   "Down",
  164:   "Connecting",
  165:   "Up",
  166:   "Unresponsive",
  167: };
  168: 
  169: struct daemon {
  170:   const char *name;
  171:   daemon_state_t state;
  172:   int fd;
  173:   struct timeval echo_sent;
  174:   u_int connect_tries;
  175:   struct thread *t_wakeup;
  176:   struct thread *t_read;
  177:   struct thread *t_write;
  178:   struct daemon *next;
  179:   struct restart_info restart;
  180: };
  181: 
  182: static const struct option longopts[] = 
  183: {
  184:   { "daemon", no_argument, NULL, 'd'},
  185:   { "statedir", required_argument, NULL, 'S'},
  186:   { "no-echo", no_argument, NULL, 'e'},
  187:   { "loglevel", required_argument, NULL, 'l'},
  188:   { "interval", required_argument, NULL, 'i'},
  189:   { "timeout", required_argument, NULL, 't'},
  190:   { "restart-timeout", required_argument, NULL, 'T'},
  191:   { "restart", required_argument, NULL, 'r'},
  192:   { "start-command", required_argument, NULL, 's'},
  193:   { "kill-command", required_argument, NULL, 'k'},
  194:   { "restart-all", required_argument, NULL, 'R'},
  195:   { "all-restart", no_argument, NULL, 'a'},
  196:   { "always-all-restart", no_argument, NULL, 'A'},
  197:   { "unresponsive-restart", no_argument, NULL, 'z'},
  198:   { "min-restart-interval", required_argument, NULL, 'm'},
  199:   { "max-restart-interval", required_argument, NULL, 'M'},
  200:   { "pid-file", required_argument, NULL, 'p'},
  201:   { "blank-string", required_argument, NULL, 'b'},
  202:   { "help", no_argument, NULL, 'h'},
  203:   { "version", no_argument, NULL, 'v'},
  204:   { NULL, 0, NULL, 0 }
  205: };
  206: 
  207: static int try_connect(struct daemon *dmn);
  208: static int wakeup_send_echo(struct thread *t_wakeup);
  209: static void try_restart(struct daemon *dmn);
  210: static void phase_check(void);
  211: 
  212: static int
  213: usage(const char *progname, int status)
  214: {
  215:   if (status != 0)
  216:     fprintf(stderr, "Try `%s --help' for more information.\n", progname);
  217:   else
  218:     {
  219:       printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
  220: Watchdog program to monitor status of quagga daemons and try to restart\n\
  221: them if they are down or unresponsive.  It determines whether a daemon is\n\
  222: up based on whether it can connect to the daemon's vty unix stream socket.\n\
  223: It then repeatedly sends echo commands over that socket to determine whether\n\
  224: the daemon is responsive.  If the daemon crashes, we will receive an EOF\n\
  225: on the socket connection and know immediately that the daemon is down.\n\n\
  226: The daemons to be monitored should be listed on the command line.\n\n\
  227: This program can run in one of 5 modes:\n\n\
  228: 0. Mode: %s.\n\
  229:   Just monitor and report on status changes.  Example:\n\
  230:     %s -d zebra ospfd bgpd\n\n\
  231: 1. Mode: %s.\n\
  232:   Whenever any daemon hangs or crashes, use the given command to restart\n\
  233:   them all.  Example:\n\
  234:     %s -dz \\\n\
  235:       -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
  236:       zebra ospfd\n\n\
  237: 2. Mode: %s.\n\
  238:   When any single daemon hangs or crashes, restart only the daemon that's\n\
  239:   in trouble using the supplied restart command.  Example:\n\
  240:     %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
  241: 3. Mode: %s.\n\
  242:   The same as the previous mode, except that there is special treatment when\n\
  243:   the zebra daemon is in trouble.  In that case, a phased restart approach\n\
  244:   is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
  245:   daemons.  Example:\n\
  246:     %s -adz -r '/sbin/service %%s restart' \\\n\
  247:       -s '/sbin/service %%s start' \\\n\
  248:       -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
  249: 4. Mode: %s.\n\
  250:   This is the same as the previous mode, except that the phased restart\n\
  251:   procedure is used whenever any of the daemons hangs or crashes.  Example:\n\
  252:     %s -Adz -r '/sbin/service %%s restart' \\\n\
  253:       -s '/sbin/service %%s start' \\\n\
  254:       -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
  255: As of this writing, it is believed that mode 2 [%s]\n\
  256: is not safe, and mode 3 [%s] may not be safe with some of the\n\
  257: routing daemons.\n\n\
  258: In order to avoid attempting to restart the daemons in a fast loop,\n\
  259: the -m and -M options allow you to control the minimum delay between\n\
  260: restart commands.  The minimum restart delay is recalculated each time\n\
  261: a restart is attempted: if the time since the last restart attempt exceeds\n\
  262: twice the -M value, then the restart delay is set to the -m value.\n\
  263: Otherwise, the interval is doubled (but capped at the -M value).\n\n",
  264:         progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
  265:         progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],
  266:         mode_str[3]);
  267: 
  268:       printf("Options:\n\
  269: -d, --daemon	Run in daemon mode.  In this mode, error messages are sent\n\
  270: 		to syslog instead of stdout.\n\
  271: -S, --statedir	Set the vty socket directory (default is %s)\n\
  272: -e, --no-echo	Do not ping the daemons to test responsiveness (this\n\
  273: 		option is necessary if the daemons do not support the\n\
  274: 		echo command)\n\
  275: -l, --loglevel	Set the logging level (default is %d).\n\
  276: 		The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
  277: 		but it can be set higher than %d if extra-verbose debugging\n\
  278: 		messages are desired.\n\
  279: -m, --min-restart-interval\n\
  280: 		Set the minimum seconds to wait between invocations of daemon\n\
  281: 		restart commands (default is %d).\n\
  282: -M, --max-restart-interval\n\
  283: 		Set the maximum seconds to wait between invocations of daemon\n\
  284: 		restart commands (default is %d).\n\
  285: -i, --interval	Set the status polling interval in seconds (default is %d)\n\
  286: -t, --timeout	Set the unresponsiveness timeout in seconds (default is %d)\n\
  287: -T, --restart-timeout\n\
  288: 		Set the restart (kill) timeout in seconds (default is %d).\n\
  289: 		If any background jobs are still running after this much\n\
  290: 		time has elapsed, they will be killed.\n\
  291: -r, --restart	Supply a Bourne shell command to use to restart a single\n\
  292: 		daemon.  The command string should include '%%s' where the\n\
  293: 		name of the daemon should be substituted.\n\
  294: 		Note that -r and -R are incompatible.\n\
  295: -s, --start-command\n\
  296: 		Supply a Bourne shell to command to use to start a single\n\
  297: 		daemon.  The command string should include '%%s' where the\n\
  298: 		name of the daemon should be substituted.\n\
  299: -k, --kill-command\n\
  300: 		Supply a Bourne shell to command to use to stop a single\n\
  301: 		daemon.  The command string should include '%%s' where the\n\
  302: 		name of the daemon should be substituted.\n\
  303: -R, --restart-all\n\
  304: 		When one or more daemons is down, try to restart everything\n\
  305: 		using the Bourne shell command supplied as the argument.\n\
  306: 		Note that -r and -R are incompatible.\n\
  307: -z, --unresponsive-restart\n\
  308: 		When a daemon is unresponsive, treat it as being down for\n\
  309: 		restart purposes.\n\
  310: -a, --all-restart\n\
  311: 		When zebra hangs or crashes, restart all daemons using\n\
  312: 		this phased approach: 1. stop all other daemons; 2. restart\n\
  313: 		zebra; 3. start other daemons.  Requires -r, -s, and -k.\n\
  314: -A, --always-all-restart\n\
  315: 		When any daemon (not just zebra) hangs or crashes, use the\n\
  316: 		same phased restart mechanism described above for -a.\n\
  317: 		Requires -r, -s, and -k.\n\
  318: -p, --pid-file	Set process identifier file name\n\
  319: 		(default is %s).\n\
  320: -b, --blank-string\n\
  321: 		When the supplied argument string is found in any of the\n\
  322: 		various shell command arguments (-r, -s, -k, or -R), replace\n\
  323: 		it with a space.  This is an ugly hack to circumvent problems\n\
  324: 		passing command-line arguments with embedded spaces.\n\
  325: -v, --version	Print program version\n\
  326: -h, --help	Display this help and exit\n",
  327:         VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
  328:         DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
  329:         DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,
  330:         DEFAULT_PIDFILE);
  331:     }
  332: 
  333:   return status;
  334: }
  335: 
  336: static pid_t
  337: run_background(char *shell_cmd)
  338: {
  339:   pid_t child;
  340: 
  341:   switch (child = fork())
  342:     {
  343:     case -1:
  344:       zlog_err("fork failed, cannot run command [%s]: %s",
  345: 	       shell_cmd,safe_strerror(errno));
  346:       return -1;
  347:     case 0:
  348:       /* Child process. */
  349:       /* Use separate process group so child processes can be killed easily. */
  350:       if (setpgid(0,0) < 0)
  351:         zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
  352:       {
  353: 	char shell[] = "sh";
  354: 	char dashc[] = "-c";
  355:         char *const argv[4] = { shell, dashc, shell_cmd, NULL};
  356: 	execv("/bin/sh", argv);
  357: 	zlog_err("execv(/bin/sh -c '%s') failed: %s",
  358: 		 shell_cmd,safe_strerror(errno));
  359: 	_exit(127);
  360:       }
  361:     default:
  362:       /* Parent process: we will reap the child later. */
  363:       zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
  364:       return child;
  365:     }
  366: }
  367: 
  368: static struct timeval *
  369: time_elapsed(struct timeval *result, const struct timeval *start_time)
  370: {
  371:   gettimeofday(result,NULL);
  372:   result->tv_sec -= start_time->tv_sec;
  373:   result->tv_usec -= start_time->tv_usec;
  374:   while (result->tv_usec < 0)
  375:     {
  376:       result->tv_usec += 1000000L;
  377:       result->tv_sec--;
  378:     }
  379:   return result;
  380: }
  381: 
  382: static int
  383: restart_kill(struct thread *t_kill)
  384: {
  385:   struct restart_info *restart = THREAD_ARG(t_kill);
  386:   struct timeval delay;
  387: 
  388:   time_elapsed(&delay,&restart->time);
  389:   zlog_warn("Warning: %s %s child process %d still running after "
  390: 	    "%ld seconds, sending signal %d",
  391: 	    restart->what,restart->name,(int)restart->pid, (long)delay.tv_sec,
  392: 	    (restart->kills ? SIGKILL : SIGTERM));
  393:   kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
  394:   restart->kills++;
  395:   restart->t_kill = thread_add_timer(master,restart_kill,restart,
  396: 				     gs.restart_timeout);
  397:   return 0;
  398: }
  399: 
  400: static struct restart_info *
  401: find_child(pid_t child)
  402: {
  403:   if (gs.mode == MODE_GLOBAL_RESTART)
  404:     {
  405:       if (gs.restart.pid == child)
  406:         return &gs.restart;
  407:     }
  408:   else
  409:     {
  410:       struct daemon *dmn;
  411:       for (dmn = gs.daemons; dmn; dmn = dmn->next)
  412:         {
  413: 	  if (dmn->restart.pid == child)
  414: 	    return &dmn->restart;
  415:         }
  416:     }
  417:   return NULL;
  418: }
  419: 
  420: static void
  421: sigchild(void)
  422: {
  423:   pid_t child;
  424:   int status;
  425:   const char *name;
  426:   const char *what;
  427:   struct restart_info *restart;
  428: 
  429:   switch (child = waitpid(-1,&status,WNOHANG)) 
  430:     {
  431:     case -1:
  432:       zlog_err("waitpid failed: %s",safe_strerror(errno));
  433:       return;
  434:     case 0:
  435:       zlog_warn("SIGCHLD received, but waitpid did not reap a child");
  436:       return;
  437:     }
  438: 
  439:   if ((restart = find_child(child)) != NULL)
  440:     {
  441:       name = restart->name;
  442:       what = restart->what;
  443:       restart->pid = 0;
  444:       gs.numpids--;
  445:       thread_cancel(restart->t_kill);
  446:       restart->t_kill = NULL;
  447:       /* Update restart time to reflect the time the command completed. */
  448:       gettimeofday(&restart->time,NULL);
  449:     }
  450:   else
  451:     {
  452:       zlog_err("waitpid returned status for an unknown child process %d",
  453: 	       (int)child);
  454:       name = "(unknown)";
  455:       what = "background";
  456:     }
  457:   if (WIFSTOPPED(status))
  458:       zlog_warn("warning: %s %s process %d is stopped",
  459: 		what,name,(int)child);
  460:   else if (WIFSIGNALED(status))
  461:     zlog_warn("%s %s process %d terminated due to signal %d",
  462: 	      what,name,(int)child,WTERMSIG(status));
  463:   else if (WIFEXITED(status))
  464:     {
  465:       if (WEXITSTATUS(status) != 0)
  466: 	zlog_warn("%s %s process %d exited with non-zero status %d",
  467: 		  what,name,(int)child,WEXITSTATUS(status));
  468:       else
  469: 	zlog_debug("%s %s process %d exited normally",what,name,(int)child);
  470:     }
  471:   else
  472:     zlog_err("cannot interpret %s %s process %d wait status 0x%x",
  473: 	     what,name,(int)child,status);
  474:   phase_check();
  475: }
  476: 
  477: static int
  478: run_job(struct restart_info *restart, const char *cmdtype, const char *command,
  479: 	int force, int update_interval)
  480: {
  481:   struct timeval delay;
  482: 
  483:   if (gs.loglevel > LOG_DEBUG+1)
  484:     zlog_debug("attempting to %s %s",cmdtype,restart->name);
  485: 
  486:   if (restart->pid)
  487:     {
  488:       if (gs.loglevel > LOG_DEBUG+1)
  489:         zlog_debug("cannot %s %s, previous pid %d still running",
  490: 		   cmdtype,restart->name,(int)restart->pid);
  491:       return -1;
  492:     }
  493: 
  494:   /* Note: time_elapsed test must come before the force test, since we need
  495:      to make sure that delay is initialized for use below in updating the
  496:      restart interval. */
  497:   if ((time_elapsed(&delay,&restart->time)->tv_sec < restart->interval) &&
  498:       !force)
  499:     {
  500:       if (gs.loglevel > LOG_DEBUG+1)
  501:         zlog_debug("postponing %s %s: "
  502: 		   "elapsed time %ld < retry interval %ld",
  503: 		   cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
  504:       return -1;
  505:     }
  506: 
  507:   gettimeofday(&restart->time,NULL);
  508:   restart->kills = 0;
  509:   {
  510:     char cmd[strlen(command)+strlen(restart->name)+1];
  511:     snprintf(cmd,sizeof(cmd),command,restart->name);
  512:     if ((restart->pid = run_background(cmd)) > 0)
  513:       {
  514: 	restart->t_kill = thread_add_timer(master,restart_kill,restart,
  515: 					   gs.restart_timeout);
  516: 	restart->what = cmdtype;
  517: 	gs.numpids++;
  518:       }
  519:     else
  520:       restart->pid = 0;
  521:   }
  522: 
  523:   /* Calculate the new restart interval. */
  524:   if (update_interval)
  525:     {
  526:       if (delay.tv_sec > 2*gs.max_restart_interval)
  527: 	restart->interval = gs.min_restart_interval;
  528:       else if ((restart->interval *= 2) > gs.max_restart_interval)
  529: 	restart->interval = gs.max_restart_interval;
  530:       if (gs.loglevel > LOG_DEBUG+1)
  531: 	zlog_debug("restart %s interval is now %ld",
  532: 		   restart->name,restart->interval);
  533:     }
  534:   return restart->pid;
  535: }
  536: 
  537: #define SET_READ_HANDLER(DMN) \
  538:   (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
  539: 
  540: #define SET_WAKEUP_DOWN(DMN)	\
  541:   (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN),	\
  542:     					  FUZZY(gs.period))
  543: 
  544: #define SET_WAKEUP_UNRESPONSIVE(DMN)	\
  545:   (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
  546:     					  FUZZY(gs.period))
  547: 
  548: #define SET_WAKEUP_ECHO(DMN) \
  549:   (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
  550: 					  FUZZY(gs.period))
  551: 
  552: static int
  553: wakeup_down(struct thread *t_wakeup)
  554: {
  555:   struct daemon *dmn = THREAD_ARG(t_wakeup);
  556: 
  557:   dmn->t_wakeup = NULL;
  558:   if (try_connect(dmn) < 0)
  559:     SET_WAKEUP_DOWN(dmn);
  560:   if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
  561:     try_restart(dmn);
  562:   return 0;
  563: }
  564: 
  565: static int
  566: wakeup_init(struct thread *t_wakeup)
  567: {
  568:   struct daemon *dmn = THREAD_ARG(t_wakeup);
  569: 
  570:   dmn->t_wakeup = NULL;
  571:   if (try_connect(dmn) < 0)
  572:     {
  573:       SET_WAKEUP_DOWN(dmn);
  574:       zlog_err("%s state -> down : initial connection attempt failed",
  575: 	       dmn->name);
  576:       dmn->state = DAEMON_DOWN;
  577:     }
  578:   return 0;
  579: }
  580: 
  581: static void
  582: daemon_down(struct daemon *dmn, const char *why)
  583: {
  584:   if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
  585:     zlog_err("%s state -> down : %s",dmn->name,why);
  586:   else if (gs.loglevel > LOG_DEBUG)
  587:     zlog_debug("%s still down : %s",dmn->name,why);
  588:   if (IS_UP(dmn))
  589:     gs.numdown++;
  590:   dmn->state = DAEMON_DOWN;
  591:   if (dmn->fd >= 0)
  592:     {
  593:       close(dmn->fd);
  594:       dmn->fd = -1;
  595:     }
  596:   THREAD_OFF(dmn->t_read);
  597:   THREAD_OFF(dmn->t_write);
  598:   THREAD_OFF(dmn->t_wakeup);
  599:   if (try_connect(dmn) < 0)
  600:     SET_WAKEUP_DOWN(dmn);
  601:   phase_check();
  602: }
  603: 
  604: static int
  605: handle_read(struct thread *t_read)
  606: {
  607:   struct daemon *dmn = THREAD_ARG(t_read);
  608:   static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
  609:   char buf[sizeof(resp)+100];
  610:   ssize_t rc;
  611:   struct timeval delay;
  612: 
  613:   dmn->t_read = NULL;
  614:   if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
  615:     {
  616:       char why[100];
  617: 
  618:       if (ERRNO_IO_RETRY(errno))
  619: 	{
  620: 	  /* Pretend it never happened. */
  621: 	  SET_READ_HANDLER(dmn);
  622: 	  return 0;
  623: 	}
  624:       snprintf(why,sizeof(why),"unexpected read error: %s",
  625: 	       safe_strerror(errno));
  626:       daemon_down(dmn,why);
  627:       return 0;
  628:     }
  629:   if (rc == 0)
  630:     {
  631:       daemon_down(dmn,"read returned EOF");
  632:       return 0;
  633:     }
  634:   if (!dmn->echo_sent.tv_sec)
  635:     {
  636:       char why[sizeof(buf)+100];
  637:       snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
  638: 	       (int)rc,(int)rc,buf);
  639:       daemon_down(dmn,why);
  640:       return 0;
  641:     }
  642: 
  643:   /* We are expecting an echo response: is there any chance that the
  644:      response would not be returned entirely in the first read?  That
  645:      seems inconceivable... */
  646:   if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
  647:     {
  648:       char why[100+sizeof(buf)];
  649:       snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
  650: 			       "(expecting %u): %.*s",
  651: 	       (int)rc,(u_int)sizeof(resp),(int)rc,buf);
  652:       daemon_down(dmn,why);
  653:       return 0;
  654:     }
  655: 
  656:   time_elapsed(&delay,&dmn->echo_sent);
  657:   dmn->echo_sent.tv_sec = 0;
  658:   if (dmn->state == DAEMON_UNRESPONSIVE)
  659:     {
  660:       if (delay.tv_sec < gs.timeout)
  661: 	{
  662: 	  dmn->state = DAEMON_UP;
  663: 	  zlog_warn("%s state -> up : echo response received after %ld.%06ld "
  664: 		    "seconds", dmn->name,
  665: 		    (long)delay.tv_sec, (long)delay.tv_usec);
  666: 	}
  667:       else
  668: 	zlog_warn("%s: slow echo response finally received after %ld.%06ld "
  669: 		  "seconds", dmn->name,
  670: 		  (long)delay.tv_sec, (long)delay.tv_usec);
  671:     }
  672:   else if (gs.loglevel > LOG_DEBUG+1)
  673:     zlog_debug("%s: echo response received after %ld.%06ld seconds",
  674: 	       dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
  675: 
  676:   SET_READ_HANDLER(dmn);
  677:   if (dmn->t_wakeup)
  678:     thread_cancel(dmn->t_wakeup);
  679:   SET_WAKEUP_ECHO(dmn);
  680: 
  681:   return 0;
  682: }
  683: 
  684: static void
  685: daemon_up(struct daemon *dmn, const char *why)
  686: {
  687:   dmn->state = DAEMON_UP;
  688:   gs.numdown--;
  689:   dmn->connect_tries = 0;
  690:   zlog_notice("%s state -> up : %s",dmn->name,why);
  691:   if (gs.do_ping)
  692:     SET_WAKEUP_ECHO(dmn);
  693:   phase_check();
  694: }
  695: 
  696: static int
  697: check_connect(struct thread *t_write)
  698: {
  699:   struct daemon *dmn = THREAD_ARG(t_write);
  700:   int sockerr;
  701:   socklen_t reslen = sizeof(sockerr);
  702: 
  703:   dmn->t_write = NULL;
  704:   if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
  705:     {
  706:       zlog_warn("%s: check_connect: getsockopt failed: %s",
  707: 	        dmn->name,safe_strerror(errno));
  708:       daemon_down(dmn,"getsockopt failed checking connection success");
  709:       return 0;
  710:     }
  711:   if ((reslen == sizeof(sockerr)) && sockerr)
  712:     {
  713:       char why[100];
  714:       snprintf(why,sizeof(why),
  715: 	       "getsockopt reports that connection attempt failed: %s",
  716: 	       safe_strerror(sockerr));
  717:       daemon_down(dmn,why);
  718:       return 0;
  719:     }
  720: 
  721:   daemon_up(dmn,"delayed connect succeeded");
  722:   return 0;
  723: }
  724: 
  725: static int
  726: wakeup_connect_hanging(struct thread *t_wakeup)
  727: {
  728:   struct daemon *dmn = THREAD_ARG(t_wakeup);
  729:   char why[100];
  730: 
  731:   dmn->t_wakeup = NULL;
  732:   snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
  733: 	   gs.timeout);
  734:   daemon_down(dmn,why);
  735:   return 0;
  736: }
  737: 
  738: /* Making connection to protocol daemon. */
  739: static int
  740: try_connect(struct daemon *dmn)
  741: {
  742:   int sock;
  743:   struct sockaddr_un addr;
  744:   socklen_t len;
  745: 
  746:   if (gs.loglevel > LOG_DEBUG+1)
  747:     zlog_debug("%s: attempting to connect",dmn->name);
  748:   dmn->connect_tries++;
  749: 
  750:   memset (&addr, 0, sizeof (struct sockaddr_un));
  751:   addr.sun_family = AF_UNIX;
  752:   snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
  753: 	   gs.vtydir,dmn->name);
  754: #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
  755:   len = addr.sun_len = SUN_LEN(&addr);
  756: #else
  757:   len = sizeof (addr.sun_family) + strlen (addr.sun_path);
  758: #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
  759: 
  760:   /* Quick check to see if we might succeed before we go to the trouble
  761:      of creating a socket. */
  762:   if (access(addr.sun_path, W_OK) < 0)
  763:     {
  764:       if (errno != ENOENT)
  765:         zlog_err("%s: access to socket %s denied: %s",
  766: 		dmn->name,addr.sun_path,safe_strerror(errno));
  767:       return -1;
  768:     }
  769: 
  770:   if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
  771:     {
  772:       zlog_err("%s(%s): cannot make socket: %s",
  773: 	       __func__,addr.sun_path, safe_strerror(errno));
  774:       return -1;
  775:     }
  776: 
  777:   if (set_nonblocking(sock) < 0)
  778:     {
  779:       zlog_err("%s(%s): set_nonblocking(%d) failed",
  780: 	       __func__, addr.sun_path, sock);
  781:       close(sock);
  782:       return -1;
  783:     }
  784: 
  785:   if (connect (sock, (struct sockaddr *) &addr, len) < 0)
  786:     {
  787:       if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
  788: 	{
  789: 	  if (gs.loglevel > LOG_DEBUG)
  790: 	    zlog_debug("%s(%s): connect failed: %s",
  791: 		       __func__,addr.sun_path, safe_strerror(errno));
  792: 	  close (sock);
  793: 	  return -1;
  794: 	}
  795:       if (gs.loglevel > LOG_DEBUG)
  796: 	zlog_debug("%s: connection in progress",dmn->name);
  797:       dmn->state = DAEMON_CONNECTING;
  798:       dmn->fd = sock;
  799:       dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
  800:       dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
  801: 				       gs.timeout);
  802:       SET_READ_HANDLER(dmn);
  803:       return 0;
  804:     }
  805: 
  806:   dmn->fd = sock;
  807:   SET_READ_HANDLER(dmn);
  808:   daemon_up(dmn,"connect succeeded");
  809:   return 1;
  810: }
  811: 
  812: static int
  813: phase_hanging(struct thread *t_hanging)
  814: {
  815:   gs.t_phase_hanging = NULL;
  816:   zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
  817:            phase_str[gs.phase],PHASE_TIMEOUT);
  818:   gs.phase = PHASE_NONE;
  819:   return 0;
  820: }
  821: 
  822: static void
  823: set_phase(restart_phase_t new_phase)
  824: {
  825:   gs.phase = new_phase;
  826:   if (gs.t_phase_hanging)
  827:     thread_cancel(gs.t_phase_hanging);
  828:   gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
  829:   					PHASE_TIMEOUT);
  830: }
  831: 
  832: static void
  833: phase_check(void)
  834: {
  835:   switch (gs.phase)
  836:     {
  837:     case PHASE_NONE:
  838:       break;
  839:     case PHASE_STOPS_PENDING:
  840:       if (gs.numpids)
  841: 	break;
  842:       zlog_info("Phased restart: all routing daemon stop jobs have completed.");
  843:       set_phase(PHASE_WAITING_DOWN);
  844:       /*FALLTHRU*/
  845:     case PHASE_WAITING_DOWN:
  846:       if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
  847:         break;
  848:       zlog_info("Phased restart: all routing daemons now down.");
  849:       run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
  850:       set_phase(PHASE_ZEBRA_RESTART_PENDING);
  851:       /*FALLTHRU*/
  852:     case PHASE_ZEBRA_RESTART_PENDING:
  853:       if (gs.special->restart.pid)
  854: 	break;
  855:       zlog_info("Phased restart: %s restart job completed.",gs.special->name);
  856:       set_phase(PHASE_WAITING_ZEBRA_UP);
  857:       /*FALLTHRU*/
  858:     case PHASE_WAITING_ZEBRA_UP:
  859:       if (!IS_UP(gs.special))
  860:         break;
  861:       zlog_info("Phased restart: %s is now up.",gs.special->name);
  862:       {
  863:         struct daemon *dmn;
  864: 	for (dmn = gs.daemons; dmn; dmn = dmn->next)
  865: 	  {
  866: 	    if (dmn != gs.special)
  867: 	      run_job(&dmn->restart,"start",gs.start_command,1,0);
  868: 	  }
  869:       }
  870:       gs.phase = PHASE_NONE;
  871:       THREAD_OFF(gs.t_phase_hanging);
  872:       zlog_notice("Phased global restart has completed.");
  873:       break;
  874:     }
  875: }
  876: 
  877: static void
  878: try_restart(struct daemon *dmn)
  879: {
  880:   switch (gs.mode)
  881:   {
  882:   case MODE_MONITOR:
  883:     return;
  884:   case MODE_GLOBAL_RESTART:
  885:     run_job(&gs.restart,"restart",gs.restart_command,0,1);
  886:     break;
  887:   case MODE_SEPARATE_RESTART:
  888:     run_job(&dmn->restart,"restart",gs.restart_command,0,1);
  889:     break;
  890:   case MODE_PHASED_ZEBRA_RESTART:
  891:     if (dmn != gs.special)
  892:       {
  893:         if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
  894: 	  run_job(&dmn->restart,"restart",gs.restart_command,0,1);
  895: 	else
  896: 	  zlog_debug("%s: postponing restart attempt because master %s daemon "
  897: 		     "not up [%s], or phased restart in progress",
  898: 		     dmn->name,gs.special->name,state_str[gs.special->state]);
  899: 	break;
  900:       }
  901:     /*FALLTHRU*/
  902:   case MODE_PHASED_ALL_RESTART:
  903:     if ((gs.phase != PHASE_NONE) || gs.numpids)
  904:       {
  905: 	if (gs.loglevel > LOG_DEBUG+1)
  906: 	  zlog_debug("postponing phased global restart: restart already in "
  907: 		     "progress [%s], or outstanding child processes [%d]",
  908: 		     phase_str[gs.phase],gs.numpids);
  909:         break;
  910:       }
  911:     /* Is it too soon for a restart? */
  912:     {
  913:       struct timeval delay;
  914:       if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
  915:       	  gs.special->restart.interval)
  916: 	{
  917: 	  if (gs.loglevel > LOG_DEBUG+1)
  918: 	    zlog_debug("postponing phased global restart: "
  919: 		       "elapsed time %ld < retry interval %ld",
  920: 		       (long)delay.tv_sec,gs.special->restart.interval);
  921: 	  break;
  922: 	}
  923:     }
  924:     zlog_info("Phased restart: stopping all routing daemons.");
  925:     /* First step: stop all other daemons. */
  926:     for (dmn = gs.daemons; dmn; dmn = dmn->next)
  927:       {
  928:         if (dmn != gs.special)
  929: 	  run_job(&dmn->restart,"stop",gs.stop_command,1,1);
  930:       }
  931:     set_phase(PHASE_STOPS_PENDING);
  932:     break;
  933:   default:
  934:     zlog_err("error: unknown restart mode %d",gs.mode);
  935:     break;
  936:   }
  937: }
  938: 
  939: static int
  940: wakeup_unresponsive(struct thread *t_wakeup)
  941: {
  942:   struct daemon *dmn = THREAD_ARG(t_wakeup);
  943: 
  944:   dmn->t_wakeup = NULL;
  945:   if (dmn->state != DAEMON_UNRESPONSIVE)
  946:     zlog_err("%s: no longer unresponsive (now %s), "
  947: 	     "wakeup should have been cancelled!",
  948: 	     dmn->name,state_str[dmn->state]);
  949:   else
  950:     {
  951:       SET_WAKEUP_UNRESPONSIVE(dmn);
  952:       try_restart(dmn);
  953:     }
  954:   return 0;
  955: }
  956: 
  957: static int
  958: wakeup_no_answer(struct thread *t_wakeup)
  959: {
  960:   struct daemon *dmn = THREAD_ARG(t_wakeup);
  961: 
  962:   dmn->t_wakeup = NULL;
  963:   dmn->state = DAEMON_UNRESPONSIVE;
  964:   zlog_err("%s state -> unresponsive : no response yet to ping "
  965: 	   "sent %ld seconds ago",dmn->name,gs.timeout);
  966:   if (gs.unresponsive_restart)
  967:     {
  968:       SET_WAKEUP_UNRESPONSIVE(dmn);
  969:       try_restart(dmn);
  970:     }
  971:   return 0;
  972: }
  973: 
  974: static int
  975: wakeup_send_echo(struct thread *t_wakeup)
  976: {
  977:   static const char echocmd[] = "echo " PING_TOKEN;
  978:   ssize_t rc;
  979:   struct daemon *dmn = THREAD_ARG(t_wakeup);
  980: 
  981:   dmn->t_wakeup = NULL;
  982:   if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
  983:       ((size_t)rc != sizeof(echocmd)))
  984:     {
  985:       char why[100+sizeof(echocmd)];
  986:       snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
  987:                echocmd,(int)rc,(u_int)sizeof(echocmd));
  988:       daemon_down(dmn,why);
  989:     }
  990:   else
  991:     {
  992:       gettimeofday(&dmn->echo_sent,NULL);
  993:       dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
  994:     }
  995:   return 0;
  996: }
  997: 
  998: static void
  999: sigint(void)
 1000: {
 1001:   zlog_notice("Terminating on signal");
 1002:   exit(0);
 1003: }
 1004: 
 1005: static int
 1006: valid_command(const char *cmd)
 1007: {
 1008:   char *p;
 1009: 
 1010:   return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
 1011: }
 1012: 
 1013: /* This is an ugly hack to circumvent problems with passing command-line
 1014:    arguments that contain spaces.  The fix is to use a configuration file. */
 1015: static char *
 1016: translate_blanks(const char *cmd, const char *blankstr)
 1017: {
 1018:   char *res;
 1019:   char *p;
 1020:   size_t bslen = strlen(blankstr);
 1021: 
 1022:   if (!(res = strdup(cmd)))
 1023:     {
 1024:       perror("strdup");
 1025:       exit(1);
 1026:     }
 1027:   while ((p = strstr(res,blankstr)) != NULL)
 1028:     {
 1029:       *p = ' ';
 1030:       if (bslen != 1)
 1031: 	 memmove(p+1,p+bslen,strlen(p+bslen)+1);
 1032:     }
 1033:   return res;
 1034: }
 1035: 
 1036: int
 1037: main(int argc, char **argv)
 1038: {
 1039:   const char *progname;
 1040:   int opt;
 1041:   int daemon_mode = 0;
 1042:   const char *pidfile = DEFAULT_PIDFILE;
 1043:   const char *special = "zebra";
 1044:   const char *blankstr = NULL;
 1045:   static struct quagga_signal_t my_signals[] =
 1046:   {
 1047:     {
 1048:       .signal = SIGINT,
 1049:       .handler = sigint,
 1050:     },
 1051:     {
 1052:       .signal = SIGTERM,
 1053:       .handler = sigint,
 1054:     },
 1055:     {
 1056:       .signal = SIGCHLD,
 1057:       .handler = sigchild,
 1058:     },
 1059:   };
 1060: 
 1061:   if ((progname = strrchr (argv[0], '/')) != NULL)
 1062:     progname++;
 1063:   else
 1064:     progname = argv[0];
 1065: 
 1066:   gs.restart.name = "all";
 1067:   while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
 1068: 			    longopts, 0)) != EOF)
 1069:     {
 1070:       switch (opt)
 1071:         {
 1072: 	case 0:
 1073: 	  break;
 1074:         case 'a':
 1075: 	  if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
 1076: 	    {
 1077: 	      fputs("Ambiguous operating mode selected.\n",stderr);
 1078: 	      return usage(progname,1);
 1079: 	    }
 1080: 	  gs.mode = MODE_PHASED_ZEBRA_RESTART;
 1081: 	  break;
 1082:         case 'A':
 1083: 	  if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
 1084: 	    {
 1085: 	      fputs("Ambiguous operating mode selected.\n",stderr);
 1086: 	      return usage(progname,1);
 1087: 	    }
 1088: 	  gs.mode = MODE_PHASED_ALL_RESTART;
 1089: 	  break;
 1090: 	case 'b':
 1091: 	  blankstr = optarg;
 1092: 	  break;
 1093:         case 'd':
 1094: 	  daemon_mode = 1;
 1095: 	  break;
 1096:         case 'e':
 1097: 	  gs.do_ping = 0;
 1098: 	  break;
 1099:         case 'k':
 1100: 	  if (!valid_command(optarg))
 1101: 	  {
 1102: 	    fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
 1103: 		    optarg);
 1104: 	    return usage(progname,1);
 1105: 	  }
 1106: 	  gs.stop_command = optarg;
 1107: 	  break;
 1108: 	case 'l':
 1109: 	  {
 1110: 	    char garbage[3];
 1111: 	    if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
 1112: 	        (gs.loglevel < LOG_EMERG))
 1113: 	      {
 1114: 	        fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
 1115: 		return usage(progname,1);
 1116: 	      }
 1117: 	  }
 1118: 	  break;
 1119: 	case 'm':
 1120: 	  {
 1121: 	    char garbage[3];
 1122: 	    if ((sscanf(optarg,"%ld%1s",
 1123: 	    		&gs.min_restart_interval,garbage) != 1) ||
 1124: 	        (gs.min_restart_interval < 0))
 1125: 	      {
 1126: 	        fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
 1127: 		        optarg);
 1128: 		return usage(progname,1);
 1129: 	      }
 1130: 	  }
 1131: 	  break;
 1132: 	case 'M':
 1133: 	  {
 1134: 	    char garbage[3];
 1135: 	    if ((sscanf(optarg,"%ld%1s",
 1136: 	    		&gs.max_restart_interval,garbage) != 1) ||
 1137: 	        (gs.max_restart_interval < 0))
 1138: 	      {
 1139: 	        fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
 1140: 		        optarg);
 1141: 		return usage(progname,1);
 1142: 	      }
 1143: 	  }
 1144: 	  break;
 1145: 	case 'i':
 1146: 	  {
 1147: 	    char garbage[3];
 1148: 	    int period;
 1149: 	    if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
 1150: 	        (gs.period < 1))
 1151: 	      {
 1152: 	        fprintf(stderr,"Invalid interval argument: %s\n",optarg);
 1153: 		return usage(progname,1);
 1154: 	      }
 1155: 	    gs.period = 1000*period;
 1156: 	  }
 1157: 	  break;
 1158:         case 'p':
 1159: 	  pidfile = optarg;
 1160: 	  break;
 1161:         case 'r':
 1162: 	  if ((gs.mode == MODE_GLOBAL_RESTART) ||
 1163: 	      (gs.mode == MODE_SEPARATE_RESTART))
 1164: 	    {
 1165: 	      fputs("Ambiguous operating mode selected.\n",stderr);
 1166: 	      return usage(progname,1);
 1167: 	    }
 1168: 	  if (!valid_command(optarg))
 1169: 	  {
 1170: 	    fprintf(stderr,
 1171: 		    "Invalid restart command, must contain '%%s': %s\n",
 1172: 		    optarg);
 1173: 	    return usage(progname,1);
 1174: 	  }
 1175: 	  gs.restart_command = optarg;
 1176: 	  if (gs.mode == MODE_MONITOR)
 1177: 	    gs.mode = MODE_SEPARATE_RESTART;
 1178: 	  break;
 1179:         case 'R':
 1180: 	  if (gs.mode != MODE_MONITOR)
 1181: 	    {
 1182: 	      fputs("Ambiguous operating mode selected.\n",stderr);
 1183: 	      return usage(progname,1);
 1184: 	    }
 1185: 	  if (strchr(optarg,'%'))
 1186: 	    {
 1187: 	      fprintf(stderr,
 1188: 		      "Invalid restart-all arg, must not contain '%%s': %s\n",
 1189: 		      optarg);
 1190: 	      return usage(progname,1);
 1191: 	    }
 1192: 	  gs.restart_command = optarg;
 1193: 	  gs.mode = MODE_GLOBAL_RESTART;
 1194: 	  break;
 1195:         case 's':
 1196: 	  if (!valid_command(optarg))
 1197: 	  {
 1198: 	    fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
 1199: 		    optarg);
 1200: 	    return usage(progname,1);
 1201: 	  }
 1202: 	  gs.start_command = optarg;
 1203: 	  break;
 1204: 	case 'S':
 1205: 	  gs.vtydir = optarg;
 1206: 	  break;
 1207: 	case 't':
 1208: 	  {
 1209: 	    char garbage[3];
 1210: 	    if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
 1211: 	        (gs.timeout < 1))
 1212: 	      {
 1213: 	        fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
 1214: 		return usage(progname,1);
 1215: 	      }
 1216: 	  }
 1217: 	  break;
 1218: 	case 'T':
 1219: 	  {
 1220: 	    char garbage[3];
 1221: 	    if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
 1222: 	        (gs.restart_timeout < 1))
 1223: 	      {
 1224: 	        fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
 1225: 		return usage(progname,1);
 1226: 	      }
 1227: 	  }
 1228: 	  break;
 1229:         case 'z':
 1230: 	  gs.unresponsive_restart = 1;
 1231: 	  break;
 1232: 	case 'v':
 1233: 	  printf ("%s version %s\n", progname, QUAGGA_VERSION);
 1234: 	  puts("Copyright 2004 Andrew J. Schorr");
 1235: 	  return 0;
 1236:         case 'h':
 1237: 	  return usage(progname,0);
 1238:         default:
 1239: 	  fputs("Invalid option.\n",stderr);
 1240: 	  return usage(progname,1);
 1241:         }
 1242:     }
 1243:   
 1244:   if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
 1245:     {
 1246:       fputs("Option -z requires a -r or -R restart option.\n",stderr);
 1247:       return usage(progname,1);
 1248:     }
 1249:   switch (gs.mode)
 1250:     {
 1251:     case MODE_MONITOR:
 1252:       if (gs.restart_command || gs.start_command || gs.stop_command)
 1253:         {
 1254: 	  fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
 1255: 		  mode_str[gs.mode]);
 1256: 	  return usage(progname,1);
 1257: 	}
 1258:       break;
 1259:     case MODE_GLOBAL_RESTART:
 1260:     case MODE_SEPARATE_RESTART:
 1261:       if (!gs.restart_command || gs.start_command || gs.stop_command)
 1262:         {
 1263: 	  fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
 1264: 		  mode_str[gs.mode]);
 1265: 	  return usage(progname,1);
 1266: 	}
 1267:       break;
 1268:     case MODE_PHASED_ZEBRA_RESTART:
 1269:     case MODE_PHASED_ALL_RESTART:
 1270:       if (!gs.restart_command || !gs.start_command || !gs.stop_command)
 1271:         {
 1272: 	  fprintf(stderr,
 1273: 	  	  "Need start, kill, and restart commands in [%s] mode.\n",
 1274: 		  mode_str[gs.mode]);
 1275: 	  return usage(progname,1);
 1276: 	}
 1277:       break;
 1278:     }
 1279: 
 1280:   if (blankstr)
 1281:     {
 1282:       if (gs.restart_command)
 1283:         gs.restart_command = translate_blanks(gs.restart_command,blankstr);
 1284:       if (gs.start_command)
 1285:         gs.start_command = translate_blanks(gs.start_command,blankstr);
 1286:       if (gs.stop_command)
 1287:         gs.stop_command = translate_blanks(gs.stop_command,blankstr);
 1288:     }
 1289:       
 1290:   gs.restart.interval = gs.min_restart_interval;
 1291:   master = thread_master_create();
 1292:   signal_init (master, array_size(my_signals), my_signals);
 1293:   srandom(time(NULL));
 1294: 
 1295:   {
 1296:     int i;
 1297:     struct daemon *tail = NULL;
 1298: 
 1299:     for (i = optind; i < argc; i++)
 1300:       {
 1301: 	struct daemon *dmn;
 1302: 
 1303: 	if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
 1304: 	  {
 1305: 	    fprintf(stderr,"calloc(1,%u) failed: %s\n",
 1306: 		    (u_int)sizeof(*dmn), safe_strerror(errno));
 1307: 	    return 1;
 1308: 	  }
 1309: 	dmn->name = dmn->restart.name = argv[i];
 1310: 	dmn->state = DAEMON_INIT;
 1311: 	gs.numdaemons++;
 1312: 	gs.numdown++;
 1313: 	dmn->fd = -1;
 1314: 	dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
 1315: 					      100+(random() % 900));
 1316: 	dmn->restart.interval = gs.min_restart_interval;
 1317: 	if (tail)
 1318: 	  tail->next = dmn;
 1319: 	else
 1320: 	  gs.daemons = dmn;
 1321: 	tail = dmn;
 1322: 
 1323: 	if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
 1324: 	     (gs.mode == MODE_PHASED_ALL_RESTART)) &&
 1325: 	    !strcmp(dmn->name,special))
 1326: 	  gs.special = dmn;
 1327:       }
 1328:   }
 1329:   if (!gs.daemons)
 1330:     {
 1331:       fputs("Must specify one or more daemons to monitor.\n",stderr);
 1332:       return usage(progname,1);
 1333:     }
 1334:   if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
 1335:       (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
 1336:     {
 1337:       fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
 1338: 	      mode_str[gs.mode],special);
 1339:       return usage(progname,1);
 1340:     }
 1341:   if (gs.special && (gs.numdaemons < 2))
 1342:     {
 1343:       fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
 1344: 		     "to watch.\n",mode_str[gs.mode]);
 1345:       return usage(progname,1);
 1346:     }
 1347: 
 1348:   zlog_default = openzlog(progname, ZLOG_NONE,
 1349: 			  LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
 1350:   zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
 1351:   if (daemon_mode)
 1352:     {
 1353:       zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
 1354:       if (daemon (0, 0) < 0)
 1355: 	{
 1356: 	  fprintf(stderr, "Watchquagga daemon failed: %s", strerror(errno));
 1357: 	  exit (1);
 1358: 	}
 1359:     }
 1360:   else
 1361:     zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
 1362: 
 1363:   /* Make sure we're not already running. */
 1364:   pid_output (pidfile);
 1365: 
 1366:   /* Announce which daemons are being monitored. */
 1367:   {
 1368:     struct daemon *dmn;
 1369:     size_t len = 0;
 1370: 
 1371:     for (dmn = gs.daemons; dmn; dmn = dmn->next)
 1372:       len += strlen(dmn->name)+1;
 1373: 
 1374:     {
 1375:       char buf[len+1];
 1376:       char *p = buf;
 1377: 
 1378:       for (dmn = gs.daemons; dmn; dmn = dmn->next)
 1379: 	{
 1380: 	  if (p != buf)
 1381: 	    *p++ = ' ';
 1382: 	  strcpy(p,dmn->name);
 1383: 	  p += strlen(p);
 1384: 	}
 1385:       zlog_notice("%s %s watching [%s], mode [%s]",
 1386:       		  progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
 1387:     }
 1388:   }
 1389: 
 1390:   {
 1391:     struct thread thread;
 1392: 
 1393:     while (thread_fetch (master, &thread))
 1394:       thread_call (&thread);
 1395:   }
 1396: 
 1397:   /* Not reached. */
 1398:   return 0;
 1399: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>