mirror of
				https://gitlab.labs.nic.cz/labs/bird.git
				synced 2024-05-11 16:54:54 +00:00 
			
		
		
		
	Implement latency tracking, internal event log and watchdog
This commit is contained in:
		@@ -41,6 +41,10 @@ struct config {
 | 
			
		||||
  u32 gr_wait;				/* Graceful restart wait timeout */
 | 
			
		||||
 | 
			
		||||
  int cli_debug;			/* Tracing of CLI connections and commands */
 | 
			
		||||
  int latency_debug;			/* I/O loop tracks duration of each event */
 | 
			
		||||
  u32 latency_limit;			/* Events with longer duration are logged (us) */
 | 
			
		||||
  u32 watchdog_warning;			/* I/O loop watchdog limit for warning (us) */
 | 
			
		||||
  u32 watchdog_timeout;			/* Watchdog timeout (in seconds, 0 = disabled) */
 | 
			
		||||
  char *err_msg;			/* Parser error message */
 | 
			
		||||
  int err_lino;				/* Line containing error */
 | 
			
		||||
  char *err_file_name;			/* File name containing error */
 | 
			
		||||
 
 | 
			
		||||
@@ -33,6 +33,10 @@
 | 
			
		||||
# Turn on global debugging of all protocols
 | 
			
		||||
#debug protocols all;
 | 
			
		||||
 | 
			
		||||
# Turn on internal watchdog
 | 
			
		||||
#watchdog warning 5 s;
 | 
			
		||||
#watchdog timeout 30 s;
 | 
			
		||||
 | 
			
		||||
# The direct protocol automatically generates device routes to
 | 
			
		||||
# all network interfaces. Can exist in as many instances as you wish
 | 
			
		||||
# if you want to populate multiple routing tables with device routes.
 | 
			
		||||
@@ -162,7 +166,7 @@ protocol static {
 | 
			
		||||
#               };
 | 
			
		||||
#	};
 | 
			
		||||
#}
 | 
			
		||||
		
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#protocol bgp {
 | 
			
		||||
#	disabled;
 | 
			
		||||
@@ -186,7 +190,7 @@ protocol static {
 | 
			
		||||
#	source address 198.51.100.14;	# What local address we use for the TCP connection
 | 
			
		||||
#	password "secret";	# Password used for MD5 authentication
 | 
			
		||||
#	rr client;		# I am a route reflector and the neighor is my client
 | 
			
		||||
#	rr cluster id 1.0.0.1;	# Use this value for cluster id instead of my router id 
 | 
			
		||||
#	rr cluster id 1.0.0.1;	# Use this value for cluster id instead of my router id
 | 
			
		||||
#	export where source=RTS_STATIC;
 | 
			
		||||
#	export filter {
 | 
			
		||||
#		if source = RTS_STATIC then {
 | 
			
		||||
@@ -202,7 +206,7 @@ protocol static {
 | 
			
		||||
#		reject;
 | 
			
		||||
#	};
 | 
			
		||||
#}
 | 
			
		||||
# 
 | 
			
		||||
#
 | 
			
		||||
# Template usage example
 | 
			
		||||
#template bgp rr_client {
 | 
			
		||||
#	disabled;
 | 
			
		||||
 
 | 
			
		||||
@@ -344,6 +344,23 @@ protocol rip {
 | 
			
		||||
	of connects and disconnects, 2 and higher for logging of all client
 | 
			
		||||
	commands). Default: 0.
 | 
			
		||||
 | 
			
		||||
	<tag>debug latency <m/switch/</tag>
 | 
			
		||||
	Activate tracking of elapsed time for internal events. Recent events
 | 
			
		||||
	could be examined using <cf/dump events/ command. Default: off.
 | 
			
		||||
 | 
			
		||||
	<tag>debug latency limit <m/time/</tag>
 | 
			
		||||
	If <cf/debug latency/ is enabled, this option allows to specify a limit
 | 
			
		||||
	for elapsed time. Events exceeding the limit are logged. Default: 1 s.
 | 
			
		||||
 | 
			
		||||
	<tag>watchdog warning <m/time/</tag>
 | 
			
		||||
	Set time limit for I/O loop cycle. If one iteration took more time to
 | 
			
		||||
	complete, a warning is logged. Default: 5 s.
 | 
			
		||||
 | 
			
		||||
	<tag>watchdog timeout <m/time/</tag>
 | 
			
		||||
	Set time limit for I/O loop cycle. If the limit is breached, BIRD is
 | 
			
		||||
	killed by abort signal. The timeout has effective granularity of
 | 
			
		||||
	seconds, zero means disabled. Default: disabled (0).
 | 
			
		||||
 | 
			
		||||
	<tag>mrtdump "<m/filename/"</tag>
 | 
			
		||||
	Set MRTdump file name. This option must be specified to allow MRTdump
 | 
			
		||||
	feature. Default: no dump file.
 | 
			
		||||
 
 | 
			
		||||
@@ -114,6 +114,8 @@ ev_schedule(event *e)
 | 
			
		||||
  ev_enqueue(&global_event_list, e);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void io_log_event(void *hook, void *data);
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * ev_run_list - run an event list
 | 
			
		||||
 * @l: an event list
 | 
			
		||||
@@ -132,6 +134,11 @@ ev_run_list(event_list *l)
 | 
			
		||||
  WALK_LIST_FIRST(n, tmp_list)
 | 
			
		||||
    {
 | 
			
		||||
      event *e = SKIP_BACK(event, n, n);
 | 
			
		||||
 | 
			
		||||
      /* This is ugly hack, we want to log just events executed from the main I/O loop */
 | 
			
		||||
      if (l == &global_event_list)
 | 
			
		||||
	io_log_event(e->hook, e->data);
 | 
			
		||||
 | 
			
		||||
      ev_run(e);
 | 
			
		||||
    }
 | 
			
		||||
  return !EMPTY_LIST(*l);
 | 
			
		||||
 
 | 
			
		||||
@@ -621,6 +621,8 @@ CF_CLI(DUMP RESOURCES,,, [[Dump all allocated resource]])
 | 
			
		||||
{ rdump(&root_pool); cli_msg(0, ""); } ;
 | 
			
		||||
CF_CLI(DUMP SOCKETS,,, [[Dump open sockets]])
 | 
			
		||||
{ sk_dump_all(); cli_msg(0, ""); } ;
 | 
			
		||||
CF_CLI(DUMP EVENTS,,, [[Dump event log]])
 | 
			
		||||
{ io_log_dump(); cli_msg(0, ""); } ;
 | 
			
		||||
CF_CLI(DUMP INTERFACES,,, [[Dump interface information]])
 | 
			
		||||
{ if_dump_all(); cli_msg(0, ""); } ;
 | 
			
		||||
CF_CLI(DUMP NEIGHBORS,,, [[Dump neighbor cache]])
 | 
			
		||||
 
 | 
			
		||||
@@ -15,6 +15,7 @@ CF_DECLS
 | 
			
		||||
 | 
			
		||||
CF_KEYWORDS(LOG, SYSLOG, ALL, DEBUG, TRACE, INFO, REMOTE, WARNING, ERROR, AUTH, FATAL, BUG, STDERR, SOFT)
 | 
			
		||||
CF_KEYWORDS(TIMEFORMAT, ISO, OLD, SHORT, LONG, BASE, NAME, CONFIRM, UNDO, CHECK, TIMEOUT)
 | 
			
		||||
CF_KEYWORDS(DEBUG, LATENCY, LIMIT, WATCHDOG, WARNING, TIMEOUT)
 | 
			
		||||
 | 
			
		||||
%type <i> log_mask log_mask_list log_cat cfg_timeout
 | 
			
		||||
%type <g> log_file
 | 
			
		||||
@@ -83,6 +84,7 @@ mrtdump_base:
 | 
			
		||||
   }
 | 
			
		||||
 ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CF_ADDTO(conf, timeformat_base)
 | 
			
		||||
 | 
			
		||||
timeformat_which:
 | 
			
		||||
@@ -104,6 +106,17 @@ timeformat_base:
 | 
			
		||||
   TIMEFORMAT timeformat_spec ';'
 | 
			
		||||
 ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CF_ADDTO(conf, debug_unix)
 | 
			
		||||
 | 
			
		||||
debug_unix:
 | 
			
		||||
   DEBUG LATENCY bool { new_config->latency_debug = $3; }
 | 
			
		||||
 | DEBUG LATENCY LIMIT expr_us { new_config->latency_limit = $4; }
 | 
			
		||||
 | WATCHDOG WARNING expr_us { new_config->watchdog_warning = $3; }
 | 
			
		||||
 | WATCHDOG TIMEOUT expr_us { new_config->watchdog_timeout = ($3 + 999999) TO_S; }
 | 
			
		||||
 ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/* Unix specific commands */
 | 
			
		||||
 | 
			
		||||
CF_CLI_HELP(CONFIGURE, ..., [[Reload configuration]])
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										170
									
								
								sysdep/unix/io.c
									
									
									
									
									
								
							
							
						
						
									
										170
									
								
								sysdep/unix/io.c
									
									
									
									
									
								
							@@ -332,6 +332,8 @@ tm_first_shot(void)
 | 
			
		||||
  return x;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void io_log_event(void *hook, void *data);
 | 
			
		||||
 | 
			
		||||
static void
 | 
			
		||||
tm_shot(void)
 | 
			
		||||
{
 | 
			
		||||
@@ -372,6 +374,7 @@ tm_shot(void)
 | 
			
		||||
	    i = 0;
 | 
			
		||||
	  tm_start(t, i);
 | 
			
		||||
	}
 | 
			
		||||
      io_log_event(t->hook, t->data);
 | 
			
		||||
      t->hook(t);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -1839,6 +1842,162 @@ sk_dump_all(void)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 *	Internal event log and watchdog
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#define EVENT_LOG_LENGTH 32
 | 
			
		||||
 | 
			
		||||
struct event_log_entry
 | 
			
		||||
{
 | 
			
		||||
  void *hook;
 | 
			
		||||
  void *data;
 | 
			
		||||
  btime timestamp;
 | 
			
		||||
  btime duration;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static struct event_log_entry event_log[EVENT_LOG_LENGTH];
 | 
			
		||||
static struct event_log_entry *event_open;
 | 
			
		||||
static int event_log_pos, event_log_num, watchdog_active;
 | 
			
		||||
static btime last_time;
 | 
			
		||||
static btime loop_time;
 | 
			
		||||
 | 
			
		||||
static void
 | 
			
		||||
io_update_time(void)
 | 
			
		||||
{
 | 
			
		||||
  struct timespec ts;
 | 
			
		||||
  int rv;
 | 
			
		||||
 | 
			
		||||
  if (!clock_monotonic_available)
 | 
			
		||||
    return;
 | 
			
		||||
 | 
			
		||||
  /*
 | 
			
		||||
   * This is third time-tracking procedure (after update_times() above and
 | 
			
		||||
   * times_update() in BFD), dedicated to internal event log and latency
 | 
			
		||||
   * tracking. Hopefully, we consolidate these sometimes.
 | 
			
		||||
   */
 | 
			
		||||
 | 
			
		||||
  rv = clock_gettime(CLOCK_MONOTONIC, &ts);
 | 
			
		||||
  if (rv < 0)
 | 
			
		||||
    die("clock_gettime: %m");
 | 
			
		||||
 | 
			
		||||
  last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
 | 
			
		||||
 | 
			
		||||
  if (event_open)
 | 
			
		||||
  {
 | 
			
		||||
    event_open->duration = last_time - event_open->timestamp;
 | 
			
		||||
 | 
			
		||||
    if (event_open->duration > config->latency_limit)
 | 
			
		||||
      log(L_WARN "Event 0x%p 0x%p took %d ms",
 | 
			
		||||
	  event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
 | 
			
		||||
 | 
			
		||||
    event_open = NULL;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * io_log_event - mark approaching event into event log
 | 
			
		||||
 * @hook: event hook address
 | 
			
		||||
 * @data: event data address
 | 
			
		||||
 *
 | 
			
		||||
 * Store info (hook, data, timestamp) about the following internal event into
 | 
			
		||||
 * a circular event log (@event_log). When latency tracking is enabled, the log
 | 
			
		||||
 * entry is kept open (in @event_open) so the duration can be filled later.
 | 
			
		||||
 */
 | 
			
		||||
void
 | 
			
		||||
io_log_event(void *hook, void *data)
 | 
			
		||||
{
 | 
			
		||||
  if (config->latency_debug)
 | 
			
		||||
    io_update_time();
 | 
			
		||||
 | 
			
		||||
  struct event_log_entry *en = event_log + event_log_pos;
 | 
			
		||||
 | 
			
		||||
  en->hook = hook;
 | 
			
		||||
  en->data = data;
 | 
			
		||||
  en->timestamp = last_time;
 | 
			
		||||
  en->duration = 0;
 | 
			
		||||
 | 
			
		||||
  event_log_num++;
 | 
			
		||||
  event_log_pos++;
 | 
			
		||||
  event_log_pos %= EVENT_LOG_LENGTH;
 | 
			
		||||
 | 
			
		||||
  event_open = config->latency_debug ? en : NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void
 | 
			
		||||
io_close_event(void)
 | 
			
		||||
{
 | 
			
		||||
  if (event_open)
 | 
			
		||||
    io_update_time();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void
 | 
			
		||||
io_log_dump(void)
 | 
			
		||||
{
 | 
			
		||||
  int i;
 | 
			
		||||
 | 
			
		||||
  log(L_DEBUG "Event log:");
 | 
			
		||||
  for (i = 0; i < EVENT_LOG_LENGTH; i++)
 | 
			
		||||
  {
 | 
			
		||||
    struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
 | 
			
		||||
    if (en->hook)
 | 
			
		||||
      log(L_DEBUG "  Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
 | 
			
		||||
	  (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void
 | 
			
		||||
watchdog_sigalrm(int sig UNUSED)
 | 
			
		||||
{
 | 
			
		||||
  /* Update last_time and duration, but skip latency check */
 | 
			
		||||
  config->latency_limit = 0xffffffff;
 | 
			
		||||
  io_update_time();
 | 
			
		||||
 | 
			
		||||
  /* We want core dump */
 | 
			
		||||
  abort();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void
 | 
			
		||||
watchdog_start1(void)
 | 
			
		||||
{
 | 
			
		||||
  io_update_time();
 | 
			
		||||
 | 
			
		||||
  loop_time = last_time;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void
 | 
			
		||||
watchdog_start(void)
 | 
			
		||||
{
 | 
			
		||||
  io_update_time();
 | 
			
		||||
 | 
			
		||||
  loop_time = last_time;
 | 
			
		||||
  event_log_num = 0;
 | 
			
		||||
 | 
			
		||||
  if (config->watchdog_timeout)
 | 
			
		||||
  {
 | 
			
		||||
    alarm(config->watchdog_timeout);
 | 
			
		||||
    watchdog_active = 1;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void
 | 
			
		||||
watchdog_stop(void)
 | 
			
		||||
{
 | 
			
		||||
  io_update_time();
 | 
			
		||||
 | 
			
		||||
  if (watchdog_active)
 | 
			
		||||
  {
 | 
			
		||||
    alarm(0);
 | 
			
		||||
    watchdog_active = 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  btime duration = last_time - loop_time;
 | 
			
		||||
  if (duration > config->watchdog_warning)
 | 
			
		||||
    log(L_WARN "I/O loop cycle took %d ms for %d events",
 | 
			
		||||
	(int) (duration TO_MS), event_log_num);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 *	Main I/O Loop
 | 
			
		||||
 */
 | 
			
		||||
@@ -1873,6 +2032,7 @@ io_loop(void)
 | 
			
		||||
  sock *s;
 | 
			
		||||
  node *n;
 | 
			
		||||
 | 
			
		||||
  watchdog_start1();
 | 
			
		||||
  sock_recalc_fdsets_p = 1;
 | 
			
		||||
  for(;;)
 | 
			
		||||
    {
 | 
			
		||||
@@ -1887,6 +2047,8 @@ io_loop(void)
 | 
			
		||||
      timo.tv_sec = events ? 0 : MIN(tout - now, 3);
 | 
			
		||||
      timo.tv_usec = 0;
 | 
			
		||||
 | 
			
		||||
      io_close_event();
 | 
			
		||||
 | 
			
		||||
      if (sock_recalc_fdsets_p)
 | 
			
		||||
	{
 | 
			
		||||
	  sock_recalc_fdsets_p = 0;
 | 
			
		||||
@@ -1923,25 +2085,30 @@ io_loop(void)
 | 
			
		||||
 | 
			
		||||
      if (async_config_flag)
 | 
			
		||||
	{
 | 
			
		||||
	  io_log_event(async_config, NULL);
 | 
			
		||||
	  async_config();
 | 
			
		||||
	  async_config_flag = 0;
 | 
			
		||||
	  continue;
 | 
			
		||||
	}
 | 
			
		||||
      if (async_dump_flag)
 | 
			
		||||
	{
 | 
			
		||||
	  io_log_event(async_dump, NULL);
 | 
			
		||||
	  async_dump();
 | 
			
		||||
	  async_dump_flag = 0;
 | 
			
		||||
	  continue;
 | 
			
		||||
	}
 | 
			
		||||
      if (async_shutdown_flag)
 | 
			
		||||
	{
 | 
			
		||||
	  io_log_event(async_shutdown, NULL);
 | 
			
		||||
	  async_shutdown();
 | 
			
		||||
	  async_shutdown_flag = 0;
 | 
			
		||||
	  continue;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
      /* And finally enter select() to find active sockets */
 | 
			
		||||
      watchdog_stop();
 | 
			
		||||
      hi = select(hi+1, &rd, &wr, NULL, &timo);
 | 
			
		||||
      watchdog_start();
 | 
			
		||||
 | 
			
		||||
      if (hi < 0)
 | 
			
		||||
	{
 | 
			
		||||
@@ -1965,6 +2132,7 @@ io_loop(void)
 | 
			
		||||
		do
 | 
			
		||||
		  {
 | 
			
		||||
		    steps--;
 | 
			
		||||
		    io_log_event(s->rx_hook, s->data);
 | 
			
		||||
		    e = sk_read(s);
 | 
			
		||||
		    if (s != current_sock)
 | 
			
		||||
		      goto next;
 | 
			
		||||
@@ -1976,6 +2144,7 @@ io_loop(void)
 | 
			
		||||
		do
 | 
			
		||||
		  {
 | 
			
		||||
		    steps--;
 | 
			
		||||
		    io_log_event(s->tx_hook, s->data);
 | 
			
		||||
		    e = sk_write(s);
 | 
			
		||||
		    if (s != current_sock)
 | 
			
		||||
		      goto next;
 | 
			
		||||
@@ -2003,6 +2172,7 @@ io_loop(void)
 | 
			
		||||
	      if ((s->type < SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook)
 | 
			
		||||
		{
 | 
			
		||||
		  count++;
 | 
			
		||||
		  io_log_event(s->rx_hook, s->data);
 | 
			
		||||
		  e = sk_read(s);
 | 
			
		||||
		  if (s != current_sock)
 | 
			
		||||
		      goto next2;
 | 
			
		||||
 
 | 
			
		||||
@@ -168,6 +168,9 @@ sysdep_preconfig(struct config *c)
 | 
			
		||||
{
 | 
			
		||||
  init_list(&c->logfiles);
 | 
			
		||||
 | 
			
		||||
  c->latency_limit = UNIX_DEFAULT_LATENCY_LIMIT;
 | 
			
		||||
  c->watchdog_warning = UNIX_DEFAULT_WATCHDOG_WARNING;
 | 
			
		||||
 | 
			
		||||
#ifdef PATH_IPROUTE_DIR
 | 
			
		||||
  read_iproute_table(PATH_IPROUTE_DIR "/rt_protos", "ipp_", 256);
 | 
			
		||||
  read_iproute_table(PATH_IPROUTE_DIR "/rt_realms", "ipr_", 256);
 | 
			
		||||
@@ -585,6 +588,8 @@ handle_sigterm(int sig UNUSED)
 | 
			
		||||
  async_shutdown_flag = 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void watchdog_sigalrm(int sig UNUSED);
 | 
			
		||||
 | 
			
		||||
static void
 | 
			
		||||
signal_init(void)
 | 
			
		||||
{
 | 
			
		||||
@@ -600,6 +605,9 @@ signal_init(void)
 | 
			
		||||
  sa.sa_handler = handle_sigterm;
 | 
			
		||||
  sa.sa_flags = SA_RESTART;
 | 
			
		||||
  sigaction(SIGTERM, &sa, NULL);
 | 
			
		||||
  sa.sa_handler = watchdog_sigalrm;
 | 
			
		||||
  sa.sa_flags = 0;
 | 
			
		||||
  sigaction(SIGALRM, &sa, NULL);
 | 
			
		||||
  signal(SIGPIPE, SIG_IGN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -27,8 +27,10 @@ void cmd_reconfig_confirm(void);
 | 
			
		||||
void cmd_reconfig_undo(void);
 | 
			
		||||
void cmd_shutdown(void);
 | 
			
		||||
 | 
			
		||||
#define UNIX_DEFAULT_CONFIGURE_TIMEOUT 300
 | 
			
		||||
#define UNIX_DEFAULT_CONFIGURE_TIMEOUT	300
 | 
			
		||||
 | 
			
		||||
#define UNIX_DEFAULT_LATENCY_LIMIT	(1 S_)
 | 
			
		||||
#define UNIX_DEFAULT_WATCHDOG_WARNING	(5 S_)
 | 
			
		||||
 | 
			
		||||
/* io.c */
 | 
			
		||||
 | 
			
		||||
@@ -99,6 +101,7 @@ volatile int async_shutdown_flag;
 | 
			
		||||
 | 
			
		||||
void io_init(void);
 | 
			
		||||
void io_loop(void);
 | 
			
		||||
void io_log_dump(void);
 | 
			
		||||
int sk_open_unix(struct birdsock *s, char *name);
 | 
			
		||||
void *tracked_fopen(struct pool *, char *name, char *mode);
 | 
			
		||||
void test_old_bird(char *path);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user