implement watchdog to librenms-service (#11353)

* add watchdog to librenms-service to check log file
add Redis timeout to librenms-service

* updated docs

* fixed logfile_watchdog() indentation in service.py

* indentation fix

* code climate patch

* updated default redis timeout if alerting frequency is 0
This commit is contained in:
Hayden
2020-03-31 21:10:45 -07:00
committed by GitHub
parent daf451cef7
commit cdb6a74dc8
4 changed files with 53 additions and 4 deletions

View File

@ -236,13 +236,14 @@ class RedisLock(Lock):
if redis_kwargs.get('sentinel') and redis_kwargs.get('sentinel_service'):
sentinels = [tuple(l.split(':')) for l in redis_kwargs.pop('sentinel').split(',')]
sentinel_service = redis_kwargs.pop('sentinel_service')
kwargs = {k: v for k, v in redis_kwargs.items() if k in ["decode_responses", "password", "db"]}
kwargs = {k: v for k, v in redis_kwargs.items() if k in ["decode_responses", "password", "db", "socket_timeout"]}
self._redis = Sentinel(sentinels, **kwargs).master_for(sentinel_service)
else:
kwargs = {k: v for k, v in redis_kwargs.items() if "sentinel" not in k}
self._redis = redis.Redis(**kwargs)
self._redis.ping()
self._namespace = namespace
info("Created redis lock manager with socket_timeout of {}s".format(redis_kwargs['socket_timeout']))
def __key(self, name):
return "{}:{}".format(self._namespace, name)
@ -297,13 +298,14 @@ class RedisUniqueQueue(object):
if redis_kwargs.get('sentinel') and redis_kwargs.get('sentinel_service'):
sentinels = [tuple(l.split(':')) for l in redis_kwargs.pop('sentinel').split(',')]
sentinel_service = redis_kwargs.pop('sentinel_service')
kwargs = {k: v for k, v in redis_kwargs.items() if k in ["decode_responses", "password", "db"]}
kwargs = {k: v for k, v in redis_kwargs.items() if k in ["decode_responses", "password", "db", "socket_timeout"]}
self._redis = Sentinel(sentinels, **kwargs).master_for(sentinel_service)
else:
kwargs = {k: v for k, v in redis_kwargs.items() if "sentinel" not in k}
self._redis = redis.Redis(**kwargs)
self._redis.ping()
self.key = "{}:{}".format(namespace, name)
info("Created redis queue with socket_timeout of {}s".format(redis_kwargs['socket_timeout']))
# clean up from previous implementations
if self._redis.type(self.key) != 'zset':

View File

@ -167,7 +167,8 @@ class QueueManager:
password=self.config.redis_pass,
unix_socket_path=self.config.redis_socket,
sentinel=self.config.redis_sentinel,
sentinel_service=self.config.redis_sentinel_service)
sentinel_service=self.config.redis_sentinel_service,
socket_timeout=self.config.redis_timeout)
except ImportError:
if self.config.distributed:

View File

@ -10,6 +10,7 @@ import sys
import time
from datetime import timedelta
from datetime import datetime
from logging import debug, info, warning, error, critical, exception
from platform import python_version
from time import sleep
@ -73,6 +74,7 @@ class ServiceConfig:
redis_socket = None
redis_sentinel = None
redis_sentinel_service = None
redis_timeout = 60
db_host = 'localhost'
db_port = 0
@ -81,6 +83,9 @@ class ServiceConfig:
db_pass = ''
db_name = 'librenms'
watchdog_enabled = False
watchdog_logfile = 'logs/librenms.log'
def populate(self):
config = self._get_config_data()
@ -128,6 +133,7 @@ class ServiceConfig:
self.redis_sentinel_service = os.getenv('REDIS_SENTINEL_SERVICE',
config.get('redis_sentinel_service',
ServiceConfig.redis_sentinel_service))
self.redis_timeout = os.getenv('REDIS_TIMEOUT', self.alerting.frequency if self.alerting.frequency != 0 else self.redis_timeout)
self.db_host = os.getenv('DB_HOST', config.get('db_host', ServiceConfig.db_host))
self.db_name = os.getenv('DB_DATABASE', config.get('db_name', ServiceConfig.db_name))
@ -136,6 +142,9 @@ class ServiceConfig:
self.db_socket = os.getenv('DB_SOCKET', config.get('db_socket', ServiceConfig.db_socket))
self.db_user = os.getenv('DB_USERNAME', config.get('db_user', ServiceConfig.db_user))
self.watchdog_enabled = config.get('service_watchdog_enabled', ServiceConfig.watchdog_enabled)
self.watchdog_logfile = config.get('log_file', ServiceConfig.watchdog_logfile)
# set convenient debug variable
self.debug = logging.getLogger().isEnabledFor(logging.DEBUG)
@ -205,6 +214,11 @@ class Service:
self._lm = self.create_lock_manager()
self.daily_timer = LibreNMS.RecurringTimer(self.config.update_frequency, self.run_maintenance, 'maintenance')
self.stats_timer = LibreNMS.RecurringTimer(self.config.poller.frequency, self.log_performance_stats, 'performance')
if self.config.watchdog_enabled:
info("Starting watchdog timer for log file: {}".format(self.config.watchdog_logfile))
self.watchdog_timer = LibreNMS.RecurringTimer(self.config.poller.frequency, self.logfile_watchdog, 'watchdog')
else:
info("Watchdog is disabled.")
self.is_master = False
def attach_signals(self):
@ -239,6 +253,8 @@ class Service:
if self.config.update_enabled:
self.daily_timer.start()
self.stats_timer.start()
if self.config.watchdog_enabled:
self.watchdog_timer.start()
info("LibreNMS Service: {} started!".format(self.config.unique_name))
info("Poller group {}. Using Python {} and {} locks and queues"
@ -371,7 +387,8 @@ class Service:
password=self.config.redis_pass,
unix_socket_path=self.config.redis_socket,
sentinel=self.config.redis_sentinel,
sentinel_service=self.config.redis_sentinel_service)
sentinel_service=self.config.redis_sentinel_service,
socket_timeout=self.config.redis_timeout)
except ImportError:
if self.config.distributed:
critical("ERROR: Redis connection required for distributed polling")
@ -424,6 +441,8 @@ class Service:
self.daily_timer.stop()
self.stats_timer.stop()
if self.config.watchdog_enabled:
self.watchdog_timer.stop()
self._stop_managers_and_wait()
@ -509,3 +528,19 @@ class Service:
)
except pymysql.err.Error:
exception("Unable to log performance statistics - is the database still online?")
def logfile_watchdog(self):
try:
# check that lofgile has been written to within last poll period
logfile_mdiff = datetime.now().timestamp() - os.path.getmtime(self.config.watchdog_logfile)
except FileNotFoundError as e:
error("Log file not found! {}".format(e))
return
if logfile_mdiff > self.config.poller.frequency:
critical("BARK! Log file older than {}s, restarting service!".format(self.config.poller.frequency))
self.restart()
else:
info("Log file updated {}s ago".format(int(logfile_mdiff)))

View File

@ -95,6 +95,7 @@ REDIS_SENTINEL_SERVICE=myservice
REDIS_DB=0
#REDIS_PASSWORD=
#REDIS_TIMEOUT=60
```
## Basic Configuration
@ -154,6 +155,16 @@ You can enable it by setting the following:
$config['service_ping_enabled'] = true;
```
# Watchdog
The watchdog scheduler is disabled by default. You can enable it by setting the following:
```php
$config['service_watchdog_enabled'] = true;
```
The watchdog scheduler will check that the poller log file has been written to within the last poll period. If there is no change to the log file since, the watchdog will restart the polling service. The poller log file is set by `$config['log_file']` and defaults to `./logs/librenms.log`
# Cron Scripts
Once the LibreNMS service is installed, the cron scripts used by