mirror of
https://github.com/librenms/librenms.git
synced 2024-10-07 16:52:45 +00:00
Merge pull request #1838 from clinta/poller-service2
poller-service with all locking in Python
This commit is contained in:
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
The Poller service is an alternative to polling and discovery cron jobs and provides support for distributed polling without memcache. It is multi-threaded and runs continuously discovering and polling devices with the oldest data attempting to honor the polling frequency configured in `config.php`. This service replaces all the required cron jobs except for `/opt/librenms/daily.sh` and `/opt/librenms/alerts.php`.
|
The Poller service is an alternative to polling and discovery cron jobs and provides support for distributed polling without memcache. It is multi-threaded and runs continuously discovering and polling devices with the oldest data attempting to honor the polling frequency configured in `config.php`. This service replaces all the required cron jobs except for `/opt/librenms/daily.sh` and `/opt/librenms/alerts.php`.
|
||||||
|
|
||||||
Configure the maximum number of threads for the service in `$config['poller_service_workers']`. Configure the minimum desired polling frequency in `$config['poller_service_poll_frequency']` and the minimum desired discovery frequency in `$config['poller_service_discover_frequency']`. The service will not poll or discover devices which have data newer than this this configured age in seconds. Configure how frequently the service will attempt to poll devices which are down in `$config['poller_service_down_retry']`.
|
Configure the maximum number of threads for the service in `$config['poller_service_workers']`. Configure the minimum desired polling frequency in `$config['poller_service_poll_frequency']` and the minimum desired discovery frequency in `$config['poller_service_discover_frequency']`. The service will not poll or discover devices which have data newer than this this configured age in seconds. Configure how frequently the service will attempt to poll devices which are down in `$config['poller_service_down_retry']`. If you have enough pollers that the worker threads run out of work, the service will query looking for devices every `$config['poller_service_retry_query']` seconds.
|
||||||
|
|
||||||
The poller service is designed to gracefully degrade. If not all devices can be polled within the configured frequency, the service will continuously poll devices refreshing as frequently as possible using the configured number of threads.
|
The poller service is designed to gracefully degrade. If not all devices can be polled within the configured frequency, the service will continuously poll devices refreshing as frequently as possible using the configured number of threads.
|
||||||
|
|
||||||
@@ -18,6 +18,8 @@ $config['poller_service_workers'] = 16;
|
|||||||
$config['poller_service_poll_frequency'] = 300;
|
$config['poller_service_poll_frequency'] = 300;
|
||||||
$config['poller_service_discover_frequency'] = 21600;
|
$config['poller_service_discover_frequency'] = 21600;
|
||||||
$config['poller_service_down_retry'] = 60;
|
$config['poller_service_down_retry'] = 60;
|
||||||
|
$config['poller_service_retry_query'] = 1;
|
||||||
|
$config['poller_service_single_connection'] = false;
|
||||||
```
|
```
|
||||||
|
|
||||||
## Distributed Polling
|
## Distributed Polling
|
||||||
@@ -26,6 +28,9 @@ Distributed polling is possible, and uses the same configuration options as are
|
|||||||
## Multi-Master MySQL considerations
|
## Multi-Master MySQL considerations
|
||||||
Because locks are not replicated in Multi-Master MySQL configurations, if you are using such a configuration, you will need to make sure that all pollers are using the same MySQL server.
|
Because locks are not replicated in Multi-Master MySQL configurations, if you are using such a configuration, you will need to make sure that all pollers are using the same MySQL server.
|
||||||
|
|
||||||
|
## Single Connection
|
||||||
|
If you are running MariaDB 10.2 or newer, you can tell poller-service to use a single mysql connectino for managing locks by setting `$config['poller_service_single_connection']` to `true`. *DO NOT* configure this for any version of MariaDB less than 10.2 or any version of MySQL.
|
||||||
|
|
||||||
## Service Installation
|
## Service Installation
|
||||||
An upstart configuration `poller-service.conf` is provided. To install run `ln -s /opt/librenms/poller-service.conf /etc/init/poller-service.conf`. The service will start on boot and can be started manually by running `start poller-service`. If you recieve an error that the service does not exist, run `initctl reload-configuration`. The service is configured to run as the user `librenms` and will fail if that user does not exist.
|
An upstart configuration `poller-service.conf` is provided. To install run `ln -s /opt/librenms/poller-service.conf /etc/init/poller-service.conf`. The service will start on boot and can be started manually by running `start poller-service`. If you recieve an error that the service does not exist, run `initctl reload-configuration`. The service is configured to run as the user `librenms` and will fail if that user does not exist.
|
||||||
|
|
||||||
|
@@ -33,6 +33,7 @@ import MySQLdb
|
|||||||
import logging
|
import logging
|
||||||
import logging.handlers
|
import logging.handlers
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
log = logging.getLogger('poller-service')
|
log = logging.getLogger('poller-service')
|
||||||
log.setLevel(logging.DEBUG)
|
log.setLevel(logging.DEBUG)
|
||||||
@@ -48,6 +49,54 @@ config_file = install_dir + '/config.php'
|
|||||||
log.info('INFO: Starting poller-service')
|
log.info('INFO: Starting poller-service')
|
||||||
|
|
||||||
|
|
||||||
|
class DB:
|
||||||
|
conn = None
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.in_use = threading.Lock()
|
||||||
|
self.connect()
|
||||||
|
|
||||||
|
def connect(self):
|
||||||
|
self.in_use.acquire(True)
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
self.conn.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
if db_port == 0:
|
||||||
|
self.conn = MySQLdb.connect(host=db_server, user=db_username, passwd=db_password, db=db_dbname)
|
||||||
|
else:
|
||||||
|
self.conn = MySQLdb.connect(host=db_server, port=db_port, user=db_username, passwd=db_password, db=db_dbname)
|
||||||
|
break
|
||||||
|
except (AttributeError, MySQLdb.OperationalError):
|
||||||
|
log.warning('WARNING: MySQL Error, reconnecting.')
|
||||||
|
time.sleep(.5)
|
||||||
|
|
||||||
|
self.conn.autocommit(True)
|
||||||
|
self.conn.ping(True)
|
||||||
|
self.in_use.release()
|
||||||
|
|
||||||
|
def query(self, sql):
|
||||||
|
self.in_use.acquire(True)
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
cursor = self.conn.cursor()
|
||||||
|
cursor.execute(sql)
|
||||||
|
ret = cursor.fetchall()
|
||||||
|
cursor.close()
|
||||||
|
self.in_use.release()
|
||||||
|
return ret
|
||||||
|
except (AttributeError, MySQLdb.OperationalError):
|
||||||
|
log.warning('WARNING: MySQL Operational Error during query, reconnecting.')
|
||||||
|
self.in_use.release()
|
||||||
|
self.connect()
|
||||||
|
except (AttributeError, MySQLdb.ProgrammingError):
|
||||||
|
log.warning('WARNING: MySQL Programming Error during query, attempting query again.')
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
|
||||||
def get_config_data():
|
def get_config_data():
|
||||||
config_cmd = ['/usr/bin/env', 'php', '%s/config_to_json.php' % install_dir]
|
config_cmd = ['/usr/bin/env', 'php', '%s/config_to_json.php' % install_dir]
|
||||||
try:
|
try:
|
||||||
@@ -130,55 +179,34 @@ except KeyError:
|
|||||||
down_retry = 60
|
down_retry = 60
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if db_port == 0:
|
retry_query = int(config['poller_service_retry_query'])
|
||||||
db = MySQLdb.connect(host=db_server, user=db_username, passwd=db_password, db=db_dbname)
|
if retry_query == 0:
|
||||||
else:
|
retry_query = 1
|
||||||
db = MySQLdb.connect(host=db_server, port=db_port, user=db_username, passwd=db_password, db=db_dbname)
|
except KeyError:
|
||||||
db.autocommit(True)
|
retry_query = 1
|
||||||
cursor = db.cursor()
|
|
||||||
except:
|
try:
|
||||||
log.critical("ERROR: Could not connect to MySQL database!")
|
single_connection = bool(config['poller_service_single_connection'])
|
||||||
sys.exit(2)
|
except KeyError:
|
||||||
|
single_connection = False
|
||||||
|
|
||||||
|
db = DB()
|
||||||
|
|
||||||
|
|
||||||
def poll_worker(device_id, action):
|
def lockFree(lock, db=db):
|
||||||
try:
|
|
||||||
start_time = time.time()
|
|
||||||
path = poller_path
|
|
||||||
if action == 'discovery':
|
|
||||||
path = discover_path
|
|
||||||
command = "/usr/bin/env php %s -h %s >> /dev/null 2>&1" % (path, device_id)
|
|
||||||
subprocess.check_call(command, shell=True)
|
|
||||||
elapsed_time = int(time.time() - start_time)
|
|
||||||
if elapsed_time < 300:
|
|
||||||
log.debug("DEBUG: worker finished %s of device %s in %s seconds" % (action, device_id, elapsed_time))
|
|
||||||
else:
|
|
||||||
log.warning("WARNING: worker finished %s of device %s in %s seconds" % (action, device_id, elapsed_time))
|
|
||||||
except (KeyboardInterrupt, SystemExit):
|
|
||||||
raise
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def lockFree(lock):
|
|
||||||
global cursor
|
|
||||||
query = "SELECT IS_FREE_LOCK('{0}')".format(lock)
|
query = "SELECT IS_FREE_LOCK('{0}')".format(lock)
|
||||||
cursor.execute(query)
|
return db.query(query)[0][0] == 1
|
||||||
return cursor.fetchall()[0][0] == 1
|
|
||||||
|
|
||||||
|
|
||||||
def getLock(lock):
|
def getLock(lock, db=db):
|
||||||
global cursor
|
|
||||||
query = "SELECT GET_LOCK('{0}', 0)".format(lock)
|
query = "SELECT GET_LOCK('{0}', 0)".format(lock)
|
||||||
cursor.execute(query)
|
return db.query(query)[0][0] == 1
|
||||||
return cursor.fetchall()[0][0] == 1
|
|
||||||
|
|
||||||
|
|
||||||
def releaseLock(lock):
|
def releaseLock(lock, db=db):
|
||||||
global cursor
|
|
||||||
query = "SELECT RELEASE_LOCK('{0}')".format(lock)
|
query = "SELECT RELEASE_LOCK('{0}')".format(lock)
|
||||||
cursor.execute(query)
|
cursor = db.query(query)
|
||||||
return cursor.fetchall()[0][0] == 1
|
return db.query(query)[0][0] == 1
|
||||||
|
|
||||||
|
|
||||||
def sleep_until(timestamp):
|
def sleep_until(timestamp):
|
||||||
@@ -202,7 +230,7 @@ dev_query = ('SELECT device_id, status,
|
|||||||
' INTERVAL last_polled_timetaken SECOND '
|
' INTERVAL last_polled_timetaken SECOND '
|
||||||
' ), '
|
' ), '
|
||||||
' INTERVAL {0} SECOND) '
|
' INTERVAL {0} SECOND) '
|
||||||
' AS DATETIME(0) '
|
' AS DATETIME '
|
||||||
') AS next_poll, '
|
') AS next_poll, '
|
||||||
'CAST( '
|
'CAST( '
|
||||||
' DATE_ADD( '
|
' DATE_ADD( '
|
||||||
@@ -211,96 +239,122 @@ dev_query = ('SELECT device_id, status,
|
|||||||
' INTERVAL last_discovered_timetaken SECOND '
|
' INTERVAL last_discovered_timetaken SECOND '
|
||||||
' ), '
|
' ), '
|
||||||
' INTERVAL {1} SECOND) '
|
' INTERVAL {1} SECOND) '
|
||||||
' AS DATETIME(0) '
|
' AS DATETIME '
|
||||||
') as next_discovery '
|
') as next_discovery '
|
||||||
'FROM devices WHERE '
|
'FROM devices WHERE '
|
||||||
'disabled = 0 '
|
'disabled = 0 '
|
||||||
'AND IS_FREE_LOCK(CONCAT("polling.", device_id)) '
|
'AND IS_FREE_LOCK(CONCAT("poll.", device_id)) '
|
||||||
'AND IS_FREE_LOCK(CONCAT("queued.", device_id)) '
|
'AND IS_FREE_LOCK(CONCAT("discovery.", device_id)) '
|
||||||
|
'AND IS_FREE_LOCK(CONCAT("queue.", device_id)) '
|
||||||
'AND ( last_poll_attempted < DATE_SUB(NOW(), INTERVAL {2} SECOND ) '
|
'AND ( last_poll_attempted < DATE_SUB(NOW(), INTERVAL {2} SECOND ) '
|
||||||
' OR last_poll_attempted IS NULL ) '
|
' OR last_poll_attempted IS NULL ) '
|
||||||
'{3} '
|
'{3} '
|
||||||
'ORDER BY next_poll asc '
|
'ORDER BY next_poll asc '
|
||||||
'LIMIT 5 ').format(poll_frequency,
|
'LIMIT 1 ').format(poll_frequency,
|
||||||
discover_frequency,
|
discover_frequency,
|
||||||
down_retry,
|
down_retry,
|
||||||
poller_group)
|
poller_group)
|
||||||
|
|
||||||
threads = 0
|
|
||||||
next_update = datetime.now() + timedelta(minutes=1)
|
next_update = datetime.now() + timedelta(minutes=1)
|
||||||
devices_scanned = 0
|
devices_scanned = 0
|
||||||
|
|
||||||
while True:
|
dont_query_until = datetime.fromtimestamp(0)
|
||||||
cur_threads = threading.active_count()
|
|
||||||
if cur_threads != threads:
|
|
||||||
threads = cur_threads
|
|
||||||
log.debug('DEBUG: {0} threads currently active'.format(threads))
|
|
||||||
|
|
||||||
if next_update < datetime.now():
|
def poll_worker():
|
||||||
seconds_taken = (datetime.now() - (next_update - timedelta(minutes=1))).seconds
|
global dev_query
|
||||||
update_query = ('INSERT INTO pollers(poller_name, '
|
global devices_scanned
|
||||||
' last_polled, '
|
global dont_query_until
|
||||||
' devices, '
|
global single_connection
|
||||||
' time_taken) '
|
thread_id = threading.current_thread().name
|
||||||
' values("{0}", NOW(), "{1}", "{2}") '
|
|
||||||
'ON DUPLICATE KEY UPDATE '
|
|
||||||
' last_polled=values(last_polled), '
|
|
||||||
' devices=values(devices), '
|
|
||||||
' time_taken=values(time_taken) ').format(config['distributed_poller_name'].strip(),
|
|
||||||
devices_scanned,
|
|
||||||
seconds_taken)
|
|
||||||
try:
|
|
||||||
cursor.execute(update_query)
|
|
||||||
except:
|
|
||||||
log.critical('ERROR: MySQL query error. Is your schema up to date?')
|
|
||||||
sys.exit(2)
|
|
||||||
cursor.fetchall()
|
|
||||||
log.info('INFO: {0} devices scanned in the last minute'.format(devices_scanned))
|
|
||||||
devices_scanned = 0
|
|
||||||
next_update = datetime.now() + timedelta(minutes=1)
|
|
||||||
|
|
||||||
while threading.active_count() >= amount_of_workers or not lockFree('schema_update'):
|
if single_connection:
|
||||||
time.sleep(.5)
|
global db
|
||||||
|
else:
|
||||||
|
db = DB()
|
||||||
|
|
||||||
try:
|
while True:
|
||||||
cursor.execute(dev_query)
|
if datetime.now() < dont_query_until:
|
||||||
except:
|
time.sleep(1)
|
||||||
log.critical('ERROR: MySQL query error. Is your schema up to date?')
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
devices = cursor.fetchall()
|
|
||||||
for device_id, status, next_poll, next_discovery in devices:
|
|
||||||
# add queue lock, so we lock the next device against any other pollers
|
|
||||||
# if this fails, the device is locked by another poller already
|
|
||||||
if not getLock('queued.{0}'.format(device_id)):
|
|
||||||
continue
|
continue
|
||||||
if not lockFree('polling.{0}'.format(device_id)):
|
|
||||||
releaseLock('queued.{0}'.format(device_id))
|
dev_row = db.query(dev_query)
|
||||||
|
if len(dev_row) < 1:
|
||||||
|
dont_query_until = datetime.now() + timedelta(seconds=retry_query)
|
||||||
|
time.sleep(1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
device_id, status, next_poll, next_discovery = dev_row[0]
|
||||||
|
|
||||||
|
if not getLock('queue.{0}'.format(device_id), db):
|
||||||
|
releaseLock('queue.{0}'.format(device_id), db)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if next_poll and next_poll > datetime.now():
|
if next_poll and next_poll > datetime.now():
|
||||||
log.debug('DEBUG: Sleeping until {0} before polling {1}'.format(next_poll, device_id))
|
log.debug('DEBUG: Thread {0} Sleeping until {1} before polling {2}'.format(thread_id, next_poll, device_id))
|
||||||
sleep_until(next_poll)
|
sleep_until(next_poll)
|
||||||
|
|
||||||
action = 'poll'
|
action = 'poll'
|
||||||
if (not next_discovery or next_discovery < datetime.now()) and status == 1:
|
if (not next_discovery or next_discovery < datetime.now()) and status == 1:
|
||||||
action = 'discovery'
|
action = 'discovery'
|
||||||
|
|
||||||
log.debug('DEBUG: Starting {0} of device {1}'.format(action, device_id))
|
log.debug('DEBUG: Thread {0} Starting {1} of device {2}'.format(thread_id, action, device_id))
|
||||||
devices_scanned += 1
|
devices_scanned += 1
|
||||||
|
|
||||||
cursor.execute('UPDATE devices SET last_poll_attempted = NOW() WHERE device_id = {0}'.format(device_id))
|
db.query('UPDATE devices SET last_poll_attempted = NOW() WHERE device_id = {0}'.format(device_id))
|
||||||
cursor.fetchall()
|
|
||||||
|
|
||||||
t = threading.Thread(target=poll_worker, args=[device_id, action])
|
if not getLock('{0}.{1}'.format(action, device_id), db):
|
||||||
t.start()
|
releaseLock('{0}.{1}'.format(action, device_id), db)
|
||||||
|
releaseLock('queue.{0}'.format(device_id), db)
|
||||||
|
continue
|
||||||
|
|
||||||
# If we made it this far, break out of the loop and query again.
|
releaseLock('queue.{0}'.format(device_id), db)
|
||||||
break
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
path = poller_path
|
||||||
|
if action == 'discovery':
|
||||||
|
path = discover_path
|
||||||
|
command = "/usr/bin/env php %s -h %s >> /dev/null 2>&1" % (path, device_id)
|
||||||
|
subprocess.check_call(command, shell=True)
|
||||||
|
elapsed_time = int(time.time() - start_time)
|
||||||
|
if elapsed_time < 300:
|
||||||
|
log.debug("DEBUG: Thread {0} finished {1} of device {2} in {3} seconds".format(thread_id, action, device_id, elapsed_time))
|
||||||
|
else:
|
||||||
|
log.warning("WARNING: Thread {0} finished {1} of device {2} in {3} seconds".format(thread_id, action, device_id, elapsed_time))
|
||||||
|
except (KeyboardInterrupt, SystemExit):
|
||||||
|
raise
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
releaseLock('{0}.{1}'.format(action, device_id), db)
|
||||||
|
|
||||||
|
|
||||||
# This point is only reached if the query is empty, so sleep half a second before querying again.
|
for i in range(0, amount_of_workers):
|
||||||
time.sleep(.5)
|
t = threading.Thread(target=poll_worker)
|
||||||
|
t.name = i
|
||||||
|
t.daemon = True
|
||||||
|
t.start()
|
||||||
|
|
||||||
# Make sure we're not holding any device queue locks in this connection before querying again
|
|
||||||
# by locking a different string.
|
while True:
|
||||||
getLock('unlock.{0}'.format(config['distributed_poller_name']))
|
sleep_until(next_update)
|
||||||
|
|
||||||
|
seconds_taken = (datetime.now() - (next_update - timedelta(minutes=1))).seconds
|
||||||
|
update_query = ('INSERT INTO pollers(poller_name, '
|
||||||
|
' last_polled, '
|
||||||
|
' devices, '
|
||||||
|
' time_taken) '
|
||||||
|
' values("{0}", NOW(), "{1}", "{2}") '
|
||||||
|
'ON DUPLICATE KEY UPDATE '
|
||||||
|
' last_polled=values(last_polled), '
|
||||||
|
' devices=values(devices), '
|
||||||
|
' time_taken=values(time_taken) ').format(config['distributed_poller_name'].strip(),
|
||||||
|
devices_scanned,
|
||||||
|
seconds_taken)
|
||||||
|
try:
|
||||||
|
db.query(update_query)
|
||||||
|
except:
|
||||||
|
log.critical('ERROR: MySQL query error. Is your schema up to date?')
|
||||||
|
sys.exit(2)
|
||||||
|
log.info('INFO: {0} devices scanned in the last minute'.format(devices_scanned))
|
||||||
|
devices_scanned = 0
|
||||||
|
next_update = datetime.now() + timedelta(minutes=1)
|
||||||
|
Reference in New Issue
Block a user