mirror of
https://github.com/librenms/librenms.git
synced 2024-10-07 16:52:45 +00:00
Adds Distributed Polling via Memcached.
This commit is contained in:
25
doc/Extensions/Distributed-Poller.md
Normal file
25
doc/Extensions/Distributed-Poller.md
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
= Distributed Poller =
|
||||||
|
LibreNMS has the ability to distribute polling of devices to other machines.
|
||||||
|
|
||||||
|
These machines can be in a different physical location and therefore minimize network latencies for colocations.
|
||||||
|
|
||||||
|
Devices can also be groupped together into a `poller_group` to pin these devices to a single or a group of designated pollers.
|
||||||
|
|
||||||
|
All pollers need to share their RRD-folder, for example via NFS or a combination of NFS and rrdcached.
|
||||||
|
It is also required that all pollers can access the central memcached to communicate with eachother.
|
||||||
|
|
||||||
|
In order to enable distributed polling, set `$config['distributed_poller'] = true` and your memcached details into `$config['distributed_poller_memcached_host']` and `$config['distributed_poller_memcached_port']`.
|
||||||
|
By default, all hosts are shared and have the `poller_group = 0`. To pin a device to a poller, set it to a value greater than 0 and set the same value in the poller's config with `$config['distributed_poller_group']`.
|
||||||
|
Usually the poller's name is equal to the machine's hostname, if you want to change it set `$config['distributed_poller_name']`.
|
||||||
|
|
||||||
|
__Note__: Eventhough you pin devices to pollers, these pollers will still poll devices with `poller_group = 0`. If you do not want this, consequently define groups for all your devices!
|
||||||
|
|
||||||
|
== Configuration ==
|
||||||
|
```php
|
||||||
|
// Distributed Poller-Settings
|
||||||
|
$config['distributed_poller'] = false;
|
||||||
|
$config['distributed_poller_name'] = file_get_contents('/proc/sys/kernel/hostname');
|
||||||
|
$config['distributed_poller_group'] = 0;
|
||||||
|
$config['distributed_poller_memcached_host'] = 'example.net';
|
||||||
|
$config['distributed_poller_memcached_port'] = '11211';
|
||||||
|
```
|
||||||
@@ -589,4 +589,11 @@ $config['enable_clear_discovery'] = 1;// Set this to 0 if
|
|||||||
$config['enable_footer'] = 1;// Set this to 0 if you want to disable the footer copyright in the web interface
|
$config['enable_footer'] = 1;// Set this to 0 if you want to disable the footer copyright in the web interface
|
||||||
$config['api_demo'] = 0;// Set this to 1 if you want to disable some untrusting features for the API
|
$config['api_demo'] = 0;// Set this to 1 if you want to disable some untrusting features for the API
|
||||||
|
|
||||||
|
// Distributed Poller-Settings
|
||||||
|
$config['distributed_poller'] = false;
|
||||||
|
$config['distributed_poller_name'] = file_get_contents('/proc/sys/kernel/hostname');
|
||||||
|
$config['distributed_poller_group'] = 0;
|
||||||
|
$config['distributed_poller_memcached_host'] = 'example.net';
|
||||||
|
$config['distributed_poller_memcached_port'] = '11211';
|
||||||
|
|
||||||
?>
|
?>
|
||||||
|
|||||||
@@ -74,9 +74,66 @@ db_password = config['db_pass']
|
|||||||
db_server = config['db_host']
|
db_server = config['db_host']
|
||||||
db_dbname = config['db_name']
|
db_dbname = config['db_name']
|
||||||
|
|
||||||
|
# (c) 2015, GPLv3, Daniel Preussker <f0o@devilcode.org> <<<EOC1
|
||||||
|
if 'distributed_poller_group' in config:
|
||||||
|
poller_group = str(config['distributed_poller_group'])
|
||||||
|
else:
|
||||||
|
poller_group = False
|
||||||
|
|
||||||
|
def memc_alive():
|
||||||
|
try:
|
||||||
|
global memc
|
||||||
|
key = str(uuid.uuid4())
|
||||||
|
memc.set('poller.ping.'+key,key,60)
|
||||||
|
if memc.get('poller.ping.'+key) == key:
|
||||||
|
memc.delete('poller.ping.'+key)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def memc_touch(key,time):
|
||||||
|
try:
|
||||||
|
global memc
|
||||||
|
val = memc.get(key)
|
||||||
|
memc.set(key,val,time)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if 'distributed_poller' in config and 'distributed_poller_memcached_host' in config and 'distributed_poller_memcached_port' in config and config['distributed_poller'] == True:
|
||||||
|
try:
|
||||||
|
import memcache, uuid
|
||||||
|
memc = memcache.Client([config['distributed_poller_memcached_host']+':'+str(config['distributed_poller_memcached_port'])])
|
||||||
|
if memc_alive() == True:
|
||||||
|
if memc.get("poller.master") == None:
|
||||||
|
print "Registered as Master"
|
||||||
|
memc.set("poller.master",config['distributed_poller_name'],10)
|
||||||
|
memc.set("poller.nodes",0,300)
|
||||||
|
IsNode = False
|
||||||
|
else:
|
||||||
|
print "Registered as Node joining Master %s" % memc.get("poller.master")
|
||||||
|
IsNode = True
|
||||||
|
memc.incr("poller.nodes")
|
||||||
|
distpoll = True
|
||||||
|
else:
|
||||||
|
print "Could not connect to memcached, disabling distributed poller."
|
||||||
|
distpoll = False
|
||||||
|
IsNode = False
|
||||||
|
except:
|
||||||
|
print "ERROR: missing memcache python module:"
|
||||||
|
print "On deb systems: apt-get install python-memcache"
|
||||||
|
print "On other systems: easy_install python-memcached"
|
||||||
|
print "Disabling distributed poller."
|
||||||
|
distpoll = False
|
||||||
|
else:
|
||||||
|
distpoll = False
|
||||||
|
# EOC1
|
||||||
|
|
||||||
s_time = time.time()
|
s_time = time.time()
|
||||||
real_duration = 0
|
real_duration = 0
|
||||||
per_device_duration = {}
|
per_device_duration = {}
|
||||||
|
polled_devices = 0
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Take the amount of threads we want to run in parallel from the commandline
|
Take the amount of threads we want to run in parallel from the commandline
|
||||||
@@ -105,13 +162,25 @@ except:
|
|||||||
thus greatening our chances of completing _all_ the work in exactly the time it takes to
|
thus greatening our chances of completing _all_ the work in exactly the time it takes to
|
||||||
poll the slowest device! cool stuff he
|
poll the slowest device! cool stuff he
|
||||||
"""
|
"""
|
||||||
query = "select device_id from devices where disabled = 0 order by last_polled_timetaken desc"
|
# (c) 2015, GPLv3, Daniel Preussker <f0o@devilcode.org> <<<EOC2
|
||||||
|
if poller_group is not False:
|
||||||
|
query = "select device_id from devices where (poller_group = " + poller_group + " or poller_group = 0) and disabled = 0 order by last_polled_timetaken desc"
|
||||||
|
else:
|
||||||
|
query = "select device_id from devices where disabled = 0 order by last_polled_timetaken desc"
|
||||||
|
# EOC2
|
||||||
|
|
||||||
cursor.execute(query)
|
cursor.execute(query)
|
||||||
devices = cursor.fetchall()
|
devices = cursor.fetchall()
|
||||||
for row in devices:
|
for row in devices:
|
||||||
devices_list.append(int(row[0]))
|
devices_list.append(int(row[0]))
|
||||||
db.close()
|
# (c) 2015, GPLv3, Daniel Preussker <f0o@devilcode.org> <<<EOC3
|
||||||
|
if distpoll is True and IsNode is False:
|
||||||
|
query = "select max(device_id),min(device_id) from devices"
|
||||||
|
cursor.execute(query)
|
||||||
|
devices = cursor.fetchall()
|
||||||
|
maxlocks = devices[0][0]
|
||||||
|
minlocks = devices[0][1]
|
||||||
|
# EOC3
|
||||||
|
|
||||||
"""
|
"""
|
||||||
A seperate queue and a single worker for printing information to the screen prevents
|
A seperate queue and a single worker for printing information to the screen prevents
|
||||||
@@ -122,12 +191,42 @@ db.close()
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def printworker():
|
def printworker():
|
||||||
|
nodeso = 0
|
||||||
while True:
|
while True:
|
||||||
worker_id, device_id, elapsed_time = print_queue.get()
|
# (c) 2015, GPLv3, Daniel Preussker <f0o@devilcode.org> <<<EOC4
|
||||||
|
global IsNode
|
||||||
|
global distpoll
|
||||||
|
if distpoll is True:
|
||||||
|
if IsNode is False:
|
||||||
|
memc_touch('poller.master',10)
|
||||||
|
nodes = memc.get('poller.nodes')
|
||||||
|
if nodes is None and memc_alive() == False:
|
||||||
|
print "WARNING: Lost Memcached. Taking over all devices. Nodes will quit shortly."
|
||||||
|
distpoll = False
|
||||||
|
nodes = nodeso
|
||||||
|
if nodes is not nodeso:
|
||||||
|
print "INFO: %s Node(s) Total" % (nodes)
|
||||||
|
nodeso = nodes
|
||||||
|
else:
|
||||||
|
memc_touch('poller.nodes',10)
|
||||||
|
try:
|
||||||
|
worker_id, device_id, elapsed_time = print_queue.get(False)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
time.sleep(1)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
worker_id, device_id, elapsed_time = print_queue.get()
|
||||||
|
# EOC4
|
||||||
global real_duration
|
global real_duration
|
||||||
global per_device_duration
|
global per_device_duration
|
||||||
|
global polled_devices
|
||||||
real_duration += elapsed_time
|
real_duration += elapsed_time
|
||||||
per_device_duration[device_id] = elapsed_time
|
per_device_duration[device_id] = elapsed_time
|
||||||
|
polled_devices += 1
|
||||||
if elapsed_time < 300:
|
if elapsed_time < 300:
|
||||||
print "INFO: worker %s finished device %s in %s seconds" % (worker_id, device_id, elapsed_time)
|
print "INFO: worker %s finished device %s in %s seconds" % (worker_id, device_id, elapsed_time)
|
||||||
else:
|
else:
|
||||||
@@ -142,16 +241,29 @@ def printworker():
|
|||||||
def poll_worker():
|
def poll_worker():
|
||||||
while True:
|
while True:
|
||||||
device_id = poll_queue.get()
|
device_id = poll_queue.get()
|
||||||
try:
|
# (c) 2015, GPLv3, Daniel Preussker <f0o@devilcode.org> <<<EOC5
|
||||||
start_time = time.time()
|
if distpoll == False or memc.get('poller.device.'+str(device_id)) == None:
|
||||||
command = "/usr/bin/env php %s -h %s >> /dev/null 2>&1" % (poller_path, device_id)
|
if distpoll == True:
|
||||||
subprocess.check_call(command, shell=True)
|
result = memc.add('poller.device.'+str(device_id),config['distributed_poller_name'],300)
|
||||||
elapsed_time = int(time.time() - start_time)
|
if result == False:
|
||||||
print_queue.put([threading.current_thread().name, device_id, elapsed_time])
|
print "This device (%s) appears to be being polled by another poller" % (device_id)
|
||||||
except (KeyboardInterrupt, SystemExit):
|
poll_queue.task_done()
|
||||||
raise
|
continue
|
||||||
except:
|
if memc_alive() == False and IsNode is True:
|
||||||
pass
|
print "Lost Memcached, Not polling Device %s as Node. Master will poll it." % device_id
|
||||||
|
poll_queue.task_done()
|
||||||
|
continue
|
||||||
|
# EOC5
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
command = "/usr/bin/env php %s -h %s >> /dev/null 2>&1" % (poller_path, device_id)
|
||||||
|
subprocess.check_call(command, shell=True)
|
||||||
|
elapsed_time = int(time.time() - start_time)
|
||||||
|
print_queue.put([threading.current_thread().name, device_id, elapsed_time])
|
||||||
|
except (KeyboardInterrupt, SystemExit):
|
||||||
|
raise
|
||||||
|
except:
|
||||||
|
pass
|
||||||
poll_queue.task_done()
|
poll_queue.task_done()
|
||||||
|
|
||||||
poll_queue = Queue.Queue()
|
poll_queue = Queue.Queue()
|
||||||
@@ -179,10 +291,47 @@ except (KeyboardInterrupt, SystemExit):
|
|||||||
|
|
||||||
total_time = int(time.time() - s_time)
|
total_time = int(time.time() - s_time)
|
||||||
|
|
||||||
print "INFO: poller-wrapper polled %s devices in %s seconds with %s workers" % (len(devices_list), total_time, amount_of_workers)
|
print "INFO: poller-wrapper polled %s devices in %s seconds with %s workers" % (polled_devices, total_time, amount_of_workers)
|
||||||
|
|
||||||
|
# (c) 2015, GPLv3, Daniel Preussker <f0o@devilcode.org> <<<EOC6
|
||||||
|
if distpoll == True or memc_alive() is True:
|
||||||
|
master = memc.get("poller.master")
|
||||||
|
if master == config['distributed_poller_name'] and IsNode == False:
|
||||||
|
print "Wait for all poller-nodes to finish"
|
||||||
|
nodes = memc.get("poller.nodes")
|
||||||
|
while nodes > 0 and nodes is not None:
|
||||||
|
try:
|
||||||
|
time.sleep(1)
|
||||||
|
nodes = memc.get("poller.nodes")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
print "Clearing Locks"
|
||||||
|
x = minlocks
|
||||||
|
while x <= maxlocks:
|
||||||
|
memc.delete('poller.device.'+str(x))
|
||||||
|
x = x+1
|
||||||
|
print "%s Locks Cleared" % x
|
||||||
|
print "Clearing Nodes"
|
||||||
|
memc.delete("poller.master")
|
||||||
|
memc.delete("poller.nodes")
|
||||||
|
else:
|
||||||
|
memc.decr("poller.nodes")
|
||||||
|
print "Finished %s." % time.time()
|
||||||
|
# EOC6
|
||||||
|
|
||||||
show_stopper = False
|
show_stopper = False
|
||||||
|
|
||||||
|
query = "update pollers set last_polled=NOW(), devices='%d', time_taken='%d' where poller_name='%s'" % (polled_devices, total_time, config['distributed_poller_name'])
|
||||||
|
response = cursor.execute(query)
|
||||||
|
if response == 1:
|
||||||
|
db.commit()
|
||||||
|
else:
|
||||||
|
query = "insert into pollers set poller_name='%s', last_polled=NOW(), devices='%d', time_taken='%d'" % (config['distributed_poller_name'],polled_devices, total_time)
|
||||||
|
cursor.execute(query)
|
||||||
|
db.commit()
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
if total_time > 300:
|
if total_time > 300:
|
||||||
recommend = int(total_time / 300.0 * amount_of_workers + 1)
|
recommend = int(total_time / 300.0 * amount_of_workers + 1)
|
||||||
print "WARNING: the process took more than 5 minutes to finish, you need faster hardware or more threads"
|
print "WARNING: the process took more than 5 minutes to finish, you need faster hardware or more threads"
|
||||||
@@ -195,4 +344,5 @@ if total_time > 300:
|
|||||||
print "ERROR: Some devices are taking more than 300 seconds, the script cannot recommend you what to do."
|
print "ERROR: Some devices are taking more than 300 seconds, the script cannot recommend you what to do."
|
||||||
if show_stopper == False:
|
if show_stopper == False:
|
||||||
print "WARNING: Consider setting a minimum of %d threads. (This does not constitute professional advice!)" % recommend
|
print "WARNING: Consider setting a minimum of %d threads. (This does not constitute professional advice!)" % recommend
|
||||||
|
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|||||||
@@ -118,7 +118,7 @@ $poller_end = utime(); $poller_run = $poller_end - $poller_start; $poller_time =
|
|||||||
|
|
||||||
if ($polled_devices)
|
if ($polled_devices)
|
||||||
{
|
{
|
||||||
dbInsert(array('type' => 'poll', 'doing' => $doing, 'start' => $poller_start, 'duration' => $poller_time, 'devices' => $polled_devices ), 'perf_times');
|
dbInsert(array('type' => 'poll', 'doing' => $doing, 'start' => $poller_start, 'duration' => $poller_time, 'devices' => $polled_devices, 'poller' => $config['distributed_poller_name'] ), 'perf_times');
|
||||||
}
|
}
|
||||||
|
|
||||||
$string = $argv[0] . " $doing " . date($config['dateformat']['compact']) . " - $polled_devices devices polled in $poller_time secs";
|
$string = $argv[0] . " $doing " . date($config['dateformat']['compact']) . " - $polled_devices devices polled in $poller_time secs";
|
||||||
|
|||||||
2
sql-schema/042.sql
Normal file
2
sql-schema/042.sql
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
CREATE TABLE `pollers` (`id` int(11) NOT NULL AUTO_INCREMENT, `poller_name` varchar(255) NOT NULL, `last_polled` datetime NOT NULL, `devices` int(11) NOT NULL, `time_taken` double NOT NULL, KEY `id` (`id`)) ENGINE=InnoDB AUTO_INCREMENT=8 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
|
||||||
|
ALTER TABLE `devices` ADD `poller_group` INT(11) NOT NULL DEFAULT '0';
|
||||||
Reference in New Issue
Block a user