From 671654d15548ddff1dd41fbdc9d9082afe4bae6a Mon Sep 17 00:00:00 2001 From: VVelox Date: Fri, 3 Mar 2017 14:41:20 -0600 Subject: [PATCH] feature: Added Nvidia GPU application support (#6024) --- doc/Extensions/Applications.md | 28 ++++++++ .../graphs/application/nvidia-common.inc.php | 33 ++++++++++ .../graphs/application/nvidia_bar1.inc.php | 9 +++ .../graphs/application/nvidia_dbecc.inc.php | 9 +++ .../graphs/application/nvidia_dec.inc.php | 9 +++ .../graphs/application/nvidia_enc.inc.php | 9 +++ .../graphs/application/nvidia_fb.inc.php | 9 +++ .../graphs/application/nvidia_mclk.inc.php | 9 +++ .../graphs/application/nvidia_mem.inc.php | 9 +++ .../graphs/application/nvidia_pclk.inc.php | 9 +++ .../graphs/application/nvidia_pviol.inc.php | 9 +++ .../graphs/application/nvidia_pwr.inc.php | 9 +++ .../graphs/application/nvidia_rxpci.inc.php | 9 +++ .../graphs/application/nvidia_sbecc.inc.php | 9 +++ .../graphs/application/nvidia_sm.inc.php | 9 +++ .../graphs/application/nvidia_temp.inc.php | 9 +++ .../graphs/application/nvidia_tviol.inc.php | 9 +++ .../graphs/application/nvidia_txpci.inc.php | 9 +++ html/pages/apps.inc.php | 21 +++++- html/pages/device/apps/nvidia.inc.php | 42 ++++++++++++ includes/polling/applications/nvidia.inc.php | 65 +++++++++++++++++++ 21 files changed, 332 insertions(+), 1 deletion(-) create mode 100644 html/includes/graphs/application/nvidia-common.inc.php create mode 100644 html/includes/graphs/application/nvidia_bar1.inc.php create mode 100644 html/includes/graphs/application/nvidia_dbecc.inc.php create mode 100644 html/includes/graphs/application/nvidia_dec.inc.php create mode 100644 html/includes/graphs/application/nvidia_enc.inc.php create mode 100644 html/includes/graphs/application/nvidia_fb.inc.php create mode 100644 html/includes/graphs/application/nvidia_mclk.inc.php create mode 100644 html/includes/graphs/application/nvidia_mem.inc.php create mode 100644 html/includes/graphs/application/nvidia_pclk.inc.php create mode 100644 html/includes/graphs/application/nvidia_pviol.inc.php create mode 100644 html/includes/graphs/application/nvidia_pwr.inc.php create mode 100644 html/includes/graphs/application/nvidia_rxpci.inc.php create mode 100644 html/includes/graphs/application/nvidia_sbecc.inc.php create mode 100644 html/includes/graphs/application/nvidia_sm.inc.php create mode 100644 html/includes/graphs/application/nvidia_temp.inc.php create mode 100644 html/includes/graphs/application/nvidia_tviol.inc.php create mode 100644 html/includes/graphs/application/nvidia_txpci.inc.php create mode 100644 html/pages/device/apps/nvidia.inc.php create mode 100644 includes/polling/applications/nvidia.inc.php diff --git a/doc/Extensions/Applications.md b/doc/Extensions/Applications.md index 1f52f8d2b4..f2420f3137 100644 --- a/doc/Extensions/Applications.md +++ b/doc/Extensions/Applications.md @@ -29,12 +29,14 @@ Different applications support a variety of ways collect data: by direct connect 1. [Munin](#munin) - Agent 1. [PHP-FPM](#php-fpm) - SNMP extend 1. [Fail2ban](#fail2ban) - SNMP extend +1. [Nvidia GPU](#nvidia-gpu) - SNMP extend 1. [Squid](#squid) - SNMP proxy 1. [FreeBSD NFS Server](#freebsd-nfs-server) - SNMP extend 1. [FreeBSD NFS Client](#freebsd-nfs-client) - SNMP extend 1. [Postgres](#postgres) - SNMP extend 1. [Postfix](#postfix) - SNMP extend + ### Apache Either use SNMP extend or use the agent. ##### SNMP Extend @@ -524,6 +526,31 @@ In regards to the totals graphed there are two variables banned and firewalled. If you have more than a few jails configured, you may need to use caching as each jail needs to be polled and fail2ban-client can't do so in a timely manner for than a few. This can result in failure of other SNMP information being polled. +### Nvidia GPU + +##### SNMP Extend + +1: Copy the shell script, nvidia, to the desired host (the host must be added to LibreNMS devices) (wget https://github.com/librenms/librenms-agent/raw/master/snmp/nvidia -O /etc/snmp/nvidia) + +2: Make the script executable (chmod +x /etc/snmp/nvidia) + +3: Edit your snmpd.conf file and add: +``` +extend nvidia /etc/snmp/nvidia +``` + +5: Restart snmpd on your host. + +6: Verify you have nvidia-smi installed, which it generally should be if you have the driver from Nvida installed. + +7: On the device page in Librenms, edit your host and check `Nvidia` under the Applications tab. + +The GPU numbering on the graphs will correspond to how the nvidia-smi sees them as being. + +For questions about what the various values are/mean, please see the nvidia-smi man file under the section covering dmon. + +Please be aware that if you have more than 35 GPUs, you will need to add more colors to the config entry $config['graph_colours']['manycolours']. +======= #### Squid ##### SNMP Proxy @@ -633,3 +660,4 @@ extend postfixdetailed /etc/snmp/postfixdetailed 7: On the device page in Librenms, edit your host and check `Postfix` under the Applications tab. Before doing this, run /etc/snmp/postfixdetailed to create the initial cache file so you don't end up with some crazy initial starting value. Please note that each time /etc/snmp/postfixdetailed is ran, the cache file is updated, so if this happens in between LibreNMS doing it then the values will be thrown off for that polling period. + diff --git a/html/includes/graphs/application/nvidia-common.inc.php b/html/includes/graphs/application/nvidia-common.inc.php new file mode 100644 index 0000000000..54bf15563a --- /dev/null +++ b/html/includes/graphs/application/nvidia-common.inc.php @@ -0,0 +1,33 @@ + $rrd_filename, + 'descr' => 'GPU '.$int, + 'ds' => $rrdVar, + ); + + $int++; + $rrd_filename=rrd_name($device['hostname'], array('app', $app['app_type'], $app['app_id'], $int)); +} + +require 'includes/graphs/generic_multi_line_exact_numbers.inc.php'; diff --git a/html/includes/graphs/application/nvidia_bar1.inc.php b/html/includes/graphs/application/nvidia_bar1.inc.php new file mode 100644 index 0000000000..afed281f01 --- /dev/null +++ b/html/includes/graphs/application/nvidia_bar1.inc.php @@ -0,0 +1,9 @@ + 'GPU Utilization', + 'nvidia_mem' => 'Memory Utilization', + 'nvidia_enc' => 'Encoder Utilization', + 'nvidia_dec' => 'Decoder Utilization', + 'nvidia_fb' => 'Frame Buffer Memory Usage', + 'nvidia_bar1' => 'Bar1 Memory Usage', + 'nvidia_rxpci' => 'PCIe RX', + 'nvidia_txpci' => 'PCIe TX', + 'nvidia_pwr' => 'Power Usage', + 'nvidia_temp' => 'Temperature', + 'nvidia_mclk' => 'Memory Clock', + 'nvidia_pclk' => 'GPU Clock', + 'nvidia_pviol' => 'Thermal Violation Percentage', + 'nvidia_tviol' => 'Thermal Violation Boolean', + 'nvidia_sbecc' => 'Single Bit ECC Errors', + 'nvidia_dbecc' => 'Double Bit ECC Errors', +); + +foreach ($graphs as $key => $text) { + $graph_type = $key; + $graph_array['height'] = '100'; + $graph_array['width'] = '215'; + $graph_array['to'] = $config['time']['now']; + $graph_array['id'] = $app['app_id']; + $graph_array['type'] = 'application_'.$key; + + echo '
+
+

'.$text.'

+
+
+
'; + include 'includes/print-graphrow.inc.php'; + echo '
'; + echo '
'; + echo '
'; +} diff --git a/includes/polling/applications/nvidia.inc.php b/includes/polling/applications/nvidia.inc.php new file mode 100644 index 0000000000..747973b862 --- /dev/null +++ b/includes/polling/applications/nvidia.inc.php @@ -0,0 +1,65 @@ +addDataset('pwr', 'GAUGE', 0) + ->addDataset('temp', 'GAUGE', 0) + ->addDataset('sm', 'GAUGE', 0) + ->addDataset('mem', 'GAUGE', 0) + ->addDataset('enc', 'GAUGE', 0) + ->addDataset('dec', 'GAUGE', 0) + ->addDataset('mclk', 'GAUGE', 0) + ->addDataset('pclk', 'GAUGE', 0) + ->addDataset('pviol', 'GAUGE', 0) + ->addDataset('tviol', 'GAUGE', 0) + ->addDataset('fb', 'GAUGE', 0) + ->addDataset('bar1', 'GAUGE', 0) + ->addDataset('sbecc', 'GAUGE', 0) + ->addDataset('dbecc', 'GAUGE', 0) + ->addDataset('pci', 'GAUGE', 0) + ->addDataset('rxpci', 'GAUGE', 0) + ->addDataset('txpci', 'GAUGE', 0); + +$int=0; +while (isset($gpuArray[$int])) { + list($gpu, $pwr, $temp, $sm, $mem, $enc, $dec, $mclk, $pclk, $pviol, $tviol, + $fb, $bar1, $sbecc, $dbecc, $pci, $rxpci, $txpci)=explode(",", $gpuArray[$int]); + + $rrd_name = array('app', $name, $app_id, $int); + + $fields = array( + 'pwr' => $pwr, + 'temp' => $temp, + 'sm' => $sm, + 'mem' => $mem, + 'enc' => $enc, + 'dec' => $dec, + 'mclk' => $mclk, + 'pclk' => $pclk, + 'pviol' => $pviol, + 'tviol' => $tviol, + 'fb' => $fb, + 'bar1' => $bar1, + 'sbecc' => $sbecc, + 'dbecc' => $dbecc, + 'pci' => $pci, + 'rxpci' => $rxpci, + 'txpci' => $txpci + ); + + $tags = array('name' => $name, 'app_id' => $app_id, 'rrd_def' => $rrd_def, 'rrd_name' => $rrd_name); + data_update($device, 'app', $tags, $fields); + + $int++; +}