diff --git a/doc/Extensions/Applications.md b/doc/Extensions/Applications.md index bf5b4f9aee..e8fa0847fc 100644 --- a/doc/Extensions/Applications.md +++ b/doc/Extensions/Applications.md @@ -2547,20 +2547,29 @@ hide_monitoring_account = With this Boolean you can hide the Account which you 1. Copy the Perl script, smart, to the desired host. ``` -wget https://github.com/librenms/librenms-agent/raw/master/snmp/smart -O /etc/snmp/smart +wget https://github.com/librenms/librenms-agent/raw/master/snmp/smart-v1 -O /etc/snmp/smart ``` -2. Make the script executable +2. Install the depends. +``` +# FreeBSD +pkg install p5-JSON p5-MIME-Base64 smartmontools +# Debian +apt-get install cpanminus smartmontools +cpanm MIME::Base64 JSON +``` + +3. Make the script executable ``` chmod +x /etc/snmp/smart ``` -3. Edit your snmpd.conf file and add: +4. Edit your snmpd.conf file and add: ``` extend smart /etc/snmp/smart ``` -4. You will also need to create the config file, which defaults to the same path as the script, +5. You will also need to create the config file, which defaults to the same path as the script, but with .config appended. So if the script is located at /etc/snmp/smart, the config file will be `/etc/snmp/smart.config`. Alternatively you can also specific a config via `-c`. @@ -2598,23 +2607,24 @@ used for reporting and everything after that is used as the argument to be passe If you want to guess at the configuration, call it with -g and it will print out what it thinks it should be. -5. Restart snmpd on your host +6. Restart snmpd on your host If you have a large number of more than one or two disks on a system, you should consider adding this to cron. Also make sure the cache file is some place it can be written to. ``` - */3 * * * * /etc/snmp/smart -u + */5 * * * * /etc/snmp/smart -u ``` -6. If your snmp agent runs as user "snmp", edit your sudo users +7. If your snmp agent runs as user "snmp", edit your sudo users (usually `visudo`) and add at the bottom: ``` snmp ALL=(ALL) NOPASSWD: /etc/snmp/smart, /usr/bin/env smartctl ``` -and modify your snmpd.conf file accordingly: +and modify your snmpd.conf file accordingly, sudo can be excluded if +running it via cron: ``` extend smart /usr/bin/sudo /etc/snmp/smart @@ -2624,15 +2634,13 @@ The application should be auto-discovered as described at the top of the page. If it is not, please follow the steps set out under `SNMP Extend` heading top of page. -If you set useSN to 1, it is worth noting that you will loose -history(not able to access it from the web interface) for that device -each time you change it. You will also need to run camcontrol or the -like on said server to figure out what device actually corresponds -with that serial number. +8. Optionally setup nightly self tests for the disks. The exend will + run the specified test on all configured disks if called with the + -t flag and the name of the SMART test to run. -Also if the system you are using uses non-static device naming based -on bus information, it may be worthwhile just using the SN as the -device ID is going to be irrelevant in that case. +``` + 0 0 * * * /etc/snmp/smart -t long +``` ## Sneck diff --git a/includes/html/graphs/application/smart-common.inc.php b/includes/html/graphs/application/smart-common.inc.php index 841898bef3..35b8113375 100644 --- a/includes/html/graphs/application/smart-common.inc.php +++ b/includes/html/graphs/application/smart-common.inc.php @@ -11,10 +11,10 @@ $scale_min = 0; if (isset($vars['disk'])) { $disks = [$vars['disk']]; } else { - $disks = Rrd::getRrdApplicationArrays($device, $app->app_id, $name); + $disks = array_keys($app->data['disks']); } -$smart_enhancements = ['id9']; +$smart_enhancements = ['id9', 'maxtemp', 'id232']; $int = 0; $rrd_list = []; diff --git a/includes/html/graphs/application/smart_id232.inc.php b/includes/html/graphs/application/smart_id232.inc.php new file mode 100644 index 0000000000..e3aeac42d1 --- /dev/null +++ b/includes/html/graphs/application/smart_id232.inc.php @@ -0,0 +1,10 @@ +app_id, $vars['disk']]); +$rrd_filename_232 = Rrd::name($device['hostname'], ['app', $name . '_id232', $app->app_id, $vars['disk']]); if (Rrd::checkRrdExists($rrd_filename)) { $rrd_list[] = [ @@ -34,6 +35,11 @@ if (Rrd::checkRrdExists($rrd_filename)) { 'descr' => 'Media_Wearout_Indicator', 'ds' => 'id233', ]; + $rrd_list[] = [ + 'filename' => $rrd_filename_232, + 'descr' => 'Available_Reservd_Space', + 'ds' => 'id232', + ]; } require 'includes/html/graphs/generic_multi_line_exact_numbers.inc.php'; diff --git a/includes/html/pages/apps.inc.php b/includes/html/pages/apps.inc.php index f6590ac8f2..75b283dc72 100644 --- a/includes/html/pages/apps.inc.php +++ b/includes/html/pages/apps.inc.php @@ -274,8 +274,7 @@ $graphs['smart'] = [ 'id184', 'id187', 'id188', - 'id190', - 'id194', + 'maxtemp', 'id196', 'id197', 'id198', diff --git a/includes/html/pages/device/apps/smart.inc.php b/includes/html/pages/device/apps/smart.inc.php index 67ce029c5b..ee1530a18e 100644 --- a/includes/html/pages/device/apps/smart.inc.php +++ b/includes/html/pages/device/apps/smart.inc.php @@ -1,7 +1,5 @@ data; + +foreach ($app_data['disks'] as $label => $disk_data) { $disk = $label; if ($vars['disk'] == $disk) { $label = '' . $label . ''; } - array_push($drives, generate_link($label, $link_array, ['disk'=>$disk])); + if (isset($app_data['disks'][$disk]['health_pass'])) { + if ($app_data['disks'][$disk]['health_pass'] == 1) { + $health_status = '(OK)'; + } else { + $health_status = '(FAIL)'; + } + } + + array_push($drives, generate_link($label, $link_array, ['disk'=>$disk]) . $health_status); } printf('%s | drives: %s', generate_link('All Drives', $link_array), implode(', ', $drives)); @@ -28,6 +36,40 @@ printf('%s | drives: %s', generate_link('All Drives', $link_array), implode(', ' print_optionbar_end(); if (isset($vars['disk'])) { + if (! isset($app_data['legacy'])) { + print_optionbar_start(); + if (isset($app_data['disks'][$vars['disk']]['disk'])) { + echo 'Disk: ' . $app_data['disks'][$vars['disk']]['disk'] . "
\n"; + } + if (isset($app_data['disks'][$vars['disk']]['serial'])) { + echo 'Serial: ' . $app_data['disks'][$vars['disk']]['serial'] . "
\n"; + } + if (isset($app_data['disks'][$vars['disk']]['vendor'])) { + echo 'Vendor: ' . $app_data['disks'][$vars['disk']]['vendor'] . "
\n"; + } + if (isset($app_data['disks'][$vars['disk']]['product'])) { + echo 'Product: ' . $app_data['disks'][$vars['disk']]['product'] . "
\n"; + } + if (isset($app_data['disks'][$vars['disk']]['model_family'])) { + echo 'Model Family: ' . $app_data['disks'][$vars['disk']]['model_family'] . "
\n"; + } + if (isset($app_data['disks'][$vars['disk']]['model_number'])) { + echo 'Model Number: ' . $app_data['disks'][$vars['disk']]['model_number'] . "
\n"; + } + if (isset($app_data['disks'][$vars['disk']]['device_model'])) { + echo 'Device Model: ' . $app_data['disks'][$vars['disk']]['device_model'] . "
\n"; + } + if (isset($app_data['disks'][$vars['disk']]['revision'])) { + echo 'Revision: ' . $app_data['disks'][$vars['disk']]['revision'] . "
\n"; + } + if (isset($app_data['disks'][$vars['disk']]['fw_version'])) { + echo 'FW Version: ' . $app_data['disks'][$vars['disk']]['fw_version'] . "
\n"; + } + if (isset($app_data['disks'][$vars['disk']]['selftest_log'])) { + echo '
' . str_replace('n#', "\n#", $app_data['disks'][$vars['disk']]['selftest_log']) . "

\n"; + } + } + print_optionbar_end(); $graphs = [ 'smart_big5' => 'Reliability / Age', 'smart_temp' => 'Temperature', @@ -37,26 +79,87 @@ if (isset($vars['disk'])) { 'smart_tests_ran' => 'S.M.A.R.T self-tests run count', 'smart_runtime' => 'Power On Hours', ]; + if ($app_data['disks'][$vars['disk']]['is_ssd'] != 1) { + unset($graphs['smart_ssd']); + } } else { - $graphs = [ - 'smart_id5'=>'ID# 5, Reallocated Sectors Count', - 'smart_id9'=>'ID# 9, Power On Hours', - 'smart_id10'=>'ID# 10, Spin Retry Count', - 'smart_id173'=>'ID# 173, SSD Wear Leveller Worst Case Erase Count', - 'smart_id177'=>'ID# 177, SSD Wear Leveling Count', - 'smart_id183'=>'ID# 183, Detected Uncorrectable Bad Blocks', - 'smart_id184'=>'ID# 184, End-to-End error / IOEDC', - 'smart_id187'=>'ID# 187, Reported Uncorrectable Errors', - 'smart_id188'=>'ID# 188, Command Timeout', - 'smart_id190'=>'ID# 190, Airflow Temperature (C)', - 'smart_id194'=>'ID# 194, Temperature (C)', - 'smart_id196'=>'ID# 196, Reallocation Event Count', - 'smart_id197'=>'ID# 197, Current Pending Sector Count', - 'smart_id198'=>'ID# 198, Uncorrectable Sector Count / Offline Uncorrectable / Off-Line Scan Uncorrectable Sector Count', - 'smart_id199'=>'ID# 199, UltraDMA CRC Error Count', - 'smart_id231'=>'ID# 231, SSD Life Left', - 'smart_id233'=>'ID# 233, Media Wearout Indicator', - ]; + $graphs = []; + + if ($app_data['has']['id5'] == 1) { + $graphs['smart_id5'] = 'ID# 5, Reallocated Sectors Count'; + } + + if ($app_data['has']['id9'] == 1) { + $graphs['smart_id9'] = 'ID# 9, Power On Hours'; + } + + if ($app_data['has']['id10'] == 1) { + $graphs['smart_id10'] = 'ID# 10, Spin Retry Count'; + } + + if ($app_data['has']['id173'] == 1) { + $graphs['smart_id173'] = 'ID# 173, SSD Wear Leveller Worst Case Erase Count'; + } + + if ($app_data['has']['id177'] == 1) { + $graphs['smart_id177'] = 'ID# 177, SSD Wear Leveling Count'; + } + + if ($app_data['has']['id183'] == 1) { + $graphs['smart_id183'] = 'ID# 183, Detected Uncorrectable Bad Blocks'; + } + + if ($app_data['has']['id184'] == 1) { + $graphs['smart_id184'] = 'ID# 184, End-to-End error / IOEDC'; + } + + if ($app_data['has']['id187'] == 1) { + $graphs['smart_id187'] = 'ID# 187, Reported Uncorrectable Errors'; + } + + if ($app_data['has']['id188'] == 1) { + $graphs['smart_id188'] = 'ID# 188, Command Timeout'; + } + + if ($app_data['has']['id190'] == 1 || $app_data['has']['id194'] == 1) { + $graphs['smart_maxtemp'] = 'Max Temp(C), Airflow Temperature or Device'; + } + + if ($app_data['has']['id190'] == 1) { + $graphs['smart_id190'] = 'ID# 190, Airflow Temperature (C)'; + } + + if ($app_data['has']['id194'] == 1) { + $graphs['smart_id194'] = 'ID# 194, Temperature (C)'; + } + + if ($app_data['has']['id196'] == 1) { + $graphs['smart_id196'] = 'ID# 196, Reallocation Event Count'; + } + + if ($app_data['has']['id197'] == 1) { + $graphs['smart_id197'] = 'ID# 197, Current Pending Sector Count'; + } + + if ($app_data['has']['id198'] == 1) { + $graphs['smart_id198'] = 'ID# 198, Uncorrectable Sector Count / Offline Uncorrectable / Off-Line Scan Uncorrectable Sector Count'; + } + + if ($app_data['has']['id199'] == 1) { + $graphs['smart_id199'] = 'ID# 199, UltraDMA CRC Error Count'; + } + + if ($app_data['has']['id231'] == 1) { + $graphs['smart_id231'] = 'ID# 231, SSD Life Left'; + } + + if ($app_data['has']['id232'] == 1) { + $graphs['smart_id232'] = 'ID# 232, Available Reservd Space'; + } + + if ($app_data['has']['id233'] == 1) { + $graphs['smart_id233'] = 'ID# 233, Media Wearout Indicator'; + } } foreach ($graphs as $key => $text) { diff --git a/includes/polling/applications/smart.inc.php b/includes/polling/applications/smart.inc.php index 8570f7b87d..21657e6228 100644 --- a/includes/polling/applications/smart.inc.php +++ b/includes/polling/applications/smart.inc.php @@ -1,13 +1,73 @@ getOutput(); + $lines = explode("\n", $legacy); + + $data = ['disks'=>[], 'legacy'=>1]; + + $int = 0; + while (isset($lines[$int])) { + [$disk, $id5, $id10, $id173, $id177, $id183, $id184, $id187, $id188, $id190, $id194, + $id196, $id197, $id198, $id199, $id231, $id233, $completed, $interrupted, $read_failure, + $unknown_failure, $extended, $short, $conveyance, $selective] = explode(',', $lines[$int]); + $int++; + + // could really be any of these, but make sure we have something defined, + // otherwise there is something wrong with the line + if (isset($id173)) { + $data['disks'][$disk] = [ + '10' => $id10, + '173' => $id173, + '177' => $id177, + '183' => $id183, + '184' => $id184, + '187' => $id187, + '188' => $id188, + '190' => $id190, + '194' => $id194, + '196' => $id196, + '197' => $id197, + '198' => $id198, + '199' => $id199, + '231' => $id231, + '233' => $id233, + '5' => $id5, + '9' => $id9, + 'completed' => $completed, + 'interrupted' => $interrupted, + 'read_failure' => $read_failure, + 'unknown_failure' => $unknown_failure, + 'extended' => $extended, + 'short' => $short, + 'conveyance' => $conveyance, + 'selective' => 'selective', + ]; + } + } +} catch (JsonAppException $e) { + echo PHP_EOL . $name . ':' . $e->getCode() . ':' . $e->getMessage() . PHP_EOL; + update_application($app, $e->getCode() . ':' . $e->getMessage(), []); // Set empty metrics and error message + + return; +} + +$old_data = $app->data; +if (! isset($old_data['disks_with_failed_tests'])) { + $old_data['disks_with_failed_tests'] = []; +} +if (! isset($old_data['disks_with_failed_health'])) { + $old_data['disks_with_failed_health'] = []; +} $rrd_name = ['app', $name, $app->app_id]; $rrd_def = RrdDefinition::make() @@ -36,67 +96,194 @@ $rrd_def = RrdDefinition::make() ->addDataset('conveyance', 'GAUGE', 0) ->addDataset('selective', 'GAUGE', 0); -$int = 0; -$metrics = []; -while (isset($lines[$int])) { - [$disk, $id5, $id10, $id173, $id177, $id183, $id184, $id187, $id188, $id190, $id194, - $id196, $id197, $id198, $id199, $id231, $id233, $completed, $interrupted, $read_failure, - $unknown_failure, $extended, $short, $conveyance, $selective] = explode(',', $lines[$int]); - - $rrd_name = ['app', $name, $app->app_id, $disk]; - - $fields = [ - 'id5' => is_numeric($id5) ? $id5 : null, - 'id10' => is_numeric($id10) ? $id10 : null, - 'id173' => is_numeric($id173) ? $id173 : null, - 'id177' => is_numeric($id177) ? $id177 : null, - 'id183' => is_numeric($id183) ? $id183 : null, - 'id184' => is_numeric($id184) ? $id184 : null, - 'id187' => is_numeric($id187) ? $id187 : null, - 'id188' => is_numeric($id188) ? $id188 : null, - 'id190' => is_numeric($id190) ? $id190 : null, - 'id194' => is_numeric($id194) ? $id194 : null, - 'id196' => is_numeric($id196) ? $id196 : null, - 'id197' => is_numeric($id197) ? $id197 : null, - 'id198' => is_numeric($id198) ? $id198 : null, - 'id199' => is_numeric($id199) ? $id199 : null, - 'id231' => is_numeric($id231) ? $id231 : null, - 'id233' => is_numeric($id233) ? $id233 : null, - 'completed' => is_numeric($completed) ? $completed : null, - 'interrupted' => is_numeric($interrupted) ? $interrupted : null, - 'readfailure' => is_numeric($read_failure) ? $read_failure : null, - 'unknownfail' => is_numeric($unknown_failure) ? $unknown_failure : null, - 'extended' => is_numeric($extended) ? $extended : null, - 'short' => is_numeric($short) ? $short : null, - 'conveyance' => is_numeric($conveyance) ? $conveyance : null, - 'selective' => is_numeric($selective) ? $selective : null, - ]; - - $metrics[$disk] = $fields; - $tags = ['name' => $name, 'app_id' => $app->app_id, 'rrd_def' => $rrd_def, 'rrd_name' => $rrd_name]; - data_update($device, 'app', $tags, $fields); - - $int++; -} - -// smart enhancement id9 -$rrd_name = ['app', $name, $app->app_id]; -$rrd_def = RrdDefinition::make() +$rrd_def_id9 = RrdDefinition::make() ->addDataset('id9', 'GAUGE', 0); -$int = 0; -while (isset($lines[$int])) { - [$disk, , , , , , , , , , , , , , , , , , , , , , , , , $id9] = explode(',', $lines[$int]); +$rrd_def_id232 = RrdDefinition::make() + ->addDataset('id232', 'GAUGE', 0); - $rrd_name = ['app', $name . '_id9', $app->app_id, $disk]; +$rrd_def_maxtemp = RrdDefinition::make() + ->addDataset('maxtemp', 'GAUGE', 0); - $fields = ['id9' => $id9]; - $metrics[$disk]['id9'] = $id9; +$new_disks_with_failed_tests = []; +$new_disks_with_failed_health = []; +$data['disks_with_failed_tests'] = []; +$data['disks_with_failed_health'] = []; +$data['has'] = [ + 'id5'=>0, + 'id9'=>0, + 'id10'=>0, + 'id173'=>0, + 'id177'=>0, + 'id183'=>0, + 'id184'=>0, + 'id187'=>0, + 'id188'=>0, + 'id190'=>0, + 'id194'=>0, + 'id196'=>0, + 'id197'=>0, + 'id198'=>0, + 'id199'=>0, + 'id231'=>0, + 'id232'=>0, + 'id233'=>0, +]; +$metrics = [ + 'disks_with_failed_tests_count' => 0, + 'disks_with_failed_health_count' => 0, + 'new_disks_with_failed_tests_count' => 0, + 'new_disks_with_failed_health_count' => 0, + 'exit_nonzero' => $data['exit_nonzero'], + 'unhealthy' => $data['unhealthy'], +]; +foreach ($data['disks'] as $disk_id => $disk) { + $rrd_name = ['app', $name, $app->app_id, $disk_id]; + + $fields = [ + 'id5' => is_numeric($disk['5']) ? $disk['5'] : null, + 'id10' => is_numeric($disk['10']) ? $disk['10'] : null, + 'id173' => is_numeric($disk['173']) ? $disk['173'] : null, + 'id177' => is_numeric($disk['177']) ? $disk['177'] : null, + 'id183' => is_numeric($disk['183']) ? $disk['183'] : null, + 'id184' => is_numeric($disk['184']) ? $disk['184'] : null, + 'id187' => is_numeric($disk['187']) ? $disk['187'] : null, + 'id188' => is_numeric($disk['188']) ? $disk['188'] : null, + 'id190' => is_numeric($disk['190']) ? $disk['190'] : null, + 'id194' => is_numeric($disk['194']) ? $disk['194'] : null, + 'id196' => is_numeric($disk['196']) ? $disk['196'] : null, + 'id197' => is_numeric($disk['197']) ? $disk['197'] : null, + 'id198' => is_numeric($disk['198']) ? $disk['198'] : null, + 'id199' => is_numeric($disk['199']) ? $disk['199'] : null, + 'id231' => is_numeric($disk['231']) ? $disk['231'] : null, + 'id233' => is_numeric($disk['233']) ? $disk['233'] : null, + 'completed' => is_numeric($disk['completed']) ? $disk['completed'] : null, + 'interrupted' => is_numeric($disk['interrupted']) ? $disk['interrupted'] : null, + 'readfailure' => is_numeric($disk['read_failure']) ? $disk['read_failure'] : null, + 'unknownfail' => is_numeric($disk['unknown_failure']) ? $disk['unknown_failure'] : null, + 'extended' => is_numeric($disk['extended']) ? $disk['extended'] : null, + 'short' => is_numeric($disk['short']) ? $disk['short'] : null, + 'conveyance' => is_numeric($disk['conveyance']) ? $disk['conveyance'] : null, + 'selective' => is_numeric($disk['selective']) ? $disk['selective'] : null, + ]; $tags = ['name' => $name, 'app_id' => $app->app_id, 'rrd_def' => $rrd_def, 'rrd_name' => $rrd_name]; data_update($device, 'app', $tags, $fields); - $int++; + $metrics['disk_' . $disk_id . '_id5'] = $fields['id5']; + $metrics['disk_' . $disk_id . '_id10'] = $fields['id10']; + $metrics['disk_' . $disk_id . '_id173'] = $fields['id173']; + $metrics['disk_' . $disk_id . '_id177'] = $fields['id177']; + $metrics['disk_' . $disk_id . '_id183'] = $fields['id183']; + $metrics['disk_' . $disk_id . '_id184'] = $fields['id184']; + $metrics['disk_' . $disk_id . '_id187'] = $fields['id187']; + $metrics['disk_' . $disk_id . '_id188'] = $fields['id188']; + $metrics['disk_' . $disk_id . '_id190'] = $fields['id190']; + $metrics['disk_' . $disk_id . '_id194'] = $fields['id194']; + $metrics['disk_' . $disk_id . '_id196'] = $fields['id196']; + $metrics['disk_' . $disk_id . '_id197'] = $fields['id197']; + $metrics['disk_' . $disk_id . '_id198'] = $fields['id198']; + $metrics['disk_' . $disk_id . '_id199'] = $fields['id199']; + $metrics['disk_' . $disk_id . '_id231'] = $fields['id231']; + $metrics['disk_' . $disk_id . '_id233'] = $fields['id233']; + $metrics['disk_' . $disk_id . '_completed'] = $fields['completed']; + $metrics['disk_' . $disk_id . '_interrupted'] = $fields['interrupted']; + $metrics['disk_' . $disk_id . '_readfailure'] = $fields['readfailure']; + $metrics['disk_' . $disk_id . '_unknownfail'] = $fields['unknownfail']; + $metrics['disk_' . $disk_id . '_extended'] = $fields['extended']; + $metrics['disk_' . $disk_id . '_short'] = $fields['short']; + $metrics['disk_' . $disk_id . '_conveyance'] = $fields['conveyance']; + $metrics['disk_' . $disk_id . '_selective'] = $fields['selective']; + + $rrd_name_id9 = ['app', $name . '_id9', $app->app_id, $disk_id]; + $fields_id9 = ['id9' => $disk['9']]; + $tags_id9 = ['name' => $name, 'app_id' => $app->app_id, 'rrd_def' => $rrd_def_id9, 'rrd_name' => $rrd_name_id9]; + data_update($device, 'app', $tags_id9, $fields_id9); + + $metrics['disk_' . $disk_id . '_id9'] = $disk['9']; + + $rrd_name_id232 = ['app', $name . '_id232', $app->app_id, $disk_id]; + $fields_id232 = ['id232' => $disk['232']]; + $tags_id232 = ['name' => $name, 'app_id' => $app->app_id, 'rrd_def' => $rrd_def_id232, 'rrd_name' => $rrd_name_id232]; + data_update($device, 'app', $tags_id232, $fields_id232); + + $metrics['disk_' . $disk_id . '_id232'] = $disk['232']; + + $rrd_name_maxtemp = ['app', $name . '_maxtemp', $app->app_id, $disk_id]; + $fields_maxtemp = ['maxtemp' => $disk['max_temp']]; + $tags_maxtemp = ['name' => $name, 'app_id' => $app->app_id, 'rrd_def' => $rrd_def_maxtemp, 'rrd_name' => $rrd_name_maxtemp]; + data_update($device, 'app', $tags_maxtemp, $fields_maxtemp); + + $metrics['disk_' . $disk_id . '_max_temp'] = $disk['max_temp']; + + // check if it has any failed tests + // only counting failures, ignoring ones that have been interrupted + if ((is_numeric($disk['read_failure']) && $disk['read_failure'] > 0) || + (is_numeric($disk['unknown_failure']) && $disk['unknown_failure'] > 0)) { + $data['disks_with_failed_tests'][$disk_id] = 1; + $metrics['disks_with_failed_tests']++; + // add it to the list to alert on if it is a new failure + if (! isset($old_data['disks_with_failed_tests'])) { + $new_disks_with_failed_tests[] = $disk_id; + $metrics['new_disks_with_failed_tests']++; + } + } + + // check for what IDs we actually got + foreach (['5', '9', '10', '173', '177', '183', '184', '187', '188', '190', '194', '196', '197', '198', '199', '231', '232', '233'] as $id_check) { + if (is_numeric($disk[$id_check])) { + $data['has']['id' . $id_check] = 1; + } + } + + // figure out if this disk is a SSD or not + if (is_numeric($disk['173']) || is_numeric($disk['177']) || is_numeric($disk['231']) || is_numeric($disk['232']) || is_numeric($disk['233'])) { + $data['disks'][$disk_id]['is_ssd'] = 1; + $metrics['disk_' . $disk_id]['is_ssd'] = 1; + } else { + $data['disks'][$disk_id]['is_ssd'] = 0; + $metrics['disk_' . $disk_id]['is_ssd'] = 0; + } + + // checks if the health has failed + if (isset($disk['health_pass']) && is_numeric($disk['health_pass']) && $disk['health_pass'] < 1) { + $data['disks_with_failed_health'][$disk_id] = 1; + $metrics['disks_with_failed_health_count']++; + // add it to the list to alert on if it is a new failure + if (! isset($old_data['disks_with_failed_health'])) { + $new_disks_with_failed_health[] = $disk_id; + $metrics['new_disks_with_failed_health_count']++; + } + } + + $metrics['disk_' . $disk_id . '_health'] = $disk['health_pass']; + $metrics['disk_' . $disk_id . '_exit'] = $disk['exit']; } -update_application($app, $output, $metrics); +// log any disks with failed tests found +if (sizeof($new_disks_with_failed_tests) > 0) { + $log_message = 'SMART found new disks with failed tests: ' . json_encode($new_disks_with_failed_tests); + log_event($log_message, $device, 'application', 5); +} + +// log when there when we go to having no failed disks from having them previously +if (sizeof($data['disks_with_failed_tests']) == 0 && sizeof($old_data['disks_with_failed_tests']) > 0) { + $log_message = 'SMART is no longer finding any disks with failed tests'; + log_event($log_message, $device, 'application', 1); +} + +// log any disks with failed tests found +if (sizeof($new_disks_with_failed_health) > 0) { + $log_message = 'SMART found new disks with failed health checks: ' . json_encode($new_disks_with_failed_health); + log_event($log_message, $device, 'application', 5); +} + +// log when there when we go to having no failed disks from having them previously +if (sizeof($data['disks_with_failed_health']) == 0 && sizeof($old_data['disks_with_failed_health']) > 0) { + $log_message = 'SMART is no longer finding any disks with failed health checks'; + log_event($log_message, $device, 'application', 1); +} + +$app->data = $data; + +update_application($app, 'OK', $metrics); diff --git a/misc/alert_rules.json b/misc/alert_rules.json index 110ec2cc78..cb56d8e360 100644 --- a/misc/alert_rules.json +++ b/misc/alert_rules.json @@ -743,6 +743,16 @@ "name": "linux_config_files Configuration Files Out-Of-Sync > 0", "severity": "warning" }, + { + "rule": "applications.app_type = \"smart\" && application_metrics.metric = \"unhealthy\" && application_metrics.value > \"0\"", + "name": "SMART: one or more disk is unhealthy", + "severity": "critical" + }, + { + "rule": "applications.app_type = \"smart\" && application_metrics.metric = \"exit_nonzero\" && application_metrics.value > \"0\"", + "name": "SMART: one or more disk could not be polled", + "severity": "critical" + }, { "rule": "applications.app_type = \"suricata_extract\" && application_metrics.metric = \"errors\" && application_metrics.value > \"0\"", "name": "Suricata Extract Submit errors found > 0", diff --git a/tests/data/linux_smart-legacy.json b/tests/data/linux_smart-legacy.json new file mode 100644 index 0000000000..02a54e7b0c --- /dev/null +++ b/tests/data/linux_smart-legacy.json @@ -0,0 +1,250 @@ +{ + "applications": { + "discovery": { + "applications": [ + { + "app_type": "smart", + "app_state": "UNKNOWN", + "discovered": 1, + "app_state_prev": null, + "app_status": "", + "app_instance": "", + "data": null + } + ] + }, + "poller": { + "applications": [ + { + "app_type": "smart", + "app_state": "OK", + "discovered": 1, + "app_state_prev": "UNKNOWN", + "app_status": "", + "app_instance": "", + "data": "{\"disks\":{\"Z304VCFY\":{\"10\":\"0\",\"173\":\"null\",\"177\":\"null\",\"183\":\"0\",\"184\":\"0\",\"187\":\"0\",\"188\":\"0\",\"190\":\"35\",\"194\":\"35\",\"196\":\"null\",\"197\":\"0\",\"198\":\"0\",\"199\":\"0\",\"231\":\"null\",\"233\":\"null\",\"5\":\"0\",\"9\":null,\"completed\":\"5\",\"interrupted\":\"1\",\"read_failure\":\"0\",\"unknown_failure\":\"0\",\"extended\":\"6\",\"short\":\"0\",\"conveyance\":\"0\",\"selective\":\"selective\",\"is_ssd\":0}},\"legacy\":1,\"disks_with_failed_tests\":[],\"disks_with_failed_health\":[],\"has\":{\"id5\":1,\"id9\":0,\"id10\":1,\"id173\":0,\"id177\":0,\"id183\":1,\"id184\":1,\"id187\":1,\"id188\":1,\"id190\":1,\"id194\":1,\"id196\":0,\"id197\":1,\"id198\":1,\"id199\":1,\"id231\":0,\"id232\":0,\"id233\":0}}" + } + ], + "application_metrics": [ + { + "metric": "disk_Z304VCFY", + "value": 1, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_completed", + "value": 5, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_conveyance", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_exit", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_extended", + "value": 6, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_health", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id10", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id173", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id177", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id183", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id184", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id187", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id188", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id190", + "value": 35, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id194", + "value": 35, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id196", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id197", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id198", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id199", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id231", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id232", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id233", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id5", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_id9", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_interrupted", + "value": 1, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_max_temp", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_readfailure", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_selective", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + + + { + "metric": "disk_Z304VCFY_short", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_Z304VCFY_unknownfail", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disks_with_failed_health_count", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disks_with_failed_tests_count", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "exit_nonzero", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "new_disks_with_failed_health_count", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "new_disks_with_failed_tests_count", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "unhealthy", + "value": 0, + "value_prev": null, + "app_type": "smart" + } + ] + } + } +} diff --git a/tests/data/linux_smart-v1.json b/tests/data/linux_smart-v1.json new file mode 100644 index 0000000000..82db184288 --- /dev/null +++ b/tests/data/linux_smart-v1.json @@ -0,0 +1,248 @@ +{ + "applications": { + "discovery": { + "applications": [ + { + "app_type": "smart", + "app_state": "UNKNOWN", + "discovered": 1, + "app_state_prev": null, + "app_status": "", + "app_instance": "", + "data": null + } + ] + }, + "poller": { + "applications": [ + { + "app_type": "smart", + "app_state": "OK", + "discovered": 1, + "app_state_prev": "UNKNOWN", + "app_status": "", + "app_instance": "", + "data": "{\"disks\":{\"da0\":{\"10\":\"0\",\"173\":\"null\",\"177\":\"null\",\"183\":\"0\",\"184\":\"0\",\"187\":\"0\",\"188\":0,\"190\":\"34\",\"194\":\"34\",\"196\":\"null\",\"197\":\"0\",\"198\":\"0\",\"199\":\"0\",\"231\":\"null\",\"232\":\"null\",\"233\":\"null\",\"5\":\"0\",\"9\":\"63417\",\"completed\":5,\"conveyance\":\"0\",\"device_model\":\"ST4000DM000-1F2168\",\"disk\":\"da0 -d sat\",\"exit\":0,\"extended\":6,\"fw_version\":\"CC54\",\"health_pass\":1,\"interrupted\":1,\"max_temp\":\"34\",\"model_family\":\"Seagate Desktop HDD.15\",\"offline\":\"0\",\"read_failure\":\"0\",\"selective\":\"0\",\"selftest_log\":\"Num Test_Description Status Remaining LifeTime(hours) LBA_of_first_errorn# 1 Extended offline Completed without error 00% 63322 -n# 2 Extended offline Completed without error 00% 32177 -n# 3 Extended offline Completed without error 00% 9042 -n# 4 Extended offline Completed without error 00% 8432 -n# 5 Extended offline Completed without error 00% 29 -n# 6 Extended offline Interrupted (host reset) 00% 0 -\",\"serial\":\"Z304VCFY\",\"short\":\"0\",\"unknown_failure\":\"0\",\"is_ssd\":0}},\"exit_nonzero\":0,\"unhealthy\":0,\"disks_with_failed_tests\":[],\"disks_with_failed_health\":[],\"has\":{\"id5\":1,\"id9\":1,\"id10\":1,\"id173\":0,\"id177\":0,\"id183\":1,\"id184\":1,\"id187\":1,\"id188\":1,\"id190\":1,\"id194\":1,\"id196\":0,\"id197\":1,\"id198\":1,\"id199\":1,\"id231\":0,\"id232\":0,\"id233\":0}}" + } + ], + "application_metrics": [ + { + "metric": "disk_da0", + "value": 1, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_completed", + "value": 5, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_conveyance", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_exit", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_extended", + "value": 6, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_health", + "value": 1, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id10", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id173", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id177", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id183", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id184", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id187", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id188", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id190", + "value": 34, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id194", + "value": "34", + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id196", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id197", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id198", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id199", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id231", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id232", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id233", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id5", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_id9", + "value": 63417, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_interrupted", + "value": 1, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_max_temp", + "value": 34, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_readfailure", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_selective", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_short", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disk_da0_unknownfail", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disks_with_failed_health_count", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "disks_with_failed_tests_count", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "exit_nonzero", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "new_disks_with_failed_health_count", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "new_disks_with_failed_tests_count", + "value": 0, + "value_prev": null, + "app_type": "smart" + }, + { + "metric": "unhealthy", + "value": 0, + "value_prev": null, + "app_type": "smart" + } + ] + } + } +} diff --git a/tests/snmpsim/linux_smart-legacy.snmprec b/tests/snmpsim/linux_smart-legacy.snmprec new file mode 100644 index 0000000000..2adbb02a64 --- /dev/null +++ b/tests/snmpsim/linux_smart-legacy.snmprec @@ -0,0 +1,10 @@ +1.3.6.1.2.1.1.1.0|4|Linux server 3.10.0-693.5.2.el7.x86_64 #1 SMP Fri Oct 20 20:32:50 UTC 2017 x86_64 +1.3.6.1.2.1.1.2.0|6|1.3.6.1.4.1.8072.3.2.10 +1.3.6.1.2.1.1.3.0|67|77550514 +1.3.6.1.2.1.1.4.0|4| +1.3.6.1.2.1.1.5.0|4| +1.3.6.1.2.1.1.6.0|4| +1.3.6.1.2.1.25.1.1.0|67|77552962 +1.3.6.1.4.1.8072.1.3.2.2.1.21.6.100.105.115.116.114.111|2|1 +1.3.6.1.4.1.8072.1.3.2.2.1.21.5.115.109.97.114.116|2|1 +1.3.6.1.4.1.8072.1.3.2.3.1.2.5.115.109.97.114.116|4x|5a333034564346592c302c302c6e756c6c2c6e756c6c2c302c302c302c302c33352c33352c6e756c6c2c302c302c302c6e756c6c2c6e756c6c2c352c312c302c302c362c302c302c302c3633343338 diff --git a/tests/snmpsim/linux_smart-v1.snmprec b/tests/snmpsim/linux_smart-v1.snmprec new file mode 100644 index 0000000000..f436cf1fb3 --- /dev/null +++ b/tests/snmpsim/linux_smart-v1.snmprec @@ -0,0 +1,10 @@ +1.3.6.1.2.1.1.1.0|4|Linux server 3.10.0-693.5.2.el7.x86_64 #1 SMP Fri Oct 20 20:32:50 UTC 2017 x86_64 +1.3.6.1.2.1.1.2.0|6|1.3.6.1.4.1.8072.3.2.10 +1.3.6.1.2.1.1.3.0|67|77550514 +1.3.6.1.2.1.1.4.0|4| +1.3.6.1.2.1.1.5.0|4| +1.3.6.1.2.1.1.6.0|4| +1.3.6.1.2.1.25.1.1.0|67|77552962 +1.3.6.1.4.1.8072.1.3.2.2.1.21.6.100.105.115.116.114.111|2|1 +1.3.6.1.4.1.8072.1.3.2.2.1.21.5.115.109.97.114.116|2|1 +1.3.6.1.4.1.8072.1.3.2.3.1.2.5.115.109.97.114.116|4x|7b2264617461223a7b226469736b73223a7b22646130223a7b223130223a2230222c22313733223a226e756c6c222c22313737223a226e756c6c222c22313833223a2230222c22313834223a2230222c22313837223a2230222c22313838223a302c22313930223a223334222c22313934223a223334222c22313936223a226e756c6c222c22313937223a2230222c22313938223a2230222c22313939223a2230222c22323331223a226e756c6c222c22323332223a226e756c6c222c22323333223a226e756c6c222c2235223a2230222c2239223a223633343137222c22636f6d706c65746564223a352c22636f6e766579616e6365223a2230222c226465766963655f6d6f64656c223a22535434303030444d3030302d314632313638222c226469736b223a22646130202d6420736174222c2265786974223a302c22657874656e646564223a362c2266775f76657273696f6e223a2243433534222c226865616c74685f70617373223a312c22696e746572727570746564223a312c226d61785f74656d70223a223334222c226d6f64656c5f66616d696c79223a2253656167617465204465736b746f70204844442e3135222c226f66666c696e65223a2230222c22726561645f6661696c757265223a2230222c2273656c656374697665223a2230222c2273656c66746573745f6c6f67223a224e756d2020546573745f4465736372697074696f6e2020202053746174757320202020202020202020202020202020202052656d61696e696e6720204c69666554696d6528686f7572732920204c42415f6f665f66697273745f6572726f725c6e2320312020457874656e646564206f66666c696e6520202020436f6d706c6574656420776974686f7574206572726f7220202020202020303025202020202036333332322020202020202020202d5c6e2320322020457874656e646564206f66666c696e6520202020436f6d706c6574656420776974686f7574206572726f7220202020202020303025202020202033323137372020202020202020202d5c6e2320332020457874656e646564206f66666c696e6520202020436f6d706c6574656420776974686f7574206572726f7220202020202020303025202020202020393034322020202020202020202d5c6e2320342020457874656e646564206f66666c696e6520202020436f6d706c6574656420776974686f7574206572726f7220202020202020303025202020202020383433322020202020202020202d5c6e2320352020457874656e646564206f66666c696e6520202020436f6d706c6574656420776974686f7574206572726f7220202020202020303025202020202020202032392020202020202020202d5c6e2320362020457874656e646564206f66666c696e6520202020496e7465727275707465642028686f737420726573657429202020202020303025202020202020202020302020202020202020202d222c2273657269616c223a225a33303456434659222c2273686f7274223a2230222c22756e6b6e6f776e5f6661696c757265223a2230227d7d2c22657869745f6e6f6e7a65726f223a302c22756e6865616c746879223a307d2c226572726f72223a302c226572726f72537472696e67223a22222c2276657273696f6e223a317d0a