1
0
mirror of https://github.com/dennypage/dpinger.git synced 2024-05-19 06:50:01 +00:00

16 Commits
v1.8 ... v3.0

Author SHA1 Message Date
Denny Page
2b032751e5 Enhance pid file support to detect running processes 2017-09-29 16:04:13 -07:00
Denny Page
84ee15b155 Clean up loss accuracy description 2017-09-29 15:13:48 -07:00
Denny Page
e10c51ad95 Move check for zero intervals back to caller. Prior commit broke disable of report interval. 2017-09-29 00:23:16 -07:00
Denny Page
579ae3d66b Detect (and reject) negative numbers in paramaters 2017-09-28 15:53:20 -07:00
Denny Page
64e644e7be Don't wait for send interval before sending first echo request 2017-09-28 14:20:21 -07:00
Denny Page
a18d82ab6e Update copyright 2017-09-28 14:00:40 -07:00
Denny Page
34b0bb924e Use accept4() 2017-09-28 13:04:16 -07:00
dennypage
4173834bbe Create NOTES.md 2017-09-27 13:36:52 -07:00
dennypage
2a8eaa0c8f Merge pull request #23 from joemiller/openbsd
problem: cannot build on openbsd
2017-08-22 16:57:09 -07:00
joe miller
edb883498d problem: cannot build on openbsd
solution: include socket.h before if.h since if.h relies on types
defined in socket.h
2017-03-15 08:16:00 -07:00
Denny Page
c276feb339 Confirm that the rrd file is writable before starting 2016-03-01 22:14:36 -08:00
Denny Page
ef21655e77 Change title 2016-03-01 22:14:13 -08:00
Denny Page
2d2d21892a Change the default time period and send interval from 30s/250m to 60s/500m
Check time period vs send interval and loss interval to ensure there is always one resolved slot
2016-03-01 22:11:03 -08:00
Denny Page
a24c0cd0d0 Add option to set receive thread scheduling class
Don't call fatal with a NULL format
2016-03-01 21:58:10 -08:00
Denny Page
fdbd4a1d96 Add a safety cast for 32 bit systems 2016-02-27 21:29:33 -08:00
Denny Page
6796fa0752 Fix integer overflow on 32 bit 2016-02-27 19:34:47 -08:00
6 changed files with 191 additions and 84 deletions

View File

@@ -1,4 +1,4 @@
Copyright (c) 2015-2016, Denny Page
Copyright (c) 2015-2017, Denny Page
All rights reserved.
Redistribution and use in source and binary forms, with or without

View File

@@ -2,7 +2,7 @@
#WARNINGS=-Wall -Wextra -Wformat=2 -Wno-unused-result
CC=clang
WARNINGS=-Weverything -Wno-padded -Wno-disabled-macro-expansion
WARNINGS=-Weverything -Wno-padded -Wno-disabled-macro-expansion -Wno-reserved-id-macro
CFLAGS=${WARNINGS} -pthread -g -O2

13
NOTES.md Normal file
View File

@@ -0,0 +1,13 @@
<b>Loss accuracy</b>
In general, dpinger works a bit differently than other latency monitors. Rather than a "probe" that fires off and processes a handful of echo request/replies all at once, dpinger maintains a rolling array of echo requests spaced on the send interval. In other words, instead of waking up every second and sending 4 echo requests at once, dpinger sends an echo request every 250 milliseconds. When dpinger receives an echo reply, the time difference between the request packet and reply packet (latency) is recorded. There is nothing that times out an echo request/reply and records it as permanently lost.
When the alert check is made, or a report is generated, dpinger goes through the array and examines each echo request. If a reply has been received, it is used as part of the overall latency calculation. If a reply has not yet been received, the amount of time since the request is compared against the loss interval. If it is greater than the loss interval, the request/reply is counted as lost in the current report. However the concept of the request/reply being lost is not a permanent decision. In subsequent reports, if a the missing reply has been received, its latency will be used instead of being counted as lost.
It's important to keep in mind that latency and loss are reported as averages across the entire request set. The default time period for dpinger is 30 seconds, with an echo request being sent every 250 milliseconds. This means that the latency and loss will be reported as averages across 115-120 samples. The alert check runs every second by default. So each time, the 4 oldest entries in the set have been replaced by the 4 newest ones.
Note that if you want accurate loss reporting, it is important that the number of samples be sufficient. In order to achieve 1% loss resolution, you have need more than 100 samples in the set. The calculation for loss resolution is:
100 * send_interval / (time_period - loss_interval)
The default settings for dpinger report loss with an accuracy of 0.87%.

251
dpinger.c
View File

@@ -1,6 +1,6 @@
//
// Copyright (c) 2015-2016, Denny Page
// Copyright (c) 2015-2017, Denny Page
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
@@ -27,6 +27,11 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Silly that this is required for accept4 on Linux
#define _GNU_SOURCE
#include <stdio.h>
#include <errno.h>
#include <string.h>
@@ -39,10 +44,11 @@
#include <signal.h>
#include <netdb.h>
#include <net/if.h>
#include <sys/socket.h>
#include <net/if.h>
#include <sys/un.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
@@ -52,17 +58,6 @@
#include <pthread.h>
#include <syslog.h>
// TODO:
//
// After December 31st, 2016, review use of fcntl() for setting non blocking
// and close on exec. It would be preferable to use accept4(), SOCK_CLOEXEC
// and SOCK_NONBLOCK. These are currently avoided to allow use on older
// systems such as FreeBSD 9.3, Linux 2.6.26.
// For Linux accept4() currently requires defining _GNU_SOURCE which we would
// like to avoid.
// For FreeBSD, these definitions were introduced with FreeBSD 10.0 and are
// not present in 9.3 which is supported through 2016.
// Who we are
static const char * progname;
@@ -74,16 +69,17 @@ static const char * pidfile_name = NULL;
// Flags
static unsigned int flag_rewind = 0;
static unsigned int flag_syslog = 0;
static unsigned int flag_priority = 0;
// String representation of target
#define ADDR_STR_MAX (INET6_ADDRSTRLEN + IF_NAMESIZE + 1)
static char dest_str[ADDR_STR_MAX];
// Time period over which we are averaging results in ms
static unsigned long time_period_msec = 30000;
static unsigned long time_period_msec = 60000;
// Interval between sends in ms
static unsigned long send_interval_msec = 250;
static unsigned long send_interval_msec = 500;
// Interval before a sequence is initially treated as lost
// Input from command line in ms and used in us
@@ -191,6 +187,8 @@ static uint16_t echo_id;
static uint16_t next_sequence = 0;
static uint16_t sequence_limit;
// Receive thread ready
static unsigned int recv_ready = 0;
//
// Termination handler
@@ -286,7 +284,7 @@ llsqrt(
}
}
return s;
return (unsigned long) s;
}
@@ -333,18 +331,23 @@ send_thread(
echo_request->code = 0;
echo_request->id = echo_id;
// Give the recv thread a moment to initialize
sleeptime.tv_sec = 0;
sleeptime.tv_nsec = 10000; // 10us
do {
r = nanosleep(&sleeptime, NULL);
if (r == -1)
{
logger("nanosleep error in send thread waiting for recv thread: %d\n", errno);
}
} while (recv_ready == 0);
// Set up the timespec for nanosleep
sleeptime.tv_sec = send_interval_msec / 1000;
sleeptime.tv_nsec = (send_interval_msec % 1000) * 1000000;
while (1)
{
r = nanosleep(&sleeptime, NULL);
if (r == -1)
{
logger("nanosleep error in send thread: %d\n", errno);
}
// Set sequence number and checksum
echo_request->sequence = htons(next_sequence);
echo_request->cksum = 0;
@@ -352,8 +355,8 @@ send_thread(
array[next_slot].status = PACKET_STATUS_EMPTY;
sched_yield();
clock_gettime(CLOCK_MONOTONIC, &array[next_slot].time_sent);
clock_gettime(CLOCK_MONOTONIC, &array[next_slot].time_sent);
array[next_slot].status = PACKET_STATUS_SENT;
len = sendto(send_sock, echo_request, echo_request_len, 0, (struct sockaddr *) &dest_addr, dest_addr_len);
if (len == -1)
@@ -363,6 +366,12 @@ send_thread(
next_slot = (next_slot + 1) % array_size;
next_sequence = (next_sequence + 1) % sequence_limit;
r = nanosleep(&sleeptime, NULL);
if (r == -1)
{
logger("nanosleep error in send thread: %d\n", errno);
}
}
}
@@ -383,6 +392,9 @@ recv_thread(
struct timespec now;
unsigned int array_slot;
// Thread startup complete
recv_ready = 1;
while (1)
{
src_addr_len = sizeof(src_addr);
@@ -471,7 +483,7 @@ report(
packets_received++;
latency_usec = array[slot].latency_usec;
total_latency_usec += latency_usec;
total_latency_usec2 += latency_usec * latency_usec;
total_latency_usec2 += (unsigned long long) latency_usec * latency_usec;
}
else if (array[slot].status == PACKET_STATUS_SENT &&
ts_elapsed_usec(&array[slot].time_sent, &now) > loss_interval_usec)
@@ -489,7 +501,7 @@ report(
// stddev = sqrt((sum(rtt^2) / packets) - (sum(rtt) / packets)^2)
*average_latency_usec = avg;
*latency_deviation = llsqrt(avg2 - (avg * avg));
*latency_deviation = llsqrt(avg2 - ((unsigned long long) avg * avg));
}
else
{
@@ -687,9 +699,14 @@ usocket_thread(
while (1)
{
#if defined(DISABLE_ACCEPT4)
// Legacy
sock_fd = accept(usocket_fd, NULL, NULL);
(void) fcntl(sock_fd, F_SETFL, FD_CLOEXEC);
(void) fcntl(sock_fd, F_SETFL, fcntl(sock_fd, F_GETFL, 0) | O_NONBLOCK);
#else
sock_fd = accept4(usocket_fd, NULL, NULL, SOCK_NONBLOCK | SOCK_CLOEXEC);
#endif
report(&average_latency_usec, &latency_deviation, &average_loss_percent);
@@ -727,10 +744,10 @@ get_time_arg_msec(
const char * arg,
unsigned long * value)
{
unsigned long t;
long t;
char * suffix;
t = strtoul(arg, &suffix, 10);
t = strtol(arg, &suffix, 10);
if (*suffix == 'm')
{
// Milliseconds
@@ -743,13 +760,13 @@ get_time_arg_msec(
suffix++;
}
// Garbage in the number?
if (*suffix != 0)
// Invalid specification?
if (t < 0 || *suffix != 0)
{
return 1;
}
*value = t;
*value = (unsigned long) t;
return 0;
}
@@ -762,22 +779,22 @@ get_percent_arg(
const char * arg,
unsigned long * value)
{
unsigned long t;
long t;
char * suffix;
t = strtoul(arg, &suffix, 10);
t = strtol(arg, &suffix, 10);
if (*suffix == '%')
{
suffix++;
}
// Garbage in the number?
if (*suffix != 0 || t > 100)
// Invalid specification?
if (t < 0 || t > 100 || *suffix != 0)
{
return 1;
}
*value = t;
*value = (unsigned long) t;
return 0;
}
@@ -790,10 +807,10 @@ get_length_arg(
const char * arg,
unsigned long * value)
{
unsigned long t;
long t;
char * suffix;
t = strtoul(arg, &suffix, 10);
t = strtol(arg, &suffix, 10);
if (*suffix == 'b')
{
// Bytes
@@ -806,13 +823,13 @@ get_length_arg(
suffix++;
}
// Garbage in the number?
if (*suffix != 0)
// Invalid specification?
if (t < 0 || *suffix != 0)
{
return 1;
}
*value = t;
*value = (unsigned long) t;
return 0;
}
@@ -824,15 +841,16 @@ static void
usage(void)
{
fprintf(stderr, "Usage:\n");
fprintf(stderr, " %s [-f] [-R] [-S] [-B bind_addr] [-s send_interval] [-l loss_interval] [-t time_period] [-r report_interval] [-d data_length] [-o output_file] [-A alert_interval] [-D latency_alarm] [-L loss_alarm] [-C alert_cmd] [-i identifier] [-u usocket] [-p pidfile] dest_addr\n\n", progname);
fprintf(stderr, " %s [-f] [-R] [-S] [-P] [-B bind_addr] [-s send_interval] [-l loss_interval] [-t time_period] [-r report_interval] [-d data_length] [-o output_file] [-A alert_interval] [-D latency_alarm] [-L loss_alarm] [-C alert_cmd] [-i identifier] [-u usocket] [-p pidfile] dest_addr\n\n", progname);
fprintf(stderr, " options:\n");
fprintf(stderr, " -f run in foreground\n");
fprintf(stderr, " -R rewind output file between reports\n");
fprintf(stderr, " -S log warnings via syslog\n");
fprintf(stderr, " -P priority scheduling for receive thread (requires root)\n");
fprintf(stderr, " -B bind (source) address\n");
fprintf(stderr, " -s time interval between echo requests (default 250ms)\n");
fprintf(stderr, " -l time interval before packets are treated as lost (default 5x send interval)\n");
fprintf(stderr, " -t time period over which results are averaged (default 30s)\n");
fprintf(stderr, " -s time interval between echo requests (default 500ms)\n");
fprintf(stderr, " -l time interval before packets are treated as lost (default 4x send interval)\n");
fprintf(stderr, " -t time period over which results are averaged (default 60s)\n");
fprintf(stderr, " -r time interval between reports (default 1s)\n");
fprintf(stderr, " -d data length (default 0)\n");
fprintf(stderr, " -o output file for reports (default stdout)\n");
@@ -866,14 +884,11 @@ fatal(
const char * format,
...)
{
if (format)
{
va_list args;
va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
}
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
exit(EXIT_FAILURE);
}
@@ -897,7 +912,7 @@ parse_args(
progname = argv[0];
while((opt = getopt(argc, argv, "fRSB:s:l:t:r:d:o:A:D:L:C:i:u:p:")) != -1)
while((opt = getopt(argc, argv, "fRSPB:s:l:t:r:d:o:A:D:L:C:i:u:p:")) != -1)
{
switch (opt)
{
@@ -913,6 +928,10 @@ parse_args(
flag_syslog = 1;
break;
case 'P':
flag_priority = 1;
break;
case 'B':
bind_arg = optarg;
break;
@@ -971,7 +990,7 @@ parse_args(
case 'D':
r = get_time_arg_msec(optarg, &latency_alarm_threshold_msec);
if (r || latency_alarm_threshold_msec == 0)
if (r)
{
fatal("invalid latency alarm threshold %s\n", optarg);
}
@@ -980,7 +999,7 @@ parse_args(
case 'L':
r = get_percent_arg(optarg, &loss_alarm_threshold_percent);
if (r || loss_alarm_threshold_percent == 0)
if (r)
{
fatal("invalid loss alarm threshold %s\n", optarg);
}
@@ -1018,7 +1037,7 @@ parse_args(
default:
usage();
fatal(NULL);
exit(EXIT_FAILURE);
}
}
@@ -1026,7 +1045,7 @@ parse_args(
if (argc != optind + 1)
{
usage();
fatal(NULL);
exit(EXIT_FAILURE);
}
dest_arg = argv[optind];
@@ -1036,17 +1055,17 @@ parse_args(
fatal("no activity enabled\n");
}
// Ensure we have something to average over
if (time_period_msec < send_interval_msec)
// Ensure there is a minimum of one resolved slot at all times
if (time_period_msec <= send_interval_msec * 2 + loss_interval_msec)
{
fatal("time period cannot be less than send interval\n");
fatal("the time period must be greater than twice the send interval plus the loss interval\n");
}
// Ensure we don't have sequence space issues. This really should only be hit by
// complete accident. Even a ratio of 16384:1 would be excessive.
if (time_period_msec / send_interval_msec > 65536)
{
fatal("ratio of time period to send interval cannot exceed 65536:1\n");
fatal("the ratio of time period to send interval cannot exceed 65536:1\n");
}
// Check destination address
@@ -1127,10 +1146,13 @@ main(
char *argv[])
{
char bind_str[ADDR_STR_MAX] = "(none)";
char pidbuf[64];
int pidfile_fd = -1;
pid_t pid;
pthread_t thread;
struct sigaction act;
int buflen = PACKET_BUFLEN;
ssize_t len;
ssize_t rs;
int r;
@@ -1177,6 +1199,66 @@ main(
(void) setgid(getgid());
(void) setuid(getuid());
// Create pid file
if (pidfile_name)
{
pidfile_fd = open(pidfile_name, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0644);
if (pidfile_fd != -1)
{
// Lock the pid file
r = flock(pidfile_fd, LOCK_EX | LOCK_NB);
if (r == -1)
{
perror("flock");
fatal("error locking pid file\n");
}
}
else
{
// Pid file already exists?
pidfile_fd = open(pidfile_name, O_RDWR | O_CREAT | O_CLOEXEC, 0644);
if (pidfile_fd == -1)
{
perror("open");
fatal("cannot create/open pid file %s\n", pidfile_name);
}
// Lock the pid file
r = flock(pidfile_fd, LOCK_EX | LOCK_NB);
if (r == -1)
{
fatal("pid file %s is in use by another process\n", pidfile_name);
}
// Check for existing pid
rs = read(pidfile_fd, pidbuf, sizeof(pidbuf) - 1);
if (rs > 0)
{
pidbuf[rs] = 0;
pid = (pid_t) strtol(pidbuf, NULL, 10);
if (pid > 0)
{
// Is the pid still alive?
r = kill(pid, 0);
if (r == 0)
{
fatal("pid file %s is in use by process %u\n", pidfile_name, (unsigned int) pid);
}
}
}
// Reset the pid file
(void) lseek(pidfile_fd, 0, 0);
r = ftruncate(pidfile_fd, 0);
if (r == -1)
{
perror("ftruncate");
fatal("cannot write pid file %s\n", pidfile_name);
}
}
}
// Create report file
if (report_name)
{
@@ -1236,31 +1318,20 @@ main(
}
}
// Create pid file
if (pidfile_name)
{
pidfile_fd = open(pidfile_name, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644);
if (pidfile_fd == -1)
{
perror("open");
fatal("cannot open/create pid file %s\n", pidfile_name);
}
}
// End of general errors from command line options
// Self background
if (foreground == 0)
{
r = fork();
pid = fork();
if (r == -1)
if (pid == -1)
{
perror("fork");
fatal("cannot background\n");
}
if (r)
if (pid)
{
_exit(EXIT_SUCCESS);
}
@@ -1277,16 +1348,13 @@ main(
// Write pid file
if (pidfile_fd != -1)
{
char buf[64];
ssize_t len;
len = snprintf(buf, sizeof(buf), "%u\n", (unsigned) getpid());
if (len < 0 || (size_t) len > sizeof(buf))
len = snprintf(pidbuf, sizeof(pidbuf), "%u\n", (unsigned) getpid());
if (len < 0 || (size_t) len > sizeof(pidbuf))
{
fatal("error formatting pidfile\n");
}
rs = write(pidfile_fd, buf, (size_t) len);
rs = write(pidfile_fd, pidbuf, (size_t) len);
if (rs == -1)
{
perror("write");
@@ -1320,7 +1388,7 @@ main(
// Set the default loss interval
if (loss_interval_msec == 0)
{
loss_interval_msec = send_interval_msec * 5;
loss_interval_msec = send_interval_msec * 4;
}
loss_interval_usec = loss_interval_msec * 1000;
@@ -1363,6 +1431,27 @@ main(
fatal("cannot create recv thread\n");
}
// Set priority on recv thread if requested
if (flag_priority)
{
struct sched_param thread_sched_param;
r = sched_get_priority_min(SCHED_RR);
if (r == -1)
{
perror("sched_get_priority_min");
fatal("cannot determin minimum shceduling priority for SCHED_RR\n");
}
thread_sched_param.sched_priority = r;
r = pthread_setschedparam(thread, SCHED_RR, &thread_sched_param);
if (r != 0)
{
perror("pthread_setschedparam");
fatal("cannot set receive thread priority\n");
}
}
// Create send thread
r = pthread_create(&thread, NULL, &send_thread, NULL);
if (r != 0)

View File

@@ -15,6 +15,11 @@ dpinger=/usr/local/bin/dpinger
rrdfile="${name}.rrd"
if [ \! -w ${rrdfile} ]
then
echo "$0: file \"${rrdfile}\" does not exist or is not writable"
exit 1
fi
${dpinger} -f ${options} -s 500m -t 60s -r 60s ${targetip} |
while read -r latency stddev loss; do

View File

@@ -1,5 +1,5 @@
<html>
<head><title>WAN Statistics</title></head>
<head><title>>Latency Statistics for WAN</title></head>
<body>
<img src="/tmp/wan-1.png" alt="wan-1">
<p>