Enhance pid file support to detect running processes

Clean up loss accuracy description
Move check for zero intervals back to caller. Prior commit broke disable of report interval.
2024-05-19 06:50:01 +00:00 · 2017-09-29 16:04:13 -07:00 · 2017-09-29 15:13:48 -07:00 · 2017-09-29 00:23:16 -07:00 · 2017-09-28 15:53:20 -07:00 · 2017-09-28 14:20:21 -07:00
4 changed files with 136 additions and 58 deletions
--- a/2
+++ b/2
@@ -1,4 +1,4 @@
-Copyright (c) 2015-2016, Denny Page
+Copyright (c) 2015-2017, Denny Page
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
--- a/2
+++ b/2
@@ -2,7 +2,7 @@
 #WARNINGS=-Wall -Wextra -Wformat=2 -Wno-unused-result

 CC=clang
-WARNINGS=-Weverything -Wno-padded -Wno-disabled-macro-expansion
+WARNINGS=-Weverything -Wno-padded -Wno-disabled-macro-expansion -Wno-reserved-id-macro

 CFLAGS=${WARNINGS} -pthread -g -O2

--- a/NOTES.md
+++ b/NOTES.md
@@ -0,0 +1,13 @@
+<b>Loss accuracy</b>
+
+In general, dpinger works a bit differently than other latency monitors. Rather than a "probe" that fires off and processes a handful of echo request/replies all at once, dpinger maintains a rolling array of echo requests spaced on the send interval. In other words, instead of waking up every second and sending 4 echo requests at once, dpinger sends an echo request every 250 milliseconds. When dpinger receives an echo reply, the time difference between the request packet and reply packet (latency) is recorded. There is nothing that times out an echo request/reply and records it as permanently lost.
+
+When the alert check is made, or a report is generated, dpinger goes through the array and examines each echo request. If a reply has been received, it is used as part of the overall latency calculation. If a reply has not yet been received, the amount of time since the request is compared against the loss interval. If it is greater than the loss interval, the request/reply is counted as lost in the current report. However the concept of the request/reply being lost is not a permanent decision. In subsequent reports, if a the missing reply has been received, its latency will be used instead of being counted as lost.
+
+It's important to keep in mind that latency and loss are reported as averages across the entire request set. The default time period for dpinger is 30 seconds, with an echo request being sent every 250 milliseconds. This means that the latency and loss will be reported as averages across 115-120 samples. The alert check runs every second by default. So each time, the 4 oldest entries in the set have been replaced by the 4 newest ones.
+
+Note that if you want accurate loss reporting, it is important that the number of samples be sufficient. In order to achieve 1% loss resolution, you have need more than 100 samples in the set. The calculation for loss resolution is:
+
+  100 * send_interval / (time_period - loss_interval)
+
+The default settings for dpinger report loss with an accuracy of 0.87%.
--- a/dpinger.c
+++ b/dpinger.c
@@ -1,6 +1,6 @@

 //
-// Copyright (c) 2015-2016, Denny Page
+// Copyright (c) 2015-2017, Denny Page
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -27,6 +27,11 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //

+
+// Silly that this is required for accept4 on Linux
+#define _GNU_SOURCE
+
+
 #include <stdio.h>
 #include <errno.h>
 #include <string.h>
@@ -39,10 +44,11 @@
 #include <signal.h>

 #include <netdb.h>
-#include <net/if.h>
 #include <sys/socket.h>
+#include <net/if.h>
 #include <sys/un.h>
 #include <sys/stat.h>
+#include <sys/file.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
@@ -52,17 +58,6 @@
 #include <pthread.h>
 #include <syslog.h>

-// TODO:
-//
-// After December 31st, 2016, review use of fcntl() for setting non blocking
-// and close on exec. It would be preferable to use accept4(), SOCK_CLOEXEC
-// and SOCK_NONBLOCK. These are currently avoided to allow use on older
-// systems such as FreeBSD 9.3, Linux 2.6.26.
-// For Linux accept4() currently requires defining _GNU_SOURCE which we would
-// like to avoid.
-// For FreeBSD, these definitions were introduced with FreeBSD 10.0 and are
-// not present in 9.3 which is supported through 2016.
-

 // Who we are
 static const char *             progname;
@@ -192,6 +187,8 @@ static uint16_t                 echo_id;
 static uint16_t                 next_sequence = 0;
 static uint16_t                 sequence_limit;

+// Receive thread ready
+static unsigned int             recv_ready = 0;

 //
 // Termination handler
@@ -334,18 +331,23 @@ send_thread(
    echo_request->code = 0;
    echo_request->id = echo_id;

+    // Give the recv thread a moment to initialize
+    sleeptime.tv_sec = 0;
+    sleeptime.tv_nsec = 10000; // 10us
+    do {
+        r = nanosleep(&sleeptime, NULL);
+        if (r == -1)
+        {
+            logger("nanosleep error in send thread waiting for recv thread: %d\n", errno);
+        }
+    } while (recv_ready == 0);
+
    // Set up the timespec for nanosleep
    sleeptime.tv_sec = send_interval_msec / 1000;
    sleeptime.tv_nsec = (send_interval_msec % 1000) * 1000000;

    while (1)
    {
-        r = nanosleep(&sleeptime, NULL);
-        if (r == -1)
-        {
-            logger("nanosleep error in send thread: %d\n", errno);
-        }
-
        // Set sequence number and checksum
        echo_request->sequence = htons(next_sequence);
        echo_request->cksum = 0;
@@ -364,6 +366,12 @@ send_thread(

        next_slot = (next_slot + 1) % array_size;
        next_sequence = (next_sequence + 1) % sequence_limit;
+
+        r = nanosleep(&sleeptime, NULL);
+        if (r == -1)
+        {
+            logger("nanosleep error in send thread: %d\n", errno);
+        }
    }
 }

@@ -384,6 +392,9 @@ recv_thread(
    struct timespec             now;
    unsigned int                array_slot;

+    // Thread startup complete
+    recv_ready = 1;
+
    while (1)
    {
        src_addr_len = sizeof(src_addr);
@@ -688,9 +699,14 @@ usocket_thread(

    while (1)
    {
+#if defined(DISABLE_ACCEPT4)
+        // Legacy
        sock_fd = accept(usocket_fd, NULL, NULL);
        (void) fcntl(sock_fd, F_SETFL, FD_CLOEXEC);
        (void) fcntl(sock_fd, F_SETFL, fcntl(sock_fd, F_GETFL, 0) | O_NONBLOCK);
+#else
+        sock_fd = accept4(usocket_fd, NULL, NULL, SOCK_NONBLOCK | SOCK_CLOEXEC);
+#endif

        report(&average_latency_usec, &latency_deviation, &average_loss_percent);

@@ -728,10 +744,10 @@ get_time_arg_msec(
    const char *                arg,
    unsigned long *             value)
 {
-    unsigned long               t;
+    long                        t;
    char *                      suffix;

-    t = strtoul(arg, &suffix, 10);
+    t = strtol(arg, &suffix, 10);
    if (*suffix == 'm')
    {
        // Milliseconds
@@ -744,13 +760,13 @@ get_time_arg_msec(
        suffix++;
    }

-    // Garbage in the number?
-    if (*suffix != 0)
+    // Invalid specification?
+    if (t < 0 || *suffix != 0)
    {
        return 1;
    }

-    *value = t;
+    *value = (unsigned long) t;
    return 0;
 }

@@ -763,22 +779,22 @@ get_percent_arg(
    const char *                arg,
    unsigned long *             value)
 {
-    unsigned long               t;
+    long                        t;
    char *                      suffix;

-    t = strtoul(arg, &suffix, 10);
+    t = strtol(arg, &suffix, 10);
    if (*suffix == '%')
    {
        suffix++;
    }

-    // Garbage in the number?
-    if (*suffix != 0 || t > 100)
+    // Invalid specification?
+    if (t < 0 || t > 100 || *suffix != 0)
    {
        return 1;
    }

-    *value = t;
+    *value = (unsigned long) t;
    return 0;
 }

@@ -791,10 +807,10 @@ get_length_arg(
    const char *                arg,
    unsigned long *             value)
 {
-    unsigned long               t;
+    long                        t;
    char *                      suffix;

-    t = strtoul(arg, &suffix, 10);
+    t = strtol(arg, &suffix, 10);
    if (*suffix == 'b')
    {
        // Bytes
@@ -807,13 +823,13 @@ get_length_arg(
        suffix++;
    }

-    // Garbage in the number?
-    if (*suffix != 0)
+    // Invalid specification?
+    if (t < 0 || *suffix != 0)
    {
        return 1;
    }

-    *value = t;
+    *value = (unsigned long) t;
    return 0;
 }

@@ -974,7 +990,7 @@ parse_args(

        case 'D':
            r = get_time_arg_msec(optarg, &latency_alarm_threshold_msec);
-            if (r || latency_alarm_threshold_msec == 0)
+            if (r)
            {
                fatal("invalid latency alarm threshold %s\n", optarg);
            }
@@ -983,7 +999,7 @@ parse_args(

        case 'L':
            r = get_percent_arg(optarg, &loss_alarm_threshold_percent);
-            if (r || loss_alarm_threshold_percent == 0)
+            if (r)
            {
                fatal("invalid loss alarm threshold %s\n", optarg);
            }
@@ -1130,10 +1146,13 @@ main(
    char                        *argv[])
 {
    char                        bind_str[ADDR_STR_MAX] = "(none)";
+    char                        pidbuf[64];
    int                         pidfile_fd = -1;
+    pid_t                       pid;
    pthread_t                   thread;
    struct                      sigaction act;
    int                         buflen = PACKET_BUFLEN;
+    ssize_t                     len;
    ssize_t                     rs;
    int                         r;

@@ -1180,6 +1199,66 @@ main(
    (void) setgid(getgid());
    (void) setuid(getuid());

+    // Create pid file
+    if (pidfile_name)
+    {
+        pidfile_fd = open(pidfile_name, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0644);
+        if (pidfile_fd != -1)
+        {
+            // Lock the pid file
+            r = flock(pidfile_fd, LOCK_EX | LOCK_NB);
+            if (r == -1)
+            {
+                perror("flock");
+                fatal("error locking pid file\n");
+            }
+        }
+        else
+        {
+            // Pid file already exists?
+            pidfile_fd = open(pidfile_name, O_RDWR | O_CREAT | O_CLOEXEC, 0644);
+            if (pidfile_fd == -1)
+            {
+                perror("open");
+                fatal("cannot create/open pid file %s\n", pidfile_name);
+            }
+
+            // Lock the pid file
+            r = flock(pidfile_fd, LOCK_EX | LOCK_NB);
+            if (r == -1)
+            {
+                fatal("pid file %s is in use by another process\n", pidfile_name);
+            }
+
+            // Check for existing pid
+            rs = read(pidfile_fd, pidbuf, sizeof(pidbuf) - 1);
+            if (rs > 0)
+            {
+                pidbuf[rs] = 0;
+
+                pid = (pid_t) strtol(pidbuf, NULL, 10);
+                if (pid > 0)
+                {
+                    // Is the pid still alive?
+                    r = kill(pid, 0);
+                    if (r == 0)
+                    {
+                        fatal("pid file %s is in use by process %u\n", pidfile_name, (unsigned int) pid);
+                    }
+                }
+            }
+
+            // Reset the pid file
+            (void) lseek(pidfile_fd, 0, 0);
+            r = ftruncate(pidfile_fd, 0);
+            if (r == -1)
+            {
+                perror("ftruncate");
+                fatal("cannot write pid file %s\n", pidfile_name);
+            }
+        }
+    }
+
    // Create report file
    if (report_name)
    {
@@ -1239,31 +1318,20 @@ main(
        }
    }

-    // Create pid file
-    if (pidfile_name)
-    {
-        pidfile_fd = open(pidfile_name, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644);
-        if (pidfile_fd == -1)
-        {
-            perror("open");
-            fatal("cannot open/create pid file %s\n", pidfile_name);
-        }
-    }
-
    // End of general errors from command line options

    // Self background
    if (foreground == 0)
    {
-        r = fork();
+        pid = fork();

-        if (r == -1)
+        if (pid == -1)
        {
            perror("fork");
            fatal("cannot background\n");
        }

-        if (r)
+        if (pid)
        {
            _exit(EXIT_SUCCESS);
        }
@@ -1280,16 +1348,13 @@ main(
    // Write pid file
    if (pidfile_fd != -1)
    {
-        char                    buf[64];
-        ssize_t                 len;
-
-        len = snprintf(buf, sizeof(buf), "%u\n", (unsigned) getpid());
-        if (len < 0 || (size_t) len > sizeof(buf))
+        len = snprintf(pidbuf, sizeof(pidbuf), "%u\n", (unsigned) getpid());
+        if (len < 0 || (size_t) len > sizeof(pidbuf))
        {
            fatal("error formatting pidfile\n");
        }

-        rs = write(pidfile_fd, buf, (size_t) len);
+        rs = write(pidfile_fd, pidbuf, (size_t) len);
        if (rs == -1)
        {
            perror("write");
Author	SHA1	Message	Date
Denny Page	2b032751e5	Enhance pid file support to detect running processes	2017-09-29 16:04:13 -07:00
Denny Page	84ee15b155	Clean up loss accuracy description	2017-09-29 15:13:48 -07:00
Denny Page	e10c51ad95	Move check for zero intervals back to caller. Prior commit broke disable of report interval.	2017-09-29 00:23:16 -07:00
Denny Page	579ae3d66b	Detect (and reject) negative numbers in paramaters	2017-09-28 15:53:20 -07:00
Denny Page	64e644e7be	Don't wait for send interval before sending first echo request	2017-09-28 14:20:21 -07:00
Denny Page	a18d82ab6e	Update copyright	2017-09-28 14:00:40 -07:00
Denny Page	34b0bb924e	Use accept4()	2017-09-28 13:04:16 -07:00
dennypage	4173834bbe	Create NOTES.md	2017-09-27 13:36:52 -07:00
dennypage	2a8eaa0c8f	Merge pull request #23 from joemiller/openbsd problem: cannot build on openbsd	2017-08-22 16:57:09 -07:00
joe miller	edb883498d	problem: cannot build on openbsd solution: include socket.h before if.h since if.h relies on types defined in socket.h	2017-03-15 08:16:00 -07:00