pping: Various minor fixes

Perform various fixes and tweaks:
- Rename several defines to make them more informative
- Remove unrolling of loop in BPF programs
- Reuse defines for program sections between userspace and kernel
  space programs
- Perform fork+exec to run bpf_egress_loader script instead of
  system()
- Add comment to copied scripts indicating I've modified them
- Add pping.h and pping_helpers.h as dependencies in Makefile

Also, add a brief description of what PPing is and how it works to
README

Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
This commit is contained in:
Simon Sundberg
2021-01-26 18:34:23 +01:00
parent 71c6458712
commit 7410d5cc2c
10 changed files with 184 additions and 107 deletions

View File

@@ -6,7 +6,7 @@ BPF_TARGETS := pping_kern_xdp
BPF_TARGETS += $(TC_BPF_TARGETS)
LDFLAGS += -pthread
EXTRA_DEPS += config.mk
EXTRA_DEPS += config.mk pping.h pping_helpers.h
LIB_DIR = ../lib

View File

@@ -1,5 +1,19 @@
# PPing using XDP and TC-BPF
An implementation of the passive ping ([pping](https://github.com/pollere/pping)) utility based on XDP (for ingress) and TC-BPF (for outgress)
An implementation of the passive ping ([pping](https://github.com/pollere/pping)) utility based on XDP (for ingress) and TC-BPF (for egress)
## Simple description
Passive Ping (PPing) makes use of the TCP Timestamp option to calculate the RTT for TCP traffic passing through.
PPing can be used on measure RTTs on end hosts or any device which sees both directions of the TCP flow.
For outgoing packets, it checks for TCP timestamp TSval in the TCP header. If it finds one it creates a timestamp
for when it saw that TSval in a particular flow. On incomming packets it parses the TCP timestamp TSecr (which
is the TSval echoed by the receiving host) and checks it has seen any previous outgoing packets with that TCP
timestamp. If it has, an RTT is calculated as the difference in time between when it saw an outgoing packet
with a TSval, and when it received an incomming packet from the reverse flow with a matching TSecr.
Note that TCP timestamps may not be unique for every packet in a flow, therefore it only matches the first
outgoing packet with a particular TSval with the first incomming packet with a matching TSecr. Duplicate
TSval/TSecr are ignored.
## Planned design
!["Design of eBPF pping](./eBPF_pping_design.png)

View File

@@ -5,7 +5,7 @@
- [x] Add SPDX-license-identifier tags
- [x] Format C-code in kernel style
- [x] Use existing funcionality to reuse maps by using BTF-defined maps
- [ ] Use BTF-defined maps for TC-BPF as well if iproute has libbpf support
- [x] Use BTF-defined maps for TC-BPF as well if iproute has libbpf support
## Future
- [ ] Use libxdp to load XDP program

View File

@@ -3,8 +3,8 @@
# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
# License: GPLv2
#
# Extended by Simon Sundberg <simon.sundberg@kau.se> to add support
# of optional section (--sec) option
# Modified by Simon Sundberg <simon.sundberg@kau.se> to add support
# of optional section (--sec) option and changed default BPF_OBJ
#
basedir=`dirname $0`
source ${basedir}/functions.sh

View File

@@ -4,6 +4,10 @@
#
# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
# License: GPLv2
#
# Modified by Simon Sundberg <simon.sundberg@kau.se> to add support
# of optional section (--sec) option
#
function usage() {
echo ""

View File

@@ -3,7 +3,6 @@
#include <bpf/libbpf.h>
#include <linux/if_link.h>
#include <net/if.h> // For if_nametoindex
//#include <linux/err.h> // For IS_ERR_OR_NULL macro // use libbpf_get_error instead
#include <arpa/inet.h> // For inet_ntoa and ntohs
#include <stdio.h>
@@ -14,28 +13,31 @@
#include <stdbool.h>
#include <signal.h> // For detecting Ctrl-C
#include <sys/resource.h> // For setting rlmit
#include <sys/wait.h>
#include <time.h>
#include <pthread.h>
#include "pping.h" //key and value structs for the ts_start map
#define BILLION 1000000000UL
#define MILLION 1000000UL
#define NS_PER_SECOND 1000000000UL
#define NS_PER_MS 1000000UL
#define TCBPF_LOADER_SCRIPT "./bpf_egress_loader.sh"
#define PINNED_DIR "/sys/fs/bpf/tc/globals"
#define PPING_XDP_OBJ "pping_kern_xdp.o"
#define XDP_PROG_SEC "xdp"
#define PPING_TCBPF_OBJ "pping_kern_tc.o"
#define TCBPF_PROG_SEC "pping_egress"
#define XDP_FLAGS XDP_FLAGS_UPDATE_IF_NOEXIST
#define MAP_NAME "ts_start"
#define MAP_CLEANUP_INTERVAL 1 * BILLION // Clean timestamp map once per second
#define PERF_BUFFER_NAME "rtt_events"
#define TS_MAP "ts_start"
#define MAP_CLEANUP_INTERVAL 1 * NS_PER_SECOND // Clean timestamp map once per second
#define TIMESTAMP_LIFETIME 10 * NS_PER_SECOND // Clear out entries from ts_start if they're over 10 seconds
#define PERF_BUFFER "rtt_events"
#define PERF_BUFFER_PAGES 64 // Related to the perf-buffer size?
#define PERF_POLL_TIMEOUT_MS 100
#define RMEMLIM 512UL << 20 /* 512 MBs */
#define MAX_COMMAND_LEN 1024
#define MAX_PATH_LEN 1024
#define TIMESTAMP_LIFETIME 10 * BILLION // Clear out entries from ts_start if they're over 10 seconds
/*
* BPF implementation of pping using libbpf
@@ -50,6 +52,7 @@
* (together with the related flow) and printed out
*/
// Structure to contain arguments for clean_map (for passing to pthread_create)
struct map_cleanup_args {
int map_fd;
__u64 max_age_ns;
@@ -97,6 +100,7 @@ static int xdp_attach(struct bpf_object *obj, const char *sec, int ifindex,
prog = bpf_object__find_program_by_title(obj, sec);
else
prog = bpf_program__next(NULL, obj);
prog_fd = bpf_program__fd(prog);
if (prog_fd < 0) {
fprintf(stderr, "Could not find program to attach\n");
@@ -105,6 +109,7 @@ static int xdp_attach(struct bpf_object *obj, const char *sec, int ifindex,
if (force) // detach current (if any) xdp-program first
xdp_detach(ifindex, xdp_flags);
err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
if (err < 0) {
fprintf(stderr, "Failed loading xdp-program on interface %d\n",
@@ -114,21 +119,76 @@ static int xdp_attach(struct bpf_object *obj, const char *sec, int ifindex,
return 0;
}
static int tc_bpf_load(const char *bpf_object, const char *section,
const char *interface)
{
int status;
int ret = -1;
pid_t pid = fork();
if (pid < 0)
return -errno;
if (pid == 0) {
execl(TCBPF_LOADER_SCRIPT, TCBPF_LOADER_SCRIPT,
"--dev", interface, "--obj", bpf_object,
"--sec", section, NULL);
return -errno;
}
else { //pid > 0
waitpid(pid, &status, 0);
if (WIFEXITED(status))
ret = WEXITSTATUS(status);
return ret;
}
}
static int tc_bpf_clear(const char *interface)
{
int status;
int ret = -1;
pid_t pid = fork();
if (pid < 0)
return -errno;
if (pid == 0) {
execl(TCBPF_LOADER_SCRIPT, TCBPF_LOADER_SCRIPT,
"--dev", interface, "--remove", NULL);
return -errno;
}
else { //pid > 0
waitpid(pid, &status, 0);
if (WIFEXITED(status))
ret = WEXITSTATUS(status);
return ret;
}
}
/*
* Returns time of CLOCK_MONOTONIC as nanoseconds in a single __u64.
* On failure, the value 0 is returned (and errno will be set).
*/
static __u64 get_time_ns(clockid_t clockid)
{
struct timespec t;
if (clock_gettime(clockid, &t) != 0) // CLOCK_BOOTTIME if using bpf_get_ktime_boot_ns
return 0;
return (__u64)t.tv_sec * BILLION + (__u64)t.tv_nsec;
return (__u64)t.tv_sec * NS_PER_SECOND + (__u64)t.tv_nsec;
}
static int remove_old_entries_from_map(int map_fd, __u64 max_age)
static int clean_map(int map_fd, __u64 max_age)
{
int removed = 0, entries = 0;
int removed = 0;
struct ts_key key, prev_key = { 0 };
struct ts_timestamp value;
bool delete_prev = false;
__u64 now_nsec = get_time_ns(CLOCK_MONOTONIC);
int entries = 0; // Just for debug
__u64 duration; // Just for debug
if (now_nsec == 0)
return -errno;
@@ -153,9 +213,9 @@ static int remove_old_entries_from_map(int map_fd, __u64 max_age)
bpf_map_delete_elem(map_fd, &prev_key);
removed++;
}
__u64 duration = get_time_ns(CLOCK_MONOTONIC) - now_nsec;
duration = get_time_ns(CLOCK_MONOTONIC) - now_nsec;
printf("Gone through %d entries and removed %d of them in %llu.%09llu s\n",
entries, removed, duration / BILLION, duration % BILLION);
entries, removed, duration / NS_PER_SECOND, duration % NS_PER_SECOND);
return removed;
}
@@ -163,10 +223,11 @@ static void *periodic_map_cleanup(void *args)
{
struct map_cleanup_args *argp = args;
struct timespec interval;
interval.tv_sec = MAP_CLEANUP_INTERVAL / BILLION;
interval.tv_nsec = MAP_CLEANUP_INTERVAL % BILLION;
interval.tv_sec = MAP_CLEANUP_INTERVAL / NS_PER_SECOND;
interval.tv_nsec = MAP_CLEANUP_INTERVAL % NS_PER_SECOND;
while (keep_running) {
remove_old_entries_from_map(argp->map_fd, argp->max_age_ns);
clean_map(argp->map_fd, argp->max_age_ns);
nanosleep(&interval, NULL);
}
pthread_exit(NULL);
@@ -178,9 +239,10 @@ static void handle_rtt_event(void *ctx, int cpu, void *data, __u32 data_size)
struct in_addr saddr, daddr;
saddr.s_addr = e->flow.saddr;
daddr.s_addr = e->flow.daddr;
// inet_ntoa is deprecated, will switch to inet_ntop when adding IPv6 support
printf("%llu.%06llu ms %s:%d+", e->rtt / MILLION,
e->rtt % MILLION, inet_ntoa(daddr), ntohs(e->flow.dport));
printf("%llu.%06llu ms %s:%d+", e->rtt / NS_PER_MS,
e->rtt % NS_PER_MS, inet_ntoa(daddr), ntohs(e->flow.dport));
printf("%s:%d\n", inet_ntoa(saddr), ntohs(e->flow.sport));
}
@@ -191,17 +253,10 @@ static void handle_missed_rtt_event(void *ctx, int cpu, __u64 lost_cnt)
int main(int argc, char *argv[])
{
if (argc < 2) {
printf("Usage: ./pping_user <dev>\n");
return EXIT_FAILURE;
}
int err = 0;
int ifindex = 0;
bool xdp_attached = false;
bool tc_attached = false;
char tc_cmd[MAX_COMMAND_LEN];
char map_path[MAX_PATH_LEN];
struct bpf_object *obj = NULL;
@@ -213,12 +268,17 @@ int main(int argc, char *argv[])
struct perf_buffer *pb = NULL;
struct perf_buffer_opts pb_opts;
// TODO - better argument parsing (more relevant as featureas are added)
if (argc < 2) {
printf("Usage: ./pping_user <dev>\n");
return EXIT_FAILURE;
}
// Increase rlimit
err = set_rlimit(RMEMLIM);
err = set_rlimit(RLIM_INFINITY);
if (err) {
fprintf(stderr, "Could not set rlimit to %ld bytes: %s\n",
RMEMLIM, strerror(-err));
fprintf(stderr, "Could not set rlimit to infinity: %s\n",
strerror(-err));
goto cleanup;
}
@@ -240,15 +300,14 @@ int main(int argc, char *argv[])
}
// Get map here to allow for unpinning at cleanup
map = bpf_object__find_map_by_name(obj, MAP_NAME);
map = bpf_object__find_map_by_name(obj, TS_MAP);
err = libbpf_get_error(map);
if (err) {
fprintf(stderr, "Could not find map %s in %s: %s\n",
MAP_NAME, PPING_XDP_OBJ, strerror(err));
TS_MAP, PPING_XDP_OBJ, strerror(err));
map = NULL;
}
err = bpf_object__load(obj);
if (err) {
fprintf(stderr, "Failed loading XDP program: %s\n",
@@ -264,27 +323,26 @@ int main(int argc, char *argv[])
}
xdp_attached = true;
//Load tc-bpf section on interface egress
snprintf(tc_cmd, MAX_COMMAND_LEN, "%s --dev %s --obj %s --sec %s",
TCBPF_LOADER_SCRIPT, argv[1], PPING_TCBPF_OBJ, TCBPF_PROG_SEC);
err = system(tc_cmd);
// Load tc-bpf section on interface egress
err = tc_bpf_load(PPING_TCBPF_OBJ, TCBPF_PROG_SEC, argv[1]);
if (err) {
fprintf(stderr,
"Could not load section %s of %s on interface %s: %s\n",
TCBPF_PROG_SEC, PPING_TCBPF_OBJ, argv[1],
strerror(err));
strerror(-err));
goto cleanup;
}
tc_attached = true;
// Set up the periodical map cleaning
clean_args.max_age_ns = TIMESTAMP_LIFETIME;
clean_args.map_fd = bpf_map__fd(map);
if (clean_args.map_fd < 0) {
fprintf(stderr, "Could not get file descriptor of map %s in object %s: %s\n",
MAP_NAME, PPING_XDP_OBJ, strerror(-clean_args.map_fd));
TS_MAP, PPING_XDP_OBJ, strerror(-clean_args.map_fd));
goto cleanup;
}
clean_args.max_age_ns = TIMESTAMP_LIFETIME;
err = pthread_create(&tid, NULL, periodic_map_cleanup, &clean_args);
if (err) {
fprintf(stderr,
@@ -298,13 +356,13 @@ int main(int argc, char *argv[])
pb_opts.lost_cb = handle_missed_rtt_event;
pb = perf_buffer__new(bpf_object__find_map_fd_by_name(obj,
PERF_BUFFER_NAME),
PERF_BUFFER),
PERF_BUFFER_PAGES, &pb_opts);
err = libbpf_get_error(pb);
if (err) {
pb = NULL;
fprintf(stderr, "Failed to open perf buffer %s: %s\n",
PERF_BUFFER_NAME, strerror(err));
PERF_BUFFER, strerror(err));
goto cleanup;
}
@@ -326,7 +384,7 @@ cleanup:
perf_buffer__free(pb);
if (map && bpf_map__is_pinned(map)) {
snprintf(map_path, sizeof(map_path), "%s/%s",
PINNED_DIR, MAP_NAME);
PINNED_DIR, TS_MAP);
err = bpf_map__unpin(map, map_path);
if (err) {
fprintf(stderr,
@@ -343,13 +401,11 @@ cleanup:
}
}
if (tc_attached) {
snprintf(tc_cmd, MAX_COMMAND_LEN, "%s --dev %s --remove",
TCBPF_LOADER_SCRIPT, argv[1]);
err = system(tc_cmd);
err = tc_bpf_clear(argv[1]); //system(tc_cmd);
if (err) {
fprintf(stderr,
"Failed removing tc-bpf program from interface %s: %s\n",
argv[1], strerror(err));
argv[1], strerror(-err));
}
}

View File

@@ -1,8 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef TIMESTAMP_MAP_H
#define TIMESTAMP_MAP_H
#ifndef PPING_H
#define PPING_H
#include <linux/types.h>
#define XDP_PROG_SEC "xdp"
#define TCBPF_PROG_SEC "pping_egress"
// TODO - change to support both IPv4 and IPv6 (IPv4 addresses can be mapped to IPv6 addresses)
struct ipv4_flow {
__u32 saddr;
__u32 daddr;
@@ -17,9 +22,7 @@ struct ts_key {
struct ts_timestamp {
__u64 timestamp;
//__u64 ttl; // Delete entry after ttl, allows more dynamic map cleaning where entries for flows with short RTTs can be removed earlier
__u8 used;
// __u8 pad[7]; // Need to pad it due to compiler optimization, see "Remove struct padding with aligning members by using #pragma pack." at https://docs.cilium.io/en/v1.9/bpf/
};
struct rtt_event {

View File

@@ -3,6 +3,8 @@
#define PPING_HELPERS_H
#include "pping.h"
#include <linux/tcp.h>
#define MAX_TCP_OPTIONS 10
static __always_inline int fill_ipv4_flow(struct ipv4_flow *flow, __u32 saddr,
@@ -14,53 +16,57 @@ static __always_inline int fill_ipv4_flow(struct ipv4_flow *flow, __u32 saddr,
flow->dport = dport;
return 0;
}
/*
* Parses the TSval and TSecr values from the TCP options field. If sucessful
* the TSval and TSecr values will be stored at tsval and tsecr (in network
* Parses the TSval and TSecr values from the TCP options field. If sucessful
* the TSval and TSecr values will be stored at tsval and tsecr (in network
* byte order).
* Returns 0 if sucessful and -1 on failure
*/
static __always_inline int parse_tcp_ts(struct tcphdr *tcph, void *data_end,
__u32 *tsval, __u32 *tsecr)
__u32 *tsval, __u32 *tsecr)
{
if (tcph + 1 > data_end)
return -1;
int len = tcph->doff << 2;
if (len <= sizeof(struct tcphdr)) // No TCP options
return -1;
void *pos = (void *)(tcph + 1);
void *opt_end = ((void *)tcph + len);
int len = tcph->doff << 2;
void *opt_end = (void *)tcph + len;
__u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options
__u8 i, opt, opt_size;
#pragma unroll
for (i = 0; i < MAX_TCP_OPTIONS; i++) {
if (pos + 1 > opt_end || pos + 1 > data_end)
return -1;
opt = *(__u8 *)pos; // Save value to avoid future data_end comparisons
if (opt == 0) // Reached end of TCP options
return -1;
if (opt == 1) { // TCP NOP option - advance one byte
pos++;
continue;
}
// Option > 1, should have option size
if (pos + 2 > opt_end || pos + 2 > data_end)
return -1;
opt_size = *(__u8 *)(pos + 1); // Save value to avoid future data_end comparisons
// Option-kind is TCP timestap (yey!)
if (opt == 8 && opt_size == 10) {
if (pos + opt_size > opt_end ||
pos + opt_size > data_end)
return -1;
*tsval = *(__u32 *)(pos + 2);
*tsecr = *(__u32 *)(pos + 6);
return 0;
}
if (tcph + 1 > data_end || len <= sizeof(struct tcphdr))
return -1;
// Some other TCP option - advance option-length bytes
pos += opt_size;
}
return -1;
for (i = 0; i < MAX_TCP_OPTIONS; i++) {
if (pos + 1 > opt_end || pos + 1 > data_end)
return -1;
opt = *pos;
if (opt == 0) // Reached end of TCP options
return -1;
if (opt == 1) { // TCP NOP option - advance one byte
pos++;
continue;
}
// Option > 1, should have option size
if (pos + 2 > opt_end || pos + 2 > data_end)
return -1;
opt_size = *(pos + 1);
// Option-kind is TCP timestap (yey!)
if (opt == 8 && opt_size == 10) {
if (pos + opt_size > opt_end ||
pos + opt_size > data_end)
return -1;
*tsval = *(__u32 *)(pos + 2);
*tsecr = *(__u32 *)(pos + 6);
return 0;
}
// Some other TCP option - advance option-length bytes
pos += opt_size;
}
return -1;
}
#endif

View File

@@ -36,14 +36,12 @@ struct bpf_elf_map SEC("maps") ts_start = {
#endif
// TC-BFP for parsing TSVAL from egress traffic and add to map
SEC("pping_egress")
SEC(TCBPF_PROG_SEC)
int tc_bpf_prog_egress(struct __sk_buff *skb)
{
void *data = (void *)(long)skb->data;
void *data_end = (void *)(long)skb->data_end;
//bpf_printk("Sent packet of size %d bytes\n", data_end - data);
int proto = -1;
struct hdr_cursor nh = { .pos = data };
struct ethhdr *eth;
@@ -60,13 +58,11 @@ int tc_bpf_prog_egress(struct __sk_buff *skb)
if (proto < 0)
goto end;
//bpf_printk("TCP-packet with %d byte header and %lu bytes of data\n", proto, data_end - nh.pos);
__u32 tsval, tsecr;
if (parse_tcp_ts(tcph, data_end, &tsval, &tsecr) < 0)
goto end;
// We have a TCP timestamp, try adding it to the map
//bpf_printk("TCP-packet with timestap. TSval: %u, TSecr: %u\n", bpf_ntohl(tsval), bpf_ntohl(tsecr));
struct ts_key key;
fill_ipv4_flow(&(key.flow), iph->saddr, iph->daddr,
tcph->source, tcph->dest);

View File

@@ -30,18 +30,18 @@ struct {
} rtt_events SEC(".maps");
// XDP program for parsing TSECR-val from ingress traffic and check for match in map
SEC("xdp")
SEC(XDP_PROG_SEC)
int xdp_prog_ingress(struct xdp_md *ctx)
{
void *data = (void *)(long)ctx->data;
void *data_end = (void *)(long)ctx->data_end;
int proto = -1;
struct hdr_cursor nh = { .pos = data };
struct ethhdr *eth;
struct iphdr *iph;
struct tcphdr *tcph;
//bpf_printk("Received packet of length %d\n", (int)(data_end - data));
proto = parse_ethhdr(&nh, data_end, &eth);
if (bpf_ntohs(proto) != ETH_P_IP)
goto end;
@@ -52,20 +52,18 @@ int xdp_prog_ingress(struct xdp_md *ctx)
if (proto < 0)
goto end;
//bpf_printk("TCP-packet with %d byte header and %lu bytes of data\n", proto, data_end - nh.pos);
__u32 tsval, tsecr;
if (parse_tcp_ts(tcph, data_end, &tsval, &tsecr) < 0)
goto end;
// We have a TCP-timestamp - now we can check if it's in the map
//bpf_printk("TCP-packet with timestap. TSval: %u, TSecr: %u\n", bpf_ntohl(tsval), bpf_ntohl(tsecr));
struct ts_key key;
// Fill in reverse order of egress (dest <--> source)
fill_ipv4_flow(&(key.flow), iph->daddr, iph->saddr,
tcph->dest, tcph->source);
key.tsval = tsecr;
struct ts_timestamp *ts = bpf_map_lookup_elem(&ts_start, &key);
// Only calculate RTT for first packet with matching TSecr
if (ts && ts->used == 0) {
/*
@@ -81,8 +79,8 @@ int xdp_prog_ingress(struct xdp_md *ctx)
event.rtt = bpf_ktime_get_ns() - ts->timestamp;
bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU,
&event, sizeof(event));
//bpf_printk("Pushed rtt event with RTT: %llu\n", event.rtt);
}
end:
return XDP_PASS;
}