pping: Add timestamp and min-RTT to output

To add timestamp to output, push the timestamp when packet was
processed from kernel as part of the rtt-event. Also keep track of
minimum encountered RTT for each flow in kernel, and also push that as
part of the RTT-event.

Additionally, avoid pushing RTT messages at all if no flow-state
information can be found (due to ex. being deleted from egress side),
as no valid min-RTT can then be given. Furthermore, no longer delete
flow-information once seeing the FIN-flag on egress in order to keep
useful flow-state around for RTT-messages longer. Due to the
FIN-handshake process, it is sufficient if the ingress program deletes
the flow-state upon seeing FIN. However, still delete flow-state from
either ingress or egress upon seeing RST flag, as RST does not have a
handshake process allowing for delayed deletion.

While minimum RTT could also be tracked from the userspace process,
userspace is not aware of when the flow is closed so would have to add
additional logic to keep track of minimum RTT for each flow and
periodically clean them up. Furthermore, keeping RTT statistics in the
flow-state map is useful for implementing future features, such as an
RTT-based sampling interval. It would also be useful in case pping is
changed to no longer have a long-running userspace process printing
out all the calculated RTTs, but instead simply occasionally looks up
the RTT from the flow-state map.

Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
This commit is contained in:
Simon Sundberg
2021-04-29 18:55:06 +02:00
parent b0536ce4ec
commit b4a810b09b
4 changed files with 65 additions and 21 deletions

View File

@@ -25,7 +25,7 @@
- [ ] Could potentially include keeping track of average RTT, which
may be useful for some decisions (ex. how often to sample,
when entry can be removed etc)
- [ ] Could potentially include keeping track of minimum RTT (as
- [x] Could potentially include keeping track of minimum RTT (as
done by the original pping), ex. to track bufferbloat
- [ ] Could potentially include keeping track of if flow is
bi-directional
@@ -42,7 +42,6 @@
- It may be a good idea to keep the same format as original pping,
so that tools such as [ppviz](https://github.com/pollere/ppviz)
works for both pping implementations.
- [ ] Add timestamps to output (as original pping)
- [ ] Add support for other hooks
- Ex TC-BFP on ingress instead of XDP?
@@ -59,3 +58,4 @@
- [x] Add IPv6 support
- [x] Refactor to support easy addition of other protocols
- [x] Load tc-bpf program with libbpf (only attach it with tc)
- [x] Add timestamps to output (as original pping)

View File

@@ -40,6 +40,9 @@ static const char *__doc__ =
#define MAX_PATH_LEN 1024
#define MON_TO_REAL_UPDATE_FREQ \
(1 * NS_PER_SECOND) // Update offset between CLOCK_MONOTONIC and CLOCK_REALTIME once per second
/*
* BPF implementation of pping using libbpf
* Uses TC-BPF for egress and XDP for ingress
@@ -336,10 +339,10 @@ static int tc_bpf_clear(char *interface)
* Returns time of CLOCK_MONOTONIC as nanoseconds in a single __u64.
* On failure, the value 0 is returned (and errno will be set).
*/
static __u64 get_time_ns(void)
static __u64 get_time_ns(clockid_t clockid)
{
struct timespec t;
if (clock_gettime(CLOCK_MONOTONIC, &t) != 0)
if (clock_gettime(clockid, &t) != 0)
return 0;
return (__u64)t.tv_sec * NS_PER_SECOND + (__u64)t.tv_nsec;
@@ -374,7 +377,7 @@ static int clean_map(int map_fd, size_t key_size, size_t value_size,
int removed = 0;
void *key, *prev_key, *value;
bool delete_prev = false;
__u64 now_nsec = get_time_ns();
__u64 now_nsec = get_time_ns(CLOCK_MONOTONIC);
#ifdef DEBUG
int entries = 0;
@@ -412,7 +415,7 @@ static int clean_map(int map_fd, size_t key_size, size_t value_size,
removed++;
}
#ifdef DEBUG
duration = get_time_ns() - now_nsec;
duration = get_time_ns(CLOCK_MONOTONIC) - now_nsec;
printf("%d: Gone through %d entries and removed %d of them in %llu.%09llu s\n",
map_fd, entries, removed, duration / NS_PER_SECOND,
duration % NS_PER_SECOND);
@@ -441,6 +444,28 @@ static void *periodic_map_cleanup(void *args)
pthread_exit(NULL);
}
static __u64 convert_monotonic_to_realtime(__u64 monotonic_time)
{
__u64 now_mon, now_rt;
static __u64 offset = 0;
static __u64 offset_updated = 0;
now_mon = get_time_ns(CLOCK_MONOTONIC);
if (offset == 0 ||
(now_mon > offset_updated &&
now_mon - offset_updated > MON_TO_REAL_UPDATE_FREQ)) {
now_mon = get_time_ns(CLOCK_MONOTONIC);
now_rt = get_time_ns(CLOCK_REALTIME);
if (now_rt < now_mon)
return 0;
offset = now_rt - now_mon;
offset_updated = now_mon;
}
return monotonic_time + offset;
}
/*
* Wrapper around inet_ntop designed to handle the "bug" that mapped IPv4
* addresses are formated as IPv6 addresses for AF_INET6
@@ -461,13 +486,18 @@ static void handle_rtt_event(void *ctx, int cpu, void *data, __u32 data_size)
const struct rtt_event *e = data;
char saddr[INET6_ADDRSTRLEN];
char daddr[INET6_ADDRSTRLEN];
char timestr[9];
__u64 ts = convert_monotonic_to_realtime(e->timestamp);
time_t ts_s = ts / NS_PER_SECOND;
format_ip_address(e->flow.ipv, &e->flow.saddr.ip, saddr, sizeof(saddr));
format_ip_address(e->flow.ipv, &e->flow.daddr.ip, daddr, sizeof(daddr));
strftime(timestr, sizeof(timestr), "%H:%M:%S", localtime(&ts_s));
printf("%llu.%06llu ms %s:%d+%s:%d\n", e->rtt / NS_PER_MS,
e->rtt % NS_PER_MS, saddr, ntohs(e->flow.saddr.port), daddr,
ntohs(e->flow.daddr.port));
printf("%s.%09llu %llu.%06llu ms %llu.%06llu ms %s:%d+%s:%d\n", timestr,
ts % NS_PER_SECOND, e->rtt / NS_PER_MS, e->rtt % NS_PER_MS,
e->min_rtt / NS_PER_MS, e->min_rtt % NS_PER_MS, saddr,
ntohs(e->flow.saddr.port), daddr, ntohs(e->flow.daddr.port));
}
static void handle_missed_rtt_event(void *ctx, int cpu, __u64 lost_cnt)

View File

@@ -39,6 +39,7 @@ struct network_tuple {
};
struct flow_state {
__u64 min_rtt;
__u64 last_timestamp;
__u32 last_id;
__u32 reserved;
@@ -51,6 +52,8 @@ struct packet_id {
struct rtt_event {
__u64 rtt;
__u64 min_rtt;
__u64 timestamp;
struct network_tuple flow;
__u32 reserved;
};

View File

@@ -147,11 +147,7 @@ static int parse_tcp_identifier(struct parsing_context *ctx, __be16 *sport,
return -1;
// Check if connection is closing
if (tcph->fin || tcph->rst) {
*flow_closing = true;
/* bpf_printk("Detected connection closing on %d\n", */
/* ctx->is_egress); //Upsets verifier? */
}
*flow_closing = tcph->rst || (!ctx->is_egress && tcph->fin);
// Do not timestamp pure ACKs
if (ctx->is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len &&
@@ -324,6 +320,7 @@ int pping_ingress(struct xdp_md *ctx)
struct packet_id p_id = { 0 };
__u64 *p_ts;
struct rtt_event event = { 0 };
struct flow_state *f_state;
struct parsing_context pctx = {
.data = (void *)(long)ctx->data,
.data_end = (void *)(long)ctx->data_end,
@@ -332,19 +329,18 @@ int pping_ingress(struct xdp_md *ctx)
.is_egress = false,
};
bool flow_closing = false;
__u64 now;
if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0)
goto out;
// Delete flow, but allow final attempt at RTT calculation
if (flow_closing)
bpf_map_delete_elem(&flow_state, &p_id.flow);
now = bpf_ktime_get_ns();
p_ts = bpf_map_lookup_elem(&packet_ts, &p_id);
if (!p_ts)
goto out;
if (!p_ts || now < *p_ts)
goto validflow_out;
event.rtt = bpf_ktime_get_ns() - *p_ts;
event.rtt = now - *p_ts;
event.timestamp = now;
/*
* Attempt to delete timestamp entry as soon as RTT is calculated.
* But could have potential concurrency issue where multiple packets
@@ -352,10 +348,25 @@ int pping_ingress(struct xdp_md *ctx)
*/
bpf_map_delete_elem(&packet_ts, &p_id);
// Update flow's min-RTT, may have concurrency issues
f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
if (!f_state)
goto validflow_out;
if (f_state->min_rtt == 0 || event.rtt < f_state->min_rtt)
f_state->min_rtt = event.rtt;
event.min_rtt = f_state->min_rtt;
__builtin_memcpy(&event.flow, &p_id.flow, sizeof(struct network_tuple));
bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, &event,
sizeof(event));
validflow_out:
// Wait with deleting flow until having pushed final RTT message
if (flow_closing)
bpf_map_delete_elem(&flow_state, &p_id.flow);
out:
return XDP_PASS;
}