diff --git a/pping/pping.c b/pping/pping.c index 0df6d3b..8745104 100644 --- a/pping/pping.c +++ b/pping/pping.c @@ -188,7 +188,7 @@ static int clean_map(int map_fd, __u64 max_age) { int removed = 0; struct packet_id key, prev_key = { 0 }; - struct packet_timestamp value; + __u64 value; bool delete_prev = false; __u64 now_nsec = get_time_ns(); @@ -207,8 +207,8 @@ static int clean_map(int map_fd, __u64 max_age) } if (bpf_map_lookup_elem(map_fd, &key, &value) == 0) { - if (now_nsec > value.timestamp && - now_nsec - value.timestamp > max_age) { + if (now_nsec > value && + now_nsec - value > max_age) { delete_prev = true; } } diff --git a/pping/pping.h b/pping/pping.h index 755bc11..b5edb0e 100644 --- a/pping/pping.h +++ b/pping/pping.h @@ -34,17 +34,17 @@ struct network_tuple { __u8 reserved; }; +struct flow_state { + __u64 last_timestamp; + __u32 last_id; + __u32 reserved; +}; + struct packet_id { struct network_tuple flow; __u32 identifier; //tsval for TCP packets }; -struct packet_timestamp { - __u64 timestamp; - __u8 used; - __u8 reserved[7]; -}; - struct rtt_event { __u64 rtt; struct network_tuple flow; diff --git a/pping/pping_kern_tc.c b/pping/pping_kern_tc.c index c750ea3..a72b675 100644 --- a/pping/pping_kern_tc.c +++ b/pping/pping_kern_tc.c @@ -6,25 +6,46 @@ #include "pping.h" #include "pping_helpers.h" +#define RATE_LIMIT \ + 100000000UL // 100ms. Temporary solution, should be set by userspace +#define BURST_DURATION \ + 4000000 // 4ms, duration for when it may burst packet timestamps +#define BURST_SIZE \ + 3 // Number of packets it may create timestamps for in a burst + char _license[] SEC("license") = "GPL"; #ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */ struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(struct packet_id)); - __uint(value_size, sizeof(struct packet_timestamp)); + __uint(value_size, sizeof(__u64)); __uint(max_entries, 16384); __uint(pinning, LIBBPF_PIN_BY_NAME); } ts_start SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct network_tuple)); + __uint(value_size, sizeof(struct flow_state)); + __uint(max_entries, 16384); +} flow_state SEC(".maps"); + #else struct bpf_elf_map SEC("maps") ts_start = { .type = BPF_MAP_TYPE_HASH, .size_key = sizeof(struct packet_id), - .size_value = sizeof(struct packet_timestamp), + .size_value = sizeof(__u64), .max_elem = 16384, .pinning = PIN_GLOBAL_NS, }; + +struct bpf_elf_map SEC("maps") flow_state = { + .type = BPF_MAP_TYPE_HASH, + .size_key = sizeof(struct network_tuple), + .size_value = sizeof(struct flow_state), + .max_elem = 16384, +}; #endif // TC-BFP for parsing packet identifier from egress traffic and add to map @@ -32,18 +53,72 @@ SEC(TCBPF_PROG_SEC) int tc_bpf_prog_egress(struct __sk_buff *skb) { struct packet_id p_id = { 0 }; - struct packet_timestamp p_ts = { 0 }; + __u64 p_ts; struct parsing_context pctx = { .data = (void *)(long)skb->data, .data_end = (void *)(long)skb->data_end, .pkt_len = skb->len, .nh = { .pos = pctx.data }, }; + struct flow_state *f_state; + struct flow_state new_state = { 0 }; // Rarely-ish used, but can't really dynamically allocate memory or? if (parse_packet_identifier(&pctx, true, &p_id) < 0) goto end; - p_ts.timestamp = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns + // Check flow state + f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow); + if (!f_state) { // No previous state - attempt to create it + bpf_map_update_elem(&flow_state, &p_id.flow, &new_state, + BPF_NOEXIST); + f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow); + if (!f_state) + goto end; + } + + // Check if identfier is new + /* The gap between checking and updating last_id may cause concurrency + * issues where multiple packets may simultaneously think they are the + * first with a new identifier. As long as all of the identifiers are + * the same though, only one should be able to create a timestamp entry. + + * A bigger issue is that older identifiers (for example due to + * out-of-order packets) may pass this check and update the current + * identifier to an old one. This means that both the packet with the + * old identifier itself, as well the next packet with the current + * identifier, may be considered packets with new identifiers (even if + * both have been seen before). For TCP timestamps this could be + * prevented by changing the check to '>=' instead, but it may not be + * suitable for other protocols, such as QUIC and its spinbit. + * + * For now, just hope that the rate limit saves us from creating an + * incorrect timestamp. That may however also fail, either due to the + * to it happening in a time it's not limited by rate sampling, or + * because of rate check failing due to concurrency issues. + */ + if (f_state->last_id == p_id.identifier) + goto end; + f_state->last_id = p_id.identifier; + + // Check rate-limit + /* + * The window between checking and updating last_timestamp may cause + * concurrency issues, where multiple packets simultaneously pass the + * rate limit. However, as long as they have the same identifier, only + * a single timestamp entry should successfully be created. + */ + p_ts = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns + if (p_ts < f_state->last_timestamp || + p_ts - f_state->last_timestamp < RATE_LIMIT) + goto end; + + /* + * Updates attempt at creating timestamp, even if creation of timestamp + * fails (due to map being full). This should make the competition for + * the next available map slot somewhat fairer between heavy and sparse + * flows. + */ + f_state->last_timestamp = p_ts; bpf_map_update_elem(&ts_start, &p_id, &p_ts, BPF_NOEXIST); end: diff --git a/pping/pping_kern_xdp.c b/pping/pping_kern_xdp.c index f0a95aa..4cdf068 100644 --- a/pping/pping_kern_xdp.c +++ b/pping/pping_kern_xdp.c @@ -10,7 +10,7 @@ char _license[] SEC("license") = "GPL"; struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(struct packet_id)); - __uint(value_size, sizeof(struct packet_timestamp)); + __uint(value_size, sizeof(__u64)); __uint(max_entries, 16384); __uint(pinning, LIBBPF_PIN_BY_NAME); } ts_start SEC(".maps"); @@ -26,7 +26,7 @@ SEC(XDP_PROG_SEC) int xdp_prog_ingress(struct xdp_md *ctx) { struct packet_id p_id = { 0 }; - struct packet_timestamp *p_ts; + __u64 *p_ts; struct rtt_event event = { 0 }; struct parsing_context pctx = { .data = (void *)(long)ctx->data, @@ -39,24 +39,20 @@ int xdp_prog_ingress(struct xdp_md *ctx) goto end; p_ts = bpf_map_lookup_elem(&ts_start, &p_id); + if (!p_ts) + goto end; - // Only calculate RTT for first packet with matching identifer - if (p_ts && p_ts->used == 0) { - /* - * As used is not set atomically with the lookup, could - * potentially have multiple "first" packets (on different - * CPUs), but all those should then also have very similar RTT, - * so don't consider it a significant issue - */ - p_ts->used = 1; - // TODO - Optional delete of entry (if identifier is garantued unique) + event.rtt = bpf_ktime_get_ns() - *p_ts; + /* + * Attempt to delete timestamp entry as soon as RTT is calculated. + * But could have potential concurrency issue where multiple packets + * manage to match against the identifier before it can be deleted. + */ + bpf_map_delete_elem(&ts_start, &p_id); - __builtin_memcpy(&event.flow, &p_id.flow, - sizeof(struct network_tuple)); - event.rtt = bpf_ktime_get_ns() - p_ts->timestamp; - bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, - &event, sizeof(event)); - } + __builtin_memcpy(&event.flow, &p_id.flow, sizeof(struct network_tuple)); + bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, &event, + sizeof(event)); end: return XDP_PASS;