From cecd6b54f2db0c6c9f10bd3633aca0f8430e66e2 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Mon, 1 Mar 2021 18:16:48 +0100 Subject: [PATCH] pping: Inital rate limit implementation Add a per-flow rate limit, limiting how often new timestamp entries can be created. As part of this, add per-flow state keeping track of when last timestamp was created and last seen identifier for each flow. Additionally, remove timestamp entry as soon as RTT is calculated, as last seen identifier is used to find first unique value instead. Furthermore, remove packet_timestamp struct and only use __u64 as timestamp, as used memeber is no longer needed. This initial commit lacks cleanup of flow-state, user-configuration of rate limit, mechanism to handle bursts etc. Signed-off-by: Simon Sundberg --- pping/pping.c | 6 +-- pping/pping.h | 12 +++--- pping/pping_kern_tc.c | 83 ++++++++++++++++++++++++++++++++++++++++-- pping/pping_kern_xdp.c | 32 +++++++--------- 4 files changed, 102 insertions(+), 31 deletions(-) diff --git a/pping/pping.c b/pping/pping.c index 0df6d3b..8745104 100644 --- a/pping/pping.c +++ b/pping/pping.c @@ -188,7 +188,7 @@ static int clean_map(int map_fd, __u64 max_age) { int removed = 0; struct packet_id key, prev_key = { 0 }; - struct packet_timestamp value; + __u64 value; bool delete_prev = false; __u64 now_nsec = get_time_ns(); @@ -207,8 +207,8 @@ static int clean_map(int map_fd, __u64 max_age) } if (bpf_map_lookup_elem(map_fd, &key, &value) == 0) { - if (now_nsec > value.timestamp && - now_nsec - value.timestamp > max_age) { + if (now_nsec > value && + now_nsec - value > max_age) { delete_prev = true; } } diff --git a/pping/pping.h b/pping/pping.h index 755bc11..b5edb0e 100644 --- a/pping/pping.h +++ b/pping/pping.h @@ -34,17 +34,17 @@ struct network_tuple { __u8 reserved; }; +struct flow_state { + __u64 last_timestamp; + __u32 last_id; + __u32 reserved; +}; + struct packet_id { struct network_tuple flow; __u32 identifier; //tsval for TCP packets }; -struct packet_timestamp { - __u64 timestamp; - __u8 used; - __u8 reserved[7]; -}; - struct rtt_event { __u64 rtt; struct network_tuple flow; diff --git a/pping/pping_kern_tc.c b/pping/pping_kern_tc.c index c750ea3..a72b675 100644 --- a/pping/pping_kern_tc.c +++ b/pping/pping_kern_tc.c @@ -6,25 +6,46 @@ #include "pping.h" #include "pping_helpers.h" +#define RATE_LIMIT \ + 100000000UL // 100ms. Temporary solution, should be set by userspace +#define BURST_DURATION \ + 4000000 // 4ms, duration for when it may burst packet timestamps +#define BURST_SIZE \ + 3 // Number of packets it may create timestamps for in a burst + char _license[] SEC("license") = "GPL"; #ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */ struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(struct packet_id)); - __uint(value_size, sizeof(struct packet_timestamp)); + __uint(value_size, sizeof(__u64)); __uint(max_entries, 16384); __uint(pinning, LIBBPF_PIN_BY_NAME); } ts_start SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct network_tuple)); + __uint(value_size, sizeof(struct flow_state)); + __uint(max_entries, 16384); +} flow_state SEC(".maps"); + #else struct bpf_elf_map SEC("maps") ts_start = { .type = BPF_MAP_TYPE_HASH, .size_key = sizeof(struct packet_id), - .size_value = sizeof(struct packet_timestamp), + .size_value = sizeof(__u64), .max_elem = 16384, .pinning = PIN_GLOBAL_NS, }; + +struct bpf_elf_map SEC("maps") flow_state = { + .type = BPF_MAP_TYPE_HASH, + .size_key = sizeof(struct network_tuple), + .size_value = sizeof(struct flow_state), + .max_elem = 16384, +}; #endif // TC-BFP for parsing packet identifier from egress traffic and add to map @@ -32,18 +53,72 @@ SEC(TCBPF_PROG_SEC) int tc_bpf_prog_egress(struct __sk_buff *skb) { struct packet_id p_id = { 0 }; - struct packet_timestamp p_ts = { 0 }; + __u64 p_ts; struct parsing_context pctx = { .data = (void *)(long)skb->data, .data_end = (void *)(long)skb->data_end, .pkt_len = skb->len, .nh = { .pos = pctx.data }, }; + struct flow_state *f_state; + struct flow_state new_state = { 0 }; // Rarely-ish used, but can't really dynamically allocate memory or? if (parse_packet_identifier(&pctx, true, &p_id) < 0) goto end; - p_ts.timestamp = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns + // Check flow state + f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow); + if (!f_state) { // No previous state - attempt to create it + bpf_map_update_elem(&flow_state, &p_id.flow, &new_state, + BPF_NOEXIST); + f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow); + if (!f_state) + goto end; + } + + // Check if identfier is new + /* The gap between checking and updating last_id may cause concurrency + * issues where multiple packets may simultaneously think they are the + * first with a new identifier. As long as all of the identifiers are + * the same though, only one should be able to create a timestamp entry. + + * A bigger issue is that older identifiers (for example due to + * out-of-order packets) may pass this check and update the current + * identifier to an old one. This means that both the packet with the + * old identifier itself, as well the next packet with the current + * identifier, may be considered packets with new identifiers (even if + * both have been seen before). For TCP timestamps this could be + * prevented by changing the check to '>=' instead, but it may not be + * suitable for other protocols, such as QUIC and its spinbit. + * + * For now, just hope that the rate limit saves us from creating an + * incorrect timestamp. That may however also fail, either due to the + * to it happening in a time it's not limited by rate sampling, or + * because of rate check failing due to concurrency issues. + */ + if (f_state->last_id == p_id.identifier) + goto end; + f_state->last_id = p_id.identifier; + + // Check rate-limit + /* + * The window between checking and updating last_timestamp may cause + * concurrency issues, where multiple packets simultaneously pass the + * rate limit. However, as long as they have the same identifier, only + * a single timestamp entry should successfully be created. + */ + p_ts = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns + if (p_ts < f_state->last_timestamp || + p_ts - f_state->last_timestamp < RATE_LIMIT) + goto end; + + /* + * Updates attempt at creating timestamp, even if creation of timestamp + * fails (due to map being full). This should make the competition for + * the next available map slot somewhat fairer between heavy and sparse + * flows. + */ + f_state->last_timestamp = p_ts; bpf_map_update_elem(&ts_start, &p_id, &p_ts, BPF_NOEXIST); end: diff --git a/pping/pping_kern_xdp.c b/pping/pping_kern_xdp.c index f0a95aa..4cdf068 100644 --- a/pping/pping_kern_xdp.c +++ b/pping/pping_kern_xdp.c @@ -10,7 +10,7 @@ char _license[] SEC("license") = "GPL"; struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(struct packet_id)); - __uint(value_size, sizeof(struct packet_timestamp)); + __uint(value_size, sizeof(__u64)); __uint(max_entries, 16384); __uint(pinning, LIBBPF_PIN_BY_NAME); } ts_start SEC(".maps"); @@ -26,7 +26,7 @@ SEC(XDP_PROG_SEC) int xdp_prog_ingress(struct xdp_md *ctx) { struct packet_id p_id = { 0 }; - struct packet_timestamp *p_ts; + __u64 *p_ts; struct rtt_event event = { 0 }; struct parsing_context pctx = { .data = (void *)(long)ctx->data, @@ -39,24 +39,20 @@ int xdp_prog_ingress(struct xdp_md *ctx) goto end; p_ts = bpf_map_lookup_elem(&ts_start, &p_id); + if (!p_ts) + goto end; - // Only calculate RTT for first packet with matching identifer - if (p_ts && p_ts->used == 0) { - /* - * As used is not set atomically with the lookup, could - * potentially have multiple "first" packets (on different - * CPUs), but all those should then also have very similar RTT, - * so don't consider it a significant issue - */ - p_ts->used = 1; - // TODO - Optional delete of entry (if identifier is garantued unique) + event.rtt = bpf_ktime_get_ns() - *p_ts; + /* + * Attempt to delete timestamp entry as soon as RTT is calculated. + * But could have potential concurrency issue where multiple packets + * manage to match against the identifier before it can be deleted. + */ + bpf_map_delete_elem(&ts_start, &p_id); - __builtin_memcpy(&event.flow, &p_id.flow, - sizeof(struct network_tuple)); - event.rtt = bpf_ktime_get_ns() - p_ts->timestamp; - bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, - &event, sizeof(event)); - } + __builtin_memcpy(&event.flow, &p_id.flow, sizeof(struct network_tuple)); + bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, &event, + sizeof(event)); end: return XDP_PASS;