From 3268ba87bb478dd009cb9273db43d6c3ec38442d Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Tue, 9 Feb 2021 18:09:30 +0100 Subject: [PATCH] pping: Refactor TC and XDP programs Refactor TC and XDP programs to reuse common logic for parsing packets. Add functions for parsing packets for an identifier to pping_helpers.h which both TC and XDP parts use. Also make it easier to extend pping with support for new protocols, as only new parsing functions have to be added and inserted into a single place. Also add reserved members to end of structs in pping.h to indicate padding. Signed-off-by: Simon Sundberg --- pping/pping.c | 10 ++--- pping/pping.h | 32 +++++++++----- pping/pping_helpers.h | 96 +++++++++++++++++++++++++++++++++++++++--- pping/pping_kern_tc.c | 57 +++---------------------- pping/pping_kern_xdp.c | 65 ++++------------------------ 5 files changed, 129 insertions(+), 131 deletions(-) diff --git a/pping/pping.c b/pping/pping.c index e6eb841..0df6d3b 100644 --- a/pping/pping.c +++ b/pping/pping.c @@ -81,7 +81,7 @@ static int set_rlimit(long int lim) static int mkdir_if_noexist(const char *path) { int ret; - struct stat st = {0}; + struct stat st = { 0 }; ret = stat(path, &st); if (ret) { @@ -261,12 +261,12 @@ static void handle_rtt_event(void *ctx, int cpu, void *data, __u32 data_size) char saddr[INET6_ADDRSTRLEN]; char daddr[INET6_ADDRSTRLEN]; - format_ip_address(e->flow.ipv, &e->flow.saddr, saddr, sizeof(saddr)); - format_ip_address(e->flow.ipv, &e->flow.daddr, daddr, sizeof(daddr)); + format_ip_address(e->flow.ipv, &e->flow.saddr.ip, saddr, sizeof(saddr)); + format_ip_address(e->flow.ipv, &e->flow.daddr.ip, daddr, sizeof(daddr)); printf("%llu.%06llu ms %s:%d+%s:%d\n", e->rtt / NS_PER_MS, - e->rtt % NS_PER_MS, saddr, ntohs(e->flow.sport), daddr, - ntohs(e->flow.dport)); + e->rtt % NS_PER_MS, saddr, ntohs(e->flow.saddr.port), daddr, + ntohs(e->flow.daddr.port)); } static void handle_missed_rtt_event(void *ctx, int cpu, __u64 lost_cnt) diff --git a/pping/pping.h b/pping/pping.h index 79c3191..755bc11 100644 --- a/pping/pping.h +++ b/pping/pping.h @@ -9,21 +9,29 @@ #define TCBPF_PROG_SEC "pping_egress" /* - * Struct to hold a full network tuple + * Struct that can hold the source or destination address for a flow (l3+l4). * Works for both IPv4 and IPv6, as IPv4 addresses can be mapped to IPv6 ones - * based on RFC 4291 Section 2.5.5.2. The ipv member is technically not - * necessary, but makes it easier to determine if it is an IPv4 or IPv6 address - * (don't need to look at the first 12 bytes of address). - * The proto memeber is not currently used, but could be useful once pping - * is extended to work for other protocols than TCP + * based on RFC 4291 Section 2.5.5.2. + */ +struct flow_address { + struct in6_addr ip; + __u16 port; + __u16 reserved; +}; + +/* + * Struct to hold a full network tuple + * The ipv member is technically not necessary, but makes it easier to + * determine if saddr/daddr are IPv4 or IPv6 address (don't need to look at the + * first 12 bytes of address). The proto memeber is not currently used, but + * could be useful once pping is extended to work for other protocols than TCP. */ struct network_tuple { - struct in6_addr saddr; - struct in6_addr daddr; - __u16 sport; - __u16 dport; + struct flow_address saddr; + struct flow_address daddr; __u16 proto; //IPPROTO_TCP, IPPROTO_ICMP, QUIC etc - __u16 ipv; //AF_INET or AF_INET6 + __u8 ipv; //AF_INET or AF_INET6 + __u8 reserved; }; struct packet_id { @@ -34,11 +42,13 @@ struct packet_id { struct packet_timestamp { __u64 timestamp; __u8 used; + __u8 reserved[7]; }; struct rtt_event { __u64 rtt; struct network_tuple flow; + __u32 reserved; }; #endif diff --git a/pping/pping_helpers.h b/pping/pping_helpers.h index c12a823..b9aa2b1 100644 --- a/pping/pping_helpers.h +++ b/pping/pping_helpers.h @@ -2,9 +2,16 @@ #ifndef PPING_HELPERS_H #define PPING_HELPERS_H +#include +#include +#include #include +#include +#include +#include #include -#include + +#include #include "pping.h" #define AF_INET 2 @@ -14,12 +21,12 @@ /* * Maps and IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2 */ -static __always_inline void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6) +static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6) { /* __u16 ipv4_prefix[6] = {0x0, 0x0, 0x0, 0x0, 0x0, 0xffff}; */ - /* memcpy(ipv6, ipv4_prefix, sizeof(ipv4_prefix)); // Won't load on TC */ - memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10); - memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2); + /* __builtin_memcpy(ipv6, ipv4_prefix, sizeof(ipv4_prefix)); */ + __builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10); + __builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2); ipv6->in6_u.u6_addr32[3] = ipv4; } @@ -29,8 +36,8 @@ static __always_inline void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6) * byte order). * Returns 0 if sucessful and -1 on failure */ -static __always_inline int parse_tcp_ts(struct tcphdr *tcph, void *data_end, - __u32 *tsval, __u32 *tsecr) +static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval, + __u32 *tsecr) { int len = tcph->doff << 2; void *opt_end = (void *)tcph + len; @@ -73,5 +80,80 @@ static __always_inline int parse_tcp_ts(struct tcphdr *tcph, void *data_end, } return -1; } +/* + * Attempts to fetch an identifier for TCP packets, based on the TCP timestamp + * option. If sucessful, identifier will be set to TSval if is_ingress, TSecr + * otherwise, the port-members of saddr and daddr will be set the the TCP source + * and dest, respectively, and 0 will be returned. On failure, -1 will be + * returned. + */ +static int parse_tcp_identifier(struct hdr_cursor *nh, void *data_end, + bool is_egress, struct flow_address *saddr, + struct flow_address *daddr, __u32 *identifier) +{ + __u32 tsval, tsecr; + struct tcphdr *tcph; + + if (parse_tcphdr(nh, data_end, &tcph) < 0) + return -1; + if (parse_tcp_ts(tcph, data_end, &tsval, &tsecr) < 0) + return -1; //Possible TODO, fall back on seq/ack instead + + saddr->port = tcph->source; + daddr->port = tcph->dest; + *identifier = is_egress ? tsval : tsecr; + return 0; +} + +/* + * Attempts to parse the packet limited by the data and data_end pointers, + * to retrieve a protocol dependent packet identifier. If sucessful, the + * ipv and identifier of p_id will be set, saddr and daddr (which may be part + * of p_id) will be filled with the source and destionation addresses of the + * packet, and 0 will be returned. On failure, -1 will be returned. + */ +static int parse_packet_identifier(void *data, void *data_end, bool is_egress, + struct packet_id *p_id, + struct flow_address *saddr, + struct flow_address *daddr) +{ + struct hdr_cursor nh = { .pos = data }; + struct ethhdr *eth; + struct iphdr *iph; + struct ipv6hdr *ip6h; + int proto, err; + + proto = parse_ethhdr(&nh, data_end, ð); + + // Parse IPv4/6 header + if (proto == bpf_htons(ETH_P_IP)) { + p_id->flow.ipv = AF_INET; + proto = parse_iphdr(&nh, data_end, &iph); + } else if (proto == bpf_htons(ETH_P_IPV6)) { + p_id->flow.ipv = AF_INET6; + proto = parse_ip6hdr(&nh, data_end, &ip6h); + } else + return -1; + + // Add new protocols here + if (proto == IPPROTO_TCP) + err = parse_tcp_identifier(&nh, data_end, is_egress, saddr, + daddr, &p_id->identifier); + else + return -1; + + if (err) + return -1; + + // Sucessfully parsed packet identifier - fill in IP-addresses and return + if (p_id->flow.ipv == AF_INET) { + map_ipv4_to_ipv6(iph->saddr, &saddr->ip); + map_ipv4_to_ipv6(iph->daddr, &daddr->ip); + } else { // IPv6 + saddr->ip = ip6h->saddr; + daddr->ip = ip6h->daddr; + } + return 0; +} #endif diff --git a/pping/pping_kern_tc.c b/pping/pping_kern_tc.c index bc26697..971910c 100644 --- a/pping/pping_kern_tc.c +++ b/pping/pping_kern_tc.c @@ -2,16 +2,6 @@ #include #include #include -#include - -#include -#include -#include -#include -#include -#include - -#include #include "pping.h" #include "pping_helpers.h" @@ -37,57 +27,20 @@ struct bpf_elf_map SEC("maps") ts_start = { }; #endif -// TC-BFP for parsing TSVAL from egress traffic and add to map +// TC-BFP for parsing packet identifier from egress traffic and add to map SEC(TCBPF_PROG_SEC) int tc_bpf_prog_egress(struct __sk_buff *skb) { - void *data = (void *)(long)skb->data; - void *data_end = (void *)(long)skb->data_end; - - int proto = -1; - __u32 tsval, tsecr; - - struct hdr_cursor nh = { .pos = data }; - struct ethhdr *eth; - struct iphdr *iph; - struct ipv6hdr *ip6h; - struct tcphdr *tcph; - struct packet_id p_id = { 0 }; struct packet_timestamp p_ts = { 0 }; - proto = parse_ethhdr(&nh, data_end, ð); + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; - // Parse IPv4/6 header - if (proto == bpf_htons(ETH_P_IP)) { - p_id.flow.ipv = AF_INET; - proto = parse_iphdr(&nh, data_end, &iph); - } else if (proto == bpf_htons(ETH_P_IPV6)) { - p_id.flow.ipv = AF_INET6; - proto = parse_ip6hdr(&nh, data_end, &ip6h); - } else + if (parse_packet_identifier(data, data_end, true, &p_id, + &p_id.flow.saddr, &p_id.flow.daddr) < 0) goto end; - // Parse TCP timestamp - if (proto != IPPROTO_TCP) - goto end; - if (parse_tcphdr(&nh, data_end, &tcph) < 0) - goto end; - if (parse_tcp_ts(tcph, data_end, &tsval, &tsecr) < 0) - goto end; - - // We have a TCP timestamp, try adding it to the map - p_id.identifier = tsval; - if (p_id.flow.ipv == AF_INET) { - map_ipv4_to_ipv6(iph->saddr, &p_id.flow.saddr); - map_ipv4_to_ipv6(iph->daddr, &p_id.flow.daddr); - } else { // IPv6 - p_id.flow.saddr = ip6h->saddr; - p_id.flow.daddr = ip6h->daddr; - } - p_id.flow.sport = tcph->source; - p_id.flow.dport = tcph->dest; - p_ts.timestamp = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns bpf_map_update_elem(&ts_start, &p_id, &p_ts, BPF_NOEXIST); diff --git a/pping/pping_kern_xdp.c b/pping/pping_kern_xdp.c index 5d4fe5f..e43baca 100644 --- a/pping/pping_kern_xdp.c +++ b/pping/pping_kern_xdp.c @@ -1,16 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ #include #include -#include - -#include -#include -#include -#include -#include -#include - -#include #include "pping.h" #include "pping_helpers.h" @@ -31,63 +21,25 @@ struct { __uint(value_size, sizeof(__u32)); } rtt_events SEC(".maps"); -// XDP program for parsing TSECR-val from ingress traffic and check for match in map +// XDP program for parsing identifier in ingress traffic and check for match in map SEC(XDP_PROG_SEC) int xdp_prog_ingress(struct xdp_md *ctx) { - void *data = (void *)(long)ctx->data; - void *data_end = (void *)(long)ctx->data_end; - - int proto = -1; - __u32 tsval, tsecr; - - struct hdr_cursor nh = { .pos = data }; - struct ethhdr *eth; - struct iphdr *iph; - struct ipv6hdr *ip6h; - struct tcphdr *tcph; - struct packet_id p_id = { 0 }; struct packet_timestamp *p_ts; struct rtt_event event = { 0 }; - proto = bpf_ntohs(parse_ethhdr(&nh, data_end, ð)); + void *data = (void *)(long)ctx->data; + void *data_end = (void *)(long)ctx->data_end; - // Parse IPv4/6 header - if (proto == ETH_P_IP) { - p_id.flow.ipv = AF_INET; - proto = parse_iphdr(&nh, data_end, &iph); - } else if (proto == ETH_P_IPV6) { - p_id.flow.ipv = AF_INET6; - proto = parse_ip6hdr(&nh, data_end, &ip6h); - } else + // saddr and daddr in reverse order of egress (source <--> dest) + if (parse_packet_identifier(data, data_end, false, &p_id, + &p_id.flow.daddr, &p_id.flow.saddr) < 0) goto end; - // Parse TCP timestamp - if (proto != IPPROTO_TCP) - goto end; - if (parse_tcphdr(&nh, data_end, &tcph) < 0) - goto end; - if (parse_tcp_ts(tcph, data_end, &tsval, &tsecr) < 0) - goto end; - - // We have a TCP-timestamp - now we can check if it's in the map - p_id.identifier = tsecr; - p_id.flow.proto == proto; - // Fill in reverse order of egress (dest <--> source) - if (p_id.flow.ipv == AF_INET) { - map_ipv4_to_ipv6(iph->daddr, &p_id.flow.saddr); - map_ipv4_to_ipv6(iph->saddr, &p_id.flow.daddr); - } else { // IPv6 - p_id.flow.saddr = ip6h->daddr; - p_id.flow.daddr = ip6h->saddr; - } - p_id.flow.sport = tcph->dest; - p_id.flow.dport = tcph->source; - p_ts = bpf_map_lookup_elem(&ts_start, &p_id); - // Only calculate RTT for first packet with matching TSecr + // Only calculate RTT for first packet with matching identifer if (p_ts && p_ts->used == 0) { /* * As used is not set atomically with the lookup, could @@ -98,7 +50,8 @@ int xdp_prog_ingress(struct xdp_md *ctx) p_ts->used = 1; // TODO - Optional delete of entry (if identifier is garantued unique) - memcpy(&event.flow, &p_id.flow, sizeof(struct network_tuple)); + __builtin_memcpy(&event.flow, &p_id.flow, + sizeof(struct network_tuple)); event.rtt = bpf_ktime_get_ns() - p_ts->timestamp; bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, &event, sizeof(event));