/* SPDX-License-Identifier: GPL-2.0-or-later */ #include #include #include #include #include #include #include #include #include // overwrite xdp/parsing_helpers.h value to avoid hitting verifier limit #ifdef IPV6_EXT_MAX_CHAIN #undef IPV6_EXT_MAX_CHAIN #endif #define IPV6_EXT_MAX_CHAIN 3 #include #include "pping.h" #define AF_INET 2 #define AF_INET6 10 #define MAX_TCP_OPTIONS 10 /* * This struct keeps track of the data and data_end pointers from the xdp_md or * __skb_buff contexts, as well as a currently parsed to position kept in nh. * Additionally, it also keeps the length of the entire packet, which together * with the other members can be used to determine ex. how much data each * header encloses. */ struct parsing_context { void *data; //Start of eth hdr void *data_end; //End of safe acessible area struct hdr_cursor nh; //Position to parse next __u32 pkt_len; //Full packet length (headers+data) bool is_egress; //Is packet on egress or ingress? }; char _license[] SEC("license") = "GPL"; // Global config struct - set from userspace static volatile const struct bpf_config config = {}; // Map definitions struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, struct packet_id); __type(value, __u64); __uint(max_entries, 16384); } packet_ts SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, struct network_tuple); __type(value, struct flow_state); __uint(max_entries, 16384); } flow_state SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u32)); } events SEC(".maps"); // Help functions /* * Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2 */ static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6) { __builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10); __builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2); ipv6->in6_u.u6_addr32[3] = ipv4; } /* * Parses the TSval and TSecr values from the TCP options field. If sucessful * the TSval and TSecr values will be stored at tsval and tsecr (in network * byte order). * Returns 0 if sucessful and -1 on failure */ static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval, __u32 *tsecr) { int len = tcph->doff << 2; void *opt_end = (void *)tcph + len; __u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options __u8 i, opt; volatile __u8 opt_size; // Seems to ensure it's always read of from stack as u8 if (tcph + 1 > data_end || len <= sizeof(struct tcphdr)) return -1; #pragma unroll //temporary solution until we can identify why the non-unrolled loop gets stuck in an infinite loop for (i = 0; i < MAX_TCP_OPTIONS; i++) { if (pos + 1 > opt_end || pos + 1 > data_end) return -1; opt = *pos; if (opt == 0) // Reached end of TCP options return -1; if (opt == 1) { // TCP NOP option - advance one byte pos++; continue; } // Option > 1, should have option size if (pos + 2 > opt_end || pos + 2 > data_end) return -1; opt_size = *(pos + 1); if (opt_size < 2) // Stop parsing options if opt_size has an invalid value return -1; // Option-kind is TCP timestap (yey!) if (opt == 8 && opt_size == 10) { if (pos + 10 > opt_end || pos + 10 > data_end) return -1; *tsval = *(__u32 *)(pos + 2); *tsecr = *(__u32 *)(pos + 6); return 0; } // Some other TCP option - advance option-length bytes pos += opt_size; } return -1; } /* * Attempts to fetch an identifier for TCP packets, based on the TCP timestamp * option. * If successful, identifier will be set to TSval if is_ingress, or TSecr * otherwise, the port-members of saddr and daddr will be set to the TCP source * and dest, respectively, fei will be filled appropriately (based on * SYN/FIN/RST) and 0 will be returned. * On failure, -1 will be returned. */ static int parse_tcp_identifier(struct parsing_context *ctx, __be16 *sport, __be16 *dport, struct flow_event_info *fei, __u32 *identifier) { __u32 tsval, tsecr; struct tcphdr *tcph; if (parse_tcphdr(&ctx->nh, ctx->data_end, &tcph) < 0) return -1; // Do not timestamp pure ACKs if (ctx->is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len && !tcph->syn) return -1; // Check if connection is opening/closing if (tcph->syn) { fei->event = FLOW_EVENT_OPENING; fei->reason = tcph->ack ? EVENT_REASON_SYN_ACK : EVENT_REASON_SYN; } else if (tcph->rst) { fei->event = FLOW_EVENT_CLOSING; fei->reason = EVENT_REASON_RST; } else if (!ctx->is_egress && tcph->fin) { fei->event = FLOW_EVENT_CLOSING; fei->reason = tcph->ack ? EVENT_REASON_FIN_ACK : EVENT_REASON_FIN; } else { fei->event = FLOW_EVENT_NONE; } if (parse_tcp_ts(tcph, ctx->data_end, &tsval, &tsecr) < 0) return -1; //Possible TODO, fall back on seq/ack instead *sport = tcph->source; *dport = tcph->dest; *identifier = ctx->is_egress ? tsval : tsecr; return 0; } /* * Attempts to parse the packet limited by the data and data_end pointers, * to retrieve a protocol dependent packet identifier. If sucessful, the * pointed to p_id and fei will be filled with parsed information from the * packet, and 0 will be returned. On failure, -1 will be returned. * If is_egress saddr and daddr will match source and destination of packet, * respectively, and identifier will be set to the identifer for an outgoing * packet. Otherwise, saddr and daddr will be swapped (will match * destination and source of packet, respectively), and identifier will be * set to the identifier of a response. */ static int parse_packet_identifier(struct parsing_context *ctx, struct packet_id *p_id, struct flow_event_info *fei) { int proto, err; struct ethhdr *eth; struct iphdr *iph; struct ipv6hdr *ip6h; struct flow_address *saddr, *daddr; // Switch saddr <--> daddr on ingress to match egress if (ctx->is_egress) { saddr = &p_id->flow.saddr; daddr = &p_id->flow.daddr; } else { saddr = &p_id->flow.daddr; daddr = &p_id->flow.saddr; } proto = parse_ethhdr(&ctx->nh, ctx->data_end, ð); // Parse IPv4/6 header if (proto == bpf_htons(ETH_P_IP)) { p_id->flow.ipv = AF_INET; p_id->flow.proto = parse_iphdr(&ctx->nh, ctx->data_end, &iph); } else if (proto == bpf_htons(ETH_P_IPV6)) { p_id->flow.ipv = AF_INET6; p_id->flow.proto = parse_ip6hdr(&ctx->nh, ctx->data_end, &ip6h); } else { return -1; } // Add new protocols here if (p_id->flow.proto == IPPROTO_TCP) { err = parse_tcp_identifier(ctx, &saddr->port, &daddr->port, fei, &p_id->identifier); if (err) return -1; } else { return -1; } // Sucessfully parsed packet identifier - fill in IP-addresses and return if (p_id->flow.ipv == AF_INET) { map_ipv4_to_ipv6(iph->saddr, &saddr->ip); map_ipv4_to_ipv6(iph->daddr, &daddr->ip); } else { // IPv6 saddr->ip = ip6h->saddr; daddr->ip = ip6h->daddr; } return 0; } /* * Returns the number of unparsed bytes left in the packet (bytes after nh.pos) */ static __u32 remaining_pkt_payload(struct parsing_context *ctx) { // pkt_len - (pos - data) fails because compiler transforms it to pkt_len - pos + data (pkt_len - pos not ok because value - pointer) // data + pkt_len - pos fails on (data+pkt_len) - pos due to math between pkt_pointer and unbounded register __u32 parsed_bytes = ctx->nh.pos - ctx->data; return parsed_bytes < ctx->pkt_len ? ctx->pkt_len - parsed_bytes : 0; } /* * Fills in event_type, timestamp, flow, source and reserved. * Does not fill in the flow_info. */ static void fill_flow_event(struct flow_event *fe, __u64 timestamp, struct network_tuple *flow, enum flow_event_source source) { fe->event_type = EVENT_TYPE_FLOW; fe->timestamp = timestamp; __builtin_memcpy(&fe->flow, flow, sizeof(struct network_tuple)); fe->source = source; fe->reserved = 0; // Make sure it's initilized } // Programs // TC-BFP for parsing packet identifier from egress traffic and add to map SEC(EGRESS_PROG_SEC) int pping_egress(struct __sk_buff *skb) { struct packet_id p_id = { 0 }; struct flow_event fe; __u64 now; struct parsing_context pctx = { .data = (void *)(long)skb->data, .data_end = (void *)(long)skb->data_end, .pkt_len = skb->len, .nh = { .pos = pctx.data }, .is_egress = true, }; struct flow_state *f_state; struct flow_state new_state = { 0 }; if (parse_packet_identifier(&pctx, &p_id, &fe.event_info) < 0) goto out; now = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow); // Flow closing - try to delete flow state and push closing-event if (fe.event_info.event == FLOW_EVENT_CLOSING) { if (!f_state) { bpf_map_delete_elem(&flow_state, &p_id.flow); fill_flow_event(&fe, now, &p_id.flow, EVENT_SOURCE_EGRESS); bpf_perf_event_output(skb, &events, BPF_F_CURRENT_CPU, &fe, sizeof(fe)); } goto out; } // No previous state - attempt to create it and push flow-opening event if (!f_state) { bpf_map_update_elem(&flow_state, &p_id.flow, &new_state, BPF_NOEXIST); f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow); if (!f_state) // Creation failed goto out; if (fe.event_info.event != FLOW_EVENT_OPENING) { fe.event_info.event = FLOW_EVENT_OPENING; fe.event_info.reason = EVENT_REASON_FIRST_OBS_PCKT; } fill_flow_event(&fe, now, &p_id.flow, EVENT_SOURCE_EGRESS); bpf_perf_event_output(skb, &events, BPF_F_CURRENT_CPU, &fe, sizeof(fe)); } f_state->sent_pkts++; f_state->sent_bytes += remaining_pkt_payload(&pctx); // Check if identfier is new if (f_state->last_id == p_id.identifier) goto out; f_state->last_id = p_id.identifier; // Check rate-limit if (now < f_state->last_timestamp || now - f_state->last_timestamp < config.rate_limit) goto out; /* * Updates attempt at creating timestamp, even if creation of timestamp * fails (due to map being full). This should make the competition for * the next available map slot somewhat fairer between heavy and sparse * flows. */ f_state->last_timestamp = now; bpf_map_update_elem(&packet_ts, &p_id, &now, BPF_NOEXIST); out: return BPF_OK; } // XDP program for parsing identifier in ingress traffic and check for match in map SEC(INGRESS_PROG_SEC) int pping_ingress(struct xdp_md *ctx) { struct packet_id p_id = { 0 }; __u64 *p_ts; struct flow_event fe; struct rtt_event re = { 0 }; struct flow_state *f_state; struct parsing_context pctx = { .data = (void *)(long)ctx->data, .data_end = (void *)(long)ctx->data_end, .pkt_len = pctx.data_end - pctx.data, .nh = { .pos = pctx.data }, .is_egress = false, }; __u64 now; if (parse_packet_identifier(&pctx, &p_id, &fe.event_info) < 0) goto out; f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow); if (!f_state) goto out; f_state->rec_pkts++; f_state->rec_bytes += remaining_pkt_payload(&pctx); now = bpf_ktime_get_ns(); p_ts = bpf_map_lookup_elem(&packet_ts, &p_id); if (!p_ts || now < *p_ts) goto validflow_out; re.rtt = now - *p_ts; // Delete timestamp entry as soon as RTT is calculated bpf_map_delete_elem(&packet_ts, &p_id); if (f_state->min_rtt == 0 || re.rtt < f_state->min_rtt) f_state->min_rtt = re.rtt; re.event_type = EVENT_TYPE_RTT; re.timestamp = now; re.min_rtt = f_state->min_rtt; re.sent_pkts = f_state->sent_pkts; re.sent_bytes = f_state->sent_bytes; re.rec_pkts = f_state->rec_pkts; re.rec_bytes = f_state->rec_bytes; // Push event to perf-buffer __builtin_memcpy(&re.flow, &p_id.flow, sizeof(struct network_tuple)); bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &re, sizeof(re)); validflow_out: // Wait with deleting flow until having pushed final RTT message if (fe.event_info.event == FLOW_EVENT_CLOSING && f_state) { bpf_map_delete_elem(&flow_state, &p_id.flow); fill_flow_event(&fe, now, &p_id.flow, EVENT_SOURCE_INGRESS); bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &fe, sizeof(fe)); } out: return XDP_PASS; }