2021-01-18 13:13:51 +01:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2021-02-09 18:09:30 +01:00
|
|
|
#include <linux/bpf.h>
|
2021-04-15 14:13:54 +02:00
|
|
|
#include <bpf/bpf_helpers.h>
|
2021-02-09 18:09:30 +01:00
|
|
|
#include <linux/in.h>
|
2021-02-08 20:28:46 +01:00
|
|
|
#include <linux/in6.h>
|
2021-02-09 18:09:30 +01:00
|
|
|
#include <linux/if_ether.h>
|
|
|
|
#include <linux/ip.h>
|
|
|
|
#include <linux/ipv6.h>
|
2021-01-26 18:34:23 +01:00
|
|
|
#include <linux/tcp.h>
|
2021-02-09 18:09:30 +01:00
|
|
|
#include <stdbool.h>
|
2021-04-15 14:13:54 +02:00
|
|
|
|
2021-04-22 17:51:49 +02:00
|
|
|
// overwrite xdp/parsing_helpers.h value to avoid hitting verifier limit
|
|
|
|
#ifdef IPV6_EXT_MAX_CHAIN
|
|
|
|
#undef IPV6_EXT_MAX_CHAIN
|
|
|
|
#endif
|
|
|
|
#define IPV6_EXT_MAX_CHAIN 3
|
|
|
|
|
|
|
|
#include <xdp/parsing_helpers.h>
|
2021-02-08 20:28:46 +01:00
|
|
|
#include "pping.h"
|
2021-01-26 18:34:23 +01:00
|
|
|
|
2021-02-09 13:00:28 +01:00
|
|
|
#define AF_INET 2
|
|
|
|
#define AF_INET6 10
|
2021-01-07 18:30:53 +01:00
|
|
|
#define MAX_TCP_OPTIONS 10
|
|
|
|
|
2021-02-12 18:31:30 +01:00
|
|
|
/*
|
|
|
|
* This struct keeps track of the data and data_end pointers from the xdp_md or
|
|
|
|
* __skb_buff contexts, as well as a currently parsed to position kept in nh.
|
2021-02-16 12:34:19 +01:00
|
|
|
* Additionally, it also keeps the length of the entire packet, which together
|
|
|
|
* with the other members can be used to determine ex. how much data each
|
|
|
|
* header encloses.
|
2021-02-12 18:31:30 +01:00
|
|
|
*/
|
|
|
|
struct parsing_context {
|
2021-04-15 14:13:54 +02:00
|
|
|
void *data; //Start of eth hdr
|
|
|
|
void *data_end; //End of safe acessible area
|
2021-02-12 18:31:30 +01:00
|
|
|
struct hdr_cursor nh; //Position to parse next
|
2021-03-22 12:23:27 +01:00
|
|
|
__u32 pkt_len; //Full packet length (headers+data)
|
|
|
|
bool is_egress; //Is packet on egress or ingress?
|
2021-02-12 18:31:30 +01:00
|
|
|
};
|
|
|
|
|
2021-04-15 14:13:54 +02:00
|
|
|
char _license[] SEC("license") = "GPL";
|
|
|
|
// Global config struct - set from userspace
|
|
|
|
static volatile const struct bpf_config config = {};
|
2021-03-22 12:23:27 +01:00
|
|
|
|
2021-04-15 14:13:54 +02:00
|
|
|
// Map definitions
|
2021-03-02 17:40:51 +01:00
|
|
|
struct {
|
|
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
|
|
__type(key, struct packet_id);
|
|
|
|
__type(value, __u64);
|
|
|
|
__uint(max_entries, 16384);
|
2021-04-15 14:13:54 +02:00
|
|
|
} packet_ts SEC(".maps");
|
2021-03-02 17:40:51 +01:00
|
|
|
|
2021-03-09 19:58:42 +01:00
|
|
|
struct {
|
|
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
|
|
__type(key, struct network_tuple);
|
|
|
|
__type(value, struct flow_state);
|
|
|
|
__uint(max_entries, 16384);
|
|
|
|
} flow_state SEC(".maps");
|
|
|
|
|
2021-04-15 14:13:54 +02:00
|
|
|
struct {
|
|
|
|
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
|
|
|
__uint(key_size, sizeof(__u32));
|
|
|
|
__uint(value_size, sizeof(__u32));
|
|
|
|
} rtt_events SEC(".maps");
|
|
|
|
|
|
|
|
// Help functions
|
|
|
|
|
2021-02-08 20:28:46 +01:00
|
|
|
/*
|
2021-02-16 12:34:19 +01:00
|
|
|
* Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2
|
2021-02-08 20:28:46 +01:00
|
|
|
*/
|
2021-02-09 18:09:30 +01:00
|
|
|
static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
|
2021-01-07 18:30:53 +01:00
|
|
|
{
|
2021-03-22 12:23:27 +01:00
|
|
|
__builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10);
|
|
|
|
__builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2);
|
2021-02-08 20:28:46 +01:00
|
|
|
ipv6->in6_u.u6_addr32[3] = ipv4;
|
2021-01-07 18:30:53 +01:00
|
|
|
}
|
2021-01-26 18:34:23 +01:00
|
|
|
|
2021-01-18 18:08:35 +01:00
|
|
|
/*
|
2021-01-26 18:34:23 +01:00
|
|
|
* Parses the TSval and TSecr values from the TCP options field. If sucessful
|
|
|
|
* the TSval and TSecr values will be stored at tsval and tsecr (in network
|
2021-01-18 18:08:35 +01:00
|
|
|
* byte order).
|
|
|
|
* Returns 0 if sucessful and -1 on failure
|
|
|
|
*/
|
2021-02-09 18:09:30 +01:00
|
|
|
static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval,
|
|
|
|
__u32 *tsecr)
|
2021-01-07 18:30:53 +01:00
|
|
|
{
|
2021-01-27 12:16:11 +01:00
|
|
|
int len = tcph->doff << 2;
|
2021-01-26 18:34:23 +01:00
|
|
|
void *opt_end = (void *)tcph + len;
|
2021-01-27 12:16:11 +01:00
|
|
|
__u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options
|
2021-03-29 20:13:33 +02:00
|
|
|
__u8 i, opt;
|
2021-04-15 14:13:54 +02:00
|
|
|
volatile __u8
|
|
|
|
opt_size; // Seems to ensure it's always read of from stack as u8
|
2021-01-07 18:30:53 +01:00
|
|
|
|
2021-01-27 12:16:11 +01:00
|
|
|
if (tcph + 1 > data_end || len <= sizeof(struct tcphdr))
|
|
|
|
return -1;
|
2021-03-15 18:23:23 +01:00
|
|
|
#pragma unroll //temporary solution until we can identify why the non-unrolled loop gets stuck in an infinite loop
|
2021-01-27 12:16:11 +01:00
|
|
|
for (i = 0; i < MAX_TCP_OPTIONS; i++) {
|
|
|
|
if (pos + 1 > opt_end || pos + 1 > data_end)
|
|
|
|
return -1;
|
2021-01-26 18:34:23 +01:00
|
|
|
|
2021-01-27 12:16:11 +01:00
|
|
|
opt = *pos;
|
|
|
|
if (opt == 0) // Reached end of TCP options
|
|
|
|
return -1;
|
2021-01-26 18:34:23 +01:00
|
|
|
|
2021-01-27 12:16:11 +01:00
|
|
|
if (opt == 1) { // TCP NOP option - advance one byte
|
|
|
|
pos++;
|
|
|
|
continue;
|
|
|
|
}
|
2021-01-26 18:34:23 +01:00
|
|
|
|
2021-01-27 12:16:11 +01:00
|
|
|
// Option > 1, should have option size
|
|
|
|
if (pos + 2 > opt_end || pos + 2 > data_end)
|
|
|
|
return -1;
|
|
|
|
opt_size = *(pos + 1);
|
2021-03-30 19:34:48 +02:00
|
|
|
if (opt_size < 2) // Stop parsing options if opt_size has an invalid value
|
|
|
|
return -1;
|
2021-01-26 18:34:23 +01:00
|
|
|
|
2021-01-27 12:16:11 +01:00
|
|
|
// Option-kind is TCP timestap (yey!)
|
|
|
|
if (opt == 8 && opt_size == 10) {
|
2021-03-29 20:13:33 +02:00
|
|
|
if (pos + 10 > opt_end || pos + 10 > data_end)
|
2021-01-27 12:16:11 +01:00
|
|
|
return -1;
|
|
|
|
*tsval = *(__u32 *)(pos + 2);
|
|
|
|
*tsecr = *(__u32 *)(pos + 6);
|
|
|
|
return 0;
|
|
|
|
}
|
2021-01-07 18:30:53 +01:00
|
|
|
|
2021-01-27 12:16:11 +01:00
|
|
|
// Some other TCP option - advance option-length bytes
|
|
|
|
pos += opt_size;
|
|
|
|
}
|
|
|
|
return -1;
|
2021-01-07 18:30:53 +01:00
|
|
|
}
|
2021-03-09 19:58:42 +01:00
|
|
|
|
2021-02-09 18:09:30 +01:00
|
|
|
/*
|
|
|
|
* Attempts to fetch an identifier for TCP packets, based on the TCP timestamp
|
|
|
|
* option. If sucessful, identifier will be set to TSval if is_ingress, TSecr
|
|
|
|
* otherwise, the port-members of saddr and daddr will be set the the TCP source
|
|
|
|
* and dest, respectively, and 0 will be returned. On failure, -1 will be
|
2021-03-09 19:58:42 +01:00
|
|
|
* returned. Additionally, if the connection is closing (FIN or RST flag), sets
|
|
|
|
* flow_closing to true.
|
2021-02-09 18:09:30 +01:00
|
|
|
*/
|
2021-03-09 19:58:42 +01:00
|
|
|
static int parse_tcp_identifier(struct parsing_context *ctx, __be16 *sport,
|
|
|
|
__be16 *dport, bool *flow_closing,
|
|
|
|
__u32 *identifier)
|
2021-02-09 18:09:30 +01:00
|
|
|
{
|
|
|
|
__u32 tsval, tsecr;
|
|
|
|
struct tcphdr *tcph;
|
|
|
|
|
2021-02-12 18:31:30 +01:00
|
|
|
if (parse_tcphdr(&ctx->nh, ctx->data_end, &tcph) < 0)
|
|
|
|
return -1;
|
|
|
|
|
2021-03-09 19:58:42 +01:00
|
|
|
// Check if connection is closing
|
|
|
|
if (tcph->fin || tcph->rst) {
|
|
|
|
*flow_closing = true;
|
|
|
|
/* bpf_printk("Detected connection closing on %d\n", */
|
|
|
|
/* ctx->is_egress); //Upsets verifier? */
|
|
|
|
}
|
|
|
|
|
2021-02-12 18:31:30 +01:00
|
|
|
// Do not timestamp pure ACKs
|
2021-03-09 19:58:42 +01:00
|
|
|
if (ctx->is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len &&
|
|
|
|
!tcph->syn)
|
2021-02-09 18:09:30 +01:00
|
|
|
return -1;
|
2021-02-12 11:40:43 +01:00
|
|
|
|
2021-02-12 18:31:30 +01:00
|
|
|
if (parse_tcp_ts(tcph, ctx->data_end, &tsval, &tsecr) < 0)
|
2021-02-09 18:09:30 +01:00
|
|
|
return -1; //Possible TODO, fall back on seq/ack instead
|
|
|
|
|
2021-02-16 12:34:19 +01:00
|
|
|
*sport = tcph->source;
|
|
|
|
*dport = tcph->dest;
|
2021-03-09 19:58:42 +01:00
|
|
|
*identifier = ctx->is_egress ? tsval : tsecr;
|
2021-02-09 18:09:30 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Attempts to parse the packet limited by the data and data_end pointers,
|
|
|
|
* to retrieve a protocol dependent packet identifier. If sucessful, the
|
2021-02-12 11:40:43 +01:00
|
|
|
* pointed to p_id will be filled with parsed information from the packet
|
2021-02-09 18:09:30 +01:00
|
|
|
* packet, and 0 will be returned. On failure, -1 will be returned.
|
2021-02-12 11:40:43 +01:00
|
|
|
* If is_egress saddr and daddr will match source and destination of packet,
|
|
|
|
* respectively, and identifier will be set to the identifer for an outgoing
|
|
|
|
* packet. Otherwise, saddr and daddr will be swapped (will match
|
|
|
|
* destination and source of packet, respectively), and identifier will be
|
|
|
|
* set to the identifier of a response.
|
2021-02-09 18:09:30 +01:00
|
|
|
*/
|
2021-03-09 19:58:42 +01:00
|
|
|
static int parse_packet_identifier(struct parsing_context *ctx,
|
|
|
|
struct packet_id *p_id, bool *flow_closing)
|
2021-02-09 18:09:30 +01:00
|
|
|
{
|
2021-02-12 11:40:43 +01:00
|
|
|
int proto, err;
|
2021-02-09 18:09:30 +01:00
|
|
|
struct ethhdr *eth;
|
|
|
|
struct iphdr *iph;
|
|
|
|
struct ipv6hdr *ip6h;
|
2021-02-12 11:40:43 +01:00
|
|
|
struct flow_address *saddr, *daddr;
|
|
|
|
|
|
|
|
// Switch saddr <--> daddr on ingress to match egress
|
2021-03-09 19:58:42 +01:00
|
|
|
if (ctx->is_egress) {
|
2021-02-12 11:40:43 +01:00
|
|
|
saddr = &p_id->flow.saddr;
|
|
|
|
daddr = &p_id->flow.daddr;
|
|
|
|
} else {
|
|
|
|
saddr = &p_id->flow.daddr;
|
|
|
|
daddr = &p_id->flow.saddr;
|
|
|
|
}
|
2021-02-09 18:09:30 +01:00
|
|
|
|
2021-02-12 18:31:30 +01:00
|
|
|
proto = parse_ethhdr(&ctx->nh, ctx->data_end, ð);
|
2021-02-09 18:09:30 +01:00
|
|
|
|
|
|
|
// Parse IPv4/6 header
|
|
|
|
if (proto == bpf_htons(ETH_P_IP)) {
|
|
|
|
p_id->flow.ipv = AF_INET;
|
2021-02-12 18:31:30 +01:00
|
|
|
proto = parse_iphdr(&ctx->nh, ctx->data_end, &iph);
|
2021-02-09 18:09:30 +01:00
|
|
|
} else if (proto == bpf_htons(ETH_P_IPV6)) {
|
|
|
|
p_id->flow.ipv = AF_INET6;
|
2021-02-12 18:31:30 +01:00
|
|
|
proto = parse_ip6hdr(&ctx->nh, ctx->data_end, &ip6h);
|
2021-02-12 11:40:43 +01:00
|
|
|
} else {
|
2021-02-09 18:09:30 +01:00
|
|
|
return -1;
|
2021-02-12 11:40:43 +01:00
|
|
|
}
|
2021-02-09 18:09:30 +01:00
|
|
|
|
|
|
|
// Add new protocols here
|
2021-02-12 11:40:43 +01:00
|
|
|
if (proto == IPPROTO_TCP) {
|
2021-03-09 19:58:42 +01:00
|
|
|
err = parse_tcp_identifier(ctx, &saddr->port, &daddr->port,
|
|
|
|
flow_closing, &p_id->identifier);
|
2021-02-12 11:40:43 +01:00
|
|
|
if (err)
|
|
|
|
return -1;
|
|
|
|
} else {
|
2021-02-09 18:09:30 +01:00
|
|
|
return -1;
|
2021-02-12 11:40:43 +01:00
|
|
|
}
|
2021-02-09 18:09:30 +01:00
|
|
|
|
|
|
|
// Sucessfully parsed packet identifier - fill in IP-addresses and return
|
|
|
|
if (p_id->flow.ipv == AF_INET) {
|
|
|
|
map_ipv4_to_ipv6(iph->saddr, &saddr->ip);
|
|
|
|
map_ipv4_to_ipv6(iph->daddr, &daddr->ip);
|
|
|
|
} else { // IPv6
|
|
|
|
saddr->ip = ip6h->saddr;
|
|
|
|
daddr->ip = ip6h->daddr;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2021-01-07 18:30:53 +01:00
|
|
|
|
2021-04-15 14:13:54 +02:00
|
|
|
// Programs
|
|
|
|
|
|
|
|
// TC-BFP for parsing packet identifier from egress traffic and add to map
|
|
|
|
SEC(EGRESS_PROG_SEC)
|
|
|
|
int pping_egress(struct __sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct packet_id p_id = { 0 };
|
|
|
|
__u64 p_ts;
|
|
|
|
struct parsing_context pctx = {
|
|
|
|
.data = (void *)(long)skb->data,
|
|
|
|
.data_end = (void *)(long)skb->data_end,
|
|
|
|
.pkt_len = skb->len,
|
|
|
|
.nh = { .pos = pctx.data },
|
|
|
|
.is_egress = true,
|
|
|
|
};
|
|
|
|
bool flow_closing = false;
|
|
|
|
struct flow_state *f_state;
|
|
|
|
struct flow_state new_state = { 0 };
|
|
|
|
|
|
|
|
if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
// Delete flow and create no timestamp entry if flow is closing
|
|
|
|
if (flow_closing) {
|
|
|
|
bpf_map_delete_elem(&flow_state, &p_id.flow);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check flow state
|
|
|
|
f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
|
|
|
|
if (!f_state) { // No previous state - attempt to create it
|
|
|
|
bpf_map_update_elem(&flow_state, &p_id.flow, &new_state,
|
|
|
|
BPF_NOEXIST);
|
|
|
|
f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
|
|
|
|
if (!f_state)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if identfier is new
|
|
|
|
/* The gap between checking and updating last_id may cause concurrency
|
|
|
|
* issues where multiple packets may simultaneously think they are the
|
|
|
|
* first with a new identifier. As long as all of the identifiers are
|
|
|
|
* the same though, only one should be able to create a timestamp entry.
|
|
|
|
|
|
|
|
* A bigger issue is that older identifiers (for example due to
|
|
|
|
* out-of-order packets) may pass this check and update the current
|
|
|
|
* identifier to an old one. This means that both the packet with the
|
|
|
|
* old identifier itself as well the next packet with the current
|
|
|
|
* identifier may be considered packets with new identifiers (even if
|
|
|
|
* both have been seen before). For TCP timestamps this could be
|
|
|
|
* prevented by changing the check to '>=' instead, but it may not be
|
|
|
|
* suitable for other protocols, such as QUIC and its spinbit.
|
|
|
|
*
|
|
|
|
* For now, just hope that the rate limit saves us from creating an
|
|
|
|
* incorrect timestamp. That may however also fail, either due to the
|
|
|
|
* to it happening in a time it's not limited by rate sampling, or
|
|
|
|
* because of rate check failing due to concurrency issues.
|
|
|
|
*/
|
|
|
|
if (f_state->last_id == p_id.identifier)
|
|
|
|
goto out;
|
|
|
|
f_state->last_id = p_id.identifier;
|
|
|
|
|
|
|
|
// Check rate-limit
|
|
|
|
/*
|
|
|
|
* The window between checking and updating last_timestamp may cause
|
|
|
|
* concurrency issues, where multiple packets simultaneously pass the
|
|
|
|
* rate limit. However, as long as they have the same identifier, only
|
|
|
|
* a single timestamp entry should successfully be created.
|
|
|
|
*/
|
|
|
|
p_ts = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns
|
|
|
|
if (p_ts < f_state->last_timestamp ||
|
|
|
|
p_ts - f_state->last_timestamp < config.rate_limit)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Updates attempt at creating timestamp, even if creation of timestamp
|
|
|
|
* fails (due to map being full). This should make the competition for
|
|
|
|
* the next available map slot somewhat fairer between heavy and sparse
|
|
|
|
* flows.
|
|
|
|
*/
|
|
|
|
f_state->last_timestamp = p_ts;
|
|
|
|
bpf_map_update_elem(&packet_ts, &p_id, &p_ts, BPF_NOEXIST);
|
|
|
|
|
|
|
|
out:
|
|
|
|
return BPF_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
// XDP program for parsing identifier in ingress traffic and check for match in map
|
|
|
|
SEC(INGRESS_PROG_SEC)
|
|
|
|
int pping_ingress(struct xdp_md *ctx)
|
|
|
|
{
|
|
|
|
struct packet_id p_id = { 0 };
|
|
|
|
__u64 *p_ts;
|
|
|
|
struct rtt_event event = { 0 };
|
|
|
|
struct parsing_context pctx = {
|
|
|
|
.data = (void *)(long)ctx->data,
|
|
|
|
.data_end = (void *)(long)ctx->data_end,
|
|
|
|
.pkt_len = pctx.data_end - pctx.data,
|
|
|
|
.nh = { .pos = pctx.data },
|
|
|
|
.is_egress = false,
|
|
|
|
};
|
|
|
|
bool flow_closing = false;
|
|
|
|
|
|
|
|
if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
// Delete flow, but allow final attempt at RTT calculation
|
|
|
|
if (flow_closing)
|
|
|
|
bpf_map_delete_elem(&flow_state, &p_id.flow);
|
|
|
|
|
|
|
|
p_ts = bpf_map_lookup_elem(&packet_ts, &p_id);
|
|
|
|
if (!p_ts)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
event.rtt = bpf_ktime_get_ns() - *p_ts;
|
|
|
|
/*
|
|
|
|
* Attempt to delete timestamp entry as soon as RTT is calculated.
|
|
|
|
* But could have potential concurrency issue where multiple packets
|
|
|
|
* manage to match against the identifier before it can be deleted.
|
|
|
|
*/
|
|
|
|
bpf_map_delete_elem(&packet_ts, &p_id);
|
|
|
|
|
|
|
|
__builtin_memcpy(&event.flow, &p_id.flow, sizeof(struct network_tuple));
|
|
|
|
bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, &event,
|
|
|
|
sizeof(event));
|
|
|
|
|
|
|
|
out:
|
|
|
|
return XDP_PASS;
|
|
|
|
}
|