mirror of
https://github.com/xdp-project/bpf-examples.git
synced 2024-05-06 15:54:53 +00:00
Add a counter of outstanding (unmatched) timestamped entires in the flow state. Before a timestamp lookup is attempted, check that there are any outstanding timestamps, otherwise avoid the unecessary hash map lookup. Use 32 bit counter for outstanding timestamps to allow atomic increments/decrements using __synch_fetch_and_add. This operation is not supported on smaller integers, which is why such a large counter is used. The atomicity is needed because the counter may be concurrently accessed by both the ingress/egress hook as well as the periodical map cleanup. Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
909 lines
26 KiB
C
909 lines
26 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
#include <linux/bpf.h>
|
|
#include <bpf/bpf_helpers.h>
|
|
#include <linux/pkt_cls.h>
|
|
#include <linux/in.h>
|
|
#include <linux/in6.h>
|
|
#include <linux/if_ether.h>
|
|
#include <linux/ip.h>
|
|
#include <linux/ipv6.h>
|
|
#include <linux/tcp.h>
|
|
#include <linux/icmp.h>
|
|
#include <linux/icmpv6.h>
|
|
#include <stdbool.h>
|
|
|
|
// overwrite xdp/parsing_helpers.h value to avoid hitting verifier limit
|
|
#ifdef IPV6_EXT_MAX_CHAIN
|
|
#undef IPV6_EXT_MAX_CHAIN
|
|
#endif
|
|
#define IPV6_EXT_MAX_CHAIN 3
|
|
|
|
#include <xdp/parsing_helpers.h>
|
|
#include "pping.h"
|
|
#include "pping_debug_cleanup.h"
|
|
|
|
#define AF_INET 2
|
|
#define AF_INET6 10
|
|
#define MAX_TCP_OPTIONS 10
|
|
|
|
// Mask for IPv6 flowlabel + traffic class - used in fib lookup
|
|
#define IPV6_FLOWINFO_MASK __cpu_to_be32(0x0FFFFFFF)
|
|
|
|
// Emit a warning max once per second when failing to add entry to map
|
|
#define WARN_MAP_FULL_INTERVAL 1000000000UL
|
|
|
|
// Time before map entry is considered old and can safetly be removed
|
|
#define TIMESTAMP_LIFETIME (10 * NS_PER_SECOND) // Clear any timestamp older than this
|
|
#define TIMESTAMP_RTT_LIFETIME 8 // Clear timestamp once it is this many times older than RTT
|
|
#define FLOW_LIFETIME (300 * NS_PER_SECOND) // Clear any flow that's been inactive this long
|
|
#define ICMP_FLOW_LIFETIME (30 * NS_PER_SECOND) // Clear any ICMP flows if they're inactive this long
|
|
#define UNOPENED_FLOW_LIFETIME (30 * NS_PER_SECOND) // Clear out flows that have not seen a response after this long
|
|
|
|
/*
|
|
* Structs for map iteration programs
|
|
* Copied from /tools/testing/selftest/bpf/progs/bpf_iter.h
|
|
*/
|
|
struct bpf_iter_meta {
|
|
struct seq_file *seq;
|
|
__u64 session_id;
|
|
__u64 seq_num;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct bpf_iter__bpf_map_elem {
|
|
struct bpf_iter_meta *meta;
|
|
struct bpf_map *map;
|
|
void *key;
|
|
void *value;
|
|
};
|
|
|
|
/*
|
|
* This struct keeps track of the data and data_end pointers from the xdp_md or
|
|
* __skb_buff contexts, as well as a currently parsed to position kept in nh.
|
|
* Additionally, it also keeps the length of the entire packet, which together
|
|
* with the other members can be used to determine ex. how much data each
|
|
* header encloses.
|
|
*/
|
|
struct parsing_context {
|
|
void *data; // Start of eth hdr
|
|
void *data_end; // End of safe acessible area
|
|
struct hdr_cursor nh; // Position to parse next
|
|
__u32 pkt_len; // Full packet length (headers+data)
|
|
__u32 ingress_ifindex; // Interface packet arrived on
|
|
bool is_egress; // Is packet on egress or ingress?
|
|
};
|
|
|
|
/*
|
|
* Struct filled in by parse_packet_id.
|
|
*
|
|
* Note: As long as parse_packet_id is successful, the flow-parts of pid
|
|
* and reply_pid should be valid, regardless of value for pid_valid and
|
|
* reply_pid valid. The *pid_valid members are there to indicate that the
|
|
* identifier part of *pid are valid and can be used for timestamping/lookup.
|
|
* The reason for not keeping the flow parts as an entirely separate members
|
|
* is to save some performance by avoid doing a copy for lookup/insertion
|
|
* in the packet_ts map.
|
|
*/
|
|
struct packet_info {
|
|
union {
|
|
struct iphdr *iph;
|
|
struct ipv6hdr *ip6h;
|
|
};
|
|
union {
|
|
struct icmphdr *icmph;
|
|
struct icmp6hdr *icmp6h;
|
|
struct tcphdr *tcph;
|
|
};
|
|
__u64 time; // Arrival time of packet
|
|
__u32 payload; // Size of packet data (excluding headers)
|
|
struct packet_id pid; // identifier to timestamp (ex. TSval)
|
|
struct packet_id reply_pid; // identifier to match against (ex. TSecr)
|
|
bool pid_valid; // identifier can be used to timestamp packet
|
|
bool reply_pid_valid; // reply_identifier can be used to match packet
|
|
enum flow_event_type event_type; // flow event triggered by packet
|
|
enum flow_event_reason event_reason; // reason for triggering flow event
|
|
};
|
|
|
|
char _license[] SEC("license") = "GPL";
|
|
// Global config struct - set from userspace
|
|
static volatile const struct bpf_config config = {};
|
|
static volatile __u64 last_warn_time[2] = { 0 };
|
|
|
|
|
|
// Map definitions
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
__type(key, struct packet_id);
|
|
__type(value, __u64);
|
|
__uint(max_entries, 16384);
|
|
} packet_ts SEC(".maps");
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
__type(key, struct network_tuple);
|
|
__type(value, struct flow_state);
|
|
__uint(max_entries, 16384);
|
|
} flow_state SEC(".maps");
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, sizeof(__u32));
|
|
} events SEC(".maps");
|
|
|
|
|
|
// Help functions
|
|
|
|
/*
|
|
* Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2
|
|
*/
|
|
static void map_ipv4_to_ipv6(struct in6_addr *ipv6, __be32 ipv4)
|
|
{
|
|
__builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10);
|
|
__builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2);
|
|
ipv6->in6_u.u6_addr32[3] = ipv4;
|
|
}
|
|
|
|
/*
|
|
* Returns the number of unparsed bytes left in the packet (bytes after nh.pos)
|
|
*/
|
|
static __u32 remaining_pkt_payload(struct parsing_context *ctx)
|
|
{
|
|
// pkt_len - (pos - data) fails because compiler transforms it to pkt_len - pos + data (pkt_len - pos not ok because value - pointer)
|
|
// data + pkt_len - pos fails on (data+pkt_len) - pos due to math between pkt_pointer and unbounded register
|
|
__u32 parsed_bytes = ctx->nh.pos - ctx->data;
|
|
return parsed_bytes < ctx->pkt_len ? ctx->pkt_len - parsed_bytes : 0;
|
|
}
|
|
|
|
/*
|
|
* Convenience function for getting the corresponding reverse flow.
|
|
* PPing needs to keep track of flow in both directions, and sometimes
|
|
* also needs to reverse the flow to report the "correct" (consistent
|
|
* with Kathie's PPing) src and dest address.
|
|
*/
|
|
static void reverse_flow(struct network_tuple *dest, struct network_tuple *src)
|
|
{
|
|
dest->ipv = src->ipv;
|
|
dest->proto = src->proto;
|
|
dest->saddr = src->daddr;
|
|
dest->daddr = src->saddr;
|
|
dest->reserved = 0;
|
|
}
|
|
|
|
/*
|
|
* Parses the TSval and TSecr values from the TCP options field. If sucessful
|
|
* the TSval and TSecr values will be stored at tsval and tsecr (in network
|
|
* byte order).
|
|
* Returns 0 if sucessful and -1 on failure
|
|
*/
|
|
static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval,
|
|
__u32 *tsecr)
|
|
{
|
|
int len = tcph->doff << 2;
|
|
void *opt_end = (void *)tcph + len;
|
|
__u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options
|
|
__u8 i, opt;
|
|
volatile __u8
|
|
opt_size; // Seems to ensure it's always read of from stack as u8
|
|
|
|
if (tcph + 1 > data_end || len <= sizeof(struct tcphdr))
|
|
return -1;
|
|
#pragma unroll //temporary solution until we can identify why the non-unrolled loop gets stuck in an infinite loop
|
|
for (i = 0; i < MAX_TCP_OPTIONS; i++) {
|
|
if (pos + 1 > opt_end || pos + 1 > data_end)
|
|
return -1;
|
|
|
|
opt = *pos;
|
|
if (opt == 0) // Reached end of TCP options
|
|
return -1;
|
|
|
|
if (opt == 1) { // TCP NOP option - advance one byte
|
|
pos++;
|
|
continue;
|
|
}
|
|
|
|
// Option > 1, should have option size
|
|
if (pos + 2 > opt_end || pos + 2 > data_end)
|
|
return -1;
|
|
opt_size = *(pos + 1);
|
|
if (opt_size < 2) // Stop parsing options if opt_size has an invalid value
|
|
return -1;
|
|
|
|
// Option-kind is TCP timestap (yey!)
|
|
if (opt == 8 && opt_size == 10) {
|
|
if (pos + 10 > opt_end || pos + 10 > data_end)
|
|
return -1;
|
|
*tsval = *(__u32 *)(pos + 2);
|
|
*tsecr = *(__u32 *)(pos + 6);
|
|
return 0;
|
|
}
|
|
|
|
// Some other TCP option - advance option-length bytes
|
|
pos += opt_size;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Attempts to fetch an identifier for TCP packets, based on the TCP timestamp
|
|
* option.
|
|
*
|
|
* Will use the TSval as pid and TSecr as reply_pid, and the TCP source and dest
|
|
* as port numbers.
|
|
*
|
|
* If successful, the pid (identifer + flow.port), reply_pid, pid_valid,
|
|
* reply_pid_valid, event_type and event_reason members of p_info will be set
|
|
* appropriately and 0 will be returned.
|
|
* On failure -1 will be returned (no guarantees on values set in p_info).
|
|
*/
|
|
static int parse_tcp_identifier(struct parsing_context *pctx,
|
|
struct packet_info *p_info)
|
|
{
|
|
if (parse_tcphdr(&pctx->nh, pctx->data_end, &p_info->tcph) < 0)
|
|
return -1;
|
|
|
|
if (parse_tcp_ts(p_info->tcph, pctx->data_end, &p_info->pid.identifier,
|
|
&p_info->reply_pid.identifier) < 0)
|
|
return -1; //Possible TODO, fall back on seq/ack instead
|
|
|
|
p_info->pid.flow.saddr.port = p_info->tcph->source;
|
|
p_info->pid.flow.daddr.port = p_info->tcph->dest;
|
|
|
|
// Do not timestamp pure ACKs (no payload)
|
|
p_info->pid_valid =
|
|
pctx->nh.pos - pctx->data < pctx->pkt_len || p_info->tcph->syn;
|
|
|
|
// Do not match on non-ACKs (TSecr not valid)
|
|
p_info->reply_pid_valid = p_info->tcph->ack;
|
|
|
|
// Check if connection is opening/closing
|
|
if (p_info->tcph->rst) {
|
|
p_info->event_type = FLOW_EVENT_CLOSING_BOTH;
|
|
p_info->event_reason = EVENT_REASON_RST;
|
|
} else if (p_info->tcph->fin) {
|
|
p_info->event_type = FLOW_EVENT_CLOSING;
|
|
p_info->event_reason = EVENT_REASON_FIN;
|
|
} else if (p_info->tcph->syn) {
|
|
p_info->event_type = FLOW_EVENT_OPENING;
|
|
p_info->event_reason = p_info->tcph->ack ?
|
|
EVENT_REASON_SYN_ACK :
|
|
EVENT_REASON_SYN;
|
|
} else {
|
|
p_info->event_type = FLOW_EVENT_NONE;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Attempts to fetch an identifier for an ICMPv6 header, based on the echo
|
|
* request/reply sequence number.
|
|
*
|
|
* Will use the echo sequence number as pid/reply_pid and the echo identifier
|
|
* as port numbers. Echo requests will only generate a valid pid and echo
|
|
* replies will only generate a valid reply_pid.
|
|
*
|
|
* If successful, the pid (identifier + flow.port), reply_pid, pid_valid,
|
|
* reply pid_valid and event_type of p_info will be set appropriately and 0
|
|
* will be returned.
|
|
* On failure, -1 will be returned (no guarantees on p_info members).
|
|
*
|
|
* Note: Will store the 16-bit sequence number in network byte order
|
|
* in the 32-bit (reply_)pid.identifier.
|
|
*/
|
|
static int parse_icmp6_identifier(struct parsing_context *pctx,
|
|
struct packet_info *p_info)
|
|
{
|
|
if (parse_icmp6hdr(&pctx->nh, pctx->data_end, &p_info->icmp6h) < 0)
|
|
return -1;
|
|
|
|
if (p_info->icmp6h->icmp6_code != 0)
|
|
return -1;
|
|
|
|
if (p_info->icmp6h->icmp6_type == ICMPV6_ECHO_REQUEST) {
|
|
p_info->pid.identifier = p_info->icmp6h->icmp6_sequence;
|
|
p_info->pid_valid = true;
|
|
p_info->reply_pid_valid = false;
|
|
} else if (p_info->icmp6h->icmp6_type == ICMPV6_ECHO_REPLY) {
|
|
p_info->reply_pid.identifier = p_info->icmp6h->icmp6_sequence;
|
|
p_info->reply_pid_valid = true;
|
|
p_info->pid_valid = false;
|
|
} else {
|
|
return -1;
|
|
}
|
|
|
|
p_info->event_type = FLOW_EVENT_NONE;
|
|
p_info->pid.flow.saddr.port = p_info->icmp6h->icmp6_identifier;
|
|
p_info->pid.flow.daddr.port = p_info->pid.flow.saddr.port;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Same as parse_icmp6_identifier, but for an ICMP(v4) header instead.
|
|
*/
|
|
static int parse_icmp_identifier(struct parsing_context *pctx,
|
|
struct packet_info *p_info)
|
|
{
|
|
if (parse_icmphdr(&pctx->nh, pctx->data_end, &p_info->icmph) < 0)
|
|
return -1;
|
|
|
|
if (p_info->icmph->code != 0)
|
|
return -1;
|
|
|
|
if (p_info->icmph->type == ICMP_ECHO) {
|
|
p_info->pid.identifier = p_info->icmph->un.echo.sequence;
|
|
p_info->pid_valid = true;
|
|
p_info->reply_pid_valid = false;
|
|
} else if (p_info->icmph->type == ICMP_ECHOREPLY) {
|
|
p_info->reply_pid.identifier = p_info->icmph->un.echo.sequence;
|
|
p_info->reply_pid_valid = true;
|
|
p_info->pid_valid = false;
|
|
} else {
|
|
return -1;
|
|
}
|
|
|
|
p_info->event_type = FLOW_EVENT_NONE;
|
|
p_info->pid.flow.saddr.port = p_info->icmph->un.echo.id;
|
|
p_info->pid.flow.daddr.port = p_info->pid.flow.saddr.port;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Attempts to parse the packet defined by pctx for a valid packet identifier
|
|
* and reply identifier, filling in p_info.
|
|
*
|
|
* If succesful, all members of p_info will be set appropriately and 0 will
|
|
* be returned.
|
|
* On failure -1 will be returned (no garantuees on p_info members).
|
|
*/
|
|
static int parse_packet_identifier(struct parsing_context *pctx,
|
|
struct packet_info *p_info)
|
|
{
|
|
int proto, err;
|
|
struct ethhdr *eth;
|
|
|
|
p_info->time = bpf_ktime_get_ns();
|
|
proto = parse_ethhdr(&pctx->nh, pctx->data_end, ð);
|
|
|
|
// Parse IPv4/6 header
|
|
if (proto == bpf_htons(ETH_P_IP)) {
|
|
p_info->pid.flow.ipv = AF_INET;
|
|
p_info->pid.flow.proto =
|
|
parse_iphdr(&pctx->nh, pctx->data_end, &p_info->iph);
|
|
} else if (proto == bpf_htons(ETH_P_IPV6)) {
|
|
p_info->pid.flow.ipv = AF_INET6;
|
|
p_info->pid.flow.proto =
|
|
parse_ip6hdr(&pctx->nh, pctx->data_end, &p_info->ip6h);
|
|
} else {
|
|
return -1;
|
|
}
|
|
|
|
// Parse identifer from suitable protocol
|
|
if (config.track_tcp && p_info->pid.flow.proto == IPPROTO_TCP)
|
|
err = parse_tcp_identifier(pctx, p_info);
|
|
else if (config.track_icmp &&
|
|
p_info->pid.flow.proto == IPPROTO_ICMPV6 &&
|
|
p_info->pid.flow.ipv == AF_INET6)
|
|
err = parse_icmp6_identifier(pctx, p_info);
|
|
else if (config.track_icmp && p_info->pid.flow.proto == IPPROTO_ICMP &&
|
|
p_info->pid.flow.ipv == AF_INET)
|
|
err = parse_icmp_identifier(pctx, p_info);
|
|
else
|
|
return -1; // No matching protocol
|
|
if (err)
|
|
return -1; // Failed parsing protocol
|
|
|
|
// Sucessfully parsed packet identifier - fill in IP-addresses and return
|
|
if (p_info->pid.flow.ipv == AF_INET) {
|
|
map_ipv4_to_ipv6(&p_info->pid.flow.saddr.ip,
|
|
p_info->iph->saddr);
|
|
map_ipv4_to_ipv6(&p_info->pid.flow.daddr.ip,
|
|
p_info->iph->daddr);
|
|
} else { // IPv6
|
|
p_info->pid.flow.saddr.ip = p_info->ip6h->saddr;
|
|
p_info->pid.flow.daddr.ip = p_info->ip6h->daddr;
|
|
}
|
|
|
|
reverse_flow(&p_info->reply_pid.flow, &p_info->pid.flow);
|
|
p_info->payload = remaining_pkt_payload(pctx);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Calculate a smoothed rtt similar to how TCP stack does it in
|
|
* net/ipv4/tcp_input.c/tcp_rtt_estimator().
|
|
*
|
|
* NOTE: Will cause roundoff errors, but if RTTs > 1000ns errors should be small
|
|
*/
|
|
static __u64 calculate_srtt(__u64 prev_srtt, __u64 rtt)
|
|
{
|
|
if (!prev_srtt)
|
|
return rtt;
|
|
// srtt = 7/8*prev_srtt + 1/8*rtt
|
|
return prev_srtt - (prev_srtt >> 3) + (rtt >> 3);
|
|
}
|
|
|
|
static bool is_rate_limited(__u64 now, __u64 last_ts, __u64 rtt)
|
|
{
|
|
if (now < last_ts)
|
|
return true;
|
|
|
|
// RTT-based rate limit
|
|
if (config.rtt_rate && rtt)
|
|
return now - last_ts < FIXPOINT_TO_UINT(config.rtt_rate * rtt);
|
|
|
|
// Static rate limit
|
|
return now - last_ts < config.rate_limit;
|
|
}
|
|
|
|
/*
|
|
* Send a flow opening event through the perf-buffer.
|
|
* As these events are only sent upon receiving a reply, need to access state
|
|
* of the reverse flow to get reason flow was opened and when the original
|
|
* packet opening the flow was sent.
|
|
*/
|
|
static void send_flow_open_event(void *ctx, struct packet_info *p_info,
|
|
struct flow_state *rev_flow)
|
|
{
|
|
struct flow_event fe = {
|
|
.event_type = EVENT_TYPE_FLOW,
|
|
.flow_event_type = FLOW_EVENT_OPENING,
|
|
.source = EVENT_SOURCE_PKT_DEST,
|
|
.flow = p_info->pid.flow,
|
|
.reason = rev_flow->opening_reason,
|
|
.timestamp = rev_flow->last_timestamp,
|
|
.reserved = 0,
|
|
};
|
|
|
|
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &fe, sizeof(fe));
|
|
}
|
|
|
|
/*
|
|
* Sends a flow-event message based on p_info.
|
|
*
|
|
* The rev_flow argument is used to inform if the message is for the flow
|
|
* in the current direction or the reverse flow, and will adapt the flow and
|
|
* source members accordingly.
|
|
*/
|
|
static void send_flow_event(void *ctx, struct packet_info *p_info,
|
|
bool rev_flow)
|
|
{
|
|
struct flow_event fe = {
|
|
.event_type = EVENT_TYPE_FLOW,
|
|
.flow_event_type = p_info->event_type,
|
|
.reason = p_info->event_reason,
|
|
.timestamp = p_info->time,
|
|
.reserved = 0, // Make sure it's initilized
|
|
};
|
|
|
|
if (rev_flow) {
|
|
fe.flow = p_info->pid.flow;
|
|
fe.source = EVENT_SOURCE_PKT_SRC;
|
|
} else {
|
|
fe.flow = p_info->reply_pid.flow;
|
|
fe.source = EVENT_SOURCE_PKT_DEST;
|
|
}
|
|
|
|
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &fe, sizeof(fe));
|
|
}
|
|
|
|
/*
|
|
* Send a map-full event for the map.
|
|
* Will only trigger once every WARN_MAP_FULL_INTERVAL
|
|
*/
|
|
static void send_map_full_event(void *ctx, struct packet_info *p_info,
|
|
enum pping_map map)
|
|
{
|
|
struct map_full_event me;
|
|
|
|
if (p_info->time < last_warn_time[map] ||
|
|
p_info->time - last_warn_time[map] < WARN_MAP_FULL_INTERVAL)
|
|
return;
|
|
|
|
last_warn_time[map] = p_info->time;
|
|
|
|
__builtin_memset(&me, 0, sizeof(me));
|
|
me.event_type = EVENT_TYPE_MAP_FULL;
|
|
me.timestamp = p_info->time;
|
|
me.flow = p_info->pid.flow;
|
|
me.map = map;
|
|
|
|
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &me, sizeof(me));
|
|
}
|
|
|
|
/*
|
|
* Attempt to create a new flow-state.
|
|
* Returns a pointer to the flow_state if successful, NULL otherwise
|
|
*/
|
|
static struct flow_state *create_flow(void *ctx, struct packet_info *p_info)
|
|
{
|
|
struct flow_state new_state = { 0 };
|
|
|
|
new_state.last_timestamp = p_info->time;
|
|
new_state.opening_reason = p_info->event_type == FLOW_EVENT_OPENING ?
|
|
p_info->event_reason :
|
|
EVENT_REASON_FIRST_OBS_PCKT;
|
|
|
|
if (bpf_map_update_elem(&flow_state, &p_info->pid.flow, &new_state,
|
|
BPF_NOEXIST) != 0) {
|
|
send_map_full_event(ctx, p_info, PPING_MAP_FLOWSTATE);
|
|
return NULL;
|
|
}
|
|
|
|
return bpf_map_lookup_elem(&flow_state, &p_info->pid.flow);
|
|
}
|
|
|
|
static struct flow_state *update_flow(void *ctx, struct packet_info *p_info,
|
|
bool *new_flow)
|
|
{
|
|
struct flow_state *f_state;
|
|
*new_flow = false;
|
|
|
|
f_state = bpf_map_lookup_elem(&flow_state, &p_info->pid.flow);
|
|
|
|
// Attempt to create flow if it does not exist
|
|
if (!f_state && p_info->pid_valid &&
|
|
!(p_info->event_type == FLOW_EVENT_CLOSING ||
|
|
p_info->event_type == FLOW_EVENT_CLOSING_BOTH)) {
|
|
*new_flow = true;
|
|
f_state = create_flow(ctx, p_info);
|
|
}
|
|
|
|
if (!f_state)
|
|
return NULL;
|
|
|
|
// Update flow state
|
|
f_state->sent_pkts++;
|
|
f_state->sent_bytes += p_info->payload;
|
|
|
|
return f_state;
|
|
}
|
|
|
|
static struct flow_state *update_rev_flow(void *ctx, struct packet_info *p_info)
|
|
{
|
|
struct flow_state *f_state;
|
|
|
|
f_state = bpf_map_lookup_elem(&flow_state, &p_info->reply_pid.flow);
|
|
if (!f_state)
|
|
return NULL;
|
|
|
|
// Is a new flow, push opening flow message
|
|
if (!f_state->has_opened &&
|
|
p_info->event_type != FLOW_EVENT_CLOSING_BOTH) {
|
|
f_state->has_opened = true;
|
|
send_flow_open_event(ctx, p_info, f_state);
|
|
}
|
|
|
|
// Update flow state
|
|
f_state->rec_pkts++;
|
|
f_state->rec_bytes += p_info->payload;
|
|
|
|
return f_state;
|
|
}
|
|
|
|
static void delete_closed_flows(void *ctx, struct packet_info *p_info,
|
|
struct flow_state *flow,
|
|
struct flow_state *rev_flow)
|
|
{
|
|
bool has_opened;
|
|
|
|
// Flow closing - try to delete flow state and push closing-event
|
|
if (flow && (p_info->event_type == FLOW_EVENT_CLOSING ||
|
|
p_info->event_type == FLOW_EVENT_CLOSING_BOTH)) {
|
|
has_opened = flow->has_opened;
|
|
if (bpf_map_delete_elem(&flow_state, &p_info->pid.flow) == 0) {
|
|
debug_increment_autodel(PPING_MAP_FLOWSTATE);
|
|
if (has_opened)
|
|
send_flow_event(ctx, p_info, false);
|
|
}
|
|
}
|
|
|
|
// Also close reverse flow
|
|
if (rev_flow && p_info->event_type == FLOW_EVENT_CLOSING_BOTH) {
|
|
has_opened = rev_flow->has_opened;
|
|
if (bpf_map_delete_elem(&flow_state, &p_info->reply_pid.flow) ==
|
|
0) {
|
|
debug_increment_autodel(PPING_MAP_FLOWSTATE);
|
|
if (has_opened)
|
|
send_flow_event(ctx, p_info, true);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Return true if p_info->pid.flow.daddr is a "local" address.
|
|
*
|
|
* Works by performing a fib lookup for p_info->pid.flow.
|
|
* Lookup struct filled based on examples from
|
|
* samples/bpf/xdp_fwd_kern.c/xdp_fwd_flags() and
|
|
* tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c
|
|
*/
|
|
static bool is_local_address(struct packet_info *p_info, void *ctx,
|
|
struct parsing_context *pctx)
|
|
{
|
|
int ret;
|
|
struct bpf_fib_lookup lookup;
|
|
__builtin_memset(&lookup, 0, sizeof(lookup));
|
|
|
|
lookup.ifindex = pctx->ingress_ifindex;
|
|
lookup.family = p_info->pid.flow.ipv;
|
|
|
|
if (lookup.family == AF_INET) {
|
|
lookup.tos = p_info->iph->tos;
|
|
lookup.tot_len = bpf_ntohs(p_info->iph->tot_len);
|
|
lookup.ipv4_src = p_info->iph->saddr;
|
|
lookup.ipv4_dst = p_info->iph->daddr;
|
|
} else if (lookup.family == AF_INET6) {
|
|
struct in6_addr *src = (struct in6_addr *)lookup.ipv6_src;
|
|
struct in6_addr *dst = (struct in6_addr *)lookup.ipv6_dst;
|
|
|
|
lookup.flowinfo = *(__be32 *)p_info->ip6h & IPV6_FLOWINFO_MASK;
|
|
lookup.tot_len = bpf_ntohs(p_info->ip6h->payload_len);
|
|
*src = p_info->pid.flow.saddr.ip; //verifier did not like ip6h->saddr
|
|
*dst = p_info->pid.flow.daddr.ip;
|
|
}
|
|
|
|
lookup.l4_protocol = p_info->pid.flow.proto;
|
|
lookup.sport = 0;
|
|
lookup.dport = 0;
|
|
|
|
ret = bpf_fib_lookup(ctx, &lookup, sizeof(lookup), 0);
|
|
|
|
return ret == BPF_FIB_LKUP_RET_NOT_FWDED ||
|
|
ret == BPF_FIB_LKUP_RET_FWD_DISABLED;
|
|
}
|
|
|
|
/*
|
|
* Attempt to create a timestamp-entry for packet p_info for flow in f_state
|
|
*/
|
|
static void pping_timestamp_packet(struct flow_state *f_state, void *ctx,
|
|
struct parsing_context *pctx,
|
|
struct packet_info *p_info, bool new_flow)
|
|
{
|
|
if (!f_state || !p_info->pid_valid)
|
|
return;
|
|
|
|
if (config.localfilt && !pctx->is_egress &&
|
|
is_local_address(p_info, ctx, pctx))
|
|
return;
|
|
|
|
// Check if identfier is new
|
|
if (!new_flow && f_state->last_id == p_info->pid.identifier)
|
|
return;
|
|
f_state->last_id = p_info->pid.identifier;
|
|
|
|
// Check rate-limit
|
|
if (!new_flow &&
|
|
is_rate_limited(p_info->time, f_state->last_timestamp,
|
|
config.use_srtt ? f_state->srtt : f_state->min_rtt))
|
|
return;
|
|
|
|
/*
|
|
* Updates attempt at creating timestamp, even if creation of timestamp
|
|
* fails (due to map being full). This should make the competition for
|
|
* the next available map slot somewhat fairer between heavy and sparse
|
|
* flows.
|
|
*/
|
|
f_state->last_timestamp = p_info->time;
|
|
|
|
if (bpf_map_update_elem(&packet_ts, &p_info->pid, &p_info->time,
|
|
BPF_NOEXIST) == 0)
|
|
__sync_fetch_and_add(&f_state->outstanding_timestamps, 1);
|
|
else
|
|
send_map_full_event(ctx, p_info, PPING_MAP_PACKETTS);
|
|
}
|
|
|
|
/*
|
|
* Attempt to match packet in p_info with a timestamp from flow in f_state
|
|
*/
|
|
static void pping_match_packet(struct flow_state *f_state, void *ctx,
|
|
struct parsing_context *pctx,
|
|
struct packet_info *p_info)
|
|
{
|
|
struct rtt_event re = { 0 };
|
|
__u64 *p_ts;
|
|
|
|
if (!f_state || !p_info->reply_pid_valid)
|
|
return;
|
|
|
|
if (f_state->outstanding_timestamps == 0)
|
|
return;
|
|
|
|
p_ts = bpf_map_lookup_elem(&packet_ts, &p_info->reply_pid);
|
|
if (!p_ts || p_info->time < *p_ts)
|
|
return;
|
|
|
|
re.rtt = p_info->time - *p_ts;
|
|
|
|
// Delete timestamp entry as soon as RTT is calculated
|
|
if (bpf_map_delete_elem(&packet_ts, &p_info->reply_pid) == 0) {
|
|
__sync_fetch_and_add(&f_state->outstanding_timestamps, -1);
|
|
debug_increment_autodel(PPING_MAP_PACKETTS);
|
|
}
|
|
|
|
if (f_state->min_rtt == 0 || re.rtt < f_state->min_rtt)
|
|
f_state->min_rtt = re.rtt;
|
|
f_state->srtt = calculate_srtt(f_state->srtt, re.rtt);
|
|
|
|
// Fill event and push to perf-buffer
|
|
re.event_type = EVENT_TYPE_RTT;
|
|
re.timestamp = p_info->time;
|
|
re.min_rtt = f_state->min_rtt;
|
|
re.sent_pkts = f_state->sent_pkts;
|
|
re.sent_bytes = f_state->sent_bytes;
|
|
re.rec_pkts = f_state->rec_pkts;
|
|
re.rec_bytes = f_state->rec_bytes;
|
|
re.flow = p_info->pid.flow;
|
|
re.match_on_egress = pctx->is_egress;
|
|
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &re, sizeof(re));
|
|
}
|
|
|
|
/*
|
|
* Will parse the ingress/egress packet in pctx and attempt to create a
|
|
* timestamp for it and match it against the reverse flow.
|
|
*/
|
|
static void pping(void *ctx, struct parsing_context *pctx)
|
|
{
|
|
struct packet_info p_info = { 0 };
|
|
struct flow_state *flow, *rev_flow;;
|
|
bool new_flow;
|
|
|
|
if (parse_packet_identifier(pctx, &p_info) < 0)
|
|
return;
|
|
|
|
flow = update_flow(ctx, &p_info, &new_flow);
|
|
pping_timestamp_packet(flow, ctx, pctx, &p_info, new_flow);
|
|
|
|
rev_flow = update_rev_flow(ctx, &p_info);
|
|
pping_match_packet(rev_flow, ctx, pctx, &p_info);
|
|
|
|
delete_closed_flows(ctx, &p_info, flow, rev_flow);
|
|
|
|
}
|
|
|
|
// Programs
|
|
|
|
// Egress path using TC-BPF
|
|
SEC("tc")
|
|
int pping_tc_egress(struct __sk_buff *skb)
|
|
{
|
|
struct parsing_context pctx = {
|
|
.data = (void *)(long)skb->data,
|
|
.data_end = (void *)(long)skb->data_end,
|
|
.pkt_len = skb->len,
|
|
.nh = { .pos = pctx.data },
|
|
.is_egress = true,
|
|
};
|
|
|
|
pping(skb, &pctx);
|
|
|
|
return TC_ACT_UNSPEC;
|
|
}
|
|
|
|
// Ingress path using TC-BPF
|
|
SEC("tc")
|
|
int pping_tc_ingress(struct __sk_buff *skb)
|
|
{
|
|
struct parsing_context pctx = {
|
|
.data = (void *)(long)skb->data,
|
|
.data_end = (void *)(long)skb->data_end,
|
|
.pkt_len = skb->len,
|
|
.nh = { .pos = pctx.data },
|
|
.ingress_ifindex = skb->ingress_ifindex,
|
|
.is_egress = false,
|
|
};
|
|
|
|
pping(skb, &pctx);
|
|
|
|
return TC_ACT_UNSPEC;
|
|
}
|
|
|
|
// Ingress path using XDP
|
|
SEC("xdp")
|
|
int pping_xdp_ingress(struct xdp_md *ctx)
|
|
{
|
|
struct parsing_context pctx = {
|
|
.data = (void *)(long)ctx->data,
|
|
.data_end = (void *)(long)ctx->data_end,
|
|
.pkt_len = pctx.data_end - pctx.data,
|
|
.nh = { .pos = pctx.data },
|
|
.ingress_ifindex = ctx->ingress_ifindex,
|
|
.is_egress = false,
|
|
};
|
|
|
|
pping(ctx, &pctx);
|
|
|
|
return XDP_PASS;
|
|
}
|
|
|
|
SEC("iter/bpf_map_elem")
|
|
int tsmap_cleanup(struct bpf_iter__bpf_map_elem *ctx)
|
|
{
|
|
struct packet_id local_pid;
|
|
struct flow_state *f_state;
|
|
struct packet_id *pid = ctx->key;
|
|
__u64 *timestamp = ctx->value;
|
|
__u64 now = bpf_ktime_get_ns();
|
|
__u64 rtt;
|
|
|
|
debug_update_mapclean_stats(ctx, &events, !ctx->key || !ctx->value,
|
|
ctx->meta->seq_num, now,
|
|
PPING_MAP_PACKETTS);
|
|
|
|
if (!pid || !timestamp)
|
|
return 0;
|
|
if (now <= *timestamp)
|
|
return 0;
|
|
|
|
/* Seems like the key for map lookup operations must be
|
|
on the stack, so copy pid to local_pid. */
|
|
__builtin_memcpy(&local_pid, pid, sizeof(local_pid));
|
|
f_state = bpf_map_lookup_elem(&flow_state, &local_pid.flow);
|
|
rtt = f_state ? f_state->srtt : 0;
|
|
|
|
if ((rtt && now - *timestamp > rtt * TIMESTAMP_RTT_LIFETIME) ||
|
|
now - *timestamp > TIMESTAMP_LIFETIME) {
|
|
if (bpf_map_delete_elem(&packet_ts, &local_pid) == 0) {
|
|
debug_increment_timeoutdel(PPING_MAP_PACKETTS);
|
|
|
|
if (f_state)
|
|
__sync_fetch_and_add(
|
|
&f_state->outstanding_timestamps, -1);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
SEC("iter/bpf_map_elem")
|
|
int flowmap_cleanup(struct bpf_iter__bpf_map_elem *ctx)
|
|
{
|
|
struct network_tuple local_flow;
|
|
struct network_tuple *flow = ctx->key;
|
|
struct flow_state *f_state = ctx->value;
|
|
struct flow_event fe;
|
|
__u64 now = bpf_ktime_get_ns();
|
|
bool has_opened;
|
|
__u64 age;
|
|
|
|
debug_update_mapclean_stats(ctx, &events, !ctx->key || !ctx->value,
|
|
ctx->meta->seq_num, now,
|
|
PPING_MAP_FLOWSTATE);
|
|
|
|
if (!flow || !f_state)
|
|
return 0;
|
|
if (now < f_state->last_timestamp)
|
|
return 0;
|
|
|
|
// Check if flow is too old for unopned/ICMP/other flow type
|
|
age = now - f_state->last_timestamp;
|
|
has_opened = f_state->has_opened;
|
|
|
|
if ((!has_opened && age > UNOPENED_FLOW_LIFETIME) ||
|
|
((flow->proto == IPPROTO_ICMP || flow->proto == IPPROTO_ICMPV6) &&
|
|
age > ICMP_FLOW_LIFETIME) ||
|
|
age > FLOW_LIFETIME) {
|
|
__builtin_memcpy(&local_flow, flow, sizeof(local_flow));
|
|
has_opened = f_state->has_opened;
|
|
|
|
if (bpf_map_delete_elem(&flow_state, &local_flow) == 0) {
|
|
debug_increment_timeoutdel(PPING_MAP_FLOWSTATE);
|
|
|
|
if (has_opened) {
|
|
reverse_flow(&fe.flow, &local_flow);
|
|
fe.event_type = EVENT_TYPE_FLOW;
|
|
fe.timestamp = now;
|
|
fe.flow_event_type = FLOW_EVENT_CLOSING;
|
|
fe.reason = EVENT_REASON_FLOW_TIMEOUT;
|
|
fe.source = EVENT_SOURCE_GC;
|
|
fe.reserved = 0;
|
|
bpf_perf_event_output(ctx, &events,
|
|
BPF_F_CURRENT_CPU, &fe,
|
|
sizeof(fe));
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|