mirror of
https://github.com/xdp-project/bpf-examples.git
synced 2024-05-06 15:54:53 +00:00
Use global functions to make use of function-by-function verification. This allows the verifier to analyze parts of the program individually from each other, which should help reduce the verification complexity (the number of instructions the verifier must go through to verify the program) and help prevent exponentially growing with every loop or branch added to the code. In this case, break out the packet parsing (parse_packet_identifier) as a global function, so that it can be handled separately from the logic after it (updating flow state, saving timestamps, matching replies to timestamps, calculating and pushing RTTs) etc. To do this, create small separate wrapper functions (parse_packet_identifier_tc() and parse_packet_identifier_xdp()) for tc/xdp, so that the verifier can correctly identify the arguments as pointers to context (PTR_TO_CTX) when evaluating the global functions. Also create small wrapper functions pping_tc() and pping_xdp() which call the corresponding parse_packet_identifier_tc/xdp function. For this to work in XDP mode (which is the default), the kernel must have been patched with a fix that addresses an issue with how global functions are verified for XDP programs, see: https://lore.kernel.org/all/20220606075253.28422-1-toke@redhat.com/ Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
1189 lines
34 KiB
C
1189 lines
34 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
#include <linux/bpf.h>
|
|
#include <bpf/bpf_helpers.h>
|
|
#include <linux/pkt_cls.h>
|
|
#include <linux/in.h>
|
|
#include <linux/in6.h>
|
|
#include <linux/if_ether.h>
|
|
#include <linux/ip.h>
|
|
#include <linux/ipv6.h>
|
|
#include <linux/tcp.h>
|
|
#include <linux/icmp.h>
|
|
#include <linux/icmpv6.h>
|
|
#include <stdbool.h>
|
|
|
|
// overwrite xdp/parsing_helpers.h value to avoid hitting verifier limit
|
|
#ifdef IPV6_EXT_MAX_CHAIN
|
|
#undef IPV6_EXT_MAX_CHAIN
|
|
#endif
|
|
#define IPV6_EXT_MAX_CHAIN 3
|
|
|
|
#include <xdp/parsing_helpers.h>
|
|
#include "pping.h"
|
|
#include "pping_debug_cleanup.h"
|
|
|
|
#ifndef AF_INET
|
|
#define AF_INET 2
|
|
#endif
|
|
#ifndef AF_INET6
|
|
#define AF_INET6 10
|
|
#endif
|
|
#define MAX_TCP_OPTIONS 10
|
|
|
|
// Mask for IPv6 flowlabel + traffic class - used in fib lookup
|
|
#define IPV6_FLOWINFO_MASK __cpu_to_be32(0x0FFFFFFF)
|
|
|
|
// Emit a warning max once per second when failing to add entry to map
|
|
#define WARN_MAP_FULL_INTERVAL 1000000000UL
|
|
|
|
// Time before map entry is considered old and can safetly be removed
|
|
#define TIMESTAMP_LIFETIME (10 * NS_PER_SECOND) // Clear any timestamp older than this
|
|
#define TIMESTAMP_RTT_LIFETIME 8 // Clear timestamp once it is this many times older than RTT
|
|
#define FLOW_LIFETIME (300 * NS_PER_SECOND) // Clear any flow that's been inactive this long
|
|
#define ICMP_FLOW_LIFETIME (30 * NS_PER_SECOND) // Clear any ICMP flows if they're inactive this long
|
|
#define UNOPENED_FLOW_LIFETIME (30 * NS_PER_SECOND) // Clear out flows that have not seen a response after this long
|
|
|
|
#define MAX_MEMCMP_SIZE 128
|
|
|
|
/*
|
|
* Structs for map iteration programs
|
|
* Copied from /tools/testing/selftest/bpf/progs/bpf_iter.h
|
|
*/
|
|
struct bpf_iter_meta {
|
|
struct seq_file *seq;
|
|
__u64 session_id;
|
|
__u64 seq_num;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct bpf_iter__bpf_map_elem {
|
|
struct bpf_iter_meta *meta;
|
|
struct bpf_map *map;
|
|
void *key;
|
|
void *value;
|
|
};
|
|
|
|
/*
|
|
* This struct keeps track of the data and data_end pointers from the xdp_md or
|
|
* __skb_buff contexts, as well as a currently parsed to position kept in nh.
|
|
* Additionally, it also keeps the length of the entire packet, which together
|
|
* with the other members can be used to determine ex. how much data each
|
|
* header encloses.
|
|
*/
|
|
struct parsing_context {
|
|
void *data; // Start of eth hdr
|
|
void *data_end; // End of safe acessible area
|
|
struct hdr_cursor nh; // Position to parse next
|
|
__u32 pkt_len; // Full packet length (headers+data)
|
|
};
|
|
|
|
/*
|
|
* Struct filled in by parse_packet_id.
|
|
*
|
|
* Note: As long as parse_packet_id is successful, the flow-parts of pid
|
|
* and reply_pid should be valid, regardless of value for pid_valid and
|
|
* reply_pid valid. The *pid_valid members are there to indicate that the
|
|
* identifier part of *pid are valid and can be used for timestamping/lookup.
|
|
* The reason for not keeping the flow parts as an entirely separate members
|
|
* is to save some performance by avoid doing a copy for lookup/insertion
|
|
* in the packet_ts map.
|
|
*/
|
|
struct packet_info {
|
|
__u64 time; // Arrival time of packet
|
|
__u32 payload; // Size of packet data (excluding headers)
|
|
struct packet_id pid; // flow + identifier to timestamp (ex. TSval)
|
|
struct packet_id reply_pid; // rev. flow + identifier to match against (ex. TSecr)
|
|
__u32 ingress_ifindex; // Interface packet arrived on (if is_ingress, otherwise not valid)
|
|
union { // The IP-level "type of service" (DSCP for IPv4, traffic class + flow label for IPv6)
|
|
__u8 ipv4_tos;
|
|
__be32 ipv6_tos;
|
|
} ip_tos;
|
|
__u16 ip_len; // The IPv4 total length or IPv6 payload length
|
|
bool is_ingress; // Packet on egress or ingress?
|
|
bool pid_flow_is_dfkey; // Used to determine which member of dualflow state to use for forward direction
|
|
bool pid_valid; // identifier can be used to timestamp packet
|
|
bool reply_pid_valid; // reply_identifier can be used to match packet
|
|
enum flow_event_type event_type; // flow event triggered by packet
|
|
enum flow_event_reason event_reason; // reason for triggering flow event
|
|
};
|
|
|
|
/*
|
|
* Struct filled in by protocol id parsers (ex. parse_tcp_identifier)
|
|
*/
|
|
struct protocol_info {
|
|
__u32 pid;
|
|
__u32 reply_pid;
|
|
bool pid_valid;
|
|
bool reply_pid_valid;
|
|
enum flow_event_type event_type;
|
|
enum flow_event_reason event_reason;
|
|
};
|
|
|
|
char _license[] SEC("license") = "GPL";
|
|
// Global config struct - set from userspace
|
|
static volatile const struct bpf_config config = {};
|
|
static volatile __u64 last_warn_time[2] = { 0 };
|
|
|
|
|
|
// Map definitions
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
__type(key, struct packet_id);
|
|
__type(value, __u64);
|
|
__uint(max_entries, 16384);
|
|
} packet_ts SEC(".maps");
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
__type(key, struct network_tuple);
|
|
__type(value, struct dual_flow_state);
|
|
__uint(max_entries, 16384);
|
|
} flow_state SEC(".maps");
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, sizeof(__u32));
|
|
} events SEC(".maps");
|
|
|
|
|
|
// Help functions
|
|
|
|
/*
|
|
* Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2
|
|
*/
|
|
static void map_ipv4_to_ipv6(struct in6_addr *ipv6, __be32 ipv4)
|
|
{
|
|
__builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10);
|
|
__builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2);
|
|
ipv6->in6_u.u6_addr32[3] = ipv4;
|
|
}
|
|
|
|
static __be32 ipv4_from_ipv6(struct in6_addr *ipv6)
|
|
{
|
|
return ipv6->in6_u.u6_addr32[3];
|
|
}
|
|
|
|
/*
|
|
* Returns the number of unparsed bytes left in the packet (bytes after nh.pos)
|
|
*/
|
|
static __u32 remaining_pkt_payload(struct parsing_context *ctx)
|
|
{
|
|
// pkt_len - (pos - data) fails because compiler transforms it to pkt_len - pos + data (pkt_len - pos not ok because value - pointer)
|
|
// data + pkt_len - pos fails on (data+pkt_len) - pos due to math between pkt_pointer and unbounded register
|
|
__u32 parsed_bytes = ctx->nh.pos - ctx->data;
|
|
return parsed_bytes < ctx->pkt_len ? ctx->pkt_len - parsed_bytes : 0;
|
|
}
|
|
|
|
/*
|
|
* Convenience function for getting the corresponding reverse flow.
|
|
* PPing needs to keep track of flow in both directions, and sometimes
|
|
* also needs to reverse the flow to report the "correct" (consistent
|
|
* with Kathie's PPing) src and dest address.
|
|
*/
|
|
static void reverse_flow(struct network_tuple *dest, struct network_tuple *src)
|
|
{
|
|
dest->ipv = src->ipv;
|
|
dest->proto = src->proto;
|
|
dest->saddr = src->daddr;
|
|
dest->daddr = src->saddr;
|
|
dest->reserved = 0;
|
|
}
|
|
|
|
/*
|
|
* Can't seem to get __builtin_memcmp to work, so hacking my own
|
|
*
|
|
* Based on https://githubhot.com/repo/iovisor/bcc/issues/3559,
|
|
* __builtin_memcmp should work constant size but I still get the "failed to
|
|
* find BTF for extern" error.
|
|
*/
|
|
static int my_memcmp(const void *s1_, const void *s2_, __u32 size)
|
|
{
|
|
const __u8 *s1 = s1_, *s2 = s2_;
|
|
int i;
|
|
|
|
for (i = 0; i < MAX_MEMCMP_SIZE && i < size; i++) {
|
|
if (s1[i] != s2[i])
|
|
return s1[i] > s2[i] ? 1 : -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static bool is_dualflow_key(struct network_tuple *flow)
|
|
{
|
|
return my_memcmp(&flow->saddr, &flow->daddr, sizeof(flow->saddr)) <= 0;
|
|
}
|
|
|
|
static void make_dualflow_key(struct network_tuple *key,
|
|
struct network_tuple *flow)
|
|
{
|
|
if (is_dualflow_key(flow))
|
|
*key = *flow;
|
|
else
|
|
reverse_flow(key, flow);
|
|
}
|
|
|
|
static struct flow_state *fstate_from_dfkey(struct dual_flow_state *df_state,
|
|
bool is_dfkey)
|
|
{
|
|
if (!df_state)
|
|
return NULL;
|
|
|
|
return is_dfkey ? &df_state->dir1 : &df_state->dir2;
|
|
}
|
|
|
|
/*
|
|
* Get the flow state for flow-direction from df_state
|
|
*
|
|
* Note: Does not validate that any of the entries in df_state actually matches
|
|
* flow, just selects the direction in df_state that best fits the flow.
|
|
*/
|
|
static struct flow_state *
|
|
get_flowstate_from_dualflow(struct dual_flow_state *df_state,
|
|
struct network_tuple *flow)
|
|
{
|
|
return fstate_from_dfkey(df_state, is_dualflow_key(flow));
|
|
}
|
|
|
|
static struct flow_state *
|
|
get_flowstate_from_packet(struct dual_flow_state *df_state,
|
|
struct packet_info *p_info)
|
|
{
|
|
return fstate_from_dfkey(df_state, p_info->pid_flow_is_dfkey);
|
|
}
|
|
|
|
static struct flow_state *
|
|
get_reverse_flowstate_from_packet(struct dual_flow_state *df_state,
|
|
struct packet_info *p_info)
|
|
{
|
|
return fstate_from_dfkey(df_state, !p_info->pid_flow_is_dfkey);
|
|
}
|
|
|
|
static struct network_tuple *
|
|
get_dualflow_key_from_packet(struct packet_info *p_info)
|
|
{
|
|
return p_info->pid_flow_is_dfkey ? &p_info->pid.flow :
|
|
&p_info->reply_pid.flow;
|
|
}
|
|
|
|
/*
|
|
* Parses the TSval and TSecr values from the TCP options field. If sucessful
|
|
* the TSval and TSecr values will be stored at tsval and tsecr (in network
|
|
* byte order).
|
|
* Returns 0 if sucessful and -1 on failure
|
|
*/
|
|
static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval,
|
|
__u32 *tsecr)
|
|
{
|
|
int len = tcph->doff << 2;
|
|
void *opt_end = (void *)tcph + len;
|
|
__u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options
|
|
__u8 i, opt;
|
|
volatile __u8
|
|
opt_size; // Seems to ensure it's always read of from stack as u8
|
|
|
|
if (tcph + 1 > data_end || len <= sizeof(struct tcphdr))
|
|
return -1;
|
|
#pragma unroll //temporary solution until we can identify why the non-unrolled loop gets stuck in an infinite loop
|
|
for (i = 0; i < MAX_TCP_OPTIONS; i++) {
|
|
if (pos + 1 > opt_end || pos + 1 > data_end)
|
|
return -1;
|
|
|
|
opt = *pos;
|
|
if (opt == 0) // Reached end of TCP options
|
|
return -1;
|
|
|
|
if (opt == 1) { // TCP NOP option - advance one byte
|
|
pos++;
|
|
continue;
|
|
}
|
|
|
|
// Option > 1, should have option size
|
|
if (pos + 2 > opt_end || pos + 2 > data_end)
|
|
return -1;
|
|
opt_size = *(pos + 1);
|
|
if (opt_size < 2) // Stop parsing options if opt_size has an invalid value
|
|
return -1;
|
|
|
|
// Option-kind is TCP timestap (yey!)
|
|
if (opt == 8 && opt_size == 10) {
|
|
if (pos + 10 > opt_end || pos + 10 > data_end)
|
|
return -1;
|
|
*tsval = *(__u32 *)(pos + 2);
|
|
*tsecr = *(__u32 *)(pos + 6);
|
|
return 0;
|
|
}
|
|
|
|
// Some other TCP option - advance option-length bytes
|
|
pos += opt_size;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Attempts to fetch an identifier for TCP packets, based on the TCP timestamp
|
|
* option.
|
|
*
|
|
* Will use the TSval as pid and TSecr as reply_pid, and the TCP source and dest
|
|
* as port numbers.
|
|
*
|
|
* If successful, tcph, sport, dport and proto_info will be set
|
|
* appropriately and 0 will be returned.
|
|
* On failure -1 will be returned (and arguments will not be set).
|
|
*/
|
|
static int parse_tcp_identifier(struct parsing_context *pctx,
|
|
struct tcphdr **tcph, __u16 *sport,
|
|
__u16 *dport, struct protocol_info *proto_info)
|
|
{
|
|
struct tcphdr *hdr;
|
|
if (parse_tcphdr(&pctx->nh, pctx->data_end, &hdr) < 0)
|
|
return -1;
|
|
|
|
if (parse_tcp_ts(hdr, pctx->data_end, &proto_info->pid,
|
|
&proto_info->reply_pid) < 0)
|
|
return -1; //Possible TODO, fall back on seq/ack instead
|
|
|
|
// Do not timestamp pure ACKs (no payload)
|
|
proto_info->pid_valid =
|
|
pctx->nh.pos - pctx->data < pctx->pkt_len || hdr->syn;
|
|
|
|
// Do not match on non-ACKs (TSecr not valid)
|
|
proto_info->reply_pid_valid = hdr->ack;
|
|
|
|
// Check if connection is opening/closing
|
|
if (hdr->rst) {
|
|
proto_info->event_type = FLOW_EVENT_CLOSING_BOTH;
|
|
proto_info->event_reason = EVENT_REASON_RST;
|
|
} else if (hdr->fin) {
|
|
proto_info->event_type = FLOW_EVENT_CLOSING;
|
|
proto_info->event_reason = EVENT_REASON_FIN;
|
|
} else if (hdr->syn) {
|
|
proto_info->event_type = FLOW_EVENT_OPENING;
|
|
proto_info->event_reason =
|
|
hdr->ack ? EVENT_REASON_SYN_ACK : EVENT_REASON_SYN;
|
|
} else {
|
|
proto_info->event_type = FLOW_EVENT_NONE;
|
|
proto_info->event_reason = EVENT_REASON_NONE;
|
|
}
|
|
|
|
*sport = hdr->source;
|
|
*dport = hdr->dest;
|
|
*tcph = hdr;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Attempts to fetch an identifier for an ICMPv6 header, based on the echo
|
|
* request/reply sequence number.
|
|
*
|
|
* Will use the echo sequence number as pid/reply_pid and the echo identifier
|
|
* as both src and dst port numbers. Echo requests will only generate a valid
|
|
* pid and echo replies will only generate a valid reply_pid.
|
|
*
|
|
* If successful, icmp6h, sport, dport and proto_info will be set appropriately
|
|
* and 0 will be returned.
|
|
* On failure, -1 will be returned (and arguments will not be set).
|
|
*
|
|
* Note: Will store the 16-bit sequence number in network byte order
|
|
* in the 32-bit proto_info->(reply_)pid.
|
|
*/
|
|
static int parse_icmp6_identifier(struct parsing_context *pctx,
|
|
struct icmp6hdr **icmp6h, __u16 *sport,
|
|
__u16 *dport,
|
|
struct protocol_info *proto_info)
|
|
{
|
|
struct icmp6hdr *hdr;
|
|
if (parse_icmp6hdr(&pctx->nh, pctx->data_end, &hdr) < 0)
|
|
return -1;
|
|
|
|
if (hdr->icmp6_code != 0)
|
|
return -1;
|
|
|
|
if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST) {
|
|
proto_info->pid = hdr->icmp6_sequence;
|
|
proto_info->pid_valid = true;
|
|
proto_info->reply_pid = 0;
|
|
proto_info->reply_pid_valid = false;
|
|
} else if (hdr->icmp6_type == ICMPV6_ECHO_REPLY) {
|
|
proto_info->reply_pid = hdr->icmp6_sequence;
|
|
proto_info->reply_pid_valid = true;
|
|
proto_info->pid = 0;
|
|
proto_info->pid_valid = false;
|
|
} else {
|
|
return -1;
|
|
}
|
|
|
|
proto_info->event_type = FLOW_EVENT_NONE;
|
|
proto_info->event_reason = EVENT_REASON_NONE;
|
|
*sport = hdr->icmp6_identifier;
|
|
*dport = hdr->icmp6_identifier;
|
|
*icmp6h = hdr;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Same as parse_icmp6_identifier, but for an ICMP(v4) header instead.
|
|
*/
|
|
static int parse_icmp_identifier(struct parsing_context *pctx,
|
|
struct icmphdr **icmph, __u16 *sport,
|
|
__u16 *dport, struct protocol_info *proto_info)
|
|
{
|
|
struct icmphdr *hdr;
|
|
if (parse_icmphdr(&pctx->nh, pctx->data_end, &hdr) < 0)
|
|
return -1;
|
|
|
|
if (hdr->code != 0)
|
|
return -1;
|
|
|
|
if (hdr->type == ICMP_ECHO) {
|
|
proto_info->pid = hdr->un.echo.sequence;
|
|
proto_info->pid_valid = true;
|
|
proto_info->reply_pid = 0;
|
|
proto_info->reply_pid_valid = false;
|
|
} else if (hdr->type == ICMP_ECHOREPLY) {
|
|
proto_info->reply_pid = hdr->un.echo.sequence;
|
|
proto_info->reply_pid_valid = true;
|
|
proto_info->pid = 0;
|
|
proto_info->pid_valid = false;
|
|
} else {
|
|
return -1;
|
|
}
|
|
|
|
proto_info->event_type = FLOW_EVENT_NONE;
|
|
proto_info->event_reason = EVENT_REASON_NONE;
|
|
*sport = hdr->un.echo.id;
|
|
*dport = hdr->un.echo.id;
|
|
*icmph = hdr;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Attempts to parse the packet defined by pctx for a valid packet identifier
|
|
* and reply identifier, filling in p_info.
|
|
*
|
|
* If succesful, all members of p_info will be set appropriately and 0 will
|
|
* be returned.
|
|
* On failure -1 will be returned (no garantuees on p_info members).
|
|
*/
|
|
static int parse_packet_identifier(struct parsing_context *pctx,
|
|
struct packet_info *p_info)
|
|
{
|
|
int proto, err;
|
|
struct ethhdr *eth;
|
|
struct protocol_info proto_info;
|
|
union {
|
|
struct iphdr *iph;
|
|
struct ipv6hdr *ip6h;
|
|
} iph_ptr;
|
|
union {
|
|
struct tcphdr *tcph;
|
|
struct icmphdr *icmph;
|
|
struct icmp6hdr *icmp6h;
|
|
} transporth_ptr;
|
|
|
|
p_info->time = bpf_ktime_get_ns();
|
|
proto = parse_ethhdr(&pctx->nh, pctx->data_end, ð);
|
|
|
|
// Parse IPv4/6 header
|
|
if (proto == bpf_htons(ETH_P_IP)) {
|
|
p_info->pid.flow.ipv = AF_INET;
|
|
p_info->pid.flow.proto =
|
|
parse_iphdr(&pctx->nh, pctx->data_end, &iph_ptr.iph);
|
|
} else if (proto == bpf_htons(ETH_P_IPV6)) {
|
|
p_info->pid.flow.ipv = AF_INET6;
|
|
p_info->pid.flow.proto =
|
|
parse_ip6hdr(&pctx->nh, pctx->data_end, &iph_ptr.ip6h);
|
|
} else {
|
|
return -1;
|
|
}
|
|
|
|
// Parse identifer from suitable protocol
|
|
if (config.track_tcp && p_info->pid.flow.proto == IPPROTO_TCP)
|
|
err = parse_tcp_identifier(pctx, &transporth_ptr.tcph,
|
|
&p_info->pid.flow.saddr.port,
|
|
&p_info->pid.flow.daddr.port,
|
|
&proto_info);
|
|
else if (config.track_icmp &&
|
|
p_info->pid.flow.proto == IPPROTO_ICMPV6 &&
|
|
p_info->pid.flow.ipv == AF_INET6)
|
|
err = parse_icmp6_identifier(pctx, &transporth_ptr.icmp6h,
|
|
&p_info->pid.flow.saddr.port,
|
|
&p_info->pid.flow.daddr.port,
|
|
&proto_info);
|
|
else if (config.track_icmp && p_info->pid.flow.proto == IPPROTO_ICMP &&
|
|
p_info->pid.flow.ipv == AF_INET)
|
|
err = parse_icmp_identifier(pctx, &transporth_ptr.icmph,
|
|
&p_info->pid.flow.saddr.port,
|
|
&p_info->pid.flow.daddr.port,
|
|
&proto_info);
|
|
else
|
|
return -1; // No matching protocol
|
|
if (err)
|
|
return -1; // Failed parsing protocol
|
|
|
|
// Sucessfully parsed packet identifier - fill in remaining members and return
|
|
p_info->pid.identifier = proto_info.pid;
|
|
p_info->pid_valid = proto_info.pid_valid;
|
|
p_info->reply_pid.identifier = proto_info.reply_pid;
|
|
p_info->reply_pid_valid = proto_info.reply_pid_valid;
|
|
p_info->event_type = proto_info.event_type;
|
|
p_info->event_reason = proto_info.event_reason;
|
|
|
|
if (p_info->pid.flow.ipv == AF_INET) {
|
|
map_ipv4_to_ipv6(&p_info->pid.flow.saddr.ip,
|
|
iph_ptr.iph->saddr);
|
|
map_ipv4_to_ipv6(&p_info->pid.flow.daddr.ip,
|
|
iph_ptr.iph->daddr);
|
|
p_info->ip_len = bpf_ntohs(iph_ptr.iph->tot_len);
|
|
p_info->ip_tos.ipv4_tos = iph_ptr.iph->tos;
|
|
} else { // IPv6
|
|
p_info->pid.flow.saddr.ip = iph_ptr.ip6h->saddr;
|
|
p_info->pid.flow.daddr.ip = iph_ptr.ip6h->daddr;
|
|
p_info->ip_len = bpf_ntohs(iph_ptr.ip6h->payload_len);
|
|
p_info->ip_tos.ipv6_tos =
|
|
*(__be32 *)iph_ptr.ip6h & IPV6_FLOWINFO_MASK;
|
|
}
|
|
|
|
p_info->pid_flow_is_dfkey = is_dualflow_key(&p_info->pid.flow);
|
|
|
|
reverse_flow(&p_info->reply_pid.flow, &p_info->pid.flow);
|
|
p_info->payload = remaining_pkt_payload(pctx);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Global versions of parse_packet_identifer that should allow for
|
|
* function-by-function verification, and reduce the overall complexity.
|
|
* Need separate versions for tc and XDP so that verifier understands that the
|
|
* first argument is PTR_TO_CTX, and therefore their data and data_end pointers
|
|
* are valid packet pointers.
|
|
*/
|
|
__noinline int parse_packet_identifer_tc(struct __sk_buff *ctx,
|
|
struct packet_info *p_info)
|
|
{
|
|
if (!p_info)
|
|
return -1;
|
|
|
|
struct parsing_context pctx = {
|
|
.data = (void *)(long)ctx->data,
|
|
.data_end = (void *)(long)ctx->data_end,
|
|
.nh = { .pos = pctx.data },
|
|
.pkt_len = ctx->len,
|
|
};
|
|
|
|
return parse_packet_identifier(&pctx, p_info);
|
|
}
|
|
|
|
__noinline int parse_packet_identifer_xdp(struct xdp_md *ctx,
|
|
struct packet_info *p_info)
|
|
{
|
|
if (!p_info)
|
|
return -1;
|
|
|
|
struct parsing_context pctx = {
|
|
.data = (void *)(long)ctx->data,
|
|
.data_end = (void *)(long)ctx->data_end,
|
|
.nh = { .pos = pctx.data },
|
|
.pkt_len = pctx.data_end - pctx.data,
|
|
};
|
|
|
|
return parse_packet_identifier(&pctx, p_info);
|
|
}
|
|
|
|
/*
|
|
* Calculate a smoothed rtt similar to how TCP stack does it in
|
|
* net/ipv4/tcp_input.c/tcp_rtt_estimator().
|
|
*
|
|
* NOTE: Will cause roundoff errors, but if RTTs > 1000ns errors should be small
|
|
*/
|
|
static __u64 calculate_srtt(__u64 prev_srtt, __u64 rtt)
|
|
{
|
|
if (!prev_srtt)
|
|
return rtt;
|
|
// srtt = 7/8*prev_srtt + 1/8*rtt
|
|
return prev_srtt - (prev_srtt >> 3) + (rtt >> 3);
|
|
}
|
|
|
|
static bool is_rate_limited(__u64 now, __u64 last_ts, __u64 rtt)
|
|
{
|
|
if (now < last_ts)
|
|
return true;
|
|
|
|
// RTT-based rate limit
|
|
if (config.rtt_rate && rtt)
|
|
return now - last_ts < FIXPOINT_TO_UINT(config.rtt_rate * rtt);
|
|
|
|
// Static rate limit
|
|
return now - last_ts < config.rate_limit;
|
|
}
|
|
|
|
/*
|
|
* Send a flow opening event through the perf-buffer.
|
|
* As these events are only sent upon receiving a reply, need to access state
|
|
* of the reverse flow to get reason flow was opened and when the original
|
|
* packet opening the flow was sent.
|
|
*/
|
|
static void send_flow_open_event(void *ctx, struct packet_info *p_info,
|
|
struct flow_state *rev_flow)
|
|
{
|
|
struct flow_event fe = {
|
|
.event_type = EVENT_TYPE_FLOW,
|
|
.flow_event_type = FLOW_EVENT_OPENING,
|
|
.source = EVENT_SOURCE_PKT_DEST,
|
|
.flow = p_info->pid.flow,
|
|
.reason = rev_flow->opening_reason,
|
|
.timestamp = rev_flow->last_timestamp,
|
|
.reserved = 0,
|
|
};
|
|
|
|
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &fe, sizeof(fe));
|
|
}
|
|
|
|
/*
|
|
* Sends a flow-event message based on p_info.
|
|
*
|
|
* The rev_flow argument is used to inform if the message is for the flow
|
|
* in the current direction or the reverse flow, and will adapt the flow and
|
|
* source members accordingly.
|
|
*/
|
|
static void send_flow_event(void *ctx, struct packet_info *p_info,
|
|
bool rev_flow)
|
|
{
|
|
struct flow_event fe = {
|
|
.event_type = EVENT_TYPE_FLOW,
|
|
.flow_event_type = p_info->event_type,
|
|
.reason = p_info->event_reason,
|
|
.timestamp = p_info->time,
|
|
.reserved = 0, // Make sure it's initilized
|
|
};
|
|
|
|
if (rev_flow) {
|
|
fe.flow = p_info->pid.flow;
|
|
fe.source = EVENT_SOURCE_PKT_SRC;
|
|
} else {
|
|
fe.flow = p_info->reply_pid.flow;
|
|
fe.source = EVENT_SOURCE_PKT_DEST;
|
|
}
|
|
|
|
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &fe, sizeof(fe));
|
|
}
|
|
|
|
/*
|
|
* Send a map-full event for the map.
|
|
* Will only trigger once every WARN_MAP_FULL_INTERVAL
|
|
*/
|
|
static void send_map_full_event(void *ctx, struct packet_info *p_info,
|
|
enum pping_map map)
|
|
{
|
|
struct map_full_event me;
|
|
|
|
if (p_info->time < last_warn_time[map] ||
|
|
p_info->time - last_warn_time[map] < WARN_MAP_FULL_INTERVAL)
|
|
return;
|
|
|
|
last_warn_time[map] = p_info->time;
|
|
|
|
__builtin_memset(&me, 0, sizeof(me));
|
|
me.event_type = EVENT_TYPE_MAP_FULL;
|
|
me.timestamp = p_info->time;
|
|
me.flow = p_info->pid.flow;
|
|
me.map = map;
|
|
|
|
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &me, sizeof(me));
|
|
}
|
|
|
|
/*
|
|
* Initilizes an "empty" flow state based on the forward direction of the
|
|
* current packet
|
|
*/
|
|
static void init_flowstate(struct flow_state *f_state,
|
|
struct packet_info *p_info)
|
|
{
|
|
f_state->conn_state = CONNECTION_STATE_WAITOPEN;
|
|
f_state->last_timestamp = p_info->time;
|
|
f_state->opening_reason = p_info->event_type == FLOW_EVENT_OPENING ?
|
|
p_info->event_reason :
|
|
EVENT_REASON_FIRST_OBS_PCKT;
|
|
}
|
|
|
|
static void init_empty_flowstate(struct flow_state *f_state)
|
|
{
|
|
f_state->conn_state = CONNECTION_STATE_EMPTY;
|
|
}
|
|
|
|
/*
|
|
* Initilize a new (assumed 0-initlized) dual flow state based on the current
|
|
* packet.
|
|
*/
|
|
static void init_dualflow_state(struct dual_flow_state *df_state,
|
|
struct packet_info *p_info)
|
|
{
|
|
struct flow_state *fw_state =
|
|
get_flowstate_from_packet(df_state, p_info);
|
|
struct flow_state *rev_state =
|
|
get_reverse_flowstate_from_packet(df_state, p_info);
|
|
|
|
init_flowstate(fw_state, p_info);
|
|
init_empty_flowstate(rev_state);
|
|
}
|
|
|
|
static struct dual_flow_state *
|
|
create_dualflow_state(void *ctx, struct packet_info *p_info, bool *new_flow)
|
|
{
|
|
struct network_tuple *key = get_dualflow_key_from_packet(p_info);
|
|
struct dual_flow_state new_state = { 0 };
|
|
|
|
init_dualflow_state(&new_state, p_info);
|
|
|
|
if (bpf_map_update_elem(&flow_state, key, &new_state, BPF_NOEXIST) ==
|
|
0) {
|
|
if (new_flow)
|
|
*new_flow = true;
|
|
} else {
|
|
send_map_full_event(ctx, p_info, PPING_MAP_FLOWSTATE);
|
|
return NULL;
|
|
}
|
|
|
|
return bpf_map_lookup_elem(&flow_state, key);
|
|
}
|
|
|
|
static struct dual_flow_state *
|
|
lookup_or_create_dualflow_state(void *ctx, struct packet_info *p_info,
|
|
bool *new_flow)
|
|
{
|
|
struct dual_flow_state *df_state;
|
|
|
|
df_state = bpf_map_lookup_elem(&flow_state,
|
|
get_dualflow_key_from_packet(p_info));
|
|
|
|
if (df_state)
|
|
return df_state;
|
|
|
|
// Only try to create new state if we have a valid pid
|
|
if (!p_info->pid_valid || p_info->event_type == FLOW_EVENT_CLOSING ||
|
|
p_info->event_type == FLOW_EVENT_CLOSING_BOTH)
|
|
return NULL;
|
|
|
|
return create_dualflow_state(ctx, p_info, new_flow);
|
|
}
|
|
|
|
static bool is_flowstate_active(struct flow_state *f_state)
|
|
{
|
|
return f_state->conn_state != CONNECTION_STATE_EMPTY &&
|
|
f_state->conn_state != CONNECTION_STATE_CLOSED;
|
|
}
|
|
|
|
static void update_forward_flowstate(struct packet_info *p_info,
|
|
struct flow_state *f_state, bool *new_flow)
|
|
{
|
|
// "Create" flowstate if it's empty
|
|
if (f_state->conn_state == CONNECTION_STATE_EMPTY &&
|
|
p_info->pid_valid) {
|
|
init_flowstate(f_state, p_info);
|
|
if (new_flow)
|
|
*new_flow = true;
|
|
}
|
|
|
|
if (is_flowstate_active(f_state)) {
|
|
f_state->sent_pkts++;
|
|
f_state->sent_bytes += p_info->payload;
|
|
}
|
|
}
|
|
|
|
static void update_reverse_flowstate(void *ctx, struct packet_info *p_info,
|
|
struct flow_state *f_state)
|
|
{
|
|
if (!is_flowstate_active(f_state))
|
|
return;
|
|
|
|
// First time we see reply for flow?
|
|
if (f_state->conn_state == CONNECTION_STATE_WAITOPEN &&
|
|
p_info->event_type != FLOW_EVENT_CLOSING_BOTH) {
|
|
f_state->conn_state = CONNECTION_STATE_OPEN;
|
|
send_flow_open_event(ctx, p_info, f_state);
|
|
}
|
|
|
|
f_state->rec_pkts++;
|
|
f_state->rec_bytes += p_info->payload;
|
|
}
|
|
|
|
static bool should_notify_closing(struct flow_state *f_state)
|
|
{
|
|
return f_state->conn_state == CONNECTION_STATE_OPEN;
|
|
}
|
|
|
|
static void close_and_delete_flows(void *ctx, struct packet_info *p_info,
|
|
struct flow_state *fw_flow,
|
|
struct flow_state *rev_flow)
|
|
{
|
|
// Forward flow closing
|
|
if (p_info->event_type == FLOW_EVENT_CLOSING ||
|
|
p_info->event_type == FLOW_EVENT_CLOSING_BOTH) {
|
|
if (should_notify_closing(fw_flow))
|
|
send_flow_event(ctx, p_info, false);
|
|
fw_flow->conn_state = CONNECTION_STATE_CLOSED;
|
|
}
|
|
|
|
// Reverse flow closing
|
|
if (p_info->event_type == FLOW_EVENT_CLOSING_BOTH) {
|
|
if (should_notify_closing(rev_flow))
|
|
send_flow_event(ctx, p_info, true);
|
|
rev_flow->conn_state = CONNECTION_STATE_CLOSED;
|
|
}
|
|
|
|
// Delete flowstate entry if neither flow is open anymore
|
|
if (!is_flowstate_active(fw_flow) && !is_flowstate_active(rev_flow)) {
|
|
if (bpf_map_delete_elem(&flow_state,
|
|
get_dualflow_key_from_packet(p_info)) ==
|
|
0)
|
|
debug_increment_autodel(PPING_MAP_FLOWSTATE);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Return true if p_info->pid.flow.daddr is a "local" address.
|
|
*
|
|
* Works by performing a fib lookup for p_info->pid.flow.
|
|
* Lookup struct filled based on examples from
|
|
* samples/bpf/xdp_fwd_kern.c/xdp_fwd_flags() and
|
|
* tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c
|
|
*/
|
|
static bool is_local_address(struct packet_info *p_info, void *ctx)
|
|
{
|
|
int ret;
|
|
struct bpf_fib_lookup lookup;
|
|
__builtin_memset(&lookup, 0, sizeof(lookup));
|
|
|
|
lookup.ifindex = p_info->ingress_ifindex;
|
|
lookup.family = p_info->pid.flow.ipv;
|
|
lookup.tot_len = p_info->ip_len;
|
|
|
|
if (lookup.family == AF_INET) {
|
|
lookup.tos = p_info->ip_tos.ipv4_tos;
|
|
lookup.ipv4_src = ipv4_from_ipv6(&p_info->pid.flow.saddr.ip);
|
|
lookup.ipv4_dst = ipv4_from_ipv6(&p_info->pid.flow.daddr.ip);
|
|
} else if (lookup.family == AF_INET6) {
|
|
struct in6_addr *src = (struct in6_addr *)lookup.ipv6_src;
|
|
struct in6_addr *dst = (struct in6_addr *)lookup.ipv6_dst;
|
|
|
|
lookup.flowinfo = p_info->ip_tos.ipv6_tos;
|
|
*src = p_info->pid.flow.saddr.ip;
|
|
*dst = p_info->pid.flow.daddr.ip;
|
|
}
|
|
|
|
lookup.l4_protocol = p_info->pid.flow.proto;
|
|
lookup.sport = 0;
|
|
lookup.dport = 0;
|
|
|
|
ret = bpf_fib_lookup(ctx, &lookup, sizeof(lookup), 0);
|
|
|
|
return ret == BPF_FIB_LKUP_RET_NOT_FWDED ||
|
|
ret == BPF_FIB_LKUP_RET_FWD_DISABLED;
|
|
}
|
|
|
|
/*
|
|
* Attempt to create a timestamp-entry for packet p_info for flow in f_state
|
|
*/
|
|
static void pping_timestamp_packet(struct flow_state *f_state, void *ctx,
|
|
struct packet_info *p_info, bool new_flow)
|
|
{
|
|
if (!is_flowstate_active(f_state) || !p_info->pid_valid)
|
|
return;
|
|
|
|
if (config.localfilt && p_info->is_ingress &&
|
|
is_local_address(p_info, ctx))
|
|
return;
|
|
|
|
// Check if identfier is new
|
|
if (!new_flow && f_state->last_id == p_info->pid.identifier)
|
|
return;
|
|
f_state->last_id = p_info->pid.identifier;
|
|
|
|
// Check rate-limit
|
|
if (!new_flow &&
|
|
is_rate_limited(p_info->time, f_state->last_timestamp,
|
|
config.use_srtt ? f_state->srtt : f_state->min_rtt))
|
|
return;
|
|
|
|
/*
|
|
* Updates attempt at creating timestamp, even if creation of timestamp
|
|
* fails (due to map being full). This should make the competition for
|
|
* the next available map slot somewhat fairer between heavy and sparse
|
|
* flows.
|
|
*/
|
|
f_state->last_timestamp = p_info->time;
|
|
|
|
if (bpf_map_update_elem(&packet_ts, &p_info->pid, &p_info->time,
|
|
BPF_NOEXIST) == 0)
|
|
__sync_fetch_and_add(&f_state->outstanding_timestamps, 1);
|
|
else
|
|
send_map_full_event(ctx, p_info, PPING_MAP_PACKETTS);
|
|
}
|
|
|
|
/*
|
|
* Attempt to match packet in p_info with a timestamp from flow in f_state
|
|
*/
|
|
static void pping_match_packet(struct flow_state *f_state, void *ctx,
|
|
struct packet_info *p_info)
|
|
{
|
|
struct rtt_event re = { 0 };
|
|
__u64 *p_ts;
|
|
|
|
if (!is_flowstate_active(f_state) || !p_info->reply_pid_valid)
|
|
return;
|
|
|
|
if (f_state->outstanding_timestamps == 0)
|
|
return;
|
|
|
|
p_ts = bpf_map_lookup_elem(&packet_ts, &p_info->reply_pid);
|
|
if (!p_ts || p_info->time < *p_ts)
|
|
return;
|
|
|
|
re.rtt = p_info->time - *p_ts;
|
|
|
|
// Delete timestamp entry as soon as RTT is calculated
|
|
if (bpf_map_delete_elem(&packet_ts, &p_info->reply_pid) == 0) {
|
|
__sync_fetch_and_add(&f_state->outstanding_timestamps, -1);
|
|
debug_increment_autodel(PPING_MAP_PACKETTS);
|
|
}
|
|
|
|
if (f_state->min_rtt == 0 || re.rtt < f_state->min_rtt)
|
|
f_state->min_rtt = re.rtt;
|
|
f_state->srtt = calculate_srtt(f_state->srtt, re.rtt);
|
|
|
|
// Fill event and push to perf-buffer
|
|
re.event_type = EVENT_TYPE_RTT;
|
|
re.timestamp = p_info->time;
|
|
re.min_rtt = f_state->min_rtt;
|
|
re.sent_pkts = f_state->sent_pkts;
|
|
re.sent_bytes = f_state->sent_bytes;
|
|
re.rec_pkts = f_state->rec_pkts;
|
|
re.rec_bytes = f_state->rec_bytes;
|
|
re.flow = p_info->pid.flow;
|
|
re.match_on_egress = !p_info->is_ingress;
|
|
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &re, sizeof(re));
|
|
}
|
|
|
|
/*
|
|
* Contains the actual pping logic that is applied after a packet has been
|
|
* parsed and deemed to contain some valid identifier.
|
|
|
|
* Looks up and updates flowstate (in both directions), tries to save a
|
|
* timestamp of the packet, tries to match packet against previous timestamps,
|
|
* calculates RTTs and pushes messages to userspace as appropriate.
|
|
*/
|
|
static void pping_parsed_packet(void *ctx, struct packet_info *p_info)
|
|
{
|
|
struct dual_flow_state *df_state;
|
|
struct flow_state *fw_flow, *rev_flow;
|
|
bool new_flow = false;
|
|
|
|
df_state = lookup_or_create_dualflow_state(ctx, p_info, &new_flow);
|
|
if (!df_state)
|
|
return;
|
|
|
|
fw_flow = get_flowstate_from_packet(df_state, p_info);
|
|
update_forward_flowstate(p_info, fw_flow, &new_flow);
|
|
pping_timestamp_packet(fw_flow, ctx, p_info, new_flow);
|
|
|
|
rev_flow = get_reverse_flowstate_from_packet(df_state, p_info);
|
|
update_reverse_flowstate(ctx, p_info, rev_flow);
|
|
pping_match_packet(rev_flow, ctx, p_info);
|
|
|
|
close_and_delete_flows(ctx, p_info, fw_flow, rev_flow);
|
|
}
|
|
|
|
/*
|
|
* Main function which contains all the pping logic (parse packet, attempt to
|
|
* create timestamp for it, try match against previous timestamps, update
|
|
* flowstate etc.).
|
|
*
|
|
* Has a separate tc and xdp version so that verifier sees the global
|
|
* functions for parsing packets in the right context, but most of the
|
|
* work is done in common functions (parse_packet_identifier and
|
|
* pping_parsed_packet)
|
|
*/
|
|
static void pping_tc(struct __sk_buff *ctx, bool is_ingress)
|
|
{
|
|
struct packet_info p_info = { 0 };
|
|
|
|
if (parse_packet_identifer_tc(ctx, &p_info) < 0)
|
|
return;
|
|
|
|
p_info.is_ingress = is_ingress;
|
|
p_info.ingress_ifindex = is_ingress ? ctx->ingress_ifindex : 0;
|
|
|
|
pping_parsed_packet(ctx, &p_info);
|
|
}
|
|
|
|
static void pping_xdp(struct xdp_md *ctx)
|
|
{
|
|
struct packet_info p_info = { 0 };
|
|
|
|
if (parse_packet_identifer_xdp(ctx, &p_info) < 0)
|
|
return;
|
|
|
|
p_info.is_ingress = true;
|
|
p_info.ingress_ifindex = ctx->rx_queue_index;
|
|
|
|
pping_parsed_packet(ctx, &p_info);
|
|
}
|
|
|
|
static bool is_flow_old(struct network_tuple *flow, struct flow_state *f_state,
|
|
__u64 time)
|
|
{
|
|
__u64 age;
|
|
__u64 ts;
|
|
|
|
if (!f_state || !is_flowstate_active(f_state))
|
|
return false;
|
|
|
|
ts = f_state->last_timestamp; // To avoid concurrency issue between check and age calculation
|
|
if (ts > time)
|
|
return false;
|
|
age = time - ts;
|
|
|
|
return (f_state->conn_state == CONNECTION_STATE_WAITOPEN &&
|
|
age > UNOPENED_FLOW_LIFETIME) ||
|
|
((flow->proto == IPPROTO_ICMP ||
|
|
flow->proto == IPPROTO_ICMPV6) &&
|
|
age > ICMP_FLOW_LIFETIME) ||
|
|
age > FLOW_LIFETIME;
|
|
}
|
|
|
|
static void send_flow_timeout_message(void *ctx, struct network_tuple *flow,
|
|
__u64 time)
|
|
{
|
|
struct flow_event fe = {
|
|
.event_type = EVENT_TYPE_FLOW,
|
|
.flow_event_type = FLOW_EVENT_CLOSING,
|
|
.reason = EVENT_REASON_FLOW_TIMEOUT,
|
|
.source = EVENT_SOURCE_GC,
|
|
.timestamp = time,
|
|
.reserved = 0,
|
|
};
|
|
|
|
// To be consistent with Kathie's pping we report flow "backwards"
|
|
reverse_flow(&fe.flow, flow);
|
|
|
|
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &fe, sizeof(fe));
|
|
}
|
|
|
|
// Programs
|
|
|
|
// Egress path using TC-BPF
|
|
SEC("tc")
|
|
int pping_tc_egress(struct __sk_buff *skb)
|
|
{
|
|
pping_tc(skb, false);
|
|
|
|
return TC_ACT_UNSPEC;
|
|
}
|
|
|
|
// Ingress path using TC-BPF
|
|
SEC("tc")
|
|
int pping_tc_ingress(struct __sk_buff *skb)
|
|
{
|
|
pping_tc(skb, true);
|
|
|
|
return TC_ACT_UNSPEC;
|
|
}
|
|
|
|
// Ingress path using XDP
|
|
SEC("xdp")
|
|
int pping_xdp_ingress(struct xdp_md *ctx)
|
|
{
|
|
pping_xdp(ctx);
|
|
|
|
return XDP_PASS;
|
|
}
|
|
|
|
SEC("iter/bpf_map_elem")
|
|
int tsmap_cleanup(struct bpf_iter__bpf_map_elem *ctx)
|
|
{
|
|
struct packet_id local_pid;
|
|
struct flow_state *f_state;
|
|
struct dual_flow_state *df_state;
|
|
struct network_tuple df_key;
|
|
struct packet_id *pid = ctx->key;
|
|
__u64 *timestamp = ctx->value;
|
|
__u64 now = bpf_ktime_get_ns();
|
|
__u64 rtt;
|
|
|
|
debug_update_mapclean_stats(ctx, &events, !ctx->key || !ctx->value,
|
|
ctx->meta->seq_num, now,
|
|
PPING_MAP_PACKETTS);
|
|
|
|
if (!pid || !timestamp)
|
|
return 0;
|
|
if (now <= *timestamp)
|
|
return 0;
|
|
|
|
make_dualflow_key(&df_key, &pid->flow);
|
|
df_state = bpf_map_lookup_elem(&flow_state, &df_key);
|
|
f_state = get_flowstate_from_dualflow(df_state, &pid->flow);
|
|
rtt = f_state ? f_state->srtt : 0;
|
|
|
|
if ((rtt && now - *timestamp > rtt * TIMESTAMP_RTT_LIFETIME) ||
|
|
now - *timestamp > TIMESTAMP_LIFETIME) {
|
|
/* Seems like the key for map lookup operations must be
|
|
on the stack, so copy pid to local_pid. */
|
|
__builtin_memcpy(&local_pid, pid, sizeof(local_pid));
|
|
if (bpf_map_delete_elem(&packet_ts, &local_pid) == 0) {
|
|
debug_increment_timeoutdel(PPING_MAP_PACKETTS);
|
|
|
|
if (f_state)
|
|
__sync_fetch_and_add(
|
|
&f_state->outstanding_timestamps, -1);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
SEC("iter/bpf_map_elem")
|
|
int flowmap_cleanup(struct bpf_iter__bpf_map_elem *ctx)
|
|
{
|
|
struct network_tuple flow1, flow2;
|
|
struct flow_state *f_state1, *f_state2;
|
|
struct dual_flow_state *df_state;
|
|
__u64 now = bpf_ktime_get_ns();
|
|
bool notify1, notify2, timeout1, timeout2;
|
|
|
|
debug_update_mapclean_stats(ctx, &events, !ctx->key || !ctx->value,
|
|
ctx->meta->seq_num, now,
|
|
PPING_MAP_FLOWSTATE);
|
|
|
|
if (!ctx->key || !ctx->value)
|
|
return 0;
|
|
|
|
flow1 = *(struct network_tuple *)ctx->key;
|
|
reverse_flow(&flow2, &flow1);
|
|
|
|
df_state = ctx->value;
|
|
f_state1 = get_flowstate_from_dualflow(df_state, &flow1);
|
|
f_state2 = get_flowstate_from_dualflow(df_state, &flow2);
|
|
|
|
timeout1 = is_flow_old(&flow1, f_state1, now);
|
|
timeout2 = is_flow_old(&flow2, f_state2, now);
|
|
|
|
if ((!is_flowstate_active(f_state1) || timeout1) &&
|
|
(!is_flowstate_active(f_state2) || timeout2)) {
|
|
// Entry should be deleted
|
|
notify1 = should_notify_closing(f_state1) && timeout1;
|
|
notify2 = should_notify_closing(f_state2) && timeout2;
|
|
if (bpf_map_delete_elem(&flow_state, &flow1) == 0) {
|
|
if (notify1)
|
|
send_flow_timeout_message(ctx, &flow1, now);
|
|
if (notify2)
|
|
send_flow_timeout_message(ctx, &flow2, now);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|