Merge pull request #47 from simosund/pping-reduce-verification-complexity

pping - Reduce verification complexity
This commit is contained in:
Toke Høiland-Jørgensen
2022-06-21 21:17:25 +02:00
committed by GitHub

View File

@@ -74,8 +74,6 @@ struct parsing_context {
void *data_end; // End of safe acessible area void *data_end; // End of safe acessible area
struct hdr_cursor nh; // Position to parse next struct hdr_cursor nh; // Position to parse next
__u32 pkt_len; // Full packet length (headers+data) __u32 pkt_len; // Full packet length (headers+data)
__u32 ingress_ifindex; // Interface packet arrived on
bool is_egress; // Is packet on egress or ingress?
}; };
/* /*
@@ -90,19 +88,17 @@ struct parsing_context {
* in the packet_ts map. * in the packet_ts map.
*/ */
struct packet_info { struct packet_info {
union {
struct iphdr *iph;
struct ipv6hdr *ip6h;
};
union {
struct icmphdr *icmph;
struct icmp6hdr *icmp6h;
struct tcphdr *tcph;
};
__u64 time; // Arrival time of packet __u64 time; // Arrival time of packet
__u32 payload; // Size of packet data (excluding headers) __u32 payload; // Size of packet data (excluding headers)
struct packet_id pid; // flow + identifier to timestamp (ex. TSval) struct packet_id pid; // flow + identifier to timestamp (ex. TSval)
struct packet_id reply_pid; // rev. flow + identifier to match against (ex. TSecr) struct packet_id reply_pid; // rev. flow + identifier to match against (ex. TSecr)
__u32 ingress_ifindex; // Interface packet arrived on (if is_ingress, otherwise not valid)
union { // The IP-level "type of service" (DSCP for IPv4, traffic class + flow label for IPv6)
__u8 ipv4_tos;
__be32 ipv6_tos;
} ip_tos;
__u16 ip_len; // The IPv4 total length or IPv6 payload length
bool is_ingress; // Packet on egress or ingress?
bool pid_flow_is_dfkey; // Used to determine which member of dualflow state to use for forward direction bool pid_flow_is_dfkey; // Used to determine which member of dualflow state to use for forward direction
bool pid_valid; // identifier can be used to timestamp packet bool pid_valid; // identifier can be used to timestamp packet
bool reply_pid_valid; // reply_identifier can be used to match packet bool reply_pid_valid; // reply_identifier can be used to match packet
@@ -162,6 +158,11 @@ static void map_ipv4_to_ipv6(struct in6_addr *ipv6, __be32 ipv4)
ipv6->in6_u.u6_addr32[3] = ipv4; ipv6->in6_u.u6_addr32[3] = ipv4;
} }
static __be32 ipv4_from_ipv6(struct in6_addr *ipv6)
{
return ipv6->in6_u.u6_addr32[3];
}
/* /*
* Returns the number of unparsed bytes left in the packet (bytes after nh.pos) * Returns the number of unparsed bytes left in the packet (bytes after nh.pos)
*/ */
@@ -473,6 +474,15 @@ static int parse_packet_identifier(struct parsing_context *pctx,
int proto, err; int proto, err;
struct ethhdr *eth; struct ethhdr *eth;
struct protocol_info proto_info; struct protocol_info proto_info;
union {
struct iphdr *iph;
struct ipv6hdr *ip6h;
} iph_ptr;
union {
struct tcphdr *tcph;
struct icmphdr *icmph;
struct icmp6hdr *icmp6h;
} transporth_ptr;
p_info->time = bpf_ktime_get_ns(); p_info->time = bpf_ktime_get_ns();
proto = parse_ethhdr(&pctx->nh, pctx->data_end, &eth); proto = parse_ethhdr(&pctx->nh, pctx->data_end, &eth);
@@ -481,31 +491,31 @@ static int parse_packet_identifier(struct parsing_context *pctx,
if (proto == bpf_htons(ETH_P_IP)) { if (proto == bpf_htons(ETH_P_IP)) {
p_info->pid.flow.ipv = AF_INET; p_info->pid.flow.ipv = AF_INET;
p_info->pid.flow.proto = p_info->pid.flow.proto =
parse_iphdr(&pctx->nh, pctx->data_end, &p_info->iph); parse_iphdr(&pctx->nh, pctx->data_end, &iph_ptr.iph);
} else if (proto == bpf_htons(ETH_P_IPV6)) { } else if (proto == bpf_htons(ETH_P_IPV6)) {
p_info->pid.flow.ipv = AF_INET6; p_info->pid.flow.ipv = AF_INET6;
p_info->pid.flow.proto = p_info->pid.flow.proto =
parse_ip6hdr(&pctx->nh, pctx->data_end, &p_info->ip6h); parse_ip6hdr(&pctx->nh, pctx->data_end, &iph_ptr.ip6h);
} else { } else {
return -1; return -1;
} }
// Parse identifer from suitable protocol // Parse identifer from suitable protocol
if (config.track_tcp && p_info->pid.flow.proto == IPPROTO_TCP) if (config.track_tcp && p_info->pid.flow.proto == IPPROTO_TCP)
err = parse_tcp_identifier(pctx, &p_info->tcph, err = parse_tcp_identifier(pctx, &transporth_ptr.tcph,
&p_info->pid.flow.saddr.port, &p_info->pid.flow.saddr.port,
&p_info->pid.flow.daddr.port, &p_info->pid.flow.daddr.port,
&proto_info); &proto_info);
else if (config.track_icmp && else if (config.track_icmp &&
p_info->pid.flow.proto == IPPROTO_ICMPV6 && p_info->pid.flow.proto == IPPROTO_ICMPV6 &&
p_info->pid.flow.ipv == AF_INET6) p_info->pid.flow.ipv == AF_INET6)
err = parse_icmp6_identifier(pctx, &p_info->icmp6h, err = parse_icmp6_identifier(pctx, &transporth_ptr.icmp6h,
&p_info->pid.flow.saddr.port, &p_info->pid.flow.saddr.port,
&p_info->pid.flow.daddr.port, &p_info->pid.flow.daddr.port,
&proto_info); &proto_info);
else if (config.track_icmp && p_info->pid.flow.proto == IPPROTO_ICMP && else if (config.track_icmp && p_info->pid.flow.proto == IPPROTO_ICMP &&
p_info->pid.flow.ipv == AF_INET) p_info->pid.flow.ipv == AF_INET)
err = parse_icmp_identifier(pctx, &p_info->icmph, err = parse_icmp_identifier(pctx, &transporth_ptr.icmph,
&p_info->pid.flow.saddr.port, &p_info->pid.flow.saddr.port,
&p_info->pid.flow.daddr.port, &p_info->pid.flow.daddr.port,
&proto_info); &proto_info);
@@ -524,12 +534,17 @@ static int parse_packet_identifier(struct parsing_context *pctx,
if (p_info->pid.flow.ipv == AF_INET) { if (p_info->pid.flow.ipv == AF_INET) {
map_ipv4_to_ipv6(&p_info->pid.flow.saddr.ip, map_ipv4_to_ipv6(&p_info->pid.flow.saddr.ip,
p_info->iph->saddr); iph_ptr.iph->saddr);
map_ipv4_to_ipv6(&p_info->pid.flow.daddr.ip, map_ipv4_to_ipv6(&p_info->pid.flow.daddr.ip,
p_info->iph->daddr); iph_ptr.iph->daddr);
p_info->ip_len = bpf_ntohs(iph_ptr.iph->tot_len);
p_info->ip_tos.ipv4_tos = iph_ptr.iph->tos;
} else { // IPv6 } else { // IPv6
p_info->pid.flow.saddr.ip = p_info->ip6h->saddr; p_info->pid.flow.saddr.ip = iph_ptr.ip6h->saddr;
p_info->pid.flow.daddr.ip = p_info->ip6h->daddr; p_info->pid.flow.daddr.ip = iph_ptr.ip6h->daddr;
p_info->ip_len = bpf_ntohs(iph_ptr.ip6h->payload_len);
p_info->ip_tos.ipv6_tos =
*(__be32 *)iph_ptr.ip6h & IPV6_FLOWINFO_MASK;
} }
p_info->pid_flow_is_dfkey = is_dualflow_key(&p_info->pid.flow); p_info->pid_flow_is_dfkey = is_dualflow_key(&p_info->pid.flow);
@@ -540,6 +555,45 @@ static int parse_packet_identifier(struct parsing_context *pctx,
return 0; return 0;
} }
/*
* Global versions of parse_packet_identifer that should allow for
* function-by-function verification, and reduce the overall complexity.
* Need separate versions for tc and XDP so that verifier understands that the
* first argument is PTR_TO_CTX, and therefore their data and data_end pointers
* are valid packet pointers.
*/
__noinline int parse_packet_identifer_tc(struct __sk_buff *ctx,
struct packet_info *p_info)
{
if (!p_info)
return -1;
struct parsing_context pctx = {
.data = (void *)(long)ctx->data,
.data_end = (void *)(long)ctx->data_end,
.nh = { .pos = pctx.data },
.pkt_len = ctx->len,
};
return parse_packet_identifier(&pctx, p_info);
}
__noinline int parse_packet_identifer_xdp(struct xdp_md *ctx,
struct packet_info *p_info)
{
if (!p_info)
return -1;
struct parsing_context pctx = {
.data = (void *)(long)ctx->data,
.data_end = (void *)(long)ctx->data_end,
.nh = { .pos = pctx.data },
.pkt_len = pctx.data_end - pctx.data,
};
return parse_packet_identifier(&pctx, p_info);
}
/* /*
* Calculate a smoothed rtt similar to how TCP stack does it in * Calculate a smoothed rtt similar to how TCP stack does it in
* net/ipv4/tcp_input.c/tcp_rtt_estimator(). * net/ipv4/tcp_input.c/tcp_rtt_estimator().
@@ -798,28 +852,26 @@ static void close_and_delete_flows(void *ctx, struct packet_info *p_info,
* samples/bpf/xdp_fwd_kern.c/xdp_fwd_flags() and * samples/bpf/xdp_fwd_kern.c/xdp_fwd_flags() and
* tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c * tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c
*/ */
static bool is_local_address(struct packet_info *p_info, void *ctx, static bool is_local_address(struct packet_info *p_info, void *ctx)
struct parsing_context *pctx)
{ {
int ret; int ret;
struct bpf_fib_lookup lookup; struct bpf_fib_lookup lookup;
__builtin_memset(&lookup, 0, sizeof(lookup)); __builtin_memset(&lookup, 0, sizeof(lookup));
lookup.ifindex = pctx->ingress_ifindex; lookup.ifindex = p_info->ingress_ifindex;
lookup.family = p_info->pid.flow.ipv; lookup.family = p_info->pid.flow.ipv;
lookup.tot_len = p_info->ip_len;
if (lookup.family == AF_INET) { if (lookup.family == AF_INET) {
lookup.tos = p_info->iph->tos; lookup.tos = p_info->ip_tos.ipv4_tos;
lookup.tot_len = bpf_ntohs(p_info->iph->tot_len); lookup.ipv4_src = ipv4_from_ipv6(&p_info->pid.flow.saddr.ip);
lookup.ipv4_src = p_info->iph->saddr; lookup.ipv4_dst = ipv4_from_ipv6(&p_info->pid.flow.daddr.ip);
lookup.ipv4_dst = p_info->iph->daddr;
} else if (lookup.family == AF_INET6) { } else if (lookup.family == AF_INET6) {
struct in6_addr *src = (struct in6_addr *)lookup.ipv6_src; struct in6_addr *src = (struct in6_addr *)lookup.ipv6_src;
struct in6_addr *dst = (struct in6_addr *)lookup.ipv6_dst; struct in6_addr *dst = (struct in6_addr *)lookup.ipv6_dst;
lookup.flowinfo = *(__be32 *)p_info->ip6h & IPV6_FLOWINFO_MASK; lookup.flowinfo = p_info->ip_tos.ipv6_tos;
lookup.tot_len = bpf_ntohs(p_info->ip6h->payload_len); *src = p_info->pid.flow.saddr.ip;
*src = p_info->pid.flow.saddr.ip; //verifier did not like ip6h->saddr
*dst = p_info->pid.flow.daddr.ip; *dst = p_info->pid.flow.daddr.ip;
} }
@@ -837,14 +889,13 @@ static bool is_local_address(struct packet_info *p_info, void *ctx,
* Attempt to create a timestamp-entry for packet p_info for flow in f_state * Attempt to create a timestamp-entry for packet p_info for flow in f_state
*/ */
static void pping_timestamp_packet(struct flow_state *f_state, void *ctx, static void pping_timestamp_packet(struct flow_state *f_state, void *ctx,
struct parsing_context *pctx,
struct packet_info *p_info, bool new_flow) struct packet_info *p_info, bool new_flow)
{ {
if (!is_flowstate_active(f_state) || !p_info->pid_valid) if (!is_flowstate_active(f_state) || !p_info->pid_valid)
return; return;
if (config.localfilt && !pctx->is_egress && if (config.localfilt && p_info->is_ingress &&
is_local_address(p_info, ctx, pctx)) is_local_address(p_info, ctx))
return; return;
// Check if identfier is new // Check if identfier is new
@@ -877,7 +928,6 @@ static void pping_timestamp_packet(struct flow_state *f_state, void *ctx,
* Attempt to match packet in p_info with a timestamp from flow in f_state * Attempt to match packet in p_info with a timestamp from flow in f_state
*/ */
static void pping_match_packet(struct flow_state *f_state, void *ctx, static void pping_match_packet(struct flow_state *f_state, void *ctx,
struct parsing_context *pctx,
struct packet_info *p_info) struct packet_info *p_info)
{ {
struct rtt_event re = { 0 }; struct rtt_event re = { 0 };
@@ -914,37 +964,73 @@ static void pping_match_packet(struct flow_state *f_state, void *ctx,
re.rec_pkts = f_state->rec_pkts; re.rec_pkts = f_state->rec_pkts;
re.rec_bytes = f_state->rec_bytes; re.rec_bytes = f_state->rec_bytes;
re.flow = p_info->pid.flow; re.flow = p_info->pid.flow;
re.match_on_egress = pctx->is_egress; re.match_on_egress = !p_info->is_ingress;
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &re, sizeof(re)); bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &re, sizeof(re));
} }
/* /*
* Will parse the ingress/egress packet in pctx and attempt to create a * Contains the actual pping logic that is applied after a packet has been
* timestamp for it and match it against the reverse flow. * parsed and deemed to contain some valid identifier.
* Looks up and updates flowstate (in both directions), tries to save a
* timestamp of the packet, tries to match packet against previous timestamps,
* calculates RTTs and pushes messages to userspace as appropriate.
*/ */
static void pping(void *ctx, struct parsing_context *pctx) static void pping_parsed_packet(void *ctx, struct packet_info *p_info)
{ {
struct packet_info p_info = { 0 };
struct dual_flow_state *df_state; struct dual_flow_state *df_state;
struct flow_state *fw_flow, *rev_flow; struct flow_state *fw_flow, *rev_flow;
bool new_flow = false; bool new_flow = false;
if (parse_packet_identifier(pctx, &p_info) < 0) df_state = lookup_or_create_dualflow_state(ctx, p_info, &new_flow);
return;
df_state = lookup_or_create_dualflow_state(ctx, &p_info, &new_flow);
if (!df_state) if (!df_state)
return; return;
fw_flow = get_flowstate_from_packet(df_state, &p_info); fw_flow = get_flowstate_from_packet(df_state, p_info);
update_forward_flowstate(&p_info, fw_flow, &new_flow); update_forward_flowstate(p_info, fw_flow, &new_flow);
pping_timestamp_packet(fw_flow, ctx, pctx, &p_info, new_flow); pping_timestamp_packet(fw_flow, ctx, p_info, new_flow);
rev_flow = get_reverse_flowstate_from_packet(df_state, &p_info); rev_flow = get_reverse_flowstate_from_packet(df_state, p_info);
update_reverse_flowstate(ctx, &p_info, rev_flow); update_reverse_flowstate(ctx, p_info, rev_flow);
pping_match_packet(rev_flow, ctx, pctx, &p_info); pping_match_packet(rev_flow, ctx, p_info);
close_and_delete_flows(ctx, &p_info, fw_flow, rev_flow); close_and_delete_flows(ctx, p_info, fw_flow, rev_flow);
}
/*
* Main function which contains all the pping logic (parse packet, attempt to
* create timestamp for it, try match against previous timestamps, update
* flowstate etc.).
*
* Has a separate tc and xdp version so that verifier sees the global
* functions for parsing packets in the right context, but most of the
* work is done in common functions (parse_packet_identifier and
* pping_parsed_packet)
*/
static void pping_tc(struct __sk_buff *ctx, bool is_ingress)
{
struct packet_info p_info = { 0 };
if (parse_packet_identifer_tc(ctx, &p_info) < 0)
return;
p_info.is_ingress = is_ingress;
p_info.ingress_ifindex = is_ingress ? ctx->ingress_ifindex : 0;
pping_parsed_packet(ctx, &p_info);
}
static void pping_xdp(struct xdp_md *ctx)
{
struct packet_info p_info = { 0 };
if (parse_packet_identifer_xdp(ctx, &p_info) < 0)
return;
p_info.is_ingress = true;
p_info.ingress_ifindex = ctx->rx_queue_index;
pping_parsed_packet(ctx, &p_info);
} }
static bool is_flow_old(struct network_tuple *flow, struct flow_state *f_state, static bool is_flow_old(struct network_tuple *flow, struct flow_state *f_state,
@@ -993,15 +1079,7 @@ static void send_flow_timeout_message(void *ctx, struct network_tuple *flow,
SEC("tc") SEC("tc")
int pping_tc_egress(struct __sk_buff *skb) int pping_tc_egress(struct __sk_buff *skb)
{ {
struct parsing_context pctx = { pping_tc(skb, false);
.data = (void *)(long)skb->data,
.data_end = (void *)(long)skb->data_end,
.pkt_len = skb->len,
.nh = { .pos = pctx.data },
.is_egress = true,
};
pping(skb, &pctx);
return TC_ACT_UNSPEC; return TC_ACT_UNSPEC;
} }
@@ -1010,16 +1088,7 @@ int pping_tc_egress(struct __sk_buff *skb)
SEC("tc") SEC("tc")
int pping_tc_ingress(struct __sk_buff *skb) int pping_tc_ingress(struct __sk_buff *skb)
{ {
struct parsing_context pctx = { pping_tc(skb, true);
.data = (void *)(long)skb->data,
.data_end = (void *)(long)skb->data_end,
.pkt_len = skb->len,
.nh = { .pos = pctx.data },
.ingress_ifindex = skb->ingress_ifindex,
.is_egress = false,
};
pping(skb, &pctx);
return TC_ACT_UNSPEC; return TC_ACT_UNSPEC;
} }
@@ -1028,16 +1097,7 @@ int pping_tc_ingress(struct __sk_buff *skb)
SEC("xdp") SEC("xdp")
int pping_xdp_ingress(struct xdp_md *ctx) int pping_xdp_ingress(struct xdp_md *ctx)
{ {
struct parsing_context pctx = { pping_xdp(ctx);
.data = (void *)(long)ctx->data,
.data_end = (void *)(long)ctx->data_end,
.pkt_len = pctx.data_end - pctx.data,
.nh = { .pos = pctx.data },
.ingress_ifindex = ctx->ingress_ifindex,
.is_egress = false,
};
pping(ctx, &pctx);
return XDP_PASS; return XDP_PASS;
} }