mirror of
https://github.com/xdp-project/bpf-examples.git
synced 2024-05-06 15:54:53 +00:00
Add preserve-dscp example for preserving a DSCP mark over encapsulation
This is a fun example showing how to use BPF to preserve DSCP values across an encapsulating interface, such as Wireguard. It relies on the encapsulation layer preserving the skb->hash value across the encapsulation, which is commonly the case on kernel encapsulation protocols (including Wireguard), and uses a pair of TC BPF programs and a map to re-match the packets after encapsulation and add back the DSCP value. Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
This commit is contained in:
@@ -0,0 +1 @@
|
||||
preserve-dscp
|
||||
@@ -0,0 +1,8 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
|
||||
|
||||
USER_TARGETS += preserve-dscp
|
||||
BPF_TARGETS += preserve_dscp_kern
|
||||
|
||||
LIB_DIR = ../lib
|
||||
|
||||
include $(LIB_DIR)/common.mk
|
||||
@@ -0,0 +1,47 @@
|
||||
* DSCP-preserving TC filters
|
||||
|
||||
This example shows how to use BPF to preserve DSCP values across an
|
||||
encapsulating interface such as Wireguard. It relies on the encapsulation layer
|
||||
preserving the skb->hash value across the encapsulation, which is commonly the
|
||||
case on kernel encapsulation protocols (including Wireguard).
|
||||
|
||||
The example contains two filters: one that parses the packet header and reads
|
||||
the DSCP value out of the packet, then stores that value in a map keyed on the
|
||||
skb->hash value (which is calculated if it isn't already set). And the second TC
|
||||
filter reads back the skb->hash value, looks it up in the map, and if found
|
||||
rewrites the packet DSCP based on that value.
|
||||
|
||||
The idea is that the first filter is run on the internal (encapsulating)
|
||||
interface, and the second is run on the physical interface that transmits the
|
||||
encapsulated packet. To install the filters, run the userspace component like:
|
||||
|
||||
=sudo ./preserve-dscp <ifname pre> <ifname post>=
|
||||
|
||||
To unload the filters again, run:
|
||||
=sudo ./preserve-dscp <ifname pre> <ifname post> --unload=
|
||||
|
||||
Note that unloading will remove the clsact qdisc from the interfaces entirely,
|
||||
so don't run this if you want to preserve that; instead manually remove the
|
||||
filters using =tc=.
|
||||
|
||||
** Caveats
|
||||
There are a couple of caveats to this approach:
|
||||
|
||||
- As mentioned above, this only works for encapsulation protocols that preserve
|
||||
the SKB hash in the first place.
|
||||
|
||||
- The userspace program will try to detect if the =pre= interface has an
|
||||
Ethernet header by checking if the interface has a type of =ARPHRD_NONE=, and
|
||||
if so will assume the packet starts with the IP header. If this heuristic
|
||||
turns out to be wrong, the filter will fail.
|
||||
|
||||
- There is no sanity checking on the outer filter that the packets actually
|
||||
come from the interface that we ran the =pre= filter on in the first place;
|
||||
there is no general way to check this from BPF, but the =write_dscp= filter can
|
||||
be amended to do some other sanity checks on the packet before modifying it
|
||||
(such as checking port numbers).
|
||||
|
||||
- Since this relies on =skb->hash=, it is flow-based; if individual packets in
|
||||
the same flow have different marks, which ones will be preserved is racy.
|
||||
|
||||
|
||||
@@ -0,0 +1,143 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <net/if.h>
|
||||
#include <linux/if_arp.h>
|
||||
|
||||
#include <bpf/libbpf.h>
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const char *filename = "preserve_dscp_kern.o";
|
||||
char *ifname_pre, *ifname_post;
|
||||
int ifindex_pre, ifindex_post;
|
||||
struct bpf_map *map = NULL;
|
||||
int err = 0, fd, iftype;
|
||||
struct bpf_object *obj;
|
||||
char buf[100];
|
||||
ssize_t len;
|
||||
DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .attach_point = BPF_TC_EGRESS);
|
||||
DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_pre);
|
||||
DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_post);
|
||||
|
||||
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage: %s <if pre> <if post> [--unload]\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
ifname_pre = argv[1];
|
||||
ifname_post = argv[2];
|
||||
|
||||
ifindex_pre = if_nametoindex(ifname_pre);
|
||||
if (!ifindex_pre) {
|
||||
fprintf(stderr, "Couldn't find interface '%s'\n", ifname_pre);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Get type of interface to know if it has ethernet headers */
|
||||
snprintf(buf, sizeof(buf)-1, "/sys/class/net/%s/type", ifname_pre);
|
||||
buf[sizeof(buf)-1] = '\0';
|
||||
fd = open(buf, 0);
|
||||
if (fd < 0 || (len = read(fd, buf, sizeof(buf))) == -1) {
|
||||
fprintf(stderr, "Couldn't get interface type for '%s'\n", ifname_pre);
|
||||
return 1;
|
||||
}
|
||||
buf[len] = '\0';
|
||||
close(fd);
|
||||
iftype = atoi(buf);
|
||||
|
||||
ifindex_post = if_nametoindex(ifname_post);
|
||||
if (!ifindex_post) {
|
||||
fprintf(stderr, "Couldn't find interface '%s'\n", ifname_post);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (argc == 4 && strcmp(argv[3], "--unload") == 0) {
|
||||
int _err;
|
||||
hook.attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS;
|
||||
hook.ifindex = ifindex_pre;
|
||||
_err = bpf_tc_hook_destroy(&hook);
|
||||
if (_err)
|
||||
fprintf(stderr, "Couldn't remove clsact qdisc on %s\n", ifname_pre);
|
||||
|
||||
hook.ifindex = ifindex_post;
|
||||
err = bpf_tc_hook_destroy(&hook);
|
||||
if (err)
|
||||
fprintf(stderr, "Couldn't remove clsact qdisc on %s\n", ifname_post);
|
||||
else
|
||||
err = _err;
|
||||
return err;
|
||||
} else if (argc > 3) {
|
||||
fprintf(stderr, "Usage: %s <if pre> <if post> [--unload]\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
obj = bpf_object__open(filename);
|
||||
err = libbpf_get_error(obj);
|
||||
if (err) {
|
||||
fprintf(stderr, "Couldn't open file: %s\n", filename);
|
||||
return err;
|
||||
}
|
||||
|
||||
while ((map = bpf_map__next(map, obj))) {
|
||||
if (strstr(bpf_map__name(map), ".rodata")) {
|
||||
int ip_only = (iftype == ARPHRD_NONE);
|
||||
bpf_map__set_initial_value(map, &ip_only, sizeof(ip_only));
|
||||
}
|
||||
}
|
||||
|
||||
err = bpf_object__load(obj);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed to load object\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
attach_pre.prog_fd = bpf_program__fd(bpf_object__find_program_by_name(obj, "read_dscp"));
|
||||
if (attach_pre.prog_fd < 0) {
|
||||
fprintf(stderr, "Couldn't find program 'read_dscp'\n");
|
||||
err = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
attach_post.prog_fd = bpf_program__fd(bpf_object__find_program_by_name(obj, "write_dscp"));
|
||||
if (attach_post.prog_fd < 0) {
|
||||
fprintf(stderr, "Couldn't find program 'write_dscp'\n");
|
||||
err = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
hook.ifindex = ifindex_pre;
|
||||
err = bpf_tc_hook_create(&hook);
|
||||
if (err && err != -EEXIST) {
|
||||
fprintf(stderr, "Couldn't create hook for ifindex %d\n", ifindex_pre);
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = bpf_tc_attach(&hook, &attach_pre);
|
||||
if (err) {
|
||||
fprintf(stderr, "Couldn't attach program to ifindex %d\n", hook.ifindex);
|
||||
goto out;
|
||||
}
|
||||
|
||||
hook.ifindex = ifindex_post;
|
||||
err = bpf_tc_hook_create(&hook);
|
||||
if (err && err != -EEXIST) {
|
||||
fprintf(stderr, "Couldn't create hook for ifindex %d\n", ifindex_post);
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = bpf_tc_attach(&hook, &attach_post);
|
||||
if (err) {
|
||||
fprintf(stderr, "Couldn't attach program to ifindex %d\n", hook.ifindex);
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
bpf_object__close(obj);
|
||||
return err;
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#include <linux/bpf.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <xdp/parsing_helpers.h>
|
||||
#include <linux/pkt_cls.h>
|
||||
|
||||
/* We use an LRU map to avoid having to do cleanup: We will remove the matching
|
||||
* entry in the map if a packet does not have a DSCP value, but we won't
|
||||
* otherwise clean up stale entries. Instead, we just rely on the LRU mechanism
|
||||
* to evict old entries as the map fills up.
|
||||
*/
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_LRU_HASH);
|
||||
__type(key, __u32);
|
||||
__type(value, __u8);
|
||||
__uint(max_entries, 16384);
|
||||
} flow_dscps SEC(".maps");
|
||||
|
||||
const volatile static int ip_only = 0;
|
||||
|
||||
static __u8 get_dscp(struct __sk_buff *skb)
|
||||
{
|
||||
void *data_end = (void *)(unsigned long long)skb->data_end;
|
||||
void *data = (void *)(unsigned long long)skb->data;
|
||||
struct hdr_cursor nh = { .pos = data };
|
||||
|
||||
struct ipv6hdr *ipv6hdr;
|
||||
struct iphdr *iphdr;
|
||||
struct ethhdr *eth;
|
||||
int eth_type;
|
||||
|
||||
if (!ip_only) {
|
||||
|
||||
eth_type = parse_ethhdr(&nh, data_end, ð);
|
||||
if (eth_type != bpf_htons(ETH_P_IP) &&
|
||||
eth_type != bpf_htons(ETH_P_IPV6))
|
||||
return 0;
|
||||
}
|
||||
if (parse_iphdr(&nh, data_end, &iphdr) > 0)
|
||||
return iphdr->tos >> 2;
|
||||
|
||||
else if (parse_ip6hdr(&nh, data_end, &ipv6hdr) > 0)
|
||||
return bpf_ntohs(*(__u16 *)ipv6hdr) >> 4;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void ipv4_change_dsfield(struct iphdr *iph, __u8 mask, __u8 value)
|
||||
{
|
||||
__u32 check = bpf_ntohs(iph->check);
|
||||
__u8 dsfield;
|
||||
|
||||
dsfield = (iph->tos & mask) | value;
|
||||
check += iph->tos;
|
||||
if ((check+1) >> 16) check = (check+1) & 0xffff;
|
||||
check -= dsfield;
|
||||
check += check >> 16; /* adjust carry */
|
||||
iph->check = bpf_htons(check);
|
||||
iph->tos = dsfield;
|
||||
}
|
||||
|
||||
static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask,
|
||||
__u8 value)
|
||||
{
|
||||
__u16 *p = (__u16 *)ipv6h;
|
||||
|
||||
*p = (*p & bpf_htons((((__u16)mask << 4) | 0xf00f))) | bpf_htons((__u16)value << 4);
|
||||
}
|
||||
|
||||
#define INET_ECN_MASK 3
|
||||
|
||||
static void set_dscp(struct __sk_buff *skb, __u8 dscp)
|
||||
{
|
||||
void *data_end = (void *)(unsigned long long)skb->data_end;
|
||||
void *data = (void *)(unsigned long long)skb->data;
|
||||
struct hdr_cursor nh = { .pos = data };
|
||||
|
||||
struct ipv6hdr *ipv6hdr;
|
||||
struct iphdr *iphdr;
|
||||
struct ethhdr *eth;
|
||||
int eth_type;
|
||||
|
||||
eth_type = parse_ethhdr(&nh, data_end, ð);
|
||||
|
||||
if (eth_type == bpf_htons(ETH_P_IP) &&
|
||||
parse_iphdr(&nh, data_end, &iphdr) > 0)
|
||||
ipv4_change_dsfield(iphdr, INET_ECN_MASK, dscp << 2);
|
||||
|
||||
else if (eth_type == bpf_htons(ETH_P_IPV6) &&
|
||||
parse_ip6hdr(&nh, data_end, &ipv6hdr) > 0)
|
||||
ipv6_change_dsfield(ipv6hdr, INET_ECN_MASK, dscp << 2);
|
||||
}
|
||||
|
||||
SEC("classifier/read")
|
||||
int read_dscp(struct __sk_buff *skb)
|
||||
{
|
||||
__u32 key = bpf_get_hash_recalc(skb);
|
||||
__u8 dscp;
|
||||
|
||||
dscp = get_dscp(skb);
|
||||
if (dscp)
|
||||
bpf_map_update_elem(&flow_dscps, &key, &dscp, BPF_ANY);
|
||||
else
|
||||
bpf_map_delete_elem(&flow_dscps, &key);
|
||||
|
||||
return TC_ACT_OK;
|
||||
}
|
||||
|
||||
SEC("classifier/write")
|
||||
int write_dscp(struct __sk_buff *skb)
|
||||
{
|
||||
__u32 key = skb->hash;
|
||||
__u8 *dscp;
|
||||
|
||||
dscp = bpf_map_lookup_elem(&flow_dscps, &key);
|
||||
if (dscp)
|
||||
set_dscp(skb, *dscp);
|
||||
|
||||
return TC_ACT_OK;
|
||||
}
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
Reference in New Issue
Block a user