Add preserve-dscp example for preserving a DSCP mark over encapsulation

This is a fun example showing how to use BPF to preserve DSCP values across
an encapsulating interface, such as Wireguard. It relies on the
encapsulation layer preserving the skb->hash value across the
encapsulation, which is commonly the case on kernel encapsulation
protocols (including Wireguard), and uses a pair of TC BPF programs and a
map to re-match the packets after encapsulation and add back the DSCP
value.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
This commit is contained in:
Toke Høiland-Jørgensen
2021-06-17 17:02:08 +02:00
parent dabfe929ae
commit 7bb3c6ac91
5 changed files with 321 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
preserve-dscp
+8
View File
@@ -0,0 +1,8 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
USER_TARGETS += preserve-dscp
BPF_TARGETS += preserve_dscp_kern
LIB_DIR = ../lib
include $(LIB_DIR)/common.mk
+47
View File
@@ -0,0 +1,47 @@
* DSCP-preserving TC filters
This example shows how to use BPF to preserve DSCP values across an
encapsulating interface such as Wireguard. It relies on the encapsulation layer
preserving the skb->hash value across the encapsulation, which is commonly the
case on kernel encapsulation protocols (including Wireguard).
The example contains two filters: one that parses the packet header and reads
the DSCP value out of the packet, then stores that value in a map keyed on the
skb->hash value (which is calculated if it isn't already set). And the second TC
filter reads back the skb->hash value, looks it up in the map, and if found
rewrites the packet DSCP based on that value.
The idea is that the first filter is run on the internal (encapsulating)
interface, and the second is run on the physical interface that transmits the
encapsulated packet. To install the filters, run the userspace component like:
=sudo ./preserve-dscp <ifname pre> <ifname post>=
To unload the filters again, run:
=sudo ./preserve-dscp <ifname pre> <ifname post> --unload=
Note that unloading will remove the clsact qdisc from the interfaces entirely,
so don't run this if you want to preserve that; instead manually remove the
filters using =tc=.
** Caveats
There are a couple of caveats to this approach:
- As mentioned above, this only works for encapsulation protocols that preserve
the SKB hash in the first place.
- The userspace program will try to detect if the =pre= interface has an
Ethernet header by checking if the interface has a type of =ARPHRD_NONE=, and
if so will assume the packet starts with the IP header. If this heuristic
turns out to be wrong, the filter will fail.
- There is no sanity checking on the outer filter that the packets actually
come from the interface that we ran the =pre= filter on in the first place;
there is no general way to check this from BPF, but the =write_dscp= filter can
be amended to do some other sanity checks on the packet before modifying it
(such as checking port numbers).
- Since this relies on =skb->hash=, it is flow-based; if individual packets in
the same flow have different marks, which ones will be preserved is racy.
+143
View File
@@ -0,0 +1,143 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <net/if.h>
#include <linux/if_arp.h>
#include <bpf/libbpf.h>
int main(int argc, char *argv[])
{
const char *filename = "preserve_dscp_kern.o";
char *ifname_pre, *ifname_post;
int ifindex_pre, ifindex_post;
struct bpf_map *map = NULL;
int err = 0, fd, iftype;
struct bpf_object *obj;
char buf[100];
ssize_t len;
DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .attach_point = BPF_TC_EGRESS);
DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_pre);
DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_post);
if (argc < 3) {
fprintf(stderr, "Usage: %s <if pre> <if post> [--unload]\n", argv[0]);
return 1;
}
ifname_pre = argv[1];
ifname_post = argv[2];
ifindex_pre = if_nametoindex(ifname_pre);
if (!ifindex_pre) {
fprintf(stderr, "Couldn't find interface '%s'\n", ifname_pre);
return 1;
}
/* Get type of interface to know if it has ethernet headers */
snprintf(buf, sizeof(buf)-1, "/sys/class/net/%s/type", ifname_pre);
buf[sizeof(buf)-1] = '\0';
fd = open(buf, 0);
if (fd < 0 || (len = read(fd, buf, sizeof(buf))) == -1) {
fprintf(stderr, "Couldn't get interface type for '%s'\n", ifname_pre);
return 1;
}
buf[len] = '\0';
close(fd);
iftype = atoi(buf);
ifindex_post = if_nametoindex(ifname_post);
if (!ifindex_post) {
fprintf(stderr, "Couldn't find interface '%s'\n", ifname_post);
return 1;
}
if (argc == 4 && strcmp(argv[3], "--unload") == 0) {
int _err;
hook.attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS;
hook.ifindex = ifindex_pre;
_err = bpf_tc_hook_destroy(&hook);
if (_err)
fprintf(stderr, "Couldn't remove clsact qdisc on %s\n", ifname_pre);
hook.ifindex = ifindex_post;
err = bpf_tc_hook_destroy(&hook);
if (err)
fprintf(stderr, "Couldn't remove clsact qdisc on %s\n", ifname_post);
else
err = _err;
return err;
} else if (argc > 3) {
fprintf(stderr, "Usage: %s <if pre> <if post> [--unload]\n", argv[0]);
return 1;
}
obj = bpf_object__open(filename);
err = libbpf_get_error(obj);
if (err) {
fprintf(stderr, "Couldn't open file: %s\n", filename);
return err;
}
while ((map = bpf_map__next(map, obj))) {
if (strstr(bpf_map__name(map), ".rodata")) {
int ip_only = (iftype == ARPHRD_NONE);
bpf_map__set_initial_value(map, &ip_only, sizeof(ip_only));
}
}
err = bpf_object__load(obj);
if (err) {
fprintf(stderr, "Failed to load object\n");
goto out;
}
attach_pre.prog_fd = bpf_program__fd(bpf_object__find_program_by_name(obj, "read_dscp"));
if (attach_pre.prog_fd < 0) {
fprintf(stderr, "Couldn't find program 'read_dscp'\n");
err = -ENOENT;
goto out;
}
attach_post.prog_fd = bpf_program__fd(bpf_object__find_program_by_name(obj, "write_dscp"));
if (attach_post.prog_fd < 0) {
fprintf(stderr, "Couldn't find program 'write_dscp'\n");
err = -ENOENT;
goto out;
}
hook.ifindex = ifindex_pre;
err = bpf_tc_hook_create(&hook);
if (err && err != -EEXIST) {
fprintf(stderr, "Couldn't create hook for ifindex %d\n", ifindex_pre);
goto out;
}
err = bpf_tc_attach(&hook, &attach_pre);
if (err) {
fprintf(stderr, "Couldn't attach program to ifindex %d\n", hook.ifindex);
goto out;
}
hook.ifindex = ifindex_post;
err = bpf_tc_hook_create(&hook);
if (err && err != -EEXIST) {
fprintf(stderr, "Couldn't create hook for ifindex %d\n", ifindex_post);
goto out;
}
err = bpf_tc_attach(&hook, &attach_post);
if (err) {
fprintf(stderr, "Couldn't attach program to ifindex %d\n", hook.ifindex);
goto out;
}
out:
bpf_object__close(obj);
return err;
}
+122
View File
@@ -0,0 +1,122 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <xdp/parsing_helpers.h>
#include <linux/pkt_cls.h>
/* We use an LRU map to avoid having to do cleanup: We will remove the matching
* entry in the map if a packet does not have a DSCP value, but we won't
* otherwise clean up stale entries. Instead, we just rely on the LRU mechanism
* to evict old entries as the map fills up.
*/
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, __u32);
__type(value, __u8);
__uint(max_entries, 16384);
} flow_dscps SEC(".maps");
const volatile static int ip_only = 0;
static __u8 get_dscp(struct __sk_buff *skb)
{
void *data_end = (void *)(unsigned long long)skb->data_end;
void *data = (void *)(unsigned long long)skb->data;
struct hdr_cursor nh = { .pos = data };
struct ipv6hdr *ipv6hdr;
struct iphdr *iphdr;
struct ethhdr *eth;
int eth_type;
if (!ip_only) {
eth_type = parse_ethhdr(&nh, data_end, &eth);
if (eth_type != bpf_htons(ETH_P_IP) &&
eth_type != bpf_htons(ETH_P_IPV6))
return 0;
}
if (parse_iphdr(&nh, data_end, &iphdr) > 0)
return iphdr->tos >> 2;
else if (parse_ip6hdr(&nh, data_end, &ipv6hdr) > 0)
return bpf_ntohs(*(__u16 *)ipv6hdr) >> 4;
return 0;
}
static inline void ipv4_change_dsfield(struct iphdr *iph, __u8 mask, __u8 value)
{
__u32 check = bpf_ntohs(iph->check);
__u8 dsfield;
dsfield = (iph->tos & mask) | value;
check += iph->tos;
if ((check+1) >> 16) check = (check+1) & 0xffff;
check -= dsfield;
check += check >> 16; /* adjust carry */
iph->check = bpf_htons(check);
iph->tos = dsfield;
}
static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask,
__u8 value)
{
__u16 *p = (__u16 *)ipv6h;
*p = (*p & bpf_htons((((__u16)mask << 4) | 0xf00f))) | bpf_htons((__u16)value << 4);
}
#define INET_ECN_MASK 3
static void set_dscp(struct __sk_buff *skb, __u8 dscp)
{
void *data_end = (void *)(unsigned long long)skb->data_end;
void *data = (void *)(unsigned long long)skb->data;
struct hdr_cursor nh = { .pos = data };
struct ipv6hdr *ipv6hdr;
struct iphdr *iphdr;
struct ethhdr *eth;
int eth_type;
eth_type = parse_ethhdr(&nh, data_end, &eth);
if (eth_type == bpf_htons(ETH_P_IP) &&
parse_iphdr(&nh, data_end, &iphdr) > 0)
ipv4_change_dsfield(iphdr, INET_ECN_MASK, dscp << 2);
else if (eth_type == bpf_htons(ETH_P_IPV6) &&
parse_ip6hdr(&nh, data_end, &ipv6hdr) > 0)
ipv6_change_dsfield(ipv6hdr, INET_ECN_MASK, dscp << 2);
}
SEC("classifier/read")
int read_dscp(struct __sk_buff *skb)
{
__u32 key = bpf_get_hash_recalc(skb);
__u8 dscp;
dscp = get_dscp(skb);
if (dscp)
bpf_map_update_elem(&flow_dscps, &key, &dscp, BPF_ANY);
else
bpf_map_delete_elem(&flow_dscps, &key);
return TC_ACT_OK;
}
SEC("classifier/write")
int write_dscp(struct __sk_buff *skb)
{
__u32 key = skb->hash;
__u8 *dscp;
dscp = bpf_map_lookup_elem(&flow_dscps, &key);
if (dscp)
set_dscp(skb, *dscp);
return TC_ACT_OK;
}
char _license[] SEC("license") = "GPL";