From 7bb3c6ac91d37fa09ecd4f7a20ee5835ccd2a789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 17 Jun 2021 17:02:08 +0200 Subject: [PATCH] Add preserve-dscp example for preserving a DSCP mark over encapsulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a fun example showing how to use BPF to preserve DSCP values across an encapsulating interface, such as Wireguard. It relies on the encapsulation layer preserving the skb->hash value across the encapsulation, which is commonly the case on kernel encapsulation protocols (including Wireguard), and uses a pair of TC BPF programs and a map to re-match the packets after encapsulation and add back the DSCP value. Signed-off-by: Toke Høiland-Jørgensen --- preserve-dscp/.gitignore | 1 + preserve-dscp/Makefile | 8 ++ preserve-dscp/README.org | 47 ++++++++++ preserve-dscp/preserve-dscp.c | 143 +++++++++++++++++++++++++++++ preserve-dscp/preserve_dscp_kern.c | 122 ++++++++++++++++++++++++ 5 files changed, 321 insertions(+) create mode 100644 preserve-dscp/.gitignore create mode 100644 preserve-dscp/Makefile create mode 100644 preserve-dscp/README.org create mode 100644 preserve-dscp/preserve-dscp.c create mode 100644 preserve-dscp/preserve_dscp_kern.c diff --git a/preserve-dscp/.gitignore b/preserve-dscp/.gitignore new file mode 100644 index 0000000..fea5779 --- /dev/null +++ b/preserve-dscp/.gitignore @@ -0,0 +1 @@ +preserve-dscp diff --git a/preserve-dscp/Makefile b/preserve-dscp/Makefile new file mode 100644 index 0000000..7710dbe --- /dev/null +++ b/preserve-dscp/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) + +USER_TARGETS += preserve-dscp +BPF_TARGETS += preserve_dscp_kern + +LIB_DIR = ../lib + +include $(LIB_DIR)/common.mk diff --git a/preserve-dscp/README.org b/preserve-dscp/README.org new file mode 100644 index 0000000..36c6ba2 --- /dev/null +++ b/preserve-dscp/README.org @@ -0,0 +1,47 @@ +* DSCP-preserving TC filters + +This example shows how to use BPF to preserve DSCP values across an +encapsulating interface such as Wireguard. It relies on the encapsulation layer +preserving the skb->hash value across the encapsulation, which is commonly the +case on kernel encapsulation protocols (including Wireguard). + +The example contains two filters: one that parses the packet header and reads +the DSCP value out of the packet, then stores that value in a map keyed on the +skb->hash value (which is calculated if it isn't already set). And the second TC +filter reads back the skb->hash value, looks it up in the map, and if found +rewrites the packet DSCP based on that value. + +The idea is that the first filter is run on the internal (encapsulating) +interface, and the second is run on the physical interface that transmits the +encapsulated packet. To install the filters, run the userspace component like: + +=sudo ./preserve-dscp = + +To unload the filters again, run: +=sudo ./preserve-dscp --unload= + +Note that unloading will remove the clsact qdisc from the interfaces entirely, +so don't run this if you want to preserve that; instead manually remove the +filters using =tc=. + +** Caveats +There are a couple of caveats to this approach: + +- As mentioned above, this only works for encapsulation protocols that preserve + the SKB hash in the first place. + +- The userspace program will try to detect if the =pre= interface has an + Ethernet header by checking if the interface has a type of =ARPHRD_NONE=, and + if so will assume the packet starts with the IP header. If this heuristic + turns out to be wrong, the filter will fail. + +- There is no sanity checking on the outer filter that the packets actually + come from the interface that we ran the =pre= filter on in the first place; + there is no general way to check this from BPF, but the =write_dscp= filter can + be amended to do some other sanity checks on the packet before modifying it + (such as checking port numbers). + +- Since this relies on =skb->hash=, it is flow-based; if individual packets in + the same flow have different marks, which ones will be preserved is racy. + + diff --git a/preserve-dscp/preserve-dscp.c b/preserve-dscp/preserve-dscp.c new file mode 100644 index 0000000..9192d12 --- /dev/null +++ b/preserve-dscp/preserve-dscp.c @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) +{ + const char *filename = "preserve_dscp_kern.o"; + char *ifname_pre, *ifname_post; + int ifindex_pre, ifindex_post; + struct bpf_map *map = NULL; + int err = 0, fd, iftype; + struct bpf_object *obj; + char buf[100]; + ssize_t len; + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .attach_point = BPF_TC_EGRESS); + DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_pre); + DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_post); + + + if (argc < 3) { + fprintf(stderr, "Usage: %s [--unload]\n", argv[0]); + return 1; + } + ifname_pre = argv[1]; + ifname_post = argv[2]; + + ifindex_pre = if_nametoindex(ifname_pre); + if (!ifindex_pre) { + fprintf(stderr, "Couldn't find interface '%s'\n", ifname_pre); + return 1; + } + + /* Get type of interface to know if it has ethernet headers */ + snprintf(buf, sizeof(buf)-1, "/sys/class/net/%s/type", ifname_pre); + buf[sizeof(buf)-1] = '\0'; + fd = open(buf, 0); + if (fd < 0 || (len = read(fd, buf, sizeof(buf))) == -1) { + fprintf(stderr, "Couldn't get interface type for '%s'\n", ifname_pre); + return 1; + } + buf[len] = '\0'; + close(fd); + iftype = atoi(buf); + + ifindex_post = if_nametoindex(ifname_post); + if (!ifindex_post) { + fprintf(stderr, "Couldn't find interface '%s'\n", ifname_post); + return 1; + } + + if (argc == 4 && strcmp(argv[3], "--unload") == 0) { + int _err; + hook.attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS; + hook.ifindex = ifindex_pre; + _err = bpf_tc_hook_destroy(&hook); + if (_err) + fprintf(stderr, "Couldn't remove clsact qdisc on %s\n", ifname_pre); + + hook.ifindex = ifindex_post; + err = bpf_tc_hook_destroy(&hook); + if (err) + fprintf(stderr, "Couldn't remove clsact qdisc on %s\n", ifname_post); + else + err = _err; + return err; + } else if (argc > 3) { + fprintf(stderr, "Usage: %s [--unload]\n", argv[0]); + return 1; + } + + obj = bpf_object__open(filename); + err = libbpf_get_error(obj); + if (err) { + fprintf(stderr, "Couldn't open file: %s\n", filename); + return err; + } + + while ((map = bpf_map__next(map, obj))) { + if (strstr(bpf_map__name(map), ".rodata")) { + int ip_only = (iftype == ARPHRD_NONE); + bpf_map__set_initial_value(map, &ip_only, sizeof(ip_only)); + } + } + + err = bpf_object__load(obj); + if (err) { + fprintf(stderr, "Failed to load object\n"); + goto out; + } + + attach_pre.prog_fd = bpf_program__fd(bpf_object__find_program_by_name(obj, "read_dscp")); + if (attach_pre.prog_fd < 0) { + fprintf(stderr, "Couldn't find program 'read_dscp'\n"); + err = -ENOENT; + goto out; + } + + attach_post.prog_fd = bpf_program__fd(bpf_object__find_program_by_name(obj, "write_dscp")); + if (attach_post.prog_fd < 0) { + fprintf(stderr, "Couldn't find program 'write_dscp'\n"); + err = -ENOENT; + goto out; + } + + hook.ifindex = ifindex_pre; + err = bpf_tc_hook_create(&hook); + if (err && err != -EEXIST) { + fprintf(stderr, "Couldn't create hook for ifindex %d\n", ifindex_pre); + goto out; + } + + err = bpf_tc_attach(&hook, &attach_pre); + if (err) { + fprintf(stderr, "Couldn't attach program to ifindex %d\n", hook.ifindex); + goto out; + } + + hook.ifindex = ifindex_post; + err = bpf_tc_hook_create(&hook); + if (err && err != -EEXIST) { + fprintf(stderr, "Couldn't create hook for ifindex %d\n", ifindex_post); + goto out; + } + + err = bpf_tc_attach(&hook, &attach_post); + if (err) { + fprintf(stderr, "Couldn't attach program to ifindex %d\n", hook.ifindex); + goto out; + } + +out: + bpf_object__close(obj); + return err; +} diff --git a/preserve-dscp/preserve_dscp_kern.c b/preserve-dscp/preserve_dscp_kern.c new file mode 100644 index 0000000..24120cb --- /dev/null +++ b/preserve-dscp/preserve_dscp_kern.c @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include +#include +#include +#include + +/* We use an LRU map to avoid having to do cleanup: We will remove the matching + * entry in the map if a packet does not have a DSCP value, but we won't + * otherwise clean up stale entries. Instead, we just rely on the LRU mechanism + * to evict old entries as the map fills up. + */ +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, __u32); + __type(value, __u8); + __uint(max_entries, 16384); +} flow_dscps SEC(".maps"); + +const volatile static int ip_only = 0; + +static __u8 get_dscp(struct __sk_buff *skb) +{ + void *data_end = (void *)(unsigned long long)skb->data_end; + void *data = (void *)(unsigned long long)skb->data; + struct hdr_cursor nh = { .pos = data }; + + struct ipv6hdr *ipv6hdr; + struct iphdr *iphdr; + struct ethhdr *eth; + int eth_type; + + if (!ip_only) { + + eth_type = parse_ethhdr(&nh, data_end, ð); + if (eth_type != bpf_htons(ETH_P_IP) && + eth_type != bpf_htons(ETH_P_IPV6)) + return 0; + } + if (parse_iphdr(&nh, data_end, &iphdr) > 0) + return iphdr->tos >> 2; + + else if (parse_ip6hdr(&nh, data_end, &ipv6hdr) > 0) + return bpf_ntohs(*(__u16 *)ipv6hdr) >> 4; + + return 0; +} + +static inline void ipv4_change_dsfield(struct iphdr *iph, __u8 mask, __u8 value) +{ + __u32 check = bpf_ntohs(iph->check); + __u8 dsfield; + + dsfield = (iph->tos & mask) | value; + check += iph->tos; + if ((check+1) >> 16) check = (check+1) & 0xffff; + check -= dsfield; + check += check >> 16; /* adjust carry */ + iph->check = bpf_htons(check); + iph->tos = dsfield; +} + +static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask, + __u8 value) +{ + __u16 *p = (__u16 *)ipv6h; + + *p = (*p & bpf_htons((((__u16)mask << 4) | 0xf00f))) | bpf_htons((__u16)value << 4); +} + +#define INET_ECN_MASK 3 + +static void set_dscp(struct __sk_buff *skb, __u8 dscp) +{ + void *data_end = (void *)(unsigned long long)skb->data_end; + void *data = (void *)(unsigned long long)skb->data; + struct hdr_cursor nh = { .pos = data }; + + struct ipv6hdr *ipv6hdr; + struct iphdr *iphdr; + struct ethhdr *eth; + int eth_type; + + eth_type = parse_ethhdr(&nh, data_end, ð); + + if (eth_type == bpf_htons(ETH_P_IP) && + parse_iphdr(&nh, data_end, &iphdr) > 0) + ipv4_change_dsfield(iphdr, INET_ECN_MASK, dscp << 2); + + else if (eth_type == bpf_htons(ETH_P_IPV6) && + parse_ip6hdr(&nh, data_end, &ipv6hdr) > 0) + ipv6_change_dsfield(ipv6hdr, INET_ECN_MASK, dscp << 2); +} + +SEC("classifier/read") +int read_dscp(struct __sk_buff *skb) +{ + __u32 key = bpf_get_hash_recalc(skb); + __u8 dscp; + + dscp = get_dscp(skb); + if (dscp) + bpf_map_update_elem(&flow_dscps, &key, &dscp, BPF_ANY); + else + bpf_map_delete_elem(&flow_dscps, &key); + + return TC_ACT_OK; +} + +SEC("classifier/write") +int write_dscp(struct __sk_buff *skb) +{ + __u32 key = skb->hash; + __u8 *dscp; + + dscp = bpf_map_lookup_elem(&flow_dscps, &key); + if (dscp) + set_dscp(skb, *dscp); + + return TC_ACT_OK; +} + +char _license[] SEC("license") = "GPL";