From ede42703209e99d888d4660b8a1dc41d50094b9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 29 Sep 2021 01:46:09 +0200 Subject: [PATCH] nat64-bpf: Initial version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds an initial version of a NAT64 translator in BPF. It compiles and loads, but doesn't actually appear to work yet. Signed-off-by: Toke Høiland-Jørgensen --- nat64-bpf/.gitignore | 2 + nat64-bpf/Makefile | 12 ++ nat64-bpf/README.org | 16 +++ nat64-bpf/nat64.c | 286 ++++++++++++++++++++++++++++++++++++++ nat64-bpf/nat64.h | 20 +++ nat64-bpf/nat64_kern.c | 306 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 642 insertions(+) create mode 100644 nat64-bpf/.gitignore create mode 100644 nat64-bpf/Makefile create mode 100644 nat64-bpf/README.org create mode 100644 nat64-bpf/nat64.c create mode 100644 nat64-bpf/nat64.h create mode 100644 nat64-bpf/nat64_kern.c diff --git a/nat64-bpf/.gitignore b/nat64-bpf/.gitignore new file mode 100644 index 0000000..07ed9dd --- /dev/null +++ b/nat64-bpf/.gitignore @@ -0,0 +1,2 @@ +nat64 + diff --git a/nat64-bpf/Makefile b/nat64-bpf/Makefile new file mode 100644 index 0000000..9fad6d3 --- /dev/null +++ b/nat64-bpf/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) + +USER_TARGETS := nat64 +BPF_TARGETS := nat64_kern +BPF_SKEL_OBJ := nat64_kern.o + +#LDLIBS += -pthread +EXTRA_DEPS += nat64.h + +LIB_DIR = ../lib + +include $(LIB_DIR)/common.mk diff --git a/nat64-bpf/README.org b/nat64-bpf/README.org new file mode 100644 index 0000000..a8c8d86 --- /dev/null +++ b/nat64-bpf/README.org @@ -0,0 +1,16 @@ +* NAT64 BPF implementation + +This directory contains a BPF implementation of a stateless NAT64 +implementation, like that performed by Tayga, but entirely in BPF. + +Design: + +- Global v6 /96 prefix defined as NAT64 prefix +- Each interface is assigned a v4 prefix for mapping v6 addresses + - Install onlink v4 route for that prefix to make sure traffic goes out the interface + +- Attach ingress and egress BPF programs to each interface + - On ingress: match v6 packets with a NAT64 prefix destination; remap to v4 + - On egress: lookup v4 destination address; if it's in the configured NAT64 prefix, remap back to v6 + +- Some logic to dynamically assign v4 addresses each time a new v6 source is seen diff --git a/nat64-bpf/nat64.c b/nat64-bpf/nat64.c new file mode 100644 index 0000000..3c826bc --- /dev/null +++ b/nat64-bpf/nat64.c @@ -0,0 +1,286 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "nat64.h" +#include "nat64_kern.skel.h" + +#define NS_PER_SECOND 1000000000UL +#define NS_PER_MS 1000000UL + +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "unload", no_argument, NULL, 'u' }, + { "interface", required_argument, NULL, 'i' }, // Name of interface to run on + { "allowed-src", required_argument, NULL, 'a' }, // v6 prefix to allow as source + { "v6-prefix", required_argument, NULL, '6' }, // v6 prefix to use for nat64 + { "v4-prefix", required_argument, NULL, '4' }, // v4 prefix to use for nat64 + { "timeout", required_argument, NULL, 't' }, // Address mapping timeout interval in s + { 0, 0, NULL, 0 } +}; + +struct nat64_user_config { + struct nat64_config c; + int ifindex; + char ifname[IF_NAMESIZE+1]; + struct in6_addr v6_allow; + __u32 v6_allow_pxlen; + bool unload; +}; + +struct v6_trie_key { + struct bpf_lpm_trie_key t; + struct in6_addr addr; +}; + + +static int parse_v6_prefix(char *str, struct in6_addr *v6addr) +{ + char *net; + int pxlen; + + net = strstr(str, "/"); + if (!net) { + fprintf(stderr, "Invalid v6 prefix: %s\n", str); + return -EINVAL; + } + pxlen = atoi(net + 1); + *net = '\0'; + if (inet_pton(AF_INET6, str, v6addr) != 1) { + fprintf(stderr, "Invalid v6 addr: %s\n", str); + return -EINVAL; + } + return pxlen; +} + +static int parse_arguments(int argc, char *argv[], struct nat64_user_config *config) +{ + struct in6_addr v6addr; + struct in_addr v4addr; + int pxlen, seconds; + int err, opt; + char *net; + + config->ifindex = 0; + config->c.timeout_ns = 7200 * NS_PER_SECOND; + config->c.next_addr = 1; + + /* Default to special prefix 64:ff9b::/96 */ + config->c.v6_prefix.s6_addr[1] = 0x64; + config->c.v6_prefix.s6_addr[2] = 0xff; + config->c.v6_prefix.s6_addr[3] = 0x0b; + + while ((opt = getopt_long(argc, argv, "i:6:4:t:a:hu", long_options, + NULL)) != -1) { + switch (opt) { + case 'i': + if (strlen(optarg) > IF_NAMESIZE) { + fprintf(stderr, "interface name too long\n"); + return -EINVAL; + } + strncpy(config->ifname, optarg, IF_NAMESIZE); + + config->ifindex = if_nametoindex(config->ifname); + if (config->ifindex == 0) { + err = -errno; + fprintf(stderr, + "Could not get index of interface %s: %s\n", + config->ifname, strerror(err)); + return err; + } + break; + case 'a': + pxlen = parse_v6_prefix(optarg, &v6addr); + if (pxlen < 0) + return pxlen; + config->v6_allow = v6addr; + config->v6_allow_pxlen = pxlen; + break; + case '6': + pxlen = parse_v6_prefix(optarg, &v6addr); + if (pxlen < 0) + return pxlen; + if (pxlen != 96) { + fprintf(stderr, "v6 prefix must have pxlen 96\n"); + return -EINVAL; + } + if (v6addr.s6_addr32[3]) { + fprintf(stderr, "Not a /96 network address: %s\n", optarg); + return -EINVAL; + } + config->c.v6_prefix = v6addr; + break; + case '4': + net = strstr(optarg, "/"); + if (!net) { + fprintf(stderr, "Invalid v6 prefix: %s\n", optarg); + return -EINVAL; + } + pxlen = atoi(net + 1); + if (pxlen < 1 || pxlen > 31) { + fprintf(stderr, "v4_pxlen must be between 1 and 31\n"); + return -EINVAL; + } + *net = '\0'; + if (inet_pton(AF_INET, optarg, &v4addr) != 1) { + fprintf(stderr, "Invalid v4 addr: %s\n", optarg); + return -EINVAL; + } + config->c.v4_mask = 0xFFFFFFFF << (32 - pxlen); + config->c.v4_prefix = ntohl(v4addr.s_addr); + if (config->c.v4_prefix & ~config->c.v4_mask) { + fprintf(stderr, "Not a network address: %s\n", optarg); + return -EINVAL; + } + break; + case 't': + seconds = atoi(optarg); + if (seconds < 1 || seconds > 100000) { + fprintf(stderr, "Timeout must be in the interval between 1 and 100000 seconds\n"); + return -EINVAL; + } + + config->c.timeout_ns = (__u64)seconds * NS_PER_SECOND; + break; + case 'u': + config->unload = true; + break; + default: + fprintf(stderr, "Unknown option %s\n", argv[optind]); + return -EINVAL; + } + } + + if (config->ifindex == 0) { + fprintf(stderr, + "An interface (-i or --interface) must be provided\n"); + return -EINVAL; + } + if (!config->c.v4_prefix) { + fprintf(stderr, + "A v4 prefix (-4 or --v4-prefix) must be provided\n"); + return -EINVAL; + } + + return 0; +} + + + +int main(int argc, char *argv[]) +{ + struct v6_trie_key prefix_key = {}; + struct nat64_user_config cfg = {}; + struct nat64_kern *obj; + unsigned int num_addr; + char buf[100]; + int err = 0; + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS); + DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_egress); + DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_ingress); + + err = parse_arguments(argc, argv, &cfg); + if (err) + return EXIT_FAILURE; + + hook.ifindex = cfg.ifindex; + if (cfg.unload) { + err = bpf_tc_hook_destroy(&hook); + if (err) + fprintf(stderr, "Couldn't remove clsact qdisc on %s\n", cfg.ifname); + + return err; + } + + obj = nat64_kern__open(); + err = libbpf_get_error(obj); + if (err) { + libbpf_strerror(err, buf, sizeof(buf)); + fprintf(stderr, "Couldn't open BPF skeleton: %s\n", buf); + return err; + } + + num_addr = (cfg.c.v4_prefix | ~cfg.c.v4_mask) - cfg.c.v4_prefix - 2; + + printf("num addr: %u\n", num_addr); + + obj->bss->config = cfg.c; + bpf_map__resize(obj->maps.v6_state_map, num_addr); + bpf_map__resize(obj->maps.v4_reversemap, num_addr); + bpf_map__resize(obj->maps.reclaimed_addrs, num_addr); + + err = nat64_kern__load(obj); + if (err) { + libbpf_strerror(err, buf, sizeof(buf)); + fprintf(stderr, "Couldn't load BPF skeleton: %s\n", buf); + goto out; + } + + if (cfg.v6_allow_pxlen) { + __u32 value = 0; + + prefix_key.t.prefixlen = cfg.v6_allow_pxlen; + prefix_key.addr = cfg.v6_allow; + err = bpf_map_update_elem(bpf_map__fd(obj->maps.allowed_v6_src), + &prefix_key, &value, 0); + if (err) { + fprintf(stderr, "Couldn't insert allowed prefix\n"); + goto out; + } + } + + + attach_ingress.prog_fd = bpf_program__fd(obj->progs.nat64_ingress); + if (attach_ingress.prog_fd < 0) { + fprintf(stderr, "Couldn't find ingress program\n"); + err = -ENOENT; + goto out; + } + + attach_egress.prog_fd = bpf_program__fd(obj->progs.nat64_egress); + if (attach_egress.prog_fd < 0) { + fprintf(stderr, "Couldn't find egress program\n"); + err = -ENOENT; + goto out; + } + + err = bpf_tc_hook_create(&hook); + if (err && err != -EEXIST) { + fprintf(stderr, "Couldn't create ingress hook for ifindex %d\n", cfg.ifindex); + goto out; + } + + hook.attach_point = BPF_TC_INGRESS; + err = bpf_tc_attach(&hook, &attach_ingress); + if (err) { + fprintf(stderr, "Couldn't attach ingress program to ifindex %d\n", + hook.ifindex); + goto out; + } + + hook.attach_point = BPF_TC_EGRESS; + err = bpf_tc_attach(&hook, &attach_egress); + if (err) { + fprintf(stderr, "Couldn't attach egress program to ifindex %d\n", + hook.ifindex); + goto out; + } + +out: + nat64_kern__destroy(obj); + return err; +} diff --git a/nat64-bpf/nat64.h b/nat64-bpf/nat64.h new file mode 100644 index 0000000..a994f17 --- /dev/null +++ b/nat64-bpf/nat64.h @@ -0,0 +1,20 @@ +#ifndef __NAT64_H__ +#define __NAT64_H__ + +#include + +struct nat64_config { + struct in6_addr v6_prefix; + __u64 timeout_ns; + __u32 v4_prefix; + __u32 v4_mask; + __u32 next_addr; +}; + +struct v6_addr_state { + __u64 last_seen; + __u32 v4_addr; + __u32 static_conf; +}; + +#endif diff --git a/nat64-bpf/nat64_kern.c b/nat64-bpf/nat64_kern.c new file mode 100644 index 0000000..cd42f61 --- /dev/null +++ b/nat64-bpf/nat64_kern.c @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright 2021 Toke Høiland-Jørgensen */ + + +#include +#include +#include +#include +#include +#include "../include/xdp/parsing_helpers.h" +#include "nat64.h" + +struct nat64_config config; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct in6_addr); + __type(value, struct v6_addr_state); + __uint(max_entries, 1); + __uint(map_flags, BPF_F_NO_PREALLOC); +} v6_state_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u32); + __type(value, struct in6_addr); + __uint(max_entries, 1); + __uint(map_flags, BPF_F_NO_PREALLOC); +} v4_reversemap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(key_size, sizeof(struct in6_addr)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, 1); + __uint(map_flags, BPF_F_NO_PREALLOC); +} allowed_v6_src SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(key_size, 0); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, 1); +} reclaimed_addrs SEC(".maps"); + +static int nat64_handle_v4(struct __sk_buff *skb, struct hdr_cursor *nh) +{ + void *data_end = (void *)(unsigned long long)skb->data_end; + void *data = (void *)(unsigned long long)skb->data; + + int ip_type, iphdr_len, ip_offset; + struct in6_addr *dst_v6; + int ret = TC_ACT_OK; + struct iphdr *iph; + __u32 dst_v4; + + struct ipv6hdr dst_hdr = { + .version = 6, + .saddr = config.v6_prefix, + }; + + ip_offset = (nh->pos - data) & 0x1fff; + + ip_type = parse_iphdr(nh, data_end, &iph); + if (ip_type < 0) + goto out; + + dst_v4 = bpf_ntohl(iph->daddr); + if ((dst_v4 & config.v4_mask) != config.v4_prefix) + goto out; + + /* At this point we know the destination IP is within the configured + * subnet, so if we can't rewrite the packet it should be dropped (so as + * not to leak traffic in that subnet). + */ + ret = TC_ACT_SHOT; + + iphdr_len = iph->ihl * 4; + /* drop packets with IP options */ + if (iphdr_len != sizeof(struct iphdr)) + goto out; + + dst_v6 = bpf_map_lookup_elem(&v4_reversemap, &dst_v4); + if (!dst_v6) + goto out; + + // src v4 as last octet of nat64 address + dst_hdr.saddr.s6_addr32[3] = iph->saddr; + dst_hdr.nexthdr = iph->protocol; + dst_hdr.hop_limit = iph->ttl; + dst_hdr.payload_len = iph->tot_len - iphdr_len; + __builtin_memcpy(&dst_hdr.daddr, dst_v6, sizeof(struct in6_addr)); + + if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0)) + goto out; + + /* If this fails we already mangled the packet, so need to drop it */ + if (bpf_skb_store_bytes(skb, ip_offset, + &dst_hdr, sizeof(dst_hdr), + BPF_F_RECOMPUTE_CSUM)) + goto out; + + ret = TC_ACT_OK; +out: + return ret; +} + +static long check_item(struct bpf_map *map, const void *key, void *value, void *ctx) +{ + struct v6_addr_state *state = value; + __u64 timeout = *((__u64 *)ctx); + + if (state->last_seen < timeout && !state->static_conf) { + __u32 v4_addr = state->v4_addr; + bpf_map_delete_elem(map, key); + bpf_map_delete_elem(&v4_reversemap, &v4_addr); + bpf_map_push_elem(&reclaimed_addrs, &v4_addr, 0); + + /* only reclaim one address at a time, so mappings don't expire + * until they absolutely have to + */ + return 1; + } + + return 0; +} + +static __u32 reclaim_v4_addr(void) +{ + __u64 timeout = bpf_ktime_get_ns() - config.timeout_ns; + __u32 src_v4; + + if (bpf_map_pop_elem(&reclaimed_addrs, &src_v4) == 0) + return src_v4; + + bpf_for_each_map_elem(&v6_state_map, check_item, &timeout, 0); + + return bpf_map_pop_elem(&reclaimed_addrs, &src_v4) ? 0 : src_v4; +} + +static struct v6_addr_state *alloc_new_state(struct in6_addr *src_v6) +{ + struct v6_addr_state new_v6_state = { .last_seen = bpf_ktime_get_ns() }; + __u32 max_v4 = (config.v4_prefix | ~config.v4_mask) - 1; + __u32 src_v4 = 0; + int i; + + for (i = 0; i < 10; i++) { + __u32 next_v4, next_addr; + + // next_addr = __sync_fetch_and_add(&config.next_addr, 0); + next_addr = config.next_addr; + next_v4 = config.v4_prefix + next_addr; + + if (next_v4 >= max_v4) { + src_v4 = reclaim_v4_addr(); + break; + } + +/* if (__sync_val_compare_and_swap(&config.next_addr, + next_addr, + next_addr + 1) == next_addr) { + src_v4 = next_v4; + break; + }*/ + config.next_addr = next_addr + 1; + src_v4 = next_v4; + } + + /* If src_v4 is 0 here, we failed to find an available addr */ + if (!src_v4) + return NULL; + + new_v6_state.v4_addr = src_v4; + if (bpf_map_update_elem(&v6_state_map, src_v6, &new_v6_state, BPF_NOEXIST)) + goto err; + if (bpf_map_update_elem(&v4_reversemap, &src_v4, src_v6, BPF_NOEXIST)) + goto err_v4; + + return bpf_map_lookup_elem(&v6_state_map, src_v6); + +err_v4: + bpf_map_delete_elem(&v6_state_map, src_v6); +err: + /* failed to insert entry in maps, put the address back in the queue for + * reclaiming + */ + bpf_map_push_elem(&reclaimed_addrs, &src_v4, 0); + return NULL; +} + +static int cmp_v6addr(struct in6_addr *a, struct in6_addr *b) +{ + int i; + for (i = 0; i < 4; i++) { + if (a->s6_addr32[i] < b->s6_addr32[i]) + return -1; + if (a->s6_addr32[i] > b->s6_addr32[i]) + return 1; + } + return 0; +} + +static int nat64_handle_v6(struct __sk_buff *skb, struct hdr_cursor *nh) +{ + void *data_end = (void *)(unsigned long long)skb->data_end; + void *data = (void *)(unsigned long long)skb->data; + + struct in6_addr *dst_v6, src_v6, subnet_v6 = {}; + int ip_type, ip_offset; + struct ipv6hdr *ip6h; + int ret = TC_ACT_OK; + __u32 *allowval; + + struct v6_addr_state *v6_state; + + struct iphdr dst_hdr = { + .version = 4, + }; + + ip_offset = (nh->pos - data) & 0x1fff; + + ip_type = parse_ip6hdr(nh, data_end, &ip6h); + if (ip_type < 0) + goto out; + + dst_v6 = &ip6h->daddr; + subnet_v6 = *dst_v6; + /* v6 pxlen is always 96 */ + subnet_v6.s6_addr32[3] = 0; + if (cmp_v6addr(&subnet_v6, &config.v6_prefix)) + goto out; + + /* At this point we know the destination IP is within the configured + * subnet, so if we can't rewrite the packet it should be dropped (so as + * not to leak traffic in that subnet). + */ + ret = TC_ACT_SHOT; + + /* drop packets with IP options - parser skips options */ + if (ip_type != ip6h->nexthdr) + goto out; + + allowval = bpf_map_lookup_elem(&allowed_v6_src, &ip6h->saddr); + if (!allowval) + goto out; + + src_v6 = ip6h->saddr; + v6_state = bpf_map_lookup_elem(&v6_state_map, &src_v6); + if (!v6_state) { + v6_state = alloc_new_state(&src_v6); + if (!v6_state) + goto out; + } else { + v6_state->last_seen = bpf_ktime_get_ns(); + bpf_map_update_elem(&v6_state_map, &src_v6, v6_state, BPF_EXIST); + } + + dst_hdr.daddr = ip6h->daddr.s6_addr32[3]; + dst_hdr.saddr = bpf_htonl(v6_state->v4_addr); + dst_hdr.protocol = ip6h->nexthdr; + dst_hdr.ttl = ip6h->hop_limit; + dst_hdr.tot_len = ip6h->payload_len + sizeof(struct iphdr); + + if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0)) + goto out; + + /* If this fails we already mangled the packet, so need to drop it */ + if (bpf_skb_store_bytes(skb, ip_offset, + &dst_hdr, sizeof(dst_hdr), + BPF_F_RECOMPUTE_CSUM)) + goto out; + + ret = TC_ACT_OK; +out: + return ret; +} + +static int nat64_handler(struct __sk_buff *skb, bool egress) +{ + void *data_end = (void *)(unsigned long long)skb->data_end; + void *data = (void *)(unsigned long long)skb->data; + struct hdr_cursor nh = { .pos = data }; + struct ethhdr *eth; + int eth_type; + + /* Parse Ethernet and IP/IPv6 headers */ + eth_type = parse_ethhdr(&nh, data_end, ð); + if (eth_type == bpf_htons(ETH_P_IP) && egress) + return nat64_handle_v4(skb, &nh); + else if (eth_type == bpf_htons(ETH_P_IPV6) && !egress) + return nat64_handle_v6(skb, &nh); + + return TC_ACT_OK; +} +SEC("classifier") +int nat64_egress(struct __sk_buff *skb) +{ + return nat64_handler(skb, true); +} + +SEC("classifier") +int nat64_ingress(struct __sk_buff *skb) +{ + return nat64_handler(skb, false); +}