nat64-bpf: Initial version

This adds an initial version of a NAT64 translator in BPF. It compiles and
loads, but doesn't actually appear to work yet.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
This commit is contained in:
Toke Høiland-Jørgensen
2021-09-29 01:46:09 +02:00
parent c7e3acf5d0
commit ede4270320
6 changed files with 642 additions and 0 deletions

2
nat64-bpf/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
nat64

12
nat64-bpf/Makefile Normal file
View File

@@ -0,0 +1,12 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
USER_TARGETS := nat64
BPF_TARGETS := nat64_kern
BPF_SKEL_OBJ := nat64_kern.o
#LDLIBS += -pthread
EXTRA_DEPS += nat64.h
LIB_DIR = ../lib
include $(LIB_DIR)/common.mk

16
nat64-bpf/README.org Normal file
View File

@@ -0,0 +1,16 @@
* NAT64 BPF implementation
This directory contains a BPF implementation of a stateless NAT64
implementation, like that performed by Tayga, but entirely in BPF.
Design:
- Global v6 /96 prefix defined as NAT64 prefix
- Each interface is assigned a v4 prefix for mapping v6 addresses
- Install onlink v4 route for that prefix to make sure traffic goes out the interface
- Attach ingress and egress BPF programs to each interface
- On ingress: match v6 packets with a NAT64 prefix destination; remap to v4
- On egress: lookup v4 destination address; if it's in the configured NAT64 prefix, remap back to v6
- Some logic to dynamically assign v4 addresses each time a new v6 source is seen

286
nat64-bpf/nat64.c Normal file
View File

@@ -0,0 +1,286 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <net/if.h>
#include <linux/if_arp.h>
#include <getopt.h>
#include <linux/in6.h>
#include <arpa/inet.h>
#include <linux/bpf.h>
#include <bpf/libbpf.h>
#include <bpf/bpf.h>
#include "nat64.h"
#include "nat64_kern.skel.h"
#define NS_PER_SECOND 1000000000UL
#define NS_PER_MS 1000000UL
static const struct option long_options[] = {
{ "help", no_argument, NULL, 'h' },
{ "unload", no_argument, NULL, 'u' },
{ "interface", required_argument, NULL, 'i' }, // Name of interface to run on
{ "allowed-src", required_argument, NULL, 'a' }, // v6 prefix to allow as source
{ "v6-prefix", required_argument, NULL, '6' }, // v6 prefix to use for nat64
{ "v4-prefix", required_argument, NULL, '4' }, // v4 prefix to use for nat64
{ "timeout", required_argument, NULL, 't' }, // Address mapping timeout interval in s
{ 0, 0, NULL, 0 }
};
struct nat64_user_config {
struct nat64_config c;
int ifindex;
char ifname[IF_NAMESIZE+1];
struct in6_addr v6_allow;
__u32 v6_allow_pxlen;
bool unload;
};
struct v6_trie_key {
struct bpf_lpm_trie_key t;
struct in6_addr addr;
};
static int parse_v6_prefix(char *str, struct in6_addr *v6addr)
{
char *net;
int pxlen;
net = strstr(str, "/");
if (!net) {
fprintf(stderr, "Invalid v6 prefix: %s\n", str);
return -EINVAL;
}
pxlen = atoi(net + 1);
*net = '\0';
if (inet_pton(AF_INET6, str, v6addr) != 1) {
fprintf(stderr, "Invalid v6 addr: %s\n", str);
return -EINVAL;
}
return pxlen;
}
static int parse_arguments(int argc, char *argv[], struct nat64_user_config *config)
{
struct in6_addr v6addr;
struct in_addr v4addr;
int pxlen, seconds;
int err, opt;
char *net;
config->ifindex = 0;
config->c.timeout_ns = 7200 * NS_PER_SECOND;
config->c.next_addr = 1;
/* Default to special prefix 64:ff9b::/96 */
config->c.v6_prefix.s6_addr[1] = 0x64;
config->c.v6_prefix.s6_addr[2] = 0xff;
config->c.v6_prefix.s6_addr[3] = 0x0b;
while ((opt = getopt_long(argc, argv, "i:6:4:t:a:hu", long_options,
NULL)) != -1) {
switch (opt) {
case 'i':
if (strlen(optarg) > IF_NAMESIZE) {
fprintf(stderr, "interface name too long\n");
return -EINVAL;
}
strncpy(config->ifname, optarg, IF_NAMESIZE);
config->ifindex = if_nametoindex(config->ifname);
if (config->ifindex == 0) {
err = -errno;
fprintf(stderr,
"Could not get index of interface %s: %s\n",
config->ifname, strerror(err));
return err;
}
break;
case 'a':
pxlen = parse_v6_prefix(optarg, &v6addr);
if (pxlen < 0)
return pxlen;
config->v6_allow = v6addr;
config->v6_allow_pxlen = pxlen;
break;
case '6':
pxlen = parse_v6_prefix(optarg, &v6addr);
if (pxlen < 0)
return pxlen;
if (pxlen != 96) {
fprintf(stderr, "v6 prefix must have pxlen 96\n");
return -EINVAL;
}
if (v6addr.s6_addr32[3]) {
fprintf(stderr, "Not a /96 network address: %s\n", optarg);
return -EINVAL;
}
config->c.v6_prefix = v6addr;
break;
case '4':
net = strstr(optarg, "/");
if (!net) {
fprintf(stderr, "Invalid v6 prefix: %s\n", optarg);
return -EINVAL;
}
pxlen = atoi(net + 1);
if (pxlen < 1 || pxlen > 31) {
fprintf(stderr, "v4_pxlen must be between 1 and 31\n");
return -EINVAL;
}
*net = '\0';
if (inet_pton(AF_INET, optarg, &v4addr) != 1) {
fprintf(stderr, "Invalid v4 addr: %s\n", optarg);
return -EINVAL;
}
config->c.v4_mask = 0xFFFFFFFF << (32 - pxlen);
config->c.v4_prefix = ntohl(v4addr.s_addr);
if (config->c.v4_prefix & ~config->c.v4_mask) {
fprintf(stderr, "Not a network address: %s\n", optarg);
return -EINVAL;
}
break;
case 't':
seconds = atoi(optarg);
if (seconds < 1 || seconds > 100000) {
fprintf(stderr, "Timeout must be in the interval between 1 and 100000 seconds\n");
return -EINVAL;
}
config->c.timeout_ns = (__u64)seconds * NS_PER_SECOND;
break;
case 'u':
config->unload = true;
break;
default:
fprintf(stderr, "Unknown option %s\n", argv[optind]);
return -EINVAL;
}
}
if (config->ifindex == 0) {
fprintf(stderr,
"An interface (-i or --interface) must be provided\n");
return -EINVAL;
}
if (!config->c.v4_prefix) {
fprintf(stderr,
"A v4 prefix (-4 or --v4-prefix) must be provided\n");
return -EINVAL;
}
return 0;
}
int main(int argc, char *argv[])
{
struct v6_trie_key prefix_key = {};
struct nat64_user_config cfg = {};
struct nat64_kern *obj;
unsigned int num_addr;
char buf[100];
int err = 0;
DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS);
DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_egress);
DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_ingress);
err = parse_arguments(argc, argv, &cfg);
if (err)
return EXIT_FAILURE;
hook.ifindex = cfg.ifindex;
if (cfg.unload) {
err = bpf_tc_hook_destroy(&hook);
if (err)
fprintf(stderr, "Couldn't remove clsact qdisc on %s\n", cfg.ifname);
return err;
}
obj = nat64_kern__open();
err = libbpf_get_error(obj);
if (err) {
libbpf_strerror(err, buf, sizeof(buf));
fprintf(stderr, "Couldn't open BPF skeleton: %s\n", buf);
return err;
}
num_addr = (cfg.c.v4_prefix | ~cfg.c.v4_mask) - cfg.c.v4_prefix - 2;
printf("num addr: %u\n", num_addr);
obj->bss->config = cfg.c;
bpf_map__resize(obj->maps.v6_state_map, num_addr);
bpf_map__resize(obj->maps.v4_reversemap, num_addr);
bpf_map__resize(obj->maps.reclaimed_addrs, num_addr);
err = nat64_kern__load(obj);
if (err) {
libbpf_strerror(err, buf, sizeof(buf));
fprintf(stderr, "Couldn't load BPF skeleton: %s\n", buf);
goto out;
}
if (cfg.v6_allow_pxlen) {
__u32 value = 0;
prefix_key.t.prefixlen = cfg.v6_allow_pxlen;
prefix_key.addr = cfg.v6_allow;
err = bpf_map_update_elem(bpf_map__fd(obj->maps.allowed_v6_src),
&prefix_key, &value, 0);
if (err) {
fprintf(stderr, "Couldn't insert allowed prefix\n");
goto out;
}
}
attach_ingress.prog_fd = bpf_program__fd(obj->progs.nat64_ingress);
if (attach_ingress.prog_fd < 0) {
fprintf(stderr, "Couldn't find ingress program\n");
err = -ENOENT;
goto out;
}
attach_egress.prog_fd = bpf_program__fd(obj->progs.nat64_egress);
if (attach_egress.prog_fd < 0) {
fprintf(stderr, "Couldn't find egress program\n");
err = -ENOENT;
goto out;
}
err = bpf_tc_hook_create(&hook);
if (err && err != -EEXIST) {
fprintf(stderr, "Couldn't create ingress hook for ifindex %d\n", cfg.ifindex);
goto out;
}
hook.attach_point = BPF_TC_INGRESS;
err = bpf_tc_attach(&hook, &attach_ingress);
if (err) {
fprintf(stderr, "Couldn't attach ingress program to ifindex %d\n",
hook.ifindex);
goto out;
}
hook.attach_point = BPF_TC_EGRESS;
err = bpf_tc_attach(&hook, &attach_egress);
if (err) {
fprintf(stderr, "Couldn't attach egress program to ifindex %d\n",
hook.ifindex);
goto out;
}
out:
nat64_kern__destroy(obj);
return err;
}

20
nat64-bpf/nat64.h Normal file
View File

@@ -0,0 +1,20 @@
#ifndef __NAT64_H__
#define __NAT64_H__
#include <linux/in6.h>
struct nat64_config {
struct in6_addr v6_prefix;
__u64 timeout_ns;
__u32 v4_prefix;
__u32 v4_mask;
__u32 next_addr;
};
struct v6_addr_state {
__u64 last_seen;
__u32 v4_addr;
__u32 static_conf;
};
#endif

306
nat64-bpf/nat64_kern.c Normal file
View File

@@ -0,0 +1,306 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright 2021 Toke Høiland-Jørgensen <toke@toke.dk> */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <linux/pkt_sched.h>
#include <linux/pkt_cls.h>
#include <stdbool.h>
#include "../include/xdp/parsing_helpers.h"
#include "nat64.h"
struct nat64_config config;
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, struct in6_addr);
__type(value, struct v6_addr_state);
__uint(max_entries, 1);
__uint(map_flags, BPF_F_NO_PREALLOC);
} v6_state_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, __u32);
__type(value, struct in6_addr);
__uint(max_entries, 1);
__uint(map_flags, BPF_F_NO_PREALLOC);
} v4_reversemap SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_LPM_TRIE);
__uint(key_size, sizeof(struct in6_addr));
__uint(value_size, sizeof(__u32));
__uint(max_entries, 1);
__uint(map_flags, BPF_F_NO_PREALLOC);
} allowed_v6_src SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_QUEUE);
__uint(key_size, 0);
__uint(value_size, sizeof(__u32));
__uint(max_entries, 1);
} reclaimed_addrs SEC(".maps");
static int nat64_handle_v4(struct __sk_buff *skb, struct hdr_cursor *nh)
{
void *data_end = (void *)(unsigned long long)skb->data_end;
void *data = (void *)(unsigned long long)skb->data;
int ip_type, iphdr_len, ip_offset;
struct in6_addr *dst_v6;
int ret = TC_ACT_OK;
struct iphdr *iph;
__u32 dst_v4;
struct ipv6hdr dst_hdr = {
.version = 6,
.saddr = config.v6_prefix,
};
ip_offset = (nh->pos - data) & 0x1fff;
ip_type = parse_iphdr(nh, data_end, &iph);
if (ip_type < 0)
goto out;
dst_v4 = bpf_ntohl(iph->daddr);
if ((dst_v4 & config.v4_mask) != config.v4_prefix)
goto out;
/* At this point we know the destination IP is within the configured
* subnet, so if we can't rewrite the packet it should be dropped (so as
* not to leak traffic in that subnet).
*/
ret = TC_ACT_SHOT;
iphdr_len = iph->ihl * 4;
/* drop packets with IP options */
if (iphdr_len != sizeof(struct iphdr))
goto out;
dst_v6 = bpf_map_lookup_elem(&v4_reversemap, &dst_v4);
if (!dst_v6)
goto out;
// src v4 as last octet of nat64 address
dst_hdr.saddr.s6_addr32[3] = iph->saddr;
dst_hdr.nexthdr = iph->protocol;
dst_hdr.hop_limit = iph->ttl;
dst_hdr.payload_len = iph->tot_len - iphdr_len;
__builtin_memcpy(&dst_hdr.daddr, dst_v6, sizeof(struct in6_addr));
if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0))
goto out;
/* If this fails we already mangled the packet, so need to drop it */
if (bpf_skb_store_bytes(skb, ip_offset,
&dst_hdr, sizeof(dst_hdr),
BPF_F_RECOMPUTE_CSUM))
goto out;
ret = TC_ACT_OK;
out:
return ret;
}
static long check_item(struct bpf_map *map, const void *key, void *value, void *ctx)
{
struct v6_addr_state *state = value;
__u64 timeout = *((__u64 *)ctx);
if (state->last_seen < timeout && !state->static_conf) {
__u32 v4_addr = state->v4_addr;
bpf_map_delete_elem(map, key);
bpf_map_delete_elem(&v4_reversemap, &v4_addr);
bpf_map_push_elem(&reclaimed_addrs, &v4_addr, 0);
/* only reclaim one address at a time, so mappings don't expire
* until they absolutely have to
*/
return 1;
}
return 0;
}
static __u32 reclaim_v4_addr(void)
{
__u64 timeout = bpf_ktime_get_ns() - config.timeout_ns;
__u32 src_v4;
if (bpf_map_pop_elem(&reclaimed_addrs, &src_v4) == 0)
return src_v4;
bpf_for_each_map_elem(&v6_state_map, check_item, &timeout, 0);
return bpf_map_pop_elem(&reclaimed_addrs, &src_v4) ? 0 : src_v4;
}
static struct v6_addr_state *alloc_new_state(struct in6_addr *src_v6)
{
struct v6_addr_state new_v6_state = { .last_seen = bpf_ktime_get_ns() };
__u32 max_v4 = (config.v4_prefix | ~config.v4_mask) - 1;
__u32 src_v4 = 0;
int i;
for (i = 0; i < 10; i++) {
__u32 next_v4, next_addr;
// next_addr = __sync_fetch_and_add(&config.next_addr, 0);
next_addr = config.next_addr;
next_v4 = config.v4_prefix + next_addr;
if (next_v4 >= max_v4) {
src_v4 = reclaim_v4_addr();
break;
}
/* if (__sync_val_compare_and_swap(&config.next_addr,
next_addr,
next_addr + 1) == next_addr) {
src_v4 = next_v4;
break;
}*/
config.next_addr = next_addr + 1;
src_v4 = next_v4;
}
/* If src_v4 is 0 here, we failed to find an available addr */
if (!src_v4)
return NULL;
new_v6_state.v4_addr = src_v4;
if (bpf_map_update_elem(&v6_state_map, src_v6, &new_v6_state, BPF_NOEXIST))
goto err;
if (bpf_map_update_elem(&v4_reversemap, &src_v4, src_v6, BPF_NOEXIST))
goto err_v4;
return bpf_map_lookup_elem(&v6_state_map, src_v6);
err_v4:
bpf_map_delete_elem(&v6_state_map, src_v6);
err:
/* failed to insert entry in maps, put the address back in the queue for
* reclaiming
*/
bpf_map_push_elem(&reclaimed_addrs, &src_v4, 0);
return NULL;
}
static int cmp_v6addr(struct in6_addr *a, struct in6_addr *b)
{
int i;
for (i = 0; i < 4; i++) {
if (a->s6_addr32[i] < b->s6_addr32[i])
return -1;
if (a->s6_addr32[i] > b->s6_addr32[i])
return 1;
}
return 0;
}
static int nat64_handle_v6(struct __sk_buff *skb, struct hdr_cursor *nh)
{
void *data_end = (void *)(unsigned long long)skb->data_end;
void *data = (void *)(unsigned long long)skb->data;
struct in6_addr *dst_v6, src_v6, subnet_v6 = {};
int ip_type, ip_offset;
struct ipv6hdr *ip6h;
int ret = TC_ACT_OK;
__u32 *allowval;
struct v6_addr_state *v6_state;
struct iphdr dst_hdr = {
.version = 4,
};
ip_offset = (nh->pos - data) & 0x1fff;
ip_type = parse_ip6hdr(nh, data_end, &ip6h);
if (ip_type < 0)
goto out;
dst_v6 = &ip6h->daddr;
subnet_v6 = *dst_v6;
/* v6 pxlen is always 96 */
subnet_v6.s6_addr32[3] = 0;
if (cmp_v6addr(&subnet_v6, &config.v6_prefix))
goto out;
/* At this point we know the destination IP is within the configured
* subnet, so if we can't rewrite the packet it should be dropped (so as
* not to leak traffic in that subnet).
*/
ret = TC_ACT_SHOT;
/* drop packets with IP options - parser skips options */
if (ip_type != ip6h->nexthdr)
goto out;
allowval = bpf_map_lookup_elem(&allowed_v6_src, &ip6h->saddr);
if (!allowval)
goto out;
src_v6 = ip6h->saddr;
v6_state = bpf_map_lookup_elem(&v6_state_map, &src_v6);
if (!v6_state) {
v6_state = alloc_new_state(&src_v6);
if (!v6_state)
goto out;
} else {
v6_state->last_seen = bpf_ktime_get_ns();
bpf_map_update_elem(&v6_state_map, &src_v6, v6_state, BPF_EXIST);
}
dst_hdr.daddr = ip6h->daddr.s6_addr32[3];
dst_hdr.saddr = bpf_htonl(v6_state->v4_addr);
dst_hdr.protocol = ip6h->nexthdr;
dst_hdr.ttl = ip6h->hop_limit;
dst_hdr.tot_len = ip6h->payload_len + sizeof(struct iphdr);
if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0))
goto out;
/* If this fails we already mangled the packet, so need to drop it */
if (bpf_skb_store_bytes(skb, ip_offset,
&dst_hdr, sizeof(dst_hdr),
BPF_F_RECOMPUTE_CSUM))
goto out;
ret = TC_ACT_OK;
out:
return ret;
}
static int nat64_handler(struct __sk_buff *skb, bool egress)
{
void *data_end = (void *)(unsigned long long)skb->data_end;
void *data = (void *)(unsigned long long)skb->data;
struct hdr_cursor nh = { .pos = data };
struct ethhdr *eth;
int eth_type;
/* Parse Ethernet and IP/IPv6 headers */
eth_type = parse_ethhdr(&nh, data_end, &eth);
if (eth_type == bpf_htons(ETH_P_IP) && egress)
return nat64_handle_v4(skb, &nh);
else if (eth_type == bpf_htons(ETH_P_IPV6) && !egress)
return nat64_handle_v6(skb, &nh);
return TC_ACT_OK;
}
SEC("classifier")
int nat64_egress(struct __sk_buff *skb)
{
return nat64_handler(skb, true);
}
SEC("classifier")
int nat64_ingress(struct __sk_buff *skb)
{
return nat64_handler(skb, false);
}